1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker *
3*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
4*77c1e3ccSAndroid Build Coastguard Worker *
5*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
6*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
8*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
10*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11*77c1e3ccSAndroid Build Coastguard Worker */
12*77c1e3ccSAndroid Build Coastguard Worker
13*77c1e3ccSAndroid Build Coastguard Worker #include <assert.h>
14*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
15*77c1e3ccSAndroid Build Coastguard Worker
16*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
17*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
18*77c1e3ccSAndroid Build Coastguard Worker
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/aom_dsp_common.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
21*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/transpose_neon.h"
22*77c1e3ccSAndroid Build Coastguard Worker #include "aom_ports/mem.h"
23*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/convolve.h"
24*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/filter.h"
25*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/arm/convolve_neon.h"
26*77c1e3ccSAndroid Build Coastguard Worker
convolve12_4_x(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t horiz_const)27*77c1e3ccSAndroid Build Coastguard Worker static inline int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1,
28*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s2, const int16x4_t s3,
29*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s4, const int16x4_t s5,
30*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s6, const int16x4_t s7,
31*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s8, const int16x4_t s9,
32*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s10, const int16x4_t s11,
33*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t x_filter_0_7,
34*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t x_filter_8_11,
35*77c1e3ccSAndroid Build Coastguard Worker const int32x4_t horiz_const) {
36*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
37*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
38*77c1e3ccSAndroid Build Coastguard Worker
39*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum = horiz_const;
40*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
41*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
42*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
43*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
44*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
45*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
46*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
47*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
48*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
49*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
50*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
51*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
52*77c1e3ccSAndroid Build Coastguard Worker
53*77c1e3ccSAndroid Build Coastguard Worker return vqrshrn_n_s32(sum, FILTER_BITS);
54*77c1e3ccSAndroid Build Coastguard Worker }
55*77c1e3ccSAndroid Build Coastguard Worker
convolve_x_sr_12tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16_t * x_filter_ptr)56*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
57*77c1e3ccSAndroid Build Coastguard Worker int src_stride, uint8_t *dst_ptr,
58*77c1e3ccSAndroid Build Coastguard Worker const int dst_stride, int w, int h,
59*77c1e3ccSAndroid Build Coastguard Worker const int16_t *x_filter_ptr) {
60*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
61*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
62*77c1e3ccSAndroid Build Coastguard Worker
63*77c1e3ccSAndroid Build Coastguard Worker // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
64*77c1e3ccSAndroid Build Coastguard Worker // shift by FILTER_BITS - instead of a first rounding right shift by
65*77c1e3ccSAndroid Build Coastguard Worker // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
66*77c1e3ccSAndroid Build Coastguard Worker // ROUND0_BITS.
67*77c1e3ccSAndroid Build Coastguard Worker const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
68*77c1e3ccSAndroid Build Coastguard Worker
69*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
70*77c1e3ccSAndroid Build Coastguard Worker do {
71*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src_ptr;
72*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst_ptr;
73*77c1e3ccSAndroid Build Coastguard Worker int width = w;
74*77c1e3ccSAndroid Build Coastguard Worker
75*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0, t1, t2, t3;
76*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
77*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
78*77c1e3ccSAndroid Build Coastguard Worker
79*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
80*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
81*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
82*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
83*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
84*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
85*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
86*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
87*77c1e3ccSAndroid Build Coastguard Worker
88*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
89*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
90*77c1e3ccSAndroid Build Coastguard Worker
91*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
92*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
93*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
94*77c1e3ccSAndroid Build Coastguard Worker
95*77c1e3ccSAndroid Build Coastguard Worker s += 11;
96*77c1e3ccSAndroid Build Coastguard Worker
97*77c1e3ccSAndroid Build Coastguard Worker do {
98*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
99*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
100*77c1e3ccSAndroid Build Coastguard Worker
101*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
102*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
103*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
104*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
105*77c1e3ccSAndroid Build Coastguard Worker
106*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 =
107*77c1e3ccSAndroid Build Coastguard Worker convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
108*77c1e3ccSAndroid Build Coastguard Worker x_filter_0_7, x_filter_8_11, horiz_const);
109*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d1 =
110*77c1e3ccSAndroid Build Coastguard Worker convolve12_4_x(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
111*77c1e3ccSAndroid Build Coastguard Worker x_filter_0_7, x_filter_8_11, horiz_const);
112*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d2 =
113*77c1e3ccSAndroid Build Coastguard Worker convolve12_4_x(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
114*77c1e3ccSAndroid Build Coastguard Worker x_filter_0_7, x_filter_8_11, horiz_const);
115*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d3 =
116*77c1e3ccSAndroid Build Coastguard Worker convolve12_4_x(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
117*77c1e3ccSAndroid Build Coastguard Worker x_filter_0_7, x_filter_8_11, horiz_const);
118*77c1e3ccSAndroid Build Coastguard Worker
119*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
120*77c1e3ccSAndroid Build Coastguard Worker
121*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
122*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
123*77c1e3ccSAndroid Build Coastguard Worker
124*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(d, dst_stride, d01);
125*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23);
126*77c1e3ccSAndroid Build Coastguard Worker
127*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
128*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
129*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
130*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
131*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
132*77c1e3ccSAndroid Build Coastguard Worker s5 = s9;
133*77c1e3ccSAndroid Build Coastguard Worker s6 = s10;
134*77c1e3ccSAndroid Build Coastguard Worker s7 = s11;
135*77c1e3ccSAndroid Build Coastguard Worker s8 = s12;
136*77c1e3ccSAndroid Build Coastguard Worker s9 = s13;
137*77c1e3ccSAndroid Build Coastguard Worker s10 = s14;
138*77c1e3ccSAndroid Build Coastguard Worker s += 4;
139*77c1e3ccSAndroid Build Coastguard Worker d += 4;
140*77c1e3ccSAndroid Build Coastguard Worker width -= 4;
141*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
142*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 4 * src_stride;
143*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 4 * dst_stride;
144*77c1e3ccSAndroid Build Coastguard Worker h -= 4;
145*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
146*77c1e3ccSAndroid Build Coastguard Worker
147*77c1e3ccSAndroid Build Coastguard Worker #else // !AOM_ARCH_AARCH64
148*77c1e3ccSAndroid Build Coastguard Worker do {
149*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src_ptr;
150*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst_ptr;
151*77c1e3ccSAndroid Build Coastguard Worker int width = w;
152*77c1e3ccSAndroid Build Coastguard Worker
153*77c1e3ccSAndroid Build Coastguard Worker do {
154*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t t0 = vld1q_u8(s);
155*77c1e3ccSAndroid Build Coastguard Worker int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
156*77c1e3ccSAndroid Build Coastguard Worker int16x8_t tt8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
157*77c1e3ccSAndroid Build Coastguard Worker
158*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s0 = vget_low_s16(tt0);
159*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s4 = vget_high_s16(tt0);
160*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s8 = vget_low_s16(tt8);
161*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s12 = vget_high_s16(tt8);
162*77c1e3ccSAndroid Build Coastguard Worker
163*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
164*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
165*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
166*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
167*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
168*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
169*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12
170*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13
171*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14
172*77c1e3ccSAndroid Build Coastguard Worker
173*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 =
174*77c1e3ccSAndroid Build Coastguard Worker convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
175*77c1e3ccSAndroid Build Coastguard Worker x_filter_0_7, x_filter_8_11, horiz_const);
176*77c1e3ccSAndroid Build Coastguard Worker
177*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0)));
178*77c1e3ccSAndroid Build Coastguard Worker
179*77c1e3ccSAndroid Build Coastguard Worker store_u8_4x1(d, dd0);
180*77c1e3ccSAndroid Build Coastguard Worker
181*77c1e3ccSAndroid Build Coastguard Worker s += 4;
182*77c1e3ccSAndroid Build Coastguard Worker d += 4;
183*77c1e3ccSAndroid Build Coastguard Worker width -= 4;
184*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
185*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
186*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += dst_stride;
187*77c1e3ccSAndroid Build Coastguard Worker } while (--h != 0);
188*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
189*77c1e3ccSAndroid Build Coastguard Worker }
190*77c1e3ccSAndroid Build Coastguard Worker
convolve4_8_x(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter,int16x8_t horiz_const)191*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1,
192*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s2, const int16x8_t s3,
193*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter,
194*77c1e3ccSAndroid Build Coastguard Worker int16x8_t horiz_const) {
195*77c1e3ccSAndroid Build Coastguard Worker int16x8_t sum = horiz_const;
196*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s0, filter, 0);
197*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s1, filter, 1);
198*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s2, filter, 2);
199*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s3, filter, 3);
200*77c1e3ccSAndroid Build Coastguard Worker // We halved the filter values so -1 from right shift.
201*77c1e3ccSAndroid Build Coastguard Worker return vqrshrun_n_s16(sum, FILTER_BITS - 1);
202*77c1e3ccSAndroid Build Coastguard Worker }
203*77c1e3ccSAndroid Build Coastguard Worker
convolve_x_sr_4tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16_t * x_filter_ptr)204*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_x_sr_4tap_neon(const uint8_t *src_ptr,
205*77c1e3ccSAndroid Build Coastguard Worker int src_stride, uint8_t *dst_ptr,
206*77c1e3ccSAndroid Build Coastguard Worker const int dst_stride, int w, int h,
207*77c1e3ccSAndroid Build Coastguard Worker const int16_t *x_filter_ptr) {
208*77c1e3ccSAndroid Build Coastguard Worker // All filter values are even, halve to reduce intermediate precision
209*77c1e3ccSAndroid Build Coastguard Worker // requirements.
210*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
211*77c1e3ccSAndroid Build Coastguard Worker
212*77c1e3ccSAndroid Build Coastguard Worker // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
213*77c1e3ccSAndroid Build Coastguard Worker // rounding right shift by FILTER_BITS - instead of a first rounding right
214*77c1e3ccSAndroid Build Coastguard Worker // shift by ROUND0_BITS, followed by second rounding right shift by
215*77c1e3ccSAndroid Build Coastguard Worker // FILTER_BITS - ROUND0_BITS.
216*77c1e3ccSAndroid Build Coastguard Worker // The outermost -1 is needed because we will halve the filter values.
217*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
218*77c1e3ccSAndroid Build Coastguard Worker
219*77c1e3ccSAndroid Build Coastguard Worker if (w == 4) {
220*77c1e3ccSAndroid Build Coastguard Worker do {
221*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t01[4];
222*77c1e3ccSAndroid Build Coastguard Worker t01[0] = load_unaligned_u8(src_ptr + 0, src_stride);
223*77c1e3ccSAndroid Build Coastguard Worker t01[1] = load_unaligned_u8(src_ptr + 1, src_stride);
224*77c1e3ccSAndroid Build Coastguard Worker t01[2] = load_unaligned_u8(src_ptr + 2, src_stride);
225*77c1e3ccSAndroid Build Coastguard Worker t01[3] = load_unaligned_u8(src_ptr + 3, src_stride);
226*77c1e3ccSAndroid Build Coastguard Worker
227*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s01[4];
228*77c1e3ccSAndroid Build Coastguard Worker s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
229*77c1e3ccSAndroid Build Coastguard Worker s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
230*77c1e3ccSAndroid Build Coastguard Worker s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
231*77c1e3ccSAndroid Build Coastguard Worker s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
232*77c1e3ccSAndroid Build Coastguard Worker
233*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 =
234*77c1e3ccSAndroid Build Coastguard Worker convolve4_8_x(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
235*77c1e3ccSAndroid Build Coastguard Worker
236*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
237*77c1e3ccSAndroid Build Coastguard Worker
238*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 2 * src_stride;
239*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 2 * dst_stride;
240*77c1e3ccSAndroid Build Coastguard Worker h -= 2;
241*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
242*77c1e3ccSAndroid Build Coastguard Worker } else {
243*77c1e3ccSAndroid Build Coastguard Worker do {
244*77c1e3ccSAndroid Build Coastguard Worker int width = w;
245*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src_ptr;
246*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst_ptr;
247*77c1e3ccSAndroid Build Coastguard Worker
248*77c1e3ccSAndroid Build Coastguard Worker do {
249*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0[4], t1[4];
250*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
251*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
252*77c1e3ccSAndroid Build Coastguard Worker
253*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0[4], s1[4];
254*77c1e3ccSAndroid Build Coastguard Worker s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
255*77c1e3ccSAndroid Build Coastguard Worker s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
256*77c1e3ccSAndroid Build Coastguard Worker s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
257*77c1e3ccSAndroid Build Coastguard Worker s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
258*77c1e3ccSAndroid Build Coastguard Worker
259*77c1e3ccSAndroid Build Coastguard Worker s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
260*77c1e3ccSAndroid Build Coastguard Worker s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
261*77c1e3ccSAndroid Build Coastguard Worker s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
262*77c1e3ccSAndroid Build Coastguard Worker s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
263*77c1e3ccSAndroid Build Coastguard Worker
264*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 =
265*77c1e3ccSAndroid Build Coastguard Worker convolve4_8_x(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
266*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 =
267*77c1e3ccSAndroid Build Coastguard Worker convolve4_8_x(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
268*77c1e3ccSAndroid Build Coastguard Worker
269*77c1e3ccSAndroid Build Coastguard Worker store_u8_8x2(d, dst_stride, d0, d1);
270*77c1e3ccSAndroid Build Coastguard Worker
271*77c1e3ccSAndroid Build Coastguard Worker s += 8;
272*77c1e3ccSAndroid Build Coastguard Worker d += 8;
273*77c1e3ccSAndroid Build Coastguard Worker width -= 8;
274*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
275*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 2 * src_stride;
276*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 2 * dst_stride;
277*77c1e3ccSAndroid Build Coastguard Worker h -= 2;
278*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
279*77c1e3ccSAndroid Build Coastguard Worker }
280*77c1e3ccSAndroid Build Coastguard Worker }
281*77c1e3ccSAndroid Build Coastguard Worker
convolve8_8_x(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int16x8_t horiz_const)282*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
283*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s2, const int16x8_t s3,
284*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s4, const int16x8_t s5,
285*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s6, const int16x8_t s7,
286*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t filter,
287*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t horiz_const) {
288*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter_lo = vget_low_s16(filter);
289*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter_hi = vget_high_s16(filter);
290*77c1e3ccSAndroid Build Coastguard Worker
291*77c1e3ccSAndroid Build Coastguard Worker int16x8_t sum = horiz_const;
292*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
293*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
294*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
295*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
296*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
297*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
298*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
299*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
300*77c1e3ccSAndroid Build Coastguard Worker
301*77c1e3ccSAndroid Build Coastguard Worker // We halved the convolution filter values so - 1 from the right shift.
302*77c1e3ccSAndroid Build Coastguard Worker return vqrshrun_n_s16(sum, FILTER_BITS - 1);
303*77c1e3ccSAndroid Build Coastguard Worker }
304*77c1e3ccSAndroid Build Coastguard Worker
av1_convolve_x_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)305*77c1e3ccSAndroid Build Coastguard Worker void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
306*77c1e3ccSAndroid Build Coastguard Worker int dst_stride, int w, int h,
307*77c1e3ccSAndroid Build Coastguard Worker const InterpFilterParams *filter_params_x,
308*77c1e3ccSAndroid Build Coastguard Worker const int subpel_x_qn,
309*77c1e3ccSAndroid Build Coastguard Worker ConvolveParams *conv_params) {
310*77c1e3ccSAndroid Build Coastguard Worker if (w == 2 || h == 2) {
311*77c1e3ccSAndroid Build Coastguard Worker av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
312*77c1e3ccSAndroid Build Coastguard Worker subpel_x_qn, conv_params);
313*77c1e3ccSAndroid Build Coastguard Worker return;
314*77c1e3ccSAndroid Build Coastguard Worker }
315*77c1e3ccSAndroid Build Coastguard Worker
316*77c1e3ccSAndroid Build Coastguard Worker const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
317*77c1e3ccSAndroid Build Coastguard Worker src -= horiz_offset;
318*77c1e3ccSAndroid Build Coastguard Worker
319*77c1e3ccSAndroid Build Coastguard Worker const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
320*77c1e3ccSAndroid Build Coastguard Worker filter_params_x, subpel_x_qn & SUBPEL_MASK);
321*77c1e3ccSAndroid Build Coastguard Worker
322*77c1e3ccSAndroid Build Coastguard Worker int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
323*77c1e3ccSAndroid Build Coastguard Worker
324*77c1e3ccSAndroid Build Coastguard Worker if (filter_taps > 8) {
325*77c1e3ccSAndroid Build Coastguard Worker convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
326*77c1e3ccSAndroid Build Coastguard Worker x_filter_ptr);
327*77c1e3ccSAndroid Build Coastguard Worker return;
328*77c1e3ccSAndroid Build Coastguard Worker }
329*77c1e3ccSAndroid Build Coastguard Worker
330*77c1e3ccSAndroid Build Coastguard Worker if (filter_taps <= 4) {
331*77c1e3ccSAndroid Build Coastguard Worker convolve_x_sr_4tap_neon(src + 2, src_stride, dst, dst_stride, w, h,
332*77c1e3ccSAndroid Build Coastguard Worker x_filter_ptr);
333*77c1e3ccSAndroid Build Coastguard Worker return;
334*77c1e3ccSAndroid Build Coastguard Worker }
335*77c1e3ccSAndroid Build Coastguard Worker
336*77c1e3ccSAndroid Build Coastguard Worker // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
337*77c1e3ccSAndroid Build Coastguard Worker // rounding right shift by FILTER_BITS - instead of a first rounding right
338*77c1e3ccSAndroid Build Coastguard Worker // shift by ROUND0_BITS, followed by second rounding right shift by
339*77c1e3ccSAndroid Build Coastguard Worker // FILTER_BITS - ROUND0_BITS.
340*77c1e3ccSAndroid Build Coastguard Worker // The outermost -1 is needed because we will halve the filter values.
341*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
342*77c1e3ccSAndroid Build Coastguard Worker
343*77c1e3ccSAndroid Build Coastguard Worker // Filter values are even so halve to reduce precision requirements.
344*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
345*77c1e3ccSAndroid Build Coastguard Worker
346*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
347*77c1e3ccSAndroid Build Coastguard Worker while (h >= 8) {
348*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
349*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
350*77c1e3ccSAndroid Build Coastguard Worker
351*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
352*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
353*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
354*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
355*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
356*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
357*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
358*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
359*77c1e3ccSAndroid Build Coastguard Worker
360*77c1e3ccSAndroid Build Coastguard Worker int width = w;
361*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src + 7;
362*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst;
363*77c1e3ccSAndroid Build Coastguard Worker
364*77c1e3ccSAndroid Build Coastguard Worker __builtin_prefetch(d + 0 * dst_stride);
365*77c1e3ccSAndroid Build Coastguard Worker __builtin_prefetch(d + 1 * dst_stride);
366*77c1e3ccSAndroid Build Coastguard Worker __builtin_prefetch(d + 2 * dst_stride);
367*77c1e3ccSAndroid Build Coastguard Worker __builtin_prefetch(d + 3 * dst_stride);
368*77c1e3ccSAndroid Build Coastguard Worker __builtin_prefetch(d + 4 * dst_stride);
369*77c1e3ccSAndroid Build Coastguard Worker __builtin_prefetch(d + 5 * dst_stride);
370*77c1e3ccSAndroid Build Coastguard Worker __builtin_prefetch(d + 6 * dst_stride);
371*77c1e3ccSAndroid Build Coastguard Worker __builtin_prefetch(d + 7 * dst_stride);
372*77c1e3ccSAndroid Build Coastguard Worker
373*77c1e3ccSAndroid Build Coastguard Worker do {
374*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t8, t9, t10, t11, t12, t13, t14;
375*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
376*77c1e3ccSAndroid Build Coastguard Worker
377*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
378*77c1e3ccSAndroid Build Coastguard Worker &t14);
379*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
380*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
381*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
382*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
383*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
384*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
385*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
386*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
387*77c1e3ccSAndroid Build Coastguard Worker
388*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 =
389*77c1e3ccSAndroid Build Coastguard Worker convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
390*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 =
391*77c1e3ccSAndroid Build Coastguard Worker convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const);
392*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d2 =
393*77c1e3ccSAndroid Build Coastguard Worker convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const);
394*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d3 =
395*77c1e3ccSAndroid Build Coastguard Worker convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const);
396*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
397*77c1e3ccSAndroid Build Coastguard Worker horiz_const);
398*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
399*77c1e3ccSAndroid Build Coastguard Worker horiz_const);
400*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
401*77c1e3ccSAndroid Build Coastguard Worker horiz_const);
402*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
403*77c1e3ccSAndroid Build Coastguard Worker x_filter, horiz_const);
404*77c1e3ccSAndroid Build Coastguard Worker
405*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
406*77c1e3ccSAndroid Build Coastguard Worker
407*77c1e3ccSAndroid Build Coastguard Worker store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
408*77c1e3ccSAndroid Build Coastguard Worker
409*77c1e3ccSAndroid Build Coastguard Worker s0 = s8;
410*77c1e3ccSAndroid Build Coastguard Worker s1 = s9;
411*77c1e3ccSAndroid Build Coastguard Worker s2 = s10;
412*77c1e3ccSAndroid Build Coastguard Worker s3 = s11;
413*77c1e3ccSAndroid Build Coastguard Worker s4 = s12;
414*77c1e3ccSAndroid Build Coastguard Worker s5 = s13;
415*77c1e3ccSAndroid Build Coastguard Worker s6 = s14;
416*77c1e3ccSAndroid Build Coastguard Worker s += 8;
417*77c1e3ccSAndroid Build Coastguard Worker d += 8;
418*77c1e3ccSAndroid Build Coastguard Worker width -= 8;
419*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
420*77c1e3ccSAndroid Build Coastguard Worker src += 8 * src_stride;
421*77c1e3ccSAndroid Build Coastguard Worker dst += 8 * dst_stride;
422*77c1e3ccSAndroid Build Coastguard Worker h -= 8;
423*77c1e3ccSAndroid Build Coastguard Worker }
424*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
425*77c1e3ccSAndroid Build Coastguard Worker
426*77c1e3ccSAndroid Build Coastguard Worker while (h-- != 0) {
427*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
428*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
429*77c1e3ccSAndroid Build Coastguard Worker
430*77c1e3ccSAndroid Build Coastguard Worker int width = w;
431*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src + 8;
432*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst;
433*77c1e3ccSAndroid Build Coastguard Worker
434*77c1e3ccSAndroid Build Coastguard Worker __builtin_prefetch(d);
435*77c1e3ccSAndroid Build Coastguard Worker
436*77c1e3ccSAndroid Build Coastguard Worker do {
437*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
438*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
439*77c1e3ccSAndroid Build Coastguard Worker
440*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
441*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
442*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
443*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
444*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
445*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
446*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
447*77c1e3ccSAndroid Build Coastguard Worker
448*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 =
449*77c1e3ccSAndroid Build Coastguard Worker convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
450*77c1e3ccSAndroid Build Coastguard Worker
451*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(d, d0);
452*77c1e3ccSAndroid Build Coastguard Worker
453*77c1e3ccSAndroid Build Coastguard Worker s0 = s8;
454*77c1e3ccSAndroid Build Coastguard Worker s += 8;
455*77c1e3ccSAndroid Build Coastguard Worker d += 8;
456*77c1e3ccSAndroid Build Coastguard Worker width -= 8;
457*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
458*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
459*77c1e3ccSAndroid Build Coastguard Worker dst += dst_stride;
460*77c1e3ccSAndroid Build Coastguard Worker }
461*77c1e3ccSAndroid Build Coastguard Worker }
462*77c1e3ccSAndroid Build Coastguard Worker
convolve4_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter)463*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
464*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s2, const int16x8_t s3,
465*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter) {
466*77c1e3ccSAndroid Build Coastguard Worker int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
467*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s1, filter, 1);
468*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s2, filter, 2);
469*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s3, filter, 3);
470*77c1e3ccSAndroid Build Coastguard Worker
471*77c1e3ccSAndroid Build Coastguard Worker // We halved the filter values so -1 from right shift.
472*77c1e3ccSAndroid Build Coastguard Worker return vqrshrun_n_s16(sum, FILTER_BITS - 1);
473*77c1e3ccSAndroid Build Coastguard Worker }
474*77c1e3ccSAndroid Build Coastguard Worker
convolve_y_sr_4tap_neon(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,int w,int h,const int16_t * filter_y)475*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_y_sr_4tap_neon(const uint8_t *src,
476*77c1e3ccSAndroid Build Coastguard Worker const int src_stride, uint8_t *dst,
477*77c1e3ccSAndroid Build Coastguard Worker const int dst_stride, int w, int h,
478*77c1e3ccSAndroid Build Coastguard Worker const int16_t *filter_y) {
479*77c1e3ccSAndroid Build Coastguard Worker // All filter values are even, halve to reduce intermediate precision
480*77c1e3ccSAndroid Build Coastguard Worker // requirements.
481*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
482*77c1e3ccSAndroid Build Coastguard Worker
483*77c1e3ccSAndroid Build Coastguard Worker if (w == 4) {
484*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
485*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
486*77c1e3ccSAndroid Build Coastguard Worker
487*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
488*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
489*77c1e3ccSAndroid Build Coastguard Worker
490*77c1e3ccSAndroid Build Coastguard Worker src += 2 * src_stride;
491*77c1e3ccSAndroid Build Coastguard Worker
492*77c1e3ccSAndroid Build Coastguard Worker do {
493*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
494*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
495*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
496*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
497*77c1e3ccSAndroid Build Coastguard Worker
498*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
499*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
500*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
501*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
502*77c1e3ccSAndroid Build Coastguard Worker
503*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter);
504*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter);
505*77c1e3ccSAndroid Build Coastguard Worker
506*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
507*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
508*77c1e3ccSAndroid Build Coastguard Worker
509*77c1e3ccSAndroid Build Coastguard Worker s01 = s45;
510*77c1e3ccSAndroid Build Coastguard Worker s12 = s56;
511*77c1e3ccSAndroid Build Coastguard Worker
512*77c1e3ccSAndroid Build Coastguard Worker src += 4 * src_stride;
513*77c1e3ccSAndroid Build Coastguard Worker dst += 4 * dst_stride;
514*77c1e3ccSAndroid Build Coastguard Worker h -= 4;
515*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
516*77c1e3ccSAndroid Build Coastguard Worker } else {
517*77c1e3ccSAndroid Build Coastguard Worker do {
518*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0, t1, t2;
519*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x3(src, src_stride, &t0, &t1, &t2);
520*77c1e3ccSAndroid Build Coastguard Worker
521*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
522*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
523*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
524*77c1e3ccSAndroid Build Coastguard Worker
525*77c1e3ccSAndroid Build Coastguard Worker int height = h;
526*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src + 3 * src_stride;
527*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst;
528*77c1e3ccSAndroid Build Coastguard Worker
529*77c1e3ccSAndroid Build Coastguard Worker do {
530*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t3;
531*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
532*77c1e3ccSAndroid Build Coastguard Worker
533*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
534*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
535*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
536*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
537*77c1e3ccSAndroid Build Coastguard Worker
538*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter);
539*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter);
540*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter);
541*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter);
542*77c1e3ccSAndroid Build Coastguard Worker
543*77c1e3ccSAndroid Build Coastguard Worker store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
544*77c1e3ccSAndroid Build Coastguard Worker
545*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
546*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
547*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
548*77c1e3ccSAndroid Build Coastguard Worker
549*77c1e3ccSAndroid Build Coastguard Worker s += 4 * src_stride;
550*77c1e3ccSAndroid Build Coastguard Worker d += 4 * dst_stride;
551*77c1e3ccSAndroid Build Coastguard Worker height -= 4;
552*77c1e3ccSAndroid Build Coastguard Worker } while (height != 0);
553*77c1e3ccSAndroid Build Coastguard Worker src += 8;
554*77c1e3ccSAndroid Build Coastguard Worker dst += 8;
555*77c1e3ccSAndroid Build Coastguard Worker w -= 8;
556*77c1e3ccSAndroid Build Coastguard Worker } while (w != 0);
557*77c1e3ccSAndroid Build Coastguard Worker }
558*77c1e3ccSAndroid Build Coastguard Worker }
559*77c1e3ccSAndroid Build Coastguard Worker
convolve6_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter_0_7)560*77c1e3ccSAndroid Build Coastguard Worker static inline int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
561*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s2, const int16x4_t s3,
562*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s4, const int16x4_t s5,
563*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter_0_7) {
564*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
565*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
566*77c1e3ccSAndroid Build Coastguard Worker
567*77c1e3ccSAndroid Build Coastguard Worker // Filter values at indices 0 and 7 are 0.
568*77c1e3ccSAndroid Build Coastguard Worker int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
569*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
570*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
571*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
572*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
573*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
574*77c1e3ccSAndroid Build Coastguard Worker
575*77c1e3ccSAndroid Build Coastguard Worker return sum;
576*77c1e3ccSAndroid Build Coastguard Worker }
577*77c1e3ccSAndroid Build Coastguard Worker
convolve6_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filters)578*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
579*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s2, const int16x8_t s3,
580*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s4, const int16x8_t s5,
581*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filters) {
582*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_lo = vget_low_s16(y_filters);
583*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_hi = vget_high_s16(y_filters);
584*77c1e3ccSAndroid Build Coastguard Worker
585*77c1e3ccSAndroid Build Coastguard Worker // Filter values at indices 0 and 7 are 0.
586*77c1e3ccSAndroid Build Coastguard Worker int16x8_t sum = vmulq_lane_s16(s0, y_filter_lo, 1);
587*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2);
588*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3);
589*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0);
590*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1);
591*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2);
592*77c1e3ccSAndroid Build Coastguard Worker // We halved the convolution filter values so -1 from the right shift.
593*77c1e3ccSAndroid Build Coastguard Worker return vqrshrun_n_s16(sum, FILTER_BITS - 1);
594*77c1e3ccSAndroid Build Coastguard Worker }
595*77c1e3ccSAndroid Build Coastguard Worker
convolve_y_sr_6tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16x8_t y_filter)596*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
597*77c1e3ccSAndroid Build Coastguard Worker int src_stride, uint8_t *dst_ptr,
598*77c1e3ccSAndroid Build Coastguard Worker const int dst_stride, int w, int h,
599*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter) {
600*77c1e3ccSAndroid Build Coastguard Worker if (w <= 4) {
601*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
602*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
603*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
604*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
605*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
606*77c1e3ccSAndroid Build Coastguard Worker
607*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
608*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
609*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
610*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
611*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
612*77c1e3ccSAndroid Build Coastguard Worker
613*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 5 * src_stride;
614*77c1e3ccSAndroid Build Coastguard Worker
615*77c1e3ccSAndroid Build Coastguard Worker do {
616*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
617*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
618*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
619*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
620*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
621*77c1e3ccSAndroid Build Coastguard Worker
622*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
623*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
624*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
625*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
626*77c1e3ccSAndroid Build Coastguard Worker
627*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
628*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter);
629*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter);
630*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter);
631*77c1e3ccSAndroid Build Coastguard Worker
632*77c1e3ccSAndroid Build Coastguard Worker // We halved the convolution filter values so -1 from the right shift.
633*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
634*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
635*77c1e3ccSAndroid Build Coastguard Worker
636*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
637*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
638*77c1e3ccSAndroid Build Coastguard Worker
639*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
640*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
641*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
642*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
643*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
644*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 4 * src_stride;
645*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 4 * dst_stride;
646*77c1e3ccSAndroid Build Coastguard Worker h -= 4;
647*77c1e3ccSAndroid Build Coastguard Worker #else // !AOM_ARCH_AARCH64
648*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr);
649*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
650*77c1e3ccSAndroid Build Coastguard Worker
651*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
652*77c1e3ccSAndroid Build Coastguard Worker // We halved the convolution filter values so -1 from the right shift.
653*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 =
654*77c1e3ccSAndroid Build Coastguard Worker vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
655*77c1e3ccSAndroid Build Coastguard Worker
656*77c1e3ccSAndroid Build Coastguard Worker store_u8_4x1(dst_ptr, d01);
657*77c1e3ccSAndroid Build Coastguard Worker
658*77c1e3ccSAndroid Build Coastguard Worker s0 = s1;
659*77c1e3ccSAndroid Build Coastguard Worker s1 = s2;
660*77c1e3ccSAndroid Build Coastguard Worker s2 = s3;
661*77c1e3ccSAndroid Build Coastguard Worker s3 = s4;
662*77c1e3ccSAndroid Build Coastguard Worker s4 = s5;
663*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
664*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += dst_stride;
665*77c1e3ccSAndroid Build Coastguard Worker h--;
666*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
667*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
668*77c1e3ccSAndroid Build Coastguard Worker
669*77c1e3ccSAndroid Build Coastguard Worker } else {
670*77c1e3ccSAndroid Build Coastguard Worker do {
671*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src_ptr;
672*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst_ptr;
673*77c1e3ccSAndroid Build Coastguard Worker int height = h;
674*77c1e3ccSAndroid Build Coastguard Worker
675*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0, t1, t2, t3, t4;
676*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4);
677*77c1e3ccSAndroid Build Coastguard Worker
678*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
679*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
680*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
681*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
682*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
683*77c1e3ccSAndroid Build Coastguard Worker
684*77c1e3ccSAndroid Build Coastguard Worker s += 5 * src_stride;
685*77c1e3ccSAndroid Build Coastguard Worker
686*77c1e3ccSAndroid Build Coastguard Worker do {
687*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
688*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t5, t6, t7, t8;
689*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
690*77c1e3ccSAndroid Build Coastguard Worker
691*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
692*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
693*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
694*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
695*77c1e3ccSAndroid Build Coastguard Worker
696*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
697*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter);
698*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter);
699*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter);
700*77c1e3ccSAndroid Build Coastguard Worker
701*77c1e3ccSAndroid Build Coastguard Worker store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
702*77c1e3ccSAndroid Build Coastguard Worker
703*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
704*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
705*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
706*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
707*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
708*77c1e3ccSAndroid Build Coastguard Worker s += 4 * src_stride;
709*77c1e3ccSAndroid Build Coastguard Worker d += 4 * dst_stride;
710*77c1e3ccSAndroid Build Coastguard Worker height -= 4;
711*77c1e3ccSAndroid Build Coastguard Worker #else // !AOM_ARCH_AARCH64
712*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
713*77c1e3ccSAndroid Build Coastguard Worker
714*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
715*77c1e3ccSAndroid Build Coastguard Worker
716*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(d, d0);
717*77c1e3ccSAndroid Build Coastguard Worker
718*77c1e3ccSAndroid Build Coastguard Worker s0 = s1;
719*77c1e3ccSAndroid Build Coastguard Worker s1 = s2;
720*77c1e3ccSAndroid Build Coastguard Worker s2 = s3;
721*77c1e3ccSAndroid Build Coastguard Worker s3 = s4;
722*77c1e3ccSAndroid Build Coastguard Worker s4 = s5;
723*77c1e3ccSAndroid Build Coastguard Worker s += src_stride;
724*77c1e3ccSAndroid Build Coastguard Worker d += dst_stride;
725*77c1e3ccSAndroid Build Coastguard Worker height--;
726*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
727*77c1e3ccSAndroid Build Coastguard Worker } while (height != 0);
728*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 8;
729*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 8;
730*77c1e3ccSAndroid Build Coastguard Worker w -= 8;
731*77c1e3ccSAndroid Build Coastguard Worker } while (w != 0);
732*77c1e3ccSAndroid Build Coastguard Worker }
733*77c1e3ccSAndroid Build Coastguard Worker }
734*77c1e3ccSAndroid Build Coastguard Worker
convolve8_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter)735*77c1e3ccSAndroid Build Coastguard Worker static inline int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
736*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s2, const int16x4_t s3,
737*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s4, const int16x4_t s5,
738*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s6, const int16x4_t s7,
739*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t filter) {
740*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter_lo = vget_low_s16(filter);
741*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter_hi = vget_high_s16(filter);
742*77c1e3ccSAndroid Build Coastguard Worker
743*77c1e3ccSAndroid Build Coastguard Worker int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
744*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s1, filter_lo, 1);
745*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s2, filter_lo, 2);
746*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s3, filter_lo, 3);
747*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s4, filter_hi, 0);
748*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s5, filter_hi, 1);
749*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s6, filter_hi, 2);
750*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s7, filter_hi, 3);
751*77c1e3ccSAndroid Build Coastguard Worker
752*77c1e3ccSAndroid Build Coastguard Worker return sum;
753*77c1e3ccSAndroid Build Coastguard Worker }
754*77c1e3ccSAndroid Build Coastguard Worker
convolve8_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter)755*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
756*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s2, const int16x8_t s3,
757*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s4, const int16x8_t s5,
758*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s6, const int16x8_t s7,
759*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t filter) {
760*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter_lo = vget_low_s16(filter);
761*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter_hi = vget_high_s16(filter);
762*77c1e3ccSAndroid Build Coastguard Worker
763*77c1e3ccSAndroid Build Coastguard Worker int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
764*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
765*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
766*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
767*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
768*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
769*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
770*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
771*77c1e3ccSAndroid Build Coastguard Worker
772*77c1e3ccSAndroid Build Coastguard Worker // We halved the convolution filter values so -1 from the right shift.
773*77c1e3ccSAndroid Build Coastguard Worker return vqrshrun_n_s16(sum, FILTER_BITS - 1);
774*77c1e3ccSAndroid Build Coastguard Worker }
775*77c1e3ccSAndroid Build Coastguard Worker
convolve_y_sr_8tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,const int dst_stride,int w,int h,const int16x8_t y_filter)776*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_y_sr_8tap_neon(const uint8_t *src_ptr,
777*77c1e3ccSAndroid Build Coastguard Worker int src_stride, uint8_t *dst_ptr,
778*77c1e3ccSAndroid Build Coastguard Worker const int dst_stride, int w, int h,
779*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter) {
780*77c1e3ccSAndroid Build Coastguard Worker if (w <= 4) {
781*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
782*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
783*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
784*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
785*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
786*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 5 * src_stride);
787*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 6 * src_stride);
788*77c1e3ccSAndroid Build Coastguard Worker
789*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
790*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
791*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
792*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
793*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
794*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
795*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
796*77c1e3ccSAndroid Build Coastguard Worker
797*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 7 * src_stride;
798*77c1e3ccSAndroid Build Coastguard Worker
799*77c1e3ccSAndroid Build Coastguard Worker do {
800*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
801*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
802*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
803*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t9 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
804*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t10 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
805*77c1e3ccSAndroid Build Coastguard Worker
806*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
807*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
808*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
809*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
810*77c1e3ccSAndroid Build Coastguard Worker
811*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
812*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
813*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
814*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
815*77c1e3ccSAndroid Build Coastguard Worker
816*77c1e3ccSAndroid Build Coastguard Worker // We halved the convolution filter values so -1 from the right shift.
817*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
818*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
819*77c1e3ccSAndroid Build Coastguard Worker
820*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
821*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
822*77c1e3ccSAndroid Build Coastguard Worker
823*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
824*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
825*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
826*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
827*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
828*77c1e3ccSAndroid Build Coastguard Worker s5 = s9;
829*77c1e3ccSAndroid Build Coastguard Worker s6 = s10;
830*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 4 * src_stride;
831*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 4 * dst_stride;
832*77c1e3ccSAndroid Build Coastguard Worker h -= 4;
833*77c1e3ccSAndroid Build Coastguard Worker #else // !AOM_ARCH_AARCH64
834*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr);
835*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
836*77c1e3ccSAndroid Build Coastguard Worker
837*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
838*77c1e3ccSAndroid Build Coastguard Worker // We halved the convolution filter values so -1 from the right shift.
839*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 =
840*77c1e3ccSAndroid Build Coastguard Worker vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
841*77c1e3ccSAndroid Build Coastguard Worker
842*77c1e3ccSAndroid Build Coastguard Worker store_u8_4x1(dst_ptr, d01);
843*77c1e3ccSAndroid Build Coastguard Worker
844*77c1e3ccSAndroid Build Coastguard Worker s0 = s1;
845*77c1e3ccSAndroid Build Coastguard Worker s1 = s2;
846*77c1e3ccSAndroid Build Coastguard Worker s2 = s3;
847*77c1e3ccSAndroid Build Coastguard Worker s3 = s4;
848*77c1e3ccSAndroid Build Coastguard Worker s4 = s5;
849*77c1e3ccSAndroid Build Coastguard Worker s5 = s6;
850*77c1e3ccSAndroid Build Coastguard Worker s6 = s7;
851*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
852*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += dst_stride;
853*77c1e3ccSAndroid Build Coastguard Worker h--;
854*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
855*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
856*77c1e3ccSAndroid Build Coastguard Worker } else {
857*77c1e3ccSAndroid Build Coastguard Worker do {
858*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src_ptr;
859*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst_ptr;
860*77c1e3ccSAndroid Build Coastguard Worker int height = h;
861*77c1e3ccSAndroid Build Coastguard Worker
862*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0, t1, t2, t3, t4, t5, t6;
863*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
864*77c1e3ccSAndroid Build Coastguard Worker
865*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
866*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
867*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
868*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
869*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
870*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
871*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
872*77c1e3ccSAndroid Build Coastguard Worker
873*77c1e3ccSAndroid Build Coastguard Worker s += 7 * src_stride;
874*77c1e3ccSAndroid Build Coastguard Worker
875*77c1e3ccSAndroid Build Coastguard Worker do {
876*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
877*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t7, t8, t9, t10;
878*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
879*77c1e3ccSAndroid Build Coastguard Worker
880*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
881*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
882*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
883*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
884*77c1e3ccSAndroid Build Coastguard Worker
885*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
886*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
887*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
888*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
889*77c1e3ccSAndroid Build Coastguard Worker
890*77c1e3ccSAndroid Build Coastguard Worker store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
891*77c1e3ccSAndroid Build Coastguard Worker
892*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
893*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
894*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
895*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
896*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
897*77c1e3ccSAndroid Build Coastguard Worker s5 = s9;
898*77c1e3ccSAndroid Build Coastguard Worker s6 = s10;
899*77c1e3ccSAndroid Build Coastguard Worker s += 4 * src_stride;
900*77c1e3ccSAndroid Build Coastguard Worker d += 4 * dst_stride;
901*77c1e3ccSAndroid Build Coastguard Worker height -= 4;
902*77c1e3ccSAndroid Build Coastguard Worker #else // !AOM_ARCH_AARCH64
903*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
904*77c1e3ccSAndroid Build Coastguard Worker
905*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
906*77c1e3ccSAndroid Build Coastguard Worker
907*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(d, d0);
908*77c1e3ccSAndroid Build Coastguard Worker
909*77c1e3ccSAndroid Build Coastguard Worker s0 = s1;
910*77c1e3ccSAndroid Build Coastguard Worker s1 = s2;
911*77c1e3ccSAndroid Build Coastguard Worker s2 = s3;
912*77c1e3ccSAndroid Build Coastguard Worker s3 = s4;
913*77c1e3ccSAndroid Build Coastguard Worker s4 = s5;
914*77c1e3ccSAndroid Build Coastguard Worker s5 = s6;
915*77c1e3ccSAndroid Build Coastguard Worker s6 = s7;
916*77c1e3ccSAndroid Build Coastguard Worker s += src_stride;
917*77c1e3ccSAndroid Build Coastguard Worker d += dst_stride;
918*77c1e3ccSAndroid Build Coastguard Worker height--;
919*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
920*77c1e3ccSAndroid Build Coastguard Worker } while (height != 0);
921*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 8;
922*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 8;
923*77c1e3ccSAndroid Build Coastguard Worker w -= 8;
924*77c1e3ccSAndroid Build Coastguard Worker } while (w != 0);
925*77c1e3ccSAndroid Build Coastguard Worker }
926*77c1e3ccSAndroid Build Coastguard Worker }
927*77c1e3ccSAndroid Build Coastguard Worker
convolve12_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11)928*77c1e3ccSAndroid Build Coastguard Worker static inline int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1,
929*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s2, const int16x4_t s3,
930*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s4, const int16x4_t s5,
931*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s6, const int16x4_t s7,
932*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s8, const int16x4_t s9,
933*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s10, const int16x4_t s11,
934*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter_0_7,
935*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_8_11) {
936*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
937*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
938*77c1e3ccSAndroid Build Coastguard Worker int16x4_t sum;
939*77c1e3ccSAndroid Build Coastguard Worker
940*77c1e3ccSAndroid Build Coastguard Worker sum = vmul_lane_s16(s0, y_filter_0_3, 0);
941*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
942*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
943*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
944*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
945*77c1e3ccSAndroid Build Coastguard Worker
946*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
947*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s8, y_filter_8_11, 0);
948*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s9, y_filter_8_11, 1);
949*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s10, y_filter_8_11, 2);
950*77c1e3ccSAndroid Build Coastguard Worker sum = vmla_lane_s16(sum, s11, y_filter_8_11, 3);
951*77c1e3ccSAndroid Build Coastguard Worker
952*77c1e3ccSAndroid Build Coastguard Worker // Saturating addition is required for the largest filter taps to avoid
953*77c1e3ccSAndroid Build Coastguard Worker // overflow (while staying in 16-bit elements.)
954*77c1e3ccSAndroid Build Coastguard Worker sum = vqadd_s16(sum, vmul_lane_s16(s5, y_filter_4_7, 1));
955*77c1e3ccSAndroid Build Coastguard Worker sum = vqadd_s16(sum, vmul_lane_s16(s6, y_filter_4_7, 2));
956*77c1e3ccSAndroid Build Coastguard Worker
957*77c1e3ccSAndroid Build Coastguard Worker return sum;
958*77c1e3ccSAndroid Build Coastguard Worker }
959*77c1e3ccSAndroid Build Coastguard Worker
convolve12_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t s8,const int16x8_t s9,const int16x8_t s10,const int16x8_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11)960*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1,
961*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s2, const int16x8_t s3,
962*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s4, const int16x8_t s5,
963*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s6, const int16x8_t s7,
964*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s8, const int16x8_t s9,
965*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s10, const int16x8_t s11,
966*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter_0_7,
967*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_8_11) {
968*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
969*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
970*77c1e3ccSAndroid Build Coastguard Worker int16x8_t sum;
971*77c1e3ccSAndroid Build Coastguard Worker
972*77c1e3ccSAndroid Build Coastguard Worker sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
973*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
974*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
975*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
976*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
977*77c1e3ccSAndroid Build Coastguard Worker
978*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
979*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s8, y_filter_8_11, 0);
980*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s9, y_filter_8_11, 1);
981*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s10, y_filter_8_11, 2);
982*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s11, y_filter_8_11, 3);
983*77c1e3ccSAndroid Build Coastguard Worker
984*77c1e3ccSAndroid Build Coastguard Worker // Saturating addition is required for the largest filter taps to avoid
985*77c1e3ccSAndroid Build Coastguard Worker // overflow (while staying in 16-bit elements.)
986*77c1e3ccSAndroid Build Coastguard Worker sum = vqaddq_s16(sum, vmulq_lane_s16(s5, y_filter_4_7, 1));
987*77c1e3ccSAndroid Build Coastguard Worker sum = vqaddq_s16(sum, vmulq_lane_s16(s6, y_filter_4_7, 2));
988*77c1e3ccSAndroid Build Coastguard Worker
989*77c1e3ccSAndroid Build Coastguard Worker return vqrshrun_n_s16(sum, FILTER_BITS);
990*77c1e3ccSAndroid Build Coastguard Worker }
991*77c1e3ccSAndroid Build Coastguard Worker
convolve_y_sr_12tap_neon(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr)992*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_y_sr_12tap_neon(const uint8_t *src_ptr,
993*77c1e3ccSAndroid Build Coastguard Worker int src_stride, uint8_t *dst_ptr,
994*77c1e3ccSAndroid Build Coastguard Worker int dst_stride, int w, int h,
995*77c1e3ccSAndroid Build Coastguard Worker const int16_t *y_filter_ptr) {
996*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
997*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
998*77c1e3ccSAndroid Build Coastguard Worker
999*77c1e3ccSAndroid Build Coastguard Worker if (w <= 4) {
1000*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
1001*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
1002*77c1e3ccSAndroid Build Coastguard Worker &t8, &t9, &t10);
1003*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1004*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1005*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1006*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1007*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
1008*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
1009*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
1010*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
1011*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
1012*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
1013*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
1014*77c1e3ccSAndroid Build Coastguard Worker
1015*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 11 * src_stride;
1016*77c1e3ccSAndroid Build Coastguard Worker
1017*77c1e3ccSAndroid Build Coastguard Worker do {
1018*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t11, t12, t13, t14;
1019*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(src_ptr, src_stride, &t11, &t12, &t13, &t14);
1020*77c1e3ccSAndroid Build Coastguard Worker
1021*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11)));
1022*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12)));
1023*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13)));
1024*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14)));
1025*77c1e3ccSAndroid Build Coastguard Worker
1026*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 = convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
1027*77c1e3ccSAndroid Build Coastguard Worker s11, y_filter_0_7, y_filter_8_11);
1028*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d1 = convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
1029*77c1e3ccSAndroid Build Coastguard Worker s11, s12, y_filter_0_7, y_filter_8_11);
1030*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d2 = convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1031*77c1e3ccSAndroid Build Coastguard Worker s12, s13, y_filter_0_7, y_filter_8_11);
1032*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d3 = convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1033*77c1e3ccSAndroid Build Coastguard Worker s13, s14, y_filter_0_7, y_filter_8_11);
1034*77c1e3ccSAndroid Build Coastguard Worker
1035*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
1036*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
1037*77c1e3ccSAndroid Build Coastguard Worker
1038*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
1039*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
1040*77c1e3ccSAndroid Build Coastguard Worker
1041*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
1042*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
1043*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
1044*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
1045*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
1046*77c1e3ccSAndroid Build Coastguard Worker s5 = s9;
1047*77c1e3ccSAndroid Build Coastguard Worker s6 = s10;
1048*77c1e3ccSAndroid Build Coastguard Worker s7 = s11;
1049*77c1e3ccSAndroid Build Coastguard Worker s8 = s12;
1050*77c1e3ccSAndroid Build Coastguard Worker s9 = s13;
1051*77c1e3ccSAndroid Build Coastguard Worker s10 = s14;
1052*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 4 * src_stride;
1053*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 4 * dst_stride;
1054*77c1e3ccSAndroid Build Coastguard Worker h -= 4;
1055*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
1056*77c1e3ccSAndroid Build Coastguard Worker
1057*77c1e3ccSAndroid Build Coastguard Worker } else {
1058*77c1e3ccSAndroid Build Coastguard Worker do {
1059*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src_ptr;
1060*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst_ptr;
1061*77c1e3ccSAndroid Build Coastguard Worker int height = h;
1062*77c1e3ccSAndroid Build Coastguard Worker
1063*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
1064*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
1065*77c1e3ccSAndroid Build Coastguard Worker &t9, &t10);
1066*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1067*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1068*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1069*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1070*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1071*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1072*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1073*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
1074*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
1075*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
1076*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
1077*77c1e3ccSAndroid Build Coastguard Worker
1078*77c1e3ccSAndroid Build Coastguard Worker s += 11 * src_stride;
1079*77c1e3ccSAndroid Build Coastguard Worker
1080*77c1e3ccSAndroid Build Coastguard Worker do {
1081*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t11, t12, t13, t14;
1082*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s, src_stride, &t11, &t12, &t13, &t14);
1083*77c1e3ccSAndroid Build Coastguard Worker
1084*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
1085*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
1086*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
1087*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
1088*77c1e3ccSAndroid Build Coastguard Worker
1089*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
1090*77c1e3ccSAndroid Build Coastguard Worker s10, s11, y_filter_0_7, y_filter_8_11);
1091*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 = convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
1092*77c1e3ccSAndroid Build Coastguard Worker s11, s12, y_filter_0_7, y_filter_8_11);
1093*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d2 = convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1094*77c1e3ccSAndroid Build Coastguard Worker s12, s13, y_filter_0_7, y_filter_8_11);
1095*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d3 = convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1096*77c1e3ccSAndroid Build Coastguard Worker s13, s14, y_filter_0_7, y_filter_8_11);
1097*77c1e3ccSAndroid Build Coastguard Worker
1098*77c1e3ccSAndroid Build Coastguard Worker store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
1099*77c1e3ccSAndroid Build Coastguard Worker
1100*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
1101*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
1102*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
1103*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
1104*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
1105*77c1e3ccSAndroid Build Coastguard Worker s5 = s9;
1106*77c1e3ccSAndroid Build Coastguard Worker s6 = s10;
1107*77c1e3ccSAndroid Build Coastguard Worker s7 = s11;
1108*77c1e3ccSAndroid Build Coastguard Worker s8 = s12;
1109*77c1e3ccSAndroid Build Coastguard Worker s9 = s13;
1110*77c1e3ccSAndroid Build Coastguard Worker s10 = s14;
1111*77c1e3ccSAndroid Build Coastguard Worker s += 4 * src_stride;
1112*77c1e3ccSAndroid Build Coastguard Worker d += 4 * dst_stride;
1113*77c1e3ccSAndroid Build Coastguard Worker height -= 4;
1114*77c1e3ccSAndroid Build Coastguard Worker } while (height != 0);
1115*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 8;
1116*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 8;
1117*77c1e3ccSAndroid Build Coastguard Worker w -= 8;
1118*77c1e3ccSAndroid Build Coastguard Worker } while (w != 0);
1119*77c1e3ccSAndroid Build Coastguard Worker }
1120*77c1e3ccSAndroid Build Coastguard Worker }
1121*77c1e3ccSAndroid Build Coastguard Worker
av1_convolve_y_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)1122*77c1e3ccSAndroid Build Coastguard Worker void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
1123*77c1e3ccSAndroid Build Coastguard Worker int dst_stride, int w, int h,
1124*77c1e3ccSAndroid Build Coastguard Worker const InterpFilterParams *filter_params_y,
1125*77c1e3ccSAndroid Build Coastguard Worker const int subpel_y_qn) {
1126*77c1e3ccSAndroid Build Coastguard Worker if (w == 2 || h == 2) {
1127*77c1e3ccSAndroid Build Coastguard Worker av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
1128*77c1e3ccSAndroid Build Coastguard Worker subpel_y_qn);
1129*77c1e3ccSAndroid Build Coastguard Worker return;
1130*77c1e3ccSAndroid Build Coastguard Worker }
1131*77c1e3ccSAndroid Build Coastguard Worker
1132*77c1e3ccSAndroid Build Coastguard Worker const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1133*77c1e3ccSAndroid Build Coastguard Worker const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
1134*77c1e3ccSAndroid Build Coastguard Worker const int vert_offset = clamped_y_taps / 2 - 1;
1135*77c1e3ccSAndroid Build Coastguard Worker
1136*77c1e3ccSAndroid Build Coastguard Worker src -= vert_offset * src_stride;
1137*77c1e3ccSAndroid Build Coastguard Worker
1138*77c1e3ccSAndroid Build Coastguard Worker const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1139*77c1e3ccSAndroid Build Coastguard Worker filter_params_y, subpel_y_qn & SUBPEL_MASK);
1140*77c1e3ccSAndroid Build Coastguard Worker
1141*77c1e3ccSAndroid Build Coastguard Worker if (y_filter_taps > 8) {
1142*77c1e3ccSAndroid Build Coastguard Worker convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
1143*77c1e3ccSAndroid Build Coastguard Worker y_filter_ptr);
1144*77c1e3ccSAndroid Build Coastguard Worker return;
1145*77c1e3ccSAndroid Build Coastguard Worker }
1146*77c1e3ccSAndroid Build Coastguard Worker
1147*77c1e3ccSAndroid Build Coastguard Worker // Filter values are even so halve to reduce precision requirements.
1148*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
1149*77c1e3ccSAndroid Build Coastguard Worker
1150*77c1e3ccSAndroid Build Coastguard Worker if (y_filter_taps <= 4) {
1151*77c1e3ccSAndroid Build Coastguard Worker convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h,
1152*77c1e3ccSAndroid Build Coastguard Worker y_filter_ptr);
1153*77c1e3ccSAndroid Build Coastguard Worker } else if (y_filter_taps == 6) {
1154*77c1e3ccSAndroid Build Coastguard Worker convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
1155*77c1e3ccSAndroid Build Coastguard Worker } else {
1156*77c1e3ccSAndroid Build Coastguard Worker convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
1157*77c1e3ccSAndroid Build Coastguard Worker }
1158*77c1e3ccSAndroid Build Coastguard Worker }
1159*77c1e3ccSAndroid Build Coastguard Worker
convolve12_4_2d_h(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t horiz_const)1160*77c1e3ccSAndroid Build Coastguard Worker static inline int16x4_t convolve12_4_2d_h(
1161*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1162*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1163*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
1164*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
1165*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
1166*77c1e3ccSAndroid Build Coastguard Worker const int32x4_t horiz_const) {
1167*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
1168*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
1169*77c1e3ccSAndroid Build Coastguard Worker
1170*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum = horiz_const;
1171*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
1172*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
1173*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
1174*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
1175*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
1176*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
1177*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
1178*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
1179*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
1180*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
1181*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
1182*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
1183*77c1e3ccSAndroid Build Coastguard Worker
1184*77c1e3ccSAndroid Build Coastguard Worker return vshrn_n_s32(sum, ROUND0_BITS);
1185*77c1e3ccSAndroid Build Coastguard Worker }
1186*77c1e3ccSAndroid Build Coastguard Worker
convolve_2d_sr_horiz_12tap_neon(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int w,int h,const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11)1187*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_2d_sr_horiz_12tap_neon(
1188*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
1189*77c1e3ccSAndroid Build Coastguard Worker const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
1190*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t x_filter_8_11) {
1191*77c1e3ccSAndroid Build Coastguard Worker const int bd = 8;
1192*77c1e3ccSAndroid Build Coastguard Worker // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts -
1193*77c1e3ccSAndroid Build Coastguard Worker // which are generally faster than rounding shifts on modern CPUs.
1194*77c1e3ccSAndroid Build Coastguard Worker const int32x4_t horiz_const =
1195*77c1e3ccSAndroid Build Coastguard Worker vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
1196*77c1e3ccSAndroid Build Coastguard Worker
1197*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1198*77c1e3ccSAndroid Build Coastguard Worker do {
1199*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src_ptr;
1200*77c1e3ccSAndroid Build Coastguard Worker int16_t *d = dst_ptr;
1201*77c1e3ccSAndroid Build Coastguard Worker int width = w;
1202*77c1e3ccSAndroid Build Coastguard Worker
1203*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0, t1, t2, t3;
1204*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1205*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
1206*77c1e3ccSAndroid Build Coastguard Worker
1207*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1208*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1209*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1210*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1211*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1212*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1213*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1214*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1215*77c1e3ccSAndroid Build Coastguard Worker
1216*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
1217*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
1218*77c1e3ccSAndroid Build Coastguard Worker
1219*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1220*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1221*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1222*77c1e3ccSAndroid Build Coastguard Worker
1223*77c1e3ccSAndroid Build Coastguard Worker s += 11;
1224*77c1e3ccSAndroid Build Coastguard Worker
1225*77c1e3ccSAndroid Build Coastguard Worker do {
1226*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1227*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
1228*77c1e3ccSAndroid Build Coastguard Worker
1229*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1230*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1231*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1232*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1233*77c1e3ccSAndroid Build Coastguard Worker
1234*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 =
1235*77c1e3ccSAndroid Build Coastguard Worker convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1236*77c1e3ccSAndroid Build Coastguard Worker x_filter_0_7, x_filter_8_11, horiz_const);
1237*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d1 =
1238*77c1e3ccSAndroid Build Coastguard Worker convolve12_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1239*77c1e3ccSAndroid Build Coastguard Worker x_filter_0_7, x_filter_8_11, horiz_const);
1240*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d2 =
1241*77c1e3ccSAndroid Build Coastguard Worker convolve12_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
1242*77c1e3ccSAndroid Build Coastguard Worker x_filter_0_7, x_filter_8_11, horiz_const);
1243*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d3 =
1244*77c1e3ccSAndroid Build Coastguard Worker convolve12_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
1245*77c1e3ccSAndroid Build Coastguard Worker x_filter_0_7, x_filter_8_11, horiz_const);
1246*77c1e3ccSAndroid Build Coastguard Worker
1247*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
1248*77c1e3ccSAndroid Build Coastguard Worker store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
1249*77c1e3ccSAndroid Build Coastguard Worker
1250*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
1251*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
1252*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
1253*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
1254*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
1255*77c1e3ccSAndroid Build Coastguard Worker s5 = s9;
1256*77c1e3ccSAndroid Build Coastguard Worker s6 = s10;
1257*77c1e3ccSAndroid Build Coastguard Worker s7 = s11;
1258*77c1e3ccSAndroid Build Coastguard Worker s8 = s12;
1259*77c1e3ccSAndroid Build Coastguard Worker s9 = s13;
1260*77c1e3ccSAndroid Build Coastguard Worker s10 = s14;
1261*77c1e3ccSAndroid Build Coastguard Worker s += 4;
1262*77c1e3ccSAndroid Build Coastguard Worker d += 4;
1263*77c1e3ccSAndroid Build Coastguard Worker width -= 4;
1264*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
1265*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 4 * src_stride;
1266*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 4 * dst_stride;
1267*77c1e3ccSAndroid Build Coastguard Worker h -= 4;
1268*77c1e3ccSAndroid Build Coastguard Worker } while (h > 4);
1269*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
1270*77c1e3ccSAndroid Build Coastguard Worker
1271*77c1e3ccSAndroid Build Coastguard Worker do {
1272*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src_ptr;
1273*77c1e3ccSAndroid Build Coastguard Worker int16_t *d = dst_ptr;
1274*77c1e3ccSAndroid Build Coastguard Worker int width = w;
1275*77c1e3ccSAndroid Build Coastguard Worker
1276*77c1e3ccSAndroid Build Coastguard Worker do {
1277*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t t0 = vld1q_u8(s);
1278*77c1e3ccSAndroid Build Coastguard Worker int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
1279*77c1e3ccSAndroid Build Coastguard Worker int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
1280*77c1e3ccSAndroid Build Coastguard Worker
1281*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s0 = vget_low_s16(tt0);
1282*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s4 = vget_high_s16(tt0);
1283*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s8 = vget_low_s16(tt1);
1284*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s12 = vget_high_s16(tt1);
1285*77c1e3ccSAndroid Build Coastguard Worker
1286*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
1287*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
1288*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
1289*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
1290*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
1291*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
1292*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12
1293*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13
1294*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14
1295*77c1e3ccSAndroid Build Coastguard Worker
1296*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 =
1297*77c1e3ccSAndroid Build Coastguard Worker convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
1298*77c1e3ccSAndroid Build Coastguard Worker x_filter_0_7, x_filter_8_11, horiz_const);
1299*77c1e3ccSAndroid Build Coastguard Worker vst1_s16(d, d0);
1300*77c1e3ccSAndroid Build Coastguard Worker
1301*77c1e3ccSAndroid Build Coastguard Worker s += 4;
1302*77c1e3ccSAndroid Build Coastguard Worker d += 4;
1303*77c1e3ccSAndroid Build Coastguard Worker width -= 4;
1304*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
1305*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
1306*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += dst_stride;
1307*77c1e3ccSAndroid Build Coastguard Worker } while (--h != 0);
1308*77c1e3ccSAndroid Build Coastguard Worker }
1309*77c1e3ccSAndroid Build Coastguard Worker
convolve4_8_2d_h(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter,const int16x8_t horiz_const)1310*77c1e3ccSAndroid Build Coastguard Worker static inline int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1,
1311*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s2, const int16x8_t s3,
1312*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter,
1313*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t horiz_const) {
1314*77c1e3ccSAndroid Build Coastguard Worker int16x8_t sum = vmlaq_lane_s16(horiz_const, s0, filter, 0);
1315*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s1, filter, 1);
1316*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s2, filter, 2);
1317*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s3, filter, 3);
1318*77c1e3ccSAndroid Build Coastguard Worker // We halved the filter values so -1 from right shift.
1319*77c1e3ccSAndroid Build Coastguard Worker return vshrq_n_s16(sum, ROUND0_BITS - 1);
1320*77c1e3ccSAndroid Build Coastguard Worker }
1321*77c1e3ccSAndroid Build Coastguard Worker
convolve_2d_sr_horiz_4tap_neon(const uint8_t * src,ptrdiff_t src_stride,int16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16_t * filter_x)1322*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_2d_sr_horiz_4tap_neon(
1323*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
1324*77c1e3ccSAndroid Build Coastguard Worker ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
1325*77c1e3ccSAndroid Build Coastguard Worker const int bd = 8;
1326*77c1e3ccSAndroid Build Coastguard Worker // All filter values are even, halve to reduce intermediate precision
1327*77c1e3ccSAndroid Build Coastguard Worker // requirements.
1328*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1);
1329*77c1e3ccSAndroid Build Coastguard Worker
1330*77c1e3ccSAndroid Build Coastguard Worker // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
1331*77c1e3ccSAndroid Build Coastguard Worker // shifts - which are generally faster than rounding shifts on modern CPUs.
1332*77c1e3ccSAndroid Build Coastguard Worker // (The extra -1 is needed because we halved the filter values.)
1333*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
1334*77c1e3ccSAndroid Build Coastguard Worker (1 << ((ROUND0_BITS - 1) - 1)));
1335*77c1e3ccSAndroid Build Coastguard Worker
1336*77c1e3ccSAndroid Build Coastguard Worker if (w == 4) {
1337*77c1e3ccSAndroid Build Coastguard Worker do {
1338*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t01[4];
1339*77c1e3ccSAndroid Build Coastguard Worker t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
1340*77c1e3ccSAndroid Build Coastguard Worker t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
1341*77c1e3ccSAndroid Build Coastguard Worker t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
1342*77c1e3ccSAndroid Build Coastguard Worker t01[3] = load_unaligned_u8(src + 3, (int)src_stride);
1343*77c1e3ccSAndroid Build Coastguard Worker
1344*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s01[4];
1345*77c1e3ccSAndroid Build Coastguard Worker s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
1346*77c1e3ccSAndroid Build Coastguard Worker s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
1347*77c1e3ccSAndroid Build Coastguard Worker s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
1348*77c1e3ccSAndroid Build Coastguard Worker s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
1349*77c1e3ccSAndroid Build Coastguard Worker
1350*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d01 =
1351*77c1e3ccSAndroid Build Coastguard Worker convolve4_8_2d_h(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
1352*77c1e3ccSAndroid Build Coastguard Worker
1353*77c1e3ccSAndroid Build Coastguard Worker store_s16x4_strided_x2(dst, (int)dst_stride, d01);
1354*77c1e3ccSAndroid Build Coastguard Worker
1355*77c1e3ccSAndroid Build Coastguard Worker src += 2 * src_stride;
1356*77c1e3ccSAndroid Build Coastguard Worker dst += 2 * dst_stride;
1357*77c1e3ccSAndroid Build Coastguard Worker h -= 2;
1358*77c1e3ccSAndroid Build Coastguard Worker } while (h > 0);
1359*77c1e3ccSAndroid Build Coastguard Worker } else {
1360*77c1e3ccSAndroid Build Coastguard Worker do {
1361*77c1e3ccSAndroid Build Coastguard Worker int width = w;
1362*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src;
1363*77c1e3ccSAndroid Build Coastguard Worker int16_t *d = dst;
1364*77c1e3ccSAndroid Build Coastguard Worker
1365*77c1e3ccSAndroid Build Coastguard Worker do {
1366*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0[4], t1[4];
1367*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
1368*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
1369*77c1e3ccSAndroid Build Coastguard Worker
1370*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0[4];
1371*77c1e3ccSAndroid Build Coastguard Worker s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
1372*77c1e3ccSAndroid Build Coastguard Worker s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
1373*77c1e3ccSAndroid Build Coastguard Worker s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
1374*77c1e3ccSAndroid Build Coastguard Worker s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
1375*77c1e3ccSAndroid Build Coastguard Worker
1376*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s1[4];
1377*77c1e3ccSAndroid Build Coastguard Worker s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
1378*77c1e3ccSAndroid Build Coastguard Worker s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
1379*77c1e3ccSAndroid Build Coastguard Worker s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
1380*77c1e3ccSAndroid Build Coastguard Worker s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
1381*77c1e3ccSAndroid Build Coastguard Worker
1382*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d0 =
1383*77c1e3ccSAndroid Build Coastguard Worker convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
1384*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d1 =
1385*77c1e3ccSAndroid Build Coastguard Worker convolve4_8_2d_h(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
1386*77c1e3ccSAndroid Build Coastguard Worker
1387*77c1e3ccSAndroid Build Coastguard Worker store_s16_8x2(d, dst_stride, d0, d1);
1388*77c1e3ccSAndroid Build Coastguard Worker
1389*77c1e3ccSAndroid Build Coastguard Worker s += 8;
1390*77c1e3ccSAndroid Build Coastguard Worker d += 8;
1391*77c1e3ccSAndroid Build Coastguard Worker width -= 8;
1392*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
1393*77c1e3ccSAndroid Build Coastguard Worker src += 2 * src_stride;
1394*77c1e3ccSAndroid Build Coastguard Worker dst += 2 * dst_stride;
1395*77c1e3ccSAndroid Build Coastguard Worker h -= 2;
1396*77c1e3ccSAndroid Build Coastguard Worker } while (h > 2);
1397*77c1e3ccSAndroid Build Coastguard Worker
1398*77c1e3ccSAndroid Build Coastguard Worker do {
1399*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src;
1400*77c1e3ccSAndroid Build Coastguard Worker int16_t *d = dst;
1401*77c1e3ccSAndroid Build Coastguard Worker int width = w;
1402*77c1e3ccSAndroid Build Coastguard Worker
1403*77c1e3ccSAndroid Build Coastguard Worker do {
1404*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0[4];
1405*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x4(s, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
1406*77c1e3ccSAndroid Build Coastguard Worker
1407*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0[4];
1408*77c1e3ccSAndroid Build Coastguard Worker s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
1409*77c1e3ccSAndroid Build Coastguard Worker s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
1410*77c1e3ccSAndroid Build Coastguard Worker s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
1411*77c1e3ccSAndroid Build Coastguard Worker s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
1412*77c1e3ccSAndroid Build Coastguard Worker
1413*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d0 =
1414*77c1e3ccSAndroid Build Coastguard Worker convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
1415*77c1e3ccSAndroid Build Coastguard Worker
1416*77c1e3ccSAndroid Build Coastguard Worker vst1q_s16(d, d0);
1417*77c1e3ccSAndroid Build Coastguard Worker
1418*77c1e3ccSAndroid Build Coastguard Worker s += 8;
1419*77c1e3ccSAndroid Build Coastguard Worker d += 8;
1420*77c1e3ccSAndroid Build Coastguard Worker width -= 8;
1421*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
1422*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
1423*77c1e3ccSAndroid Build Coastguard Worker dst += dst_stride;
1424*77c1e3ccSAndroid Build Coastguard Worker } while (--h != 0);
1425*77c1e3ccSAndroid Build Coastguard Worker }
1426*77c1e3ccSAndroid Build Coastguard Worker }
1427*77c1e3ccSAndroid Build Coastguard Worker
convolve8_8_2d_h(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int16x8_t horiz_const)1428*77c1e3ccSAndroid Build Coastguard Worker static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
1429*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s2, const int16x8_t s3,
1430*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s4, const int16x8_t s5,
1431*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s6, const int16x8_t s7,
1432*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t filter,
1433*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t horiz_const) {
1434*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter_lo = vget_low_s16(filter);
1435*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter_hi = vget_high_s16(filter);
1436*77c1e3ccSAndroid Build Coastguard Worker
1437*77c1e3ccSAndroid Build Coastguard Worker int16x8_t sum = horiz_const;
1438*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
1439*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
1440*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
1441*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
1442*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
1443*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
1444*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
1445*77c1e3ccSAndroid Build Coastguard Worker sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
1446*77c1e3ccSAndroid Build Coastguard Worker
1447*77c1e3ccSAndroid Build Coastguard Worker // We halved the convolution filter values so -1 from the right shift.
1448*77c1e3ccSAndroid Build Coastguard Worker return vshrq_n_s16(sum, ROUND0_BITS - 1);
1449*77c1e3ccSAndroid Build Coastguard Worker }
1450*77c1e3ccSAndroid Build Coastguard Worker
convolve_2d_sr_horiz_8tap_neon(const uint8_t * src,int src_stride,int16_t * im_block,int im_stride,int w,int im_h,const int16_t * x_filter_ptr)1451*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_2d_sr_horiz_8tap_neon(
1452*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
1453*77c1e3ccSAndroid Build Coastguard Worker int im_h, const int16_t *x_filter_ptr) {
1454*77c1e3ccSAndroid Build Coastguard Worker const int bd = 8;
1455*77c1e3ccSAndroid Build Coastguard Worker
1456*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr = src;
1457*77c1e3ccSAndroid Build Coastguard Worker int16_t *dst_ptr = im_block;
1458*77c1e3ccSAndroid Build Coastguard Worker int dst_stride = im_stride;
1459*77c1e3ccSAndroid Build Coastguard Worker int height = im_h;
1460*77c1e3ccSAndroid Build Coastguard Worker
1461*77c1e3ccSAndroid Build Coastguard Worker // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
1462*77c1e3ccSAndroid Build Coastguard Worker // shifts - which are generally faster than rounding shifts on modern CPUs.
1463*77c1e3ccSAndroid Build Coastguard Worker // (The extra -1 is needed because we halved the filter values.)
1464*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
1465*77c1e3ccSAndroid Build Coastguard Worker (1 << ((ROUND0_BITS - 1) - 1)));
1466*77c1e3ccSAndroid Build Coastguard Worker // Filter values are even, so halve to reduce intermediate precision reqs.
1467*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
1468*77c1e3ccSAndroid Build Coastguard Worker
1469*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1470*77c1e3ccSAndroid Build Coastguard Worker while (height > 8) {
1471*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src_ptr;
1472*77c1e3ccSAndroid Build Coastguard Worker int16_t *d = dst_ptr;
1473*77c1e3ccSAndroid Build Coastguard Worker int width = w;
1474*77c1e3ccSAndroid Build Coastguard Worker
1475*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
1476*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1477*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1478*77c1e3ccSAndroid Build Coastguard Worker
1479*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1480*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1481*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1482*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1483*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1484*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1485*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1486*77c1e3ccSAndroid Build Coastguard Worker
1487*77c1e3ccSAndroid Build Coastguard Worker s += 7;
1488*77c1e3ccSAndroid Build Coastguard Worker
1489*77c1e3ccSAndroid Build Coastguard Worker do {
1490*77c1e3ccSAndroid Build Coastguard Worker load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1491*77c1e3ccSAndroid Build Coastguard Worker
1492*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1493*77c1e3ccSAndroid Build Coastguard Worker
1494*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1495*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1496*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
1497*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
1498*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
1499*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
1500*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
1501*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
1502*77c1e3ccSAndroid Build Coastguard Worker
1503*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
1504*77c1e3ccSAndroid Build Coastguard Worker horiz_const);
1505*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
1506*77c1e3ccSAndroid Build Coastguard Worker horiz_const);
1507*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
1508*77c1e3ccSAndroid Build Coastguard Worker horiz_const);
1509*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
1510*77c1e3ccSAndroid Build Coastguard Worker horiz_const);
1511*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
1512*77c1e3ccSAndroid Build Coastguard Worker x_filter, horiz_const);
1513*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
1514*77c1e3ccSAndroid Build Coastguard Worker x_filter, horiz_const);
1515*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
1516*77c1e3ccSAndroid Build Coastguard Worker x_filter, horiz_const);
1517*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
1518*77c1e3ccSAndroid Build Coastguard Worker x_filter, horiz_const);
1519*77c1e3ccSAndroid Build Coastguard Worker
1520*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
1521*77c1e3ccSAndroid Build Coastguard Worker
1522*77c1e3ccSAndroid Build Coastguard Worker store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
1523*77c1e3ccSAndroid Build Coastguard Worker
1524*77c1e3ccSAndroid Build Coastguard Worker s0 = s8;
1525*77c1e3ccSAndroid Build Coastguard Worker s1 = s9;
1526*77c1e3ccSAndroid Build Coastguard Worker s2 = s10;
1527*77c1e3ccSAndroid Build Coastguard Worker s3 = s11;
1528*77c1e3ccSAndroid Build Coastguard Worker s4 = s12;
1529*77c1e3ccSAndroid Build Coastguard Worker s5 = s13;
1530*77c1e3ccSAndroid Build Coastguard Worker s6 = s14;
1531*77c1e3ccSAndroid Build Coastguard Worker s += 8;
1532*77c1e3ccSAndroid Build Coastguard Worker d += 8;
1533*77c1e3ccSAndroid Build Coastguard Worker width -= 8;
1534*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
1535*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 8 * src_stride;
1536*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 8 * dst_stride;
1537*77c1e3ccSAndroid Build Coastguard Worker height -= 8;
1538*77c1e3ccSAndroid Build Coastguard Worker }
1539*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
1540*77c1e3ccSAndroid Build Coastguard Worker
1541*77c1e3ccSAndroid Build Coastguard Worker do {
1542*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *s = src_ptr;
1543*77c1e3ccSAndroid Build Coastguard Worker int16_t *d = dst_ptr;
1544*77c1e3ccSAndroid Build Coastguard Worker int width = w;
1545*77c1e3ccSAndroid Build Coastguard Worker
1546*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
1547*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1548*77c1e3ccSAndroid Build Coastguard Worker
1549*77c1e3ccSAndroid Build Coastguard Worker do {
1550*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15
1551*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1552*77c1e3ccSAndroid Build Coastguard Worker
1553*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
1554*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
1555*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
1556*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
1557*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
1558*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
1559*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
1560*77c1e3ccSAndroid Build Coastguard Worker
1561*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
1562*77c1e3ccSAndroid Build Coastguard Worker horiz_const);
1563*77c1e3ccSAndroid Build Coastguard Worker
1564*77c1e3ccSAndroid Build Coastguard Worker vst1q_s16(d, d0);
1565*77c1e3ccSAndroid Build Coastguard Worker
1566*77c1e3ccSAndroid Build Coastguard Worker s0 = s8;
1567*77c1e3ccSAndroid Build Coastguard Worker s += 8;
1568*77c1e3ccSAndroid Build Coastguard Worker d += 8;
1569*77c1e3ccSAndroid Build Coastguard Worker width -= 8;
1570*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
1571*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
1572*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += dst_stride;
1573*77c1e3ccSAndroid Build Coastguard Worker } while (--height != 0);
1574*77c1e3ccSAndroid Build Coastguard Worker }
1575*77c1e3ccSAndroid Build Coastguard Worker
av1_convolve_2d_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)1576*77c1e3ccSAndroid Build Coastguard Worker void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
1577*77c1e3ccSAndroid Build Coastguard Worker int dst_stride, int w, int h,
1578*77c1e3ccSAndroid Build Coastguard Worker const InterpFilterParams *filter_params_x,
1579*77c1e3ccSAndroid Build Coastguard Worker const InterpFilterParams *filter_params_y,
1580*77c1e3ccSAndroid Build Coastguard Worker const int subpel_x_qn, const int subpel_y_qn,
1581*77c1e3ccSAndroid Build Coastguard Worker ConvolveParams *conv_params) {
1582*77c1e3ccSAndroid Build Coastguard Worker if (w == 2 || h == 2) {
1583*77c1e3ccSAndroid Build Coastguard Worker av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1584*77c1e3ccSAndroid Build Coastguard Worker filter_params_x, filter_params_y, subpel_x_qn,
1585*77c1e3ccSAndroid Build Coastguard Worker subpel_y_qn, conv_params);
1586*77c1e3ccSAndroid Build Coastguard Worker return;
1587*77c1e3ccSAndroid Build Coastguard Worker }
1588*77c1e3ccSAndroid Build Coastguard Worker
1589*77c1e3ccSAndroid Build Coastguard Worker const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1590*77c1e3ccSAndroid Build Coastguard Worker const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1591*77c1e3ccSAndroid Build Coastguard Worker const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
1592*77c1e3ccSAndroid Build Coastguard Worker const int im_h = h + clamped_y_taps - 1;
1593*77c1e3ccSAndroid Build Coastguard Worker const int im_stride = MAX_SB_SIZE;
1594*77c1e3ccSAndroid Build Coastguard Worker const int vert_offset = clamped_y_taps / 2 - 1;
1595*77c1e3ccSAndroid Build Coastguard Worker const int horiz_offset = filter_params_x->taps / 2 - 1;
1596*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1597*77c1e3ccSAndroid Build Coastguard Worker
1598*77c1e3ccSAndroid Build Coastguard Worker const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1599*77c1e3ccSAndroid Build Coastguard Worker filter_params_x, subpel_x_qn & SUBPEL_MASK);
1600*77c1e3ccSAndroid Build Coastguard Worker const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1601*77c1e3ccSAndroid Build Coastguard Worker filter_params_y, subpel_y_qn & SUBPEL_MASK);
1602*77c1e3ccSAndroid Build Coastguard Worker
1603*77c1e3ccSAndroid Build Coastguard Worker if (filter_params_x->taps > 8) {
1604*77c1e3ccSAndroid Build Coastguard Worker DECLARE_ALIGNED(16, int16_t,
1605*77c1e3ccSAndroid Build Coastguard Worker im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
1606*77c1e3ccSAndroid Build Coastguard Worker
1607*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
1608*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
1609*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
1610*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
1611*77c1e3ccSAndroid Build Coastguard Worker
1612*77c1e3ccSAndroid Build Coastguard Worker convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w,
1613*77c1e3ccSAndroid Build Coastguard Worker im_h, x_filter_0_7, x_filter_8_11);
1614*77c1e3ccSAndroid Build Coastguard Worker
1615*77c1e3ccSAndroid Build Coastguard Worker convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1616*77c1e3ccSAndroid Build Coastguard Worker y_filter_0_7, y_filter_8_11);
1617*77c1e3ccSAndroid Build Coastguard Worker } else {
1618*77c1e3ccSAndroid Build Coastguard Worker DECLARE_ALIGNED(16, int16_t,
1619*77c1e3ccSAndroid Build Coastguard Worker im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
1620*77c1e3ccSAndroid Build Coastguard Worker
1621*77c1e3ccSAndroid Build Coastguard Worker if (x_filter_taps <= 4) {
1622*77c1e3ccSAndroid Build Coastguard Worker convolve_2d_sr_horiz_4tap_neon(src_ptr + 2, src_stride, im_block,
1623*77c1e3ccSAndroid Build Coastguard Worker im_stride, w, im_h, x_filter_ptr);
1624*77c1e3ccSAndroid Build Coastguard Worker } else {
1625*77c1e3ccSAndroid Build Coastguard Worker convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride,
1626*77c1e3ccSAndroid Build Coastguard Worker w, im_h, x_filter_ptr);
1627*77c1e3ccSAndroid Build Coastguard Worker }
1628*77c1e3ccSAndroid Build Coastguard Worker
1629*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1630*77c1e3ccSAndroid Build Coastguard Worker
1631*77c1e3ccSAndroid Build Coastguard Worker if (clamped_y_taps <= 4) {
1632*77c1e3ccSAndroid Build Coastguard Worker convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1633*77c1e3ccSAndroid Build Coastguard Worker y_filter_ptr);
1634*77c1e3ccSAndroid Build Coastguard Worker } else if (clamped_y_taps == 6) {
1635*77c1e3ccSAndroid Build Coastguard Worker convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1636*77c1e3ccSAndroid Build Coastguard Worker y_filter);
1637*77c1e3ccSAndroid Build Coastguard Worker } else {
1638*77c1e3ccSAndroid Build Coastguard Worker convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1639*77c1e3ccSAndroid Build Coastguard Worker y_filter);
1640*77c1e3ccSAndroid Build Coastguard Worker }
1641*77c1e3ccSAndroid Build Coastguard Worker }
1642*77c1e3ccSAndroid Build Coastguard Worker }
1643*77c1e3ccSAndroid Build Coastguard Worker
av1_convolve_x_sr_intrabc_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)1644*77c1e3ccSAndroid Build Coastguard Worker void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride,
1645*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst, int dst_stride, int w, int h,
1646*77c1e3ccSAndroid Build Coastguard Worker const InterpFilterParams *filter_params_x,
1647*77c1e3ccSAndroid Build Coastguard Worker const int subpel_x_qn,
1648*77c1e3ccSAndroid Build Coastguard Worker ConvolveParams *conv_params) {
1649*77c1e3ccSAndroid Build Coastguard Worker assert(subpel_x_qn == 8);
1650*77c1e3ccSAndroid Build Coastguard Worker assert(filter_params_x->taps == 2);
1651*77c1e3ccSAndroid Build Coastguard Worker assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
1652*77c1e3ccSAndroid Build Coastguard Worker (void)filter_params_x;
1653*77c1e3ccSAndroid Build Coastguard Worker (void)subpel_x_qn;
1654*77c1e3ccSAndroid Build Coastguard Worker (void)conv_params;
1655*77c1e3ccSAndroid Build Coastguard Worker
1656*77c1e3ccSAndroid Build Coastguard Worker if (w <= 4) {
1657*77c1e3ccSAndroid Build Coastguard Worker do {
1658*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0_0 = vld1_u8(src);
1659*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0_1 = vld1_u8(src + 1);
1660*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1_0 = vld1_u8(src + src_stride);
1661*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
1662*77c1e3ccSAndroid Build Coastguard Worker
1663*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
1664*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
1665*77c1e3ccSAndroid Build Coastguard Worker
1666*77c1e3ccSAndroid Build Coastguard Worker if (w == 2) {
1667*77c1e3ccSAndroid Build Coastguard Worker store_u8_2x1(dst + 0 * dst_stride, d0);
1668*77c1e3ccSAndroid Build Coastguard Worker store_u8_2x1(dst + 1 * dst_stride, d1);
1669*77c1e3ccSAndroid Build Coastguard Worker } else {
1670*77c1e3ccSAndroid Build Coastguard Worker store_u8_4x1(dst + 0 * dst_stride, d0);
1671*77c1e3ccSAndroid Build Coastguard Worker store_u8_4x1(dst + 1 * dst_stride, d1);
1672*77c1e3ccSAndroid Build Coastguard Worker }
1673*77c1e3ccSAndroid Build Coastguard Worker
1674*77c1e3ccSAndroid Build Coastguard Worker src += 2 * src_stride;
1675*77c1e3ccSAndroid Build Coastguard Worker dst += 2 * dst_stride;
1676*77c1e3ccSAndroid Build Coastguard Worker h -= 2;
1677*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
1678*77c1e3ccSAndroid Build Coastguard Worker } else if (w == 8) {
1679*77c1e3ccSAndroid Build Coastguard Worker do {
1680*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0_0 = vld1_u8(src);
1681*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0_1 = vld1_u8(src + 1);
1682*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1_0 = vld1_u8(src + src_stride);
1683*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
1684*77c1e3ccSAndroid Build Coastguard Worker
1685*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
1686*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
1687*77c1e3ccSAndroid Build Coastguard Worker
1688*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst, d0);
1689*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst + dst_stride, d1);
1690*77c1e3ccSAndroid Build Coastguard Worker
1691*77c1e3ccSAndroid Build Coastguard Worker src += 2 * src_stride;
1692*77c1e3ccSAndroid Build Coastguard Worker dst += 2 * dst_stride;
1693*77c1e3ccSAndroid Build Coastguard Worker h -= 2;
1694*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
1695*77c1e3ccSAndroid Build Coastguard Worker } else {
1696*77c1e3ccSAndroid Build Coastguard Worker do {
1697*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr = src;
1698*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst_ptr = dst;
1699*77c1e3ccSAndroid Build Coastguard Worker int width = w;
1700*77c1e3ccSAndroid Build Coastguard Worker
1701*77c1e3ccSAndroid Build Coastguard Worker do {
1702*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0 = vld1q_u8(src_ptr);
1703*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s1 = vld1q_u8(src_ptr + 1);
1704*77c1e3ccSAndroid Build Coastguard Worker
1705*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t d0 = vrhaddq_u8(s0, s1);
1706*77c1e3ccSAndroid Build Coastguard Worker
1707*77c1e3ccSAndroid Build Coastguard Worker vst1q_u8(dst_ptr, d0);
1708*77c1e3ccSAndroid Build Coastguard Worker
1709*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 16;
1710*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 16;
1711*77c1e3ccSAndroid Build Coastguard Worker width -= 16;
1712*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
1713*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
1714*77c1e3ccSAndroid Build Coastguard Worker dst += dst_stride;
1715*77c1e3ccSAndroid Build Coastguard Worker } while (--h != 0);
1716*77c1e3ccSAndroid Build Coastguard Worker }
1717*77c1e3ccSAndroid Build Coastguard Worker }
1718*77c1e3ccSAndroid Build Coastguard Worker
av1_convolve_y_sr_intrabc_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)1719*77c1e3ccSAndroid Build Coastguard Worker void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride,
1720*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst, int dst_stride, int w, int h,
1721*77c1e3ccSAndroid Build Coastguard Worker const InterpFilterParams *filter_params_y,
1722*77c1e3ccSAndroid Build Coastguard Worker const int subpel_y_qn) {
1723*77c1e3ccSAndroid Build Coastguard Worker assert(subpel_y_qn == 8);
1724*77c1e3ccSAndroid Build Coastguard Worker assert(filter_params_y->taps == 2);
1725*77c1e3ccSAndroid Build Coastguard Worker (void)filter_params_y;
1726*77c1e3ccSAndroid Build Coastguard Worker (void)subpel_y_qn;
1727*77c1e3ccSAndroid Build Coastguard Worker
1728*77c1e3ccSAndroid Build Coastguard Worker if (w <= 4) {
1729*77c1e3ccSAndroid Build Coastguard Worker do {
1730*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = load_unaligned_u8_4x1(src);
1731*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride);
1732*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride);
1733*77c1e3ccSAndroid Build Coastguard Worker
1734*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = vrhadd_u8(s0, s1);
1735*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 = vrhadd_u8(s1, s2);
1736*77c1e3ccSAndroid Build Coastguard Worker
1737*77c1e3ccSAndroid Build Coastguard Worker if (w == 2) {
1738*77c1e3ccSAndroid Build Coastguard Worker store_u8_2x1(dst + 0 * dst_stride, d0);
1739*77c1e3ccSAndroid Build Coastguard Worker store_u8_2x1(dst + 1 * dst_stride, d1);
1740*77c1e3ccSAndroid Build Coastguard Worker } else {
1741*77c1e3ccSAndroid Build Coastguard Worker store_u8_4x1(dst + 0 * dst_stride, d0);
1742*77c1e3ccSAndroid Build Coastguard Worker store_u8_4x1(dst + 1 * dst_stride, d1);
1743*77c1e3ccSAndroid Build Coastguard Worker }
1744*77c1e3ccSAndroid Build Coastguard Worker
1745*77c1e3ccSAndroid Build Coastguard Worker src += 2 * src_stride;
1746*77c1e3ccSAndroid Build Coastguard Worker dst += 2 * dst_stride;
1747*77c1e3ccSAndroid Build Coastguard Worker h -= 2;
1748*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
1749*77c1e3ccSAndroid Build Coastguard Worker } else if (w == 8) {
1750*77c1e3ccSAndroid Build Coastguard Worker do {
1751*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = vld1_u8(src);
1752*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = vld1_u8(src + src_stride);
1753*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2 = vld1_u8(src + 2 * src_stride);
1754*77c1e3ccSAndroid Build Coastguard Worker
1755*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = vrhadd_u8(s0, s1);
1756*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 = vrhadd_u8(s1, s2);
1757*77c1e3ccSAndroid Build Coastguard Worker
1758*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst, d0);
1759*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst + dst_stride, d1);
1760*77c1e3ccSAndroid Build Coastguard Worker
1761*77c1e3ccSAndroid Build Coastguard Worker src += 2 * src_stride;
1762*77c1e3ccSAndroid Build Coastguard Worker dst += 2 * dst_stride;
1763*77c1e3ccSAndroid Build Coastguard Worker h -= 2;
1764*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
1765*77c1e3ccSAndroid Build Coastguard Worker } else {
1766*77c1e3ccSAndroid Build Coastguard Worker do {
1767*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr = src;
1768*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst_ptr = dst;
1769*77c1e3ccSAndroid Build Coastguard Worker int height = h;
1770*77c1e3ccSAndroid Build Coastguard Worker
1771*77c1e3ccSAndroid Build Coastguard Worker do {
1772*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0 = vld1q_u8(src_ptr);
1773*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s1 = vld1q_u8(src_ptr + src_stride);
1774*77c1e3ccSAndroid Build Coastguard Worker
1775*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t d0 = vrhaddq_u8(s0, s1);
1776*77c1e3ccSAndroid Build Coastguard Worker
1777*77c1e3ccSAndroid Build Coastguard Worker vst1q_u8(dst_ptr, d0);
1778*77c1e3ccSAndroid Build Coastguard Worker
1779*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
1780*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += dst_stride;
1781*77c1e3ccSAndroid Build Coastguard Worker } while (--height != 0);
1782*77c1e3ccSAndroid Build Coastguard Worker src += 16;
1783*77c1e3ccSAndroid Build Coastguard Worker dst += 16;
1784*77c1e3ccSAndroid Build Coastguard Worker w -= 16;
1785*77c1e3ccSAndroid Build Coastguard Worker } while (w != 0);
1786*77c1e3ccSAndroid Build Coastguard Worker }
1787*77c1e3ccSAndroid Build Coastguard Worker }
1788*77c1e3ccSAndroid Build Coastguard Worker
av1_convolve_2d_sr_intrabc_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)1789*77c1e3ccSAndroid Build Coastguard Worker void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride,
1790*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst, int dst_stride, int w, int h,
1791*77c1e3ccSAndroid Build Coastguard Worker const InterpFilterParams *filter_params_x,
1792*77c1e3ccSAndroid Build Coastguard Worker const InterpFilterParams *filter_params_y,
1793*77c1e3ccSAndroid Build Coastguard Worker const int subpel_x_qn,
1794*77c1e3ccSAndroid Build Coastguard Worker const int subpel_y_qn,
1795*77c1e3ccSAndroid Build Coastguard Worker ConvolveParams *conv_params) {
1796*77c1e3ccSAndroid Build Coastguard Worker assert(subpel_x_qn == 8);
1797*77c1e3ccSAndroid Build Coastguard Worker assert(subpel_y_qn == 8);
1798*77c1e3ccSAndroid Build Coastguard Worker assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
1799*77c1e3ccSAndroid Build Coastguard Worker assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
1800*77c1e3ccSAndroid Build Coastguard Worker (void)filter_params_x;
1801*77c1e3ccSAndroid Build Coastguard Worker (void)subpel_x_qn;
1802*77c1e3ccSAndroid Build Coastguard Worker (void)filter_params_y;
1803*77c1e3ccSAndroid Build Coastguard Worker (void)subpel_y_qn;
1804*77c1e3ccSAndroid Build Coastguard Worker (void)conv_params;
1805*77c1e3ccSAndroid Build Coastguard Worker
1806*77c1e3ccSAndroid Build Coastguard Worker uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
1807*77c1e3ccSAndroid Build Coastguard Worker int im_h = h + 1;
1808*77c1e3ccSAndroid Build Coastguard Worker int im_stride = w;
1809*77c1e3ccSAndroid Build Coastguard Worker assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
1810*77c1e3ccSAndroid Build Coastguard Worker
1811*77c1e3ccSAndroid Build Coastguard Worker uint16_t *im = im_block;
1812*77c1e3ccSAndroid Build Coastguard Worker
1813*77c1e3ccSAndroid Build Coastguard Worker // Horizontal filter.
1814*77c1e3ccSAndroid Build Coastguard Worker if (w <= 4) {
1815*77c1e3ccSAndroid Build Coastguard Worker do {
1816*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = vld1_u8(src);
1817*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = vld1_u8(src + 1);
1818*77c1e3ccSAndroid Build Coastguard Worker
1819*77c1e3ccSAndroid Build Coastguard Worker uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1));
1820*77c1e3ccSAndroid Build Coastguard Worker
1821*77c1e3ccSAndroid Build Coastguard Worker // Safe to store the whole vector, the im buffer is big enough.
1822*77c1e3ccSAndroid Build Coastguard Worker vst1_u16(im, sum);
1823*77c1e3ccSAndroid Build Coastguard Worker
1824*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
1825*77c1e3ccSAndroid Build Coastguard Worker im += im_stride;
1826*77c1e3ccSAndroid Build Coastguard Worker } while (--im_h != 0);
1827*77c1e3ccSAndroid Build Coastguard Worker } else {
1828*77c1e3ccSAndroid Build Coastguard Worker do {
1829*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr = src;
1830*77c1e3ccSAndroid Build Coastguard Worker uint16_t *im_ptr = im;
1831*77c1e3ccSAndroid Build Coastguard Worker int width = w;
1832*77c1e3ccSAndroid Build Coastguard Worker
1833*77c1e3ccSAndroid Build Coastguard Worker do {
1834*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = vld1_u8(src_ptr);
1835*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = vld1_u8(src_ptr + 1);
1836*77c1e3ccSAndroid Build Coastguard Worker
1837*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vaddl_u8(s0, s1);
1838*77c1e3ccSAndroid Build Coastguard Worker
1839*77c1e3ccSAndroid Build Coastguard Worker vst1q_u16(im_ptr, sum);
1840*77c1e3ccSAndroid Build Coastguard Worker
1841*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 8;
1842*77c1e3ccSAndroid Build Coastguard Worker im_ptr += 8;
1843*77c1e3ccSAndroid Build Coastguard Worker width -= 8;
1844*77c1e3ccSAndroid Build Coastguard Worker } while (width != 0);
1845*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
1846*77c1e3ccSAndroid Build Coastguard Worker im += im_stride;
1847*77c1e3ccSAndroid Build Coastguard Worker } while (--im_h != 0);
1848*77c1e3ccSAndroid Build Coastguard Worker }
1849*77c1e3ccSAndroid Build Coastguard Worker
1850*77c1e3ccSAndroid Build Coastguard Worker im = im_block;
1851*77c1e3ccSAndroid Build Coastguard Worker
1852*77c1e3ccSAndroid Build Coastguard Worker // Vertical filter.
1853*77c1e3ccSAndroid Build Coastguard Worker if (w <= 4) {
1854*77c1e3ccSAndroid Build Coastguard Worker do {
1855*77c1e3ccSAndroid Build Coastguard Worker uint16x4_t s0 = vld1_u16(im);
1856*77c1e3ccSAndroid Build Coastguard Worker uint16x4_t s1 = vld1_u16(im + im_stride);
1857*77c1e3ccSAndroid Build Coastguard Worker uint16x4_t s2 = vld1_u16(im + 2 * im_stride);
1858*77c1e3ccSAndroid Build Coastguard Worker
1859*77c1e3ccSAndroid Build Coastguard Worker uint16x4_t sum0 = vadd_u16(s0, s1);
1860*77c1e3ccSAndroid Build Coastguard Worker uint16x4_t sum1 = vadd_u16(s1, s2);
1861*77c1e3ccSAndroid Build Coastguard Worker
1862*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2);
1863*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2);
1864*77c1e3ccSAndroid Build Coastguard Worker
1865*77c1e3ccSAndroid Build Coastguard Worker if (w == 2) {
1866*77c1e3ccSAndroid Build Coastguard Worker store_u8_2x1(dst + 0 * dst_stride, d0);
1867*77c1e3ccSAndroid Build Coastguard Worker store_u8_2x1(dst + 1 * dst_stride, d1);
1868*77c1e3ccSAndroid Build Coastguard Worker } else {
1869*77c1e3ccSAndroid Build Coastguard Worker store_u8_4x1(dst + 0 * dst_stride, d0);
1870*77c1e3ccSAndroid Build Coastguard Worker store_u8_4x1(dst + 1 * dst_stride, d1);
1871*77c1e3ccSAndroid Build Coastguard Worker }
1872*77c1e3ccSAndroid Build Coastguard Worker
1873*77c1e3ccSAndroid Build Coastguard Worker im += 2 * im_stride;
1874*77c1e3ccSAndroid Build Coastguard Worker dst += 2 * dst_stride;
1875*77c1e3ccSAndroid Build Coastguard Worker h -= 2;
1876*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
1877*77c1e3ccSAndroid Build Coastguard Worker } else {
1878*77c1e3ccSAndroid Build Coastguard Worker do {
1879*77c1e3ccSAndroid Build Coastguard Worker uint16_t *im_ptr = im;
1880*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst_ptr = dst;
1881*77c1e3ccSAndroid Build Coastguard Worker int height = h;
1882*77c1e3ccSAndroid Build Coastguard Worker
1883*77c1e3ccSAndroid Build Coastguard Worker do {
1884*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t s0 = vld1q_u16(im_ptr);
1885*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
1886*77c1e3ccSAndroid Build Coastguard Worker
1887*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vaddq_u16(s0, s1);
1888*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = vqrshrn_n_u16(sum, 2);
1889*77c1e3ccSAndroid Build Coastguard Worker
1890*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst_ptr, d0);
1891*77c1e3ccSAndroid Build Coastguard Worker
1892*77c1e3ccSAndroid Build Coastguard Worker im_ptr += im_stride;
1893*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += dst_stride;
1894*77c1e3ccSAndroid Build Coastguard Worker } while (--height != 0);
1895*77c1e3ccSAndroid Build Coastguard Worker im += 8;
1896*77c1e3ccSAndroid Build Coastguard Worker dst += 8;
1897*77c1e3ccSAndroid Build Coastguard Worker w -= 8;
1898*77c1e3ccSAndroid Build Coastguard Worker } while (w != 0);
1899*77c1e3ccSAndroid Build Coastguard Worker }
1900*77c1e3ccSAndroid Build Coastguard Worker }
1901