1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
13*77c1e3ccSAndroid Build Coastguard Worker #define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
14*77c1e3ccSAndroid Build Coastguard Worker
15*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
16*77c1e3ccSAndroid Build Coastguard Worker
17*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
18*77c1e3ccSAndroid Build Coastguard Worker
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/convolve.h"
21*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/filter.h"
22*77c1e3ccSAndroid Build Coastguard Worker
convolve12_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11)23*77c1e3ccSAndroid Build Coastguard Worker static inline int32x4_t convolve12_4_2d_v(
24*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
25*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
26*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
27*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
28*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
29*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
30*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
31*77c1e3ccSAndroid Build Coastguard Worker
32*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
33*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
34*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
35*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
36*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
37*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
38*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
39*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
40*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
41*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
42*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
43*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
44*77c1e3ccSAndroid Build Coastguard Worker
45*77c1e3ccSAndroid Build Coastguard Worker return sum;
46*77c1e3ccSAndroid Build Coastguard Worker }
47*77c1e3ccSAndroid Build Coastguard Worker
convolve12_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t s8,const int16x8_t s9,const int16x8_t s10,const int16x8_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11,const int16x8_t sub_const)48*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t convolve12_8_2d_v(
49*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
50*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
51*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
52*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
53*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
54*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t sub_const) {
55*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
56*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
57*77c1e3ccSAndroid Build Coastguard Worker
58*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
59*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
60*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
61*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
62*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
63*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
64*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
65*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
66*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
67*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
68*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
69*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
70*77c1e3ccSAndroid Build Coastguard Worker
71*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
72*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
73*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
74*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
75*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
76*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
77*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
78*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
79*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
80*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
81*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
82*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
83*77c1e3ccSAndroid Build Coastguard Worker
84*77c1e3ccSAndroid Build Coastguard Worker int16x8_t res =
85*77c1e3ccSAndroid Build Coastguard Worker vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
86*77c1e3ccSAndroid Build Coastguard Worker vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
87*77c1e3ccSAndroid Build Coastguard Worker res = vsubq_s16(res, sub_const);
88*77c1e3ccSAndroid Build Coastguard Worker
89*77c1e3ccSAndroid Build Coastguard Worker return vqmovun_s16(res);
90*77c1e3ccSAndroid Build Coastguard Worker }
91*77c1e3ccSAndroid Build Coastguard Worker
convolve_2d_sr_vert_12tap_neon(int16_t * src_ptr,int src_stride,uint8_t * dst_ptr,int dst_stride,int w,int h,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11)92*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_2d_sr_vert_12tap_neon(
93*77c1e3ccSAndroid Build Coastguard Worker int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
94*77c1e3ccSAndroid Build Coastguard Worker int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
95*77c1e3ccSAndroid Build Coastguard Worker const int bd = 8;
96*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
97*77c1e3ccSAndroid Build Coastguard Worker
98*77c1e3ccSAndroid Build Coastguard Worker if (w <= 4) {
99*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
100*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
101*77c1e3ccSAndroid Build Coastguard Worker &s8, &s9, &s10);
102*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 11 * src_stride;
103*77c1e3ccSAndroid Build Coastguard Worker
104*77c1e3ccSAndroid Build Coastguard Worker do {
105*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s11, s12, s13, s14;
106*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14);
107*77c1e3ccSAndroid Build Coastguard Worker
108*77c1e3ccSAndroid Build Coastguard Worker int32x4_t d0 = convolve12_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
109*77c1e3ccSAndroid Build Coastguard Worker s10, s11, y_filter_0_7, y_filter_8_11);
110*77c1e3ccSAndroid Build Coastguard Worker int32x4_t d1 = convolve12_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
111*77c1e3ccSAndroid Build Coastguard Worker s11, s12, y_filter_0_7, y_filter_8_11);
112*77c1e3ccSAndroid Build Coastguard Worker int32x4_t d2 = convolve12_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
113*77c1e3ccSAndroid Build Coastguard Worker s12, s13, y_filter_0_7, y_filter_8_11);
114*77c1e3ccSAndroid Build Coastguard Worker int32x4_t d3 =
115*77c1e3ccSAndroid Build Coastguard Worker convolve12_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
116*77c1e3ccSAndroid Build Coastguard Worker y_filter_0_7, y_filter_8_11);
117*77c1e3ccSAndroid Build Coastguard Worker
118*77c1e3ccSAndroid Build Coastguard Worker int16x8_t dd01 =
119*77c1e3ccSAndroid Build Coastguard Worker vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS),
120*77c1e3ccSAndroid Build Coastguard Worker vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS));
121*77c1e3ccSAndroid Build Coastguard Worker int16x8_t dd23 =
122*77c1e3ccSAndroid Build Coastguard Worker vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS),
123*77c1e3ccSAndroid Build Coastguard Worker vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS));
124*77c1e3ccSAndroid Build Coastguard Worker
125*77c1e3ccSAndroid Build Coastguard Worker dd01 = vsubq_s16(dd01, sub_const);
126*77c1e3ccSAndroid Build Coastguard Worker dd23 = vsubq_s16(dd23, sub_const);
127*77c1e3ccSAndroid Build Coastguard Worker
128*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 = vqmovun_s16(dd01);
129*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d23 = vqmovun_s16(dd23);
130*77c1e3ccSAndroid Build Coastguard Worker
131*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
132*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
133*77c1e3ccSAndroid Build Coastguard Worker
134*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
135*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
136*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
137*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
138*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
139*77c1e3ccSAndroid Build Coastguard Worker s5 = s9;
140*77c1e3ccSAndroid Build Coastguard Worker s6 = s10;
141*77c1e3ccSAndroid Build Coastguard Worker s7 = s11;
142*77c1e3ccSAndroid Build Coastguard Worker s8 = s12;
143*77c1e3ccSAndroid Build Coastguard Worker s9 = s13;
144*77c1e3ccSAndroid Build Coastguard Worker s10 = s14;
145*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 4 * src_stride;
146*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 4 * dst_stride;
147*77c1e3ccSAndroid Build Coastguard Worker h -= 4;
148*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
149*77c1e3ccSAndroid Build Coastguard Worker
150*77c1e3ccSAndroid Build Coastguard Worker } else {
151*77c1e3ccSAndroid Build Coastguard Worker do {
152*77c1e3ccSAndroid Build Coastguard Worker int height = h;
153*77c1e3ccSAndroid Build Coastguard Worker int16_t *s = src_ptr;
154*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst_ptr;
155*77c1e3ccSAndroid Build Coastguard Worker
156*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
157*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
158*77c1e3ccSAndroid Build Coastguard Worker &s9, &s10);
159*77c1e3ccSAndroid Build Coastguard Worker s += 11 * src_stride;
160*77c1e3ccSAndroid Build Coastguard Worker
161*77c1e3ccSAndroid Build Coastguard Worker do {
162*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s11, s12, s13, s14;
163*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
164*77c1e3ccSAndroid Build Coastguard Worker
165*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 =
166*77c1e3ccSAndroid Build Coastguard Worker convolve12_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
167*77c1e3ccSAndroid Build Coastguard Worker y_filter_0_7, y_filter_8_11, sub_const);
168*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 =
169*77c1e3ccSAndroid Build Coastguard Worker convolve12_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
170*77c1e3ccSAndroid Build Coastguard Worker y_filter_0_7, y_filter_8_11, sub_const);
171*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d2 =
172*77c1e3ccSAndroid Build Coastguard Worker convolve12_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
173*77c1e3ccSAndroid Build Coastguard Worker s13, y_filter_0_7, y_filter_8_11, sub_const);
174*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d3 =
175*77c1e3ccSAndroid Build Coastguard Worker convolve12_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
176*77c1e3ccSAndroid Build Coastguard Worker s14, y_filter_0_7, y_filter_8_11, sub_const);
177*77c1e3ccSAndroid Build Coastguard Worker
178*77c1e3ccSAndroid Build Coastguard Worker store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
179*77c1e3ccSAndroid Build Coastguard Worker
180*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
181*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
182*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
183*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
184*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
185*77c1e3ccSAndroid Build Coastguard Worker s5 = s9;
186*77c1e3ccSAndroid Build Coastguard Worker s6 = s10;
187*77c1e3ccSAndroid Build Coastguard Worker s7 = s11;
188*77c1e3ccSAndroid Build Coastguard Worker s8 = s12;
189*77c1e3ccSAndroid Build Coastguard Worker s9 = s13;
190*77c1e3ccSAndroid Build Coastguard Worker s10 = s14;
191*77c1e3ccSAndroid Build Coastguard Worker s += 4 * src_stride;
192*77c1e3ccSAndroid Build Coastguard Worker d += 4 * dst_stride;
193*77c1e3ccSAndroid Build Coastguard Worker height -= 4;
194*77c1e3ccSAndroid Build Coastguard Worker } while (height != 0);
195*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 8;
196*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 8;
197*77c1e3ccSAndroid Build Coastguard Worker w -= 8;
198*77c1e3ccSAndroid Build Coastguard Worker } while (w != 0);
199*77c1e3ccSAndroid Build Coastguard Worker }
200*77c1e3ccSAndroid Build Coastguard Worker }
201*77c1e3ccSAndroid Build Coastguard Worker
convolve8_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter)202*77c1e3ccSAndroid Build Coastguard Worker static inline int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1,
203*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s2, const int16x4_t s3,
204*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s4, const int16x4_t s5,
205*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s6, const int16x4_t s7,
206*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter) {
207*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_lo = vget_low_s16(y_filter);
208*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_hi = vget_high_s16(y_filter);
209*77c1e3ccSAndroid Build Coastguard Worker
210*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
211*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
212*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
213*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
214*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
215*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
216*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
217*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
218*77c1e3ccSAndroid Build Coastguard Worker
219*77c1e3ccSAndroid Build Coastguard Worker return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
220*77c1e3ccSAndroid Build Coastguard Worker }
221*77c1e3ccSAndroid Build Coastguard Worker
convolve8_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const int16x8_t sub_const)222*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1,
223*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s2, const int16x8_t s3,
224*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s4, const int16x8_t s5,
225*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s6, const int16x8_t s7,
226*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter,
227*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t sub_const) {
228*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_lo = vget_low_s16(y_filter);
229*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_hi = vget_high_s16(y_filter);
230*77c1e3ccSAndroid Build Coastguard Worker
231*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
232*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
233*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
234*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
235*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
236*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
237*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
238*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
239*77c1e3ccSAndroid Build Coastguard Worker
240*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
241*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
242*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
243*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
244*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
245*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
246*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
247*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
248*77c1e3ccSAndroid Build Coastguard Worker
249*77c1e3ccSAndroid Build Coastguard Worker int16x8_t res =
250*77c1e3ccSAndroid Build Coastguard Worker vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
251*77c1e3ccSAndroid Build Coastguard Worker vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
252*77c1e3ccSAndroid Build Coastguard Worker res = vsubq_s16(res, sub_const);
253*77c1e3ccSAndroid Build Coastguard Worker
254*77c1e3ccSAndroid Build Coastguard Worker return vqmovun_s16(res);
255*77c1e3ccSAndroid Build Coastguard Worker }
256*77c1e3ccSAndroid Build Coastguard Worker
convolve_2d_sr_vert_8tap_neon(int16_t * src_ptr,int src_stride,uint8_t * dst_ptr,int dst_stride,int w,int h,const int16x8_t y_filter)257*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr,
258*77c1e3ccSAndroid Build Coastguard Worker int src_stride,
259*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst_ptr,
260*77c1e3ccSAndroid Build Coastguard Worker int dst_stride, int w, int h,
261*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter) {
262*77c1e3ccSAndroid Build Coastguard Worker const int bd = 8;
263*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
264*77c1e3ccSAndroid Build Coastguard Worker
265*77c1e3ccSAndroid Build Coastguard Worker if (w <= 4) {
266*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s0, s1, s2, s3, s4, s5, s6;
267*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
268*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 7 * src_stride;
269*77c1e3ccSAndroid Build Coastguard Worker
270*77c1e3ccSAndroid Build Coastguard Worker do {
271*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
272*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s7, s8, s9, s10;
273*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
274*77c1e3ccSAndroid Build Coastguard Worker
275*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
276*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
277*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
278*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d3 =
279*77c1e3ccSAndroid Build Coastguard Worker convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
280*77c1e3ccSAndroid Build Coastguard Worker
281*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
282*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
283*77c1e3ccSAndroid Build Coastguard Worker
284*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
285*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
286*77c1e3ccSAndroid Build Coastguard Worker
287*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
288*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
289*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
290*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
291*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
292*77c1e3ccSAndroid Build Coastguard Worker s5 = s9;
293*77c1e3ccSAndroid Build Coastguard Worker s6 = s10;
294*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 4 * src_stride;
295*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 4 * dst_stride;
296*77c1e3ccSAndroid Build Coastguard Worker h -= 4;
297*77c1e3ccSAndroid Build Coastguard Worker #else // !AOM_ARCH_AARCH64
298*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s7 = vld1_s16(src_ptr);
299*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
300*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 =
301*77c1e3ccSAndroid Build Coastguard Worker vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
302*77c1e3ccSAndroid Build Coastguard Worker
303*77c1e3ccSAndroid Build Coastguard Worker store_u8_4x1(dst_ptr, d01);
304*77c1e3ccSAndroid Build Coastguard Worker
305*77c1e3ccSAndroid Build Coastguard Worker s0 = s1;
306*77c1e3ccSAndroid Build Coastguard Worker s1 = s2;
307*77c1e3ccSAndroid Build Coastguard Worker s2 = s3;
308*77c1e3ccSAndroid Build Coastguard Worker s3 = s4;
309*77c1e3ccSAndroid Build Coastguard Worker s4 = s5;
310*77c1e3ccSAndroid Build Coastguard Worker s5 = s6;
311*77c1e3ccSAndroid Build Coastguard Worker s6 = s7;
312*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
313*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += dst_stride;
314*77c1e3ccSAndroid Build Coastguard Worker h--;
315*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
316*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
317*77c1e3ccSAndroid Build Coastguard Worker } else {
318*77c1e3ccSAndroid Build Coastguard Worker // Width is a multiple of 8 and height is a multiple of 4.
319*77c1e3ccSAndroid Build Coastguard Worker do {
320*77c1e3ccSAndroid Build Coastguard Worker int height = h;
321*77c1e3ccSAndroid Build Coastguard Worker int16_t *s = src_ptr;
322*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst_ptr;
323*77c1e3ccSAndroid Build Coastguard Worker
324*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0, s1, s2, s3, s4, s5, s6;
325*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
326*77c1e3ccSAndroid Build Coastguard Worker s += 7 * src_stride;
327*77c1e3ccSAndroid Build Coastguard Worker
328*77c1e3ccSAndroid Build Coastguard Worker do {
329*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
330*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s7, s8, s9, s10;
331*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
332*77c1e3ccSAndroid Build Coastguard Worker
333*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
334*77c1e3ccSAndroid Build Coastguard Worker y_filter, sub_const);
335*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
336*77c1e3ccSAndroid Build Coastguard Worker y_filter, sub_const);
337*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
338*77c1e3ccSAndroid Build Coastguard Worker y_filter, sub_const);
339*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
340*77c1e3ccSAndroid Build Coastguard Worker y_filter, sub_const);
341*77c1e3ccSAndroid Build Coastguard Worker
342*77c1e3ccSAndroid Build Coastguard Worker store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
343*77c1e3ccSAndroid Build Coastguard Worker
344*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
345*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
346*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
347*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
348*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
349*77c1e3ccSAndroid Build Coastguard Worker s5 = s9;
350*77c1e3ccSAndroid Build Coastguard Worker s6 = s10;
351*77c1e3ccSAndroid Build Coastguard Worker s += 4 * src_stride;
352*77c1e3ccSAndroid Build Coastguard Worker d += 4 * dst_stride;
353*77c1e3ccSAndroid Build Coastguard Worker height -= 4;
354*77c1e3ccSAndroid Build Coastguard Worker #else // !AOM_ARCH_AARCH64
355*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s7 = vld1q_s16(s);
356*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
357*77c1e3ccSAndroid Build Coastguard Worker y_filter, sub_const);
358*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(d, d0);
359*77c1e3ccSAndroid Build Coastguard Worker
360*77c1e3ccSAndroid Build Coastguard Worker s0 = s1;
361*77c1e3ccSAndroid Build Coastguard Worker s1 = s2;
362*77c1e3ccSAndroid Build Coastguard Worker s2 = s3;
363*77c1e3ccSAndroid Build Coastguard Worker s3 = s4;
364*77c1e3ccSAndroid Build Coastguard Worker s4 = s5;
365*77c1e3ccSAndroid Build Coastguard Worker s5 = s6;
366*77c1e3ccSAndroid Build Coastguard Worker s6 = s7;
367*77c1e3ccSAndroid Build Coastguard Worker s += src_stride;
368*77c1e3ccSAndroid Build Coastguard Worker d += dst_stride;
369*77c1e3ccSAndroid Build Coastguard Worker height--;
370*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
371*77c1e3ccSAndroid Build Coastguard Worker } while (height != 0);
372*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 8;
373*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 8;
374*77c1e3ccSAndroid Build Coastguard Worker w -= 8;
375*77c1e3ccSAndroid Build Coastguard Worker } while (w != 0);
376*77c1e3ccSAndroid Build Coastguard Worker }
377*77c1e3ccSAndroid Build Coastguard Worker }
378*77c1e3ccSAndroid Build Coastguard Worker
convolve6_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter)379*77c1e3ccSAndroid Build Coastguard Worker static inline int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1,
380*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s2, const int16x4_t s3,
381*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s4, const int16x4_t s5,
382*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter) {
383*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_lo = vget_low_s16(y_filter);
384*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_hi = vget_high_s16(y_filter);
385*77c1e3ccSAndroid Build Coastguard Worker
386*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 1);
387*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
388*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
389*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
390*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
391*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
392*77c1e3ccSAndroid Build Coastguard Worker
393*77c1e3ccSAndroid Build Coastguard Worker return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
394*77c1e3ccSAndroid Build Coastguard Worker }
395*77c1e3ccSAndroid Build Coastguard Worker
convolve6_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const int16x8_t sub_const)396*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1,
397*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s2, const int16x8_t s3,
398*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s4, const int16x8_t s5,
399*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter,
400*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t sub_const) {
401*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_lo = vget_low_s16(y_filter);
402*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter_hi = vget_high_s16(y_filter);
403*77c1e3ccSAndroid Build Coastguard Worker
404*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1);
405*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2);
406*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3);
407*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0);
408*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1);
409*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2);
410*77c1e3ccSAndroid Build Coastguard Worker
411*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1);
412*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2);
413*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3);
414*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0);
415*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1);
416*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2);
417*77c1e3ccSAndroid Build Coastguard Worker
418*77c1e3ccSAndroid Build Coastguard Worker int16x8_t res =
419*77c1e3ccSAndroid Build Coastguard Worker vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
420*77c1e3ccSAndroid Build Coastguard Worker vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
421*77c1e3ccSAndroid Build Coastguard Worker res = vsubq_s16(res, sub_const);
422*77c1e3ccSAndroid Build Coastguard Worker
423*77c1e3ccSAndroid Build Coastguard Worker return vqmovun_s16(res);
424*77c1e3ccSAndroid Build Coastguard Worker }
425*77c1e3ccSAndroid Build Coastguard Worker
convolve_2d_sr_vert_6tap_neon(int16_t * src_ptr,int src_stride,uint8_t * dst_ptr,int dst_stride,int w,int h,const int16x8_t y_filter)426*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
427*77c1e3ccSAndroid Build Coastguard Worker int src_stride,
428*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst_ptr,
429*77c1e3ccSAndroid Build Coastguard Worker int dst_stride, int w, int h,
430*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t y_filter) {
431*77c1e3ccSAndroid Build Coastguard Worker const int bd = 8;
432*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
433*77c1e3ccSAndroid Build Coastguard Worker
434*77c1e3ccSAndroid Build Coastguard Worker if (w <= 4) {
435*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s0, s1, s2, s3, s4;
436*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
437*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 5 * src_stride;
438*77c1e3ccSAndroid Build Coastguard Worker
439*77c1e3ccSAndroid Build Coastguard Worker do {
440*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
441*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s5, s6, s7, s8;
442*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
443*77c1e3ccSAndroid Build Coastguard Worker
444*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter);
445*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter);
446*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter);
447*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter);
448*77c1e3ccSAndroid Build Coastguard Worker
449*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
450*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
451*77c1e3ccSAndroid Build Coastguard Worker
452*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
453*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
454*77c1e3ccSAndroid Build Coastguard Worker
455*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
456*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
457*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
458*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
459*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
460*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 4 * src_stride;
461*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 4 * dst_stride;
462*77c1e3ccSAndroid Build Coastguard Worker h -= 4;
463*77c1e3ccSAndroid Build Coastguard Worker #else // !AOM_ARCH_AARCH64
464*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s5 = vld1_s16(src_ptr);
465*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter);
466*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 =
467*77c1e3ccSAndroid Build Coastguard Worker vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
468*77c1e3ccSAndroid Build Coastguard Worker
469*77c1e3ccSAndroid Build Coastguard Worker store_u8_4x1(dst_ptr, d01);
470*77c1e3ccSAndroid Build Coastguard Worker
471*77c1e3ccSAndroid Build Coastguard Worker s0 = s1;
472*77c1e3ccSAndroid Build Coastguard Worker s1 = s2;
473*77c1e3ccSAndroid Build Coastguard Worker s2 = s3;
474*77c1e3ccSAndroid Build Coastguard Worker s3 = s4;
475*77c1e3ccSAndroid Build Coastguard Worker s4 = s5;
476*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
477*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += dst_stride;
478*77c1e3ccSAndroid Build Coastguard Worker h--;
479*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
480*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
481*77c1e3ccSAndroid Build Coastguard Worker } else {
482*77c1e3ccSAndroid Build Coastguard Worker // Width is a multiple of 8 and height is a multiple of 4.
483*77c1e3ccSAndroid Build Coastguard Worker do {
484*77c1e3ccSAndroid Build Coastguard Worker int height = h;
485*77c1e3ccSAndroid Build Coastguard Worker int16_t *s = src_ptr;
486*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst_ptr;
487*77c1e3ccSAndroid Build Coastguard Worker
488*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0, s1, s2, s3, s4;
489*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
490*77c1e3ccSAndroid Build Coastguard Worker s += 5 * src_stride;
491*77c1e3ccSAndroid Build Coastguard Worker
492*77c1e3ccSAndroid Build Coastguard Worker do {
493*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
494*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s5, s6, s7, s8;
495*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
496*77c1e3ccSAndroid Build Coastguard Worker
497*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 =
498*77c1e3ccSAndroid Build Coastguard Worker convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
499*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 =
500*77c1e3ccSAndroid Build Coastguard Worker convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, sub_const);
501*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d2 =
502*77c1e3ccSAndroid Build Coastguard Worker convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, sub_const);
503*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d3 =
504*77c1e3ccSAndroid Build Coastguard Worker convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, sub_const);
505*77c1e3ccSAndroid Build Coastguard Worker
506*77c1e3ccSAndroid Build Coastguard Worker store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
507*77c1e3ccSAndroid Build Coastguard Worker
508*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
509*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
510*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
511*77c1e3ccSAndroid Build Coastguard Worker s3 = s7;
512*77c1e3ccSAndroid Build Coastguard Worker s4 = s8;
513*77c1e3ccSAndroid Build Coastguard Worker s += 4 * src_stride;
514*77c1e3ccSAndroid Build Coastguard Worker d += 4 * dst_stride;
515*77c1e3ccSAndroid Build Coastguard Worker height -= 4;
516*77c1e3ccSAndroid Build Coastguard Worker #else // !AOM_ARCH_AARCH64
517*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s5 = vld1q_s16(s);
518*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 =
519*77c1e3ccSAndroid Build Coastguard Worker convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
520*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(d, d0);
521*77c1e3ccSAndroid Build Coastguard Worker
522*77c1e3ccSAndroid Build Coastguard Worker s0 = s1;
523*77c1e3ccSAndroid Build Coastguard Worker s1 = s2;
524*77c1e3ccSAndroid Build Coastguard Worker s2 = s3;
525*77c1e3ccSAndroid Build Coastguard Worker s3 = s4;
526*77c1e3ccSAndroid Build Coastguard Worker s4 = s5;
527*77c1e3ccSAndroid Build Coastguard Worker s += src_stride;
528*77c1e3ccSAndroid Build Coastguard Worker d += dst_stride;
529*77c1e3ccSAndroid Build Coastguard Worker height--;
530*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
531*77c1e3ccSAndroid Build Coastguard Worker } while (height != 0);
532*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 8;
533*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 8;
534*77c1e3ccSAndroid Build Coastguard Worker w -= 8;
535*77c1e3ccSAndroid Build Coastguard Worker } while (w != 0);
536*77c1e3ccSAndroid Build Coastguard Worker }
537*77c1e3ccSAndroid Build Coastguard Worker }
538*77c1e3ccSAndroid Build Coastguard Worker
convolve4_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t y_filter)539*77c1e3ccSAndroid Build Coastguard Worker static inline int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1,
540*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t s2, const int16x4_t s3,
541*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter) {
542*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum = vmull_lane_s16(s0, y_filter, 0);
543*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s1, y_filter, 1);
544*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s2, y_filter, 2);
545*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_lane_s16(sum, s3, y_filter, 3);
546*77c1e3ccSAndroid Build Coastguard Worker
547*77c1e3ccSAndroid Build Coastguard Worker return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
548*77c1e3ccSAndroid Build Coastguard Worker }
549*77c1e3ccSAndroid Build Coastguard Worker
convolve4_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t y_filter,const int16x8_t sub_const)550*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1,
551*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t s2, const int16x8_t s3,
552*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t y_filter,
553*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t sub_const) {
554*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter, 0);
555*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter, 1);
556*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter, 2);
557*77c1e3ccSAndroid Build Coastguard Worker sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter, 3);
558*77c1e3ccSAndroid Build Coastguard Worker
559*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter, 0);
560*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter, 1);
561*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter, 2);
562*77c1e3ccSAndroid Build Coastguard Worker sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter, 3);
563*77c1e3ccSAndroid Build Coastguard Worker
564*77c1e3ccSAndroid Build Coastguard Worker int16x8_t res =
565*77c1e3ccSAndroid Build Coastguard Worker vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
566*77c1e3ccSAndroid Build Coastguard Worker vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
567*77c1e3ccSAndroid Build Coastguard Worker res = vsubq_s16(res, sub_const);
568*77c1e3ccSAndroid Build Coastguard Worker
569*77c1e3ccSAndroid Build Coastguard Worker return vqmovun_s16(res);
570*77c1e3ccSAndroid Build Coastguard Worker }
571*77c1e3ccSAndroid Build Coastguard Worker
convolve_2d_sr_vert_4tap_neon(int16_t * src_ptr,int src_stride,uint8_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter)572*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_2d_sr_vert_4tap_neon(int16_t *src_ptr,
573*77c1e3ccSAndroid Build Coastguard Worker int src_stride,
574*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst_ptr,
575*77c1e3ccSAndroid Build Coastguard Worker int dst_stride, int w, int h,
576*77c1e3ccSAndroid Build Coastguard Worker const int16_t *y_filter) {
577*77c1e3ccSAndroid Build Coastguard Worker const int bd = 8;
578*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
579*77c1e3ccSAndroid Build Coastguard Worker
580*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t filter = vld1_s16(y_filter + 2);
581*77c1e3ccSAndroid Build Coastguard Worker
582*77c1e3ccSAndroid Build Coastguard Worker if (w == 4) {
583*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s0, s1, s2;
584*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x3(src_ptr, src_stride, &s0, &s1, &s2);
585*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 3 * src_stride;
586*77c1e3ccSAndroid Build Coastguard Worker
587*77c1e3ccSAndroid Build Coastguard Worker do {
588*77c1e3ccSAndroid Build Coastguard Worker int16x4_t s3, s4, s5, s6;
589*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x4(src_ptr, src_stride, &s3, &s4, &s5, &s6);
590*77c1e3ccSAndroid Build Coastguard Worker
591*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d0 = convolve4_4_2d_v(s0, s1, s2, s3, filter);
592*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d1 = convolve4_4_2d_v(s1, s2, s3, s4, filter);
593*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d2 = convolve4_4_2d_v(s2, s3, s4, s5, filter);
594*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d3 = convolve4_4_2d_v(s3, s4, s5, s6, filter);
595*77c1e3ccSAndroid Build Coastguard Worker
596*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
597*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
598*77c1e3ccSAndroid Build Coastguard Worker
599*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
600*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
601*77c1e3ccSAndroid Build Coastguard Worker
602*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
603*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
604*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
605*77c1e3ccSAndroid Build Coastguard Worker
606*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 4 * src_stride;
607*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 4 * dst_stride;
608*77c1e3ccSAndroid Build Coastguard Worker h -= 4;
609*77c1e3ccSAndroid Build Coastguard Worker } while (h != 0);
610*77c1e3ccSAndroid Build Coastguard Worker } else {
611*77c1e3ccSAndroid Build Coastguard Worker // Width is a multiple of 8 and height is a multiple of 4.
612*77c1e3ccSAndroid Build Coastguard Worker do {
613*77c1e3ccSAndroid Build Coastguard Worker int height = h;
614*77c1e3ccSAndroid Build Coastguard Worker int16_t *s = src_ptr;
615*77c1e3ccSAndroid Build Coastguard Worker uint8_t *d = dst_ptr;
616*77c1e3ccSAndroid Build Coastguard Worker
617*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s0, s1, s2;
618*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x3(s, src_stride, &s0, &s1, &s2);
619*77c1e3ccSAndroid Build Coastguard Worker s += 3 * src_stride;
620*77c1e3ccSAndroid Build Coastguard Worker
621*77c1e3ccSAndroid Build Coastguard Worker do {
622*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s3, s4, s5, s6;
623*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
624*77c1e3ccSAndroid Build Coastguard Worker
625*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d0 = convolve4_8_2d_v(s0, s1, s2, s3, filter, sub_const);
626*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d1 = convolve4_8_2d_v(s1, s2, s3, s4, filter, sub_const);
627*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d2 = convolve4_8_2d_v(s2, s3, s4, s5, filter, sub_const);
628*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t d3 = convolve4_8_2d_v(s3, s4, s5, s6, filter, sub_const);
629*77c1e3ccSAndroid Build Coastguard Worker
630*77c1e3ccSAndroid Build Coastguard Worker store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
631*77c1e3ccSAndroid Build Coastguard Worker
632*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
633*77c1e3ccSAndroid Build Coastguard Worker s1 = s5;
634*77c1e3ccSAndroid Build Coastguard Worker s2 = s6;
635*77c1e3ccSAndroid Build Coastguard Worker
636*77c1e3ccSAndroid Build Coastguard Worker s += 4 * src_stride;
637*77c1e3ccSAndroid Build Coastguard Worker d += 4 * dst_stride;
638*77c1e3ccSAndroid Build Coastguard Worker height -= 4;
639*77c1e3ccSAndroid Build Coastguard Worker } while (height != 0);
640*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 8;
641*77c1e3ccSAndroid Build Coastguard Worker dst_ptr += 8;
642*77c1e3ccSAndroid Build Coastguard Worker w -= 8;
643*77c1e3ccSAndroid Build Coastguard Worker } while (w != 0);
644*77c1e3ccSAndroid Build Coastguard Worker }
645*77c1e3ccSAndroid Build Coastguard Worker }
646*77c1e3ccSAndroid Build Coastguard Worker
647*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
648