1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <arm_neon.h>
14
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_ports/mem.h"
21 #include "av1/common/convolve.h"
22 #include "av1/common/filter.h"
23 #include "av1/common/arm/highbd_compound_convolve_neon.h"
24 #include "av1/common/arm/highbd_convolve_neon.h"
25
highbd_12_convolve6_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t filter,const int32x4_t offset)26 static inline uint16x4_t highbd_12_convolve6_4(
27 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
28 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
29 const int16x8_t filter, const int32x4_t offset) {
30 // Values at indices 0 and 7 of y_filter are zero.
31 const int16x4_t filter_0_3 = vget_low_s16(filter);
32 const int16x4_t filter_4_7 = vget_high_s16(filter);
33
34 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
35 sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
36 sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
37 sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
38 sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
39 sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
40
41 return vqshrun_n_s32(sum, ROUND0_BITS + 2);
42 }
43
highbd_convolve6_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t filter,const int32x4_t offset)44 static inline uint16x4_t highbd_convolve6_4(
45 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
46 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
47 const int16x8_t filter, const int32x4_t offset) {
48 // Values at indices 0 and 7 of y_filter are zero.
49 const int16x4_t filter_0_3 = vget_low_s16(filter);
50 const int16x4_t filter_4_7 = vget_high_s16(filter);
51
52 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
53 sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
54 sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
55 sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
56 sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
57 sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
58
59 return vqshrun_n_s32(sum, ROUND0_BITS);
60 }
61
highbd_12_convolve6_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t filter,const int32x4_t offset)62 static inline uint16x8_t highbd_12_convolve6_8(
63 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
64 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
65 const int16x8_t filter, const int32x4_t offset) {
66 // Values at indices 0 and 7 of y_filter are zero.
67 const int16x4_t filter_0_3 = vget_low_s16(filter);
68 const int16x4_t filter_4_7 = vget_high_s16(filter);
69
70 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
71 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
72 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
73 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
74 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
75 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
76
77 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
78 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
79 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
80 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
81 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
82 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
83
84 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
85 vqshrun_n_s32(sum1, ROUND0_BITS + 2));
86 }
87
highbd_convolve6_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t filter,const int32x4_t offset)88 static inline uint16x8_t highbd_convolve6_8(
89 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
90 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
91 const int16x8_t filter, const int32x4_t offset) {
92 // Values at indices 0 and 7 of y_filter are zero.
93 const int16x4_t filter_0_3 = vget_low_s16(filter);
94 const int16x4_t filter_4_7 = vget_high_s16(filter);
95
96 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
97 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
98 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
99 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
100 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
101 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
102
103 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
104 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
105 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
106 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
107 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
108 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
109
110 return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS));
111 }
112
highbd_12_dist_wtd_convolve_x_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)113 static inline void highbd_12_dist_wtd_convolve_x_6tap_neon(
114 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
115 int w, int h, const int16_t *x_filter_ptr, const int offset) {
116 const int32x4_t offset_vec = vdupq_n_s32(offset);
117
118 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
119
120 int height = h;
121
122 do {
123 int width = w;
124 const int16_t *s = (const int16_t *)src_ptr;
125 uint16_t *d = dst_ptr;
126
127 do {
128 int16x8_t s0[6], s1[6], s2[6], s3[6];
129 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
130 &s0[4], &s0[5]);
131 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
132 &s1[4], &s1[5]);
133 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
134 &s2[4], &s2[5]);
135 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
136 &s3[4], &s3[5]);
137
138 uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
139 s0[5], x_filter, offset_vec);
140 uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
141 s1[5], x_filter, offset_vec);
142 uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
143 s2[5], x_filter, offset_vec);
144 uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
145 s3[5], x_filter, offset_vec);
146
147 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
148
149 s += 8;
150 d += 8;
151 width -= 8;
152 } while (width != 0);
153 src_ptr += 4 * src_stride;
154 dst_ptr += 4 * dst_stride;
155 height -= 4;
156 } while (height != 0);
157 }
158
highbd_dist_wtd_convolve_x_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)159 static inline void highbd_dist_wtd_convolve_x_6tap_neon(
160 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
161 int w, int h, const int16_t *x_filter_ptr, const int offset) {
162 const int32x4_t offset_vec = vdupq_n_s32(offset);
163
164 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
165
166 int height = h;
167
168 do {
169 int width = w;
170 const int16_t *s = (const int16_t *)src_ptr;
171 uint16_t *d = dst_ptr;
172
173 do {
174 int16x8_t s0[6], s1[6], s2[6], s3[6];
175 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
176 &s0[4], &s0[5]);
177 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
178 &s1[4], &s1[5]);
179 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
180 &s2[4], &s2[5]);
181 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
182 &s3[4], &s3[5]);
183
184 uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
185 s0[5], x_filter, offset_vec);
186 uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
187 s1[5], x_filter, offset_vec);
188 uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
189 s2[5], x_filter, offset_vec);
190 uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
191 s3[5], x_filter, offset_vec);
192
193 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
194
195 s += 8;
196 d += 8;
197 width -= 8;
198 } while (width != 0);
199 src_ptr += 4 * src_stride;
200 dst_ptr += 4 * dst_stride;
201 height -= 4;
202 } while (height != 0);
203 }
204
highbd_12_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter,const int32x4_t offset)205 static inline uint16x4_t highbd_12_convolve8_4(
206 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
207 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
208 const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
209 const int32x4_t offset) {
210 const int16x4_t filter_0_3 = vget_low_s16(filter);
211 const int16x4_t filter_4_7 = vget_high_s16(filter);
212
213 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
214 sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
215 sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
216 sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
217 sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
218 sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
219 sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
220 sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
221
222 return vqshrun_n_s32(sum, ROUND0_BITS + 2);
223 }
224
highbd_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter,const int32x4_t offset)225 static inline uint16x4_t highbd_convolve8_4(
226 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
227 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
228 const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
229 const int32x4_t offset) {
230 const int16x4_t filter_0_3 = vget_low_s16(filter);
231 const int16x4_t filter_4_7 = vget_high_s16(filter);
232
233 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
234 sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
235 sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
236 sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
237 sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
238 sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
239 sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
240 sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
241
242 return vqshrun_n_s32(sum, ROUND0_BITS);
243 }
244
highbd_12_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int32x4_t offset)245 static inline uint16x8_t highbd_12_convolve8_8(
246 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
247 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
248 const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
249 const int32x4_t offset) {
250 const int16x4_t filter_0_3 = vget_low_s16(filter);
251 const int16x4_t filter_4_7 = vget_high_s16(filter);
252
253 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
254 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
255 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
256 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
257 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
258 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
259 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
260 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
261
262 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
263 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
264 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
265 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
266 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
267 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
268 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
269 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
270
271 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
272 vqshrun_n_s32(sum1, ROUND0_BITS + 2));
273 }
274
highbd_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int32x4_t offset)275 static inline uint16x8_t highbd_convolve8_8(
276 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
277 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
278 const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
279 const int32x4_t offset) {
280 const int16x4_t filter_0_3 = vget_low_s16(filter);
281 const int16x4_t filter_4_7 = vget_high_s16(filter);
282
283 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
284 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
285 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
286 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
287 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
288 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
289 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
290 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
291
292 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
293 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
294 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
295 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
296 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
297 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
298 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
299 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
300
301 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
302 vqshrun_n_s32(sum1, ROUND0_BITS));
303 }
304
highbd_12_convolve4_4_x(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t offset)305 static inline uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4],
306 const int16x4_t x_filter,
307 const int32x4_t offset) {
308 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
309 sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
310 sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
311 sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
312
313 return vqshrun_n_s32(sum, 5);
314 }
315
highbd_convolve4_4_x(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t offset)316 static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
317 const int16x4_t x_filter,
318 const int32x4_t offset) {
319 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
320 sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
321 sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
322 sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
323
324 return vqshrun_n_s32(sum, ROUND0_BITS);
325 }
326
highbd_12_dist_wtd_convolve_x_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)327 static inline void highbd_12_dist_wtd_convolve_x_neon(
328 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
329 int w, int h, const int16_t *x_filter_ptr, const int offset) {
330 const int32x4_t offset_vec = vdupq_n_s32(offset);
331
332 if (w == 4) {
333 // 4-tap filters are used for blocks having width == 4.
334 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
335 const int16_t *s = (const int16_t *)(src_ptr + 2);
336 uint16_t *d = dst_ptr;
337
338 do {
339 int16x4_t s0[4], s1[4], s2[4], s3[4];
340 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
341 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
342 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
343 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
344
345 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
346 uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
347 uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
348 uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
349
350 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
351
352 s += 4 * src_stride;
353 d += 4 * dst_stride;
354 h -= 4;
355 } while (h != 0);
356 } else {
357 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
358 int height = h;
359
360 do {
361 int width = w;
362 const int16_t *s = (const int16_t *)src_ptr;
363 uint16_t *d = dst_ptr;
364
365 do {
366 int16x8_t s0[8], s1[8], s2[8], s3[8];
367 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
368 &s0[4], &s0[5], &s0[6], &s0[7]);
369 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
370 &s1[4], &s1[5], &s1[6], &s1[7]);
371 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
372 &s2[4], &s2[5], &s2[6], &s2[7]);
373 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
374 &s3[4], &s3[5], &s3[6], &s3[7]);
375
376 uint16x8_t d0 =
377 highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
378 s0[6], s0[7], x_filter, offset_vec);
379 uint16x8_t d1 =
380 highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
381 s1[6], s1[7], x_filter, offset_vec);
382 uint16x8_t d2 =
383 highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
384 s2[6], s2[7], x_filter, offset_vec);
385 uint16x8_t d3 =
386 highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
387 s3[6], s3[7], x_filter, offset_vec);
388
389 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
390
391 s += 8;
392 d += 8;
393 width -= 8;
394 } while (width != 0);
395 src_ptr += 4 * src_stride;
396 dst_ptr += 4 * dst_stride;
397 height -= 4;
398 } while (height != 0);
399 }
400 }
401
highbd_dist_wtd_convolve_x_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)402 static inline void highbd_dist_wtd_convolve_x_neon(
403 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
404 int w, int h, const int16_t *x_filter_ptr, const int offset) {
405 const int32x4_t offset_vec = vdupq_n_s32(offset);
406
407 if (w == 4) {
408 // 4-tap filters are used for blocks having width == 4.
409 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
410 const int16_t *s = (const int16_t *)(src_ptr + 2);
411 uint16_t *d = dst_ptr;
412
413 do {
414 int16x4_t s0[4], s1[4], s2[4], s3[4];
415 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
416 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
417 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
418 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
419
420 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
421 uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
422 uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
423 uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
424
425 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
426
427 s += 4 * src_stride;
428 d += 4 * dst_stride;
429 h -= 4;
430 } while (h != 0);
431 } else {
432 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
433 int height = h;
434
435 do {
436 int width = w;
437 const int16_t *s = (const int16_t *)src_ptr;
438 uint16_t *d = dst_ptr;
439
440 do {
441 int16x8_t s0[8], s1[8], s2[8], s3[8];
442 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
443 &s0[4], &s0[5], &s0[6], &s0[7]);
444 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
445 &s1[4], &s1[5], &s1[6], &s1[7]);
446 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
447 &s2[4], &s2[5], &s2[6], &s2[7]);
448 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
449 &s3[4], &s3[5], &s3[6], &s3[7]);
450
451 uint16x8_t d0 =
452 highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
453 s0[7], x_filter, offset_vec);
454 uint16x8_t d1 =
455 highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
456 s1[7], x_filter, offset_vec);
457 uint16x8_t d2 =
458 highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
459 s2[7], x_filter, offset_vec);
460 uint16x8_t d3 =
461 highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
462 s3[7], x_filter, offset_vec);
463
464 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
465
466 s += 8;
467 d += 8;
468 width -= 8;
469 } while (width != 0);
470 src_ptr += 4 * src_stride;
471 dst_ptr += 4 * dst_stride;
472 height -= 4;
473 } while (height != 0);
474 }
475 }
476
av1_highbd_dist_wtd_convolve_x_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)477 void av1_highbd_dist_wtd_convolve_x_neon(
478 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
479 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
480 ConvolveParams *conv_params, int bd) {
481 DECLARE_ALIGNED(16, uint16_t,
482 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
483 CONV_BUF_TYPE *dst16 = conv_params->dst;
484 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
485 int dst16_stride = conv_params->dst_stride;
486 const int im_stride = MAX_SB_SIZE;
487 const int horiz_offset = filter_params_x->taps / 2 - 1;
488 assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
489 const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
490 (1 << (bd + FILTER_BITS)) +
491 (1 << (bd + FILTER_BITS - 1));
492
493 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
494 filter_params_x, subpel_x_qn & SUBPEL_MASK);
495
496 src -= horiz_offset;
497
498 // horizontal filter
499 if (bd == 12) {
500 if (conv_params->do_average) {
501 if (x_filter_taps <= 6 && w != 4) {
502 highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
503 im_stride, w, h, x_filter_ptr,
504 offset_convolve);
505 } else {
506 highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride,
507 w, h, x_filter_ptr, offset_convolve);
508 }
509 if (conv_params->use_dist_wtd_comp_avg) {
510 highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
511 w, h, conv_params);
512 } else {
513 highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
514 conv_params);
515 }
516 } else {
517 if (x_filter_taps <= 6 && w != 4) {
518 highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
519 dst16_stride, w, h,
520 x_filter_ptr, offset_convolve);
521 } else {
522 highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride,
523 w, h, x_filter_ptr, offset_convolve);
524 }
525 }
526 } else {
527 if (conv_params->do_average) {
528 if (x_filter_taps <= 6 && w != 4) {
529 highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
530 im_stride, w, h, x_filter_ptr,
531 offset_convolve);
532 } else {
533 highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w,
534 h, x_filter_ptr, offset_convolve);
535 }
536 if (conv_params->use_dist_wtd_comp_avg) {
537 highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
538 h, conv_params, bd);
539 } else {
540 highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
541 conv_params, bd);
542 }
543 } else {
544 if (x_filter_taps <= 6 && w != 4) {
545 highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
546 dst16_stride, w, h, x_filter_ptr,
547 offset_convolve);
548 } else {
549 highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w,
550 h, x_filter_ptr, offset_convolve);
551 }
552 }
553 }
554 }
555
highbd_12_dist_wtd_convolve_y_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)556 static inline void highbd_12_dist_wtd_convolve_y_6tap_neon(
557 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
558 int w, int h, const int16_t *y_filter_ptr, const int offset) {
559 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
560 const int32x4_t offset_vec = vdupq_n_s32(offset);
561
562 if (w == 4) {
563 const int16_t *s = (const int16_t *)src_ptr;
564 uint16_t *d = dst_ptr;
565
566 int16x4_t s0, s1, s2, s3, s4;
567 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
568 s += 5 * src_stride;
569
570 do {
571 int16x4_t s5, s6, s7, s8;
572 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
573
574 uint16x4_t d0 =
575 highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
576 uint16x4_t d1 =
577 highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
578 uint16x4_t d2 =
579 highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
580 uint16x4_t d3 =
581 highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
582
583 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
584
585 s0 = s4;
586 s1 = s5;
587 s2 = s6;
588 s3 = s7;
589 s4 = s8;
590 s += 4 * src_stride;
591 d += 4 * dst_stride;
592 h -= 4;
593 } while (h != 0);
594 } else {
595 do {
596 int height = h;
597 const int16_t *s = (const int16_t *)src_ptr;
598 uint16_t *d = dst_ptr;
599
600 int16x8_t s0, s1, s2, s3, s4;
601 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
602 s += 5 * src_stride;
603
604 do {
605 int16x8_t s5, s6, s7, s8;
606 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
607
608 uint16x8_t d0 =
609 highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
610 uint16x8_t d1 =
611 highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
612 uint16x8_t d2 =
613 highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
614 uint16x8_t d3 =
615 highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
616
617 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
618
619 s0 = s4;
620 s1 = s5;
621 s2 = s6;
622 s3 = s7;
623 s4 = s8;
624 s += 4 * src_stride;
625 d += 4 * dst_stride;
626 height -= 4;
627 } while (height != 0);
628 src_ptr += 8;
629 dst_ptr += 8;
630 w -= 8;
631 } while (w != 0);
632 }
633 }
634
highbd_dist_wtd_convolve_y_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)635 static inline void highbd_dist_wtd_convolve_y_6tap_neon(
636 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
637 int w, int h, const int16_t *y_filter_ptr, const int offset) {
638 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
639 const int32x4_t offset_vec = vdupq_n_s32(offset);
640
641 if (w == 4) {
642 const int16_t *s = (const int16_t *)src_ptr;
643 uint16_t *d = dst_ptr;
644
645 int16x4_t s0, s1, s2, s3, s4;
646 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
647 s += 5 * src_stride;
648
649 do {
650 int16x4_t s5, s6, s7, s8;
651 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
652
653 uint16x4_t d0 =
654 highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
655 uint16x4_t d1 =
656 highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
657 uint16x4_t d2 =
658 highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
659 uint16x4_t d3 =
660 highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
661
662 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
663
664 s0 = s4;
665 s1 = s5;
666 s2 = s6;
667 s3 = s7;
668 s4 = s8;
669 s += 4 * src_stride;
670 d += 4 * dst_stride;
671 h -= 4;
672 } while (h != 0);
673 } else {
674 do {
675 int height = h;
676 const int16_t *s = (const int16_t *)src_ptr;
677 uint16_t *d = dst_ptr;
678
679 int16x8_t s0, s1, s2, s3, s4;
680 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
681 s += 5 * src_stride;
682
683 do {
684 int16x8_t s5, s6, s7, s8;
685 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
686
687 uint16x8_t d0 =
688 highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
689 uint16x8_t d1 =
690 highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
691 uint16x8_t d2 =
692 highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
693 uint16x8_t d3 =
694 highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
695
696 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
697
698 s0 = s4;
699 s1 = s5;
700 s2 = s6;
701 s3 = s7;
702 s4 = s8;
703 s += 4 * src_stride;
704 d += 4 * dst_stride;
705 height -= 4;
706 } while (height != 0);
707 src_ptr += 8;
708 dst_ptr += 8;
709 w -= 8;
710 } while (w != 0);
711 }
712 }
713
highbd_12_convolve4_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t filter,const int32x4_t offset)714 static inline uint16x4_t highbd_12_convolve4_4(
715 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
716 const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
717 int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
718 sum = vmlal_lane_s16(sum, s1, filter, 1);
719 sum = vmlal_lane_s16(sum, s2, filter, 2);
720 sum = vmlal_lane_s16(sum, s3, filter, 3);
721
722 return vqshrun_n_s32(sum, ROUND0_BITS + 2);
723 }
724
highbd_12_convolve4_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter,const int32x4_t offset)725 static inline uint16x8_t highbd_12_convolve4_8(
726 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
727 const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
728 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
729 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
730 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
731 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
732
733 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0);
734 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
735 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
736 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
737
738 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
739 vqshrun_n_s32(sum1, ROUND0_BITS + 2));
740 }
741
highbd_12_dist_wtd_convolve_y_4tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)742 static inline void highbd_12_dist_wtd_convolve_y_4tap_neon(
743 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
744 int w, int h, const int16_t *y_filter_ptr, const int offset) {
745 const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
746 const int32x4_t offset_vec = vdupq_n_s32(offset);
747
748 if (w == 4) {
749 const int16_t *s = (const int16_t *)src_ptr;
750 uint16_t *d = dst_ptr;
751
752 int16x4_t s0, s1, s2;
753 load_s16_4x3(s, src_stride, &s0, &s1, &s2);
754 s += 3 * src_stride;
755
756 do {
757 int16x4_t s3, s4, s5, s6;
758 load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
759
760 uint16x4_t d0 =
761 highbd_12_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec);
762 uint16x4_t d1 =
763 highbd_12_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec);
764 uint16x4_t d2 =
765 highbd_12_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec);
766 uint16x4_t d3 =
767 highbd_12_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec);
768
769 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
770
771 s0 = s4;
772 s1 = s5;
773 s2 = s6;
774
775 s += 4 * src_stride;
776 d += 4 * dst_stride;
777 h -= 4;
778 } while (h != 0);
779 } else {
780 do {
781 int height = h;
782 const int16_t *s = (const int16_t *)src_ptr;
783 uint16_t *d = dst_ptr;
784
785 int16x8_t s0, s1, s2;
786 load_s16_8x3(s, src_stride, &s0, &s1, &s2);
787 s += 3 * src_stride;
788
789 do {
790 int16x8_t s3, s4, s5, s6;
791 load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
792
793 uint16x8_t d0 =
794 highbd_12_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec);
795 uint16x8_t d1 =
796 highbd_12_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec);
797 uint16x8_t d2 =
798 highbd_12_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec);
799 uint16x8_t d3 =
800 highbd_12_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec);
801
802 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
803
804 s0 = s4;
805 s1 = s5;
806 s2 = s6;
807
808 s += 4 * src_stride;
809 d += 4 * dst_stride;
810 height -= 4;
811 } while (height != 0);
812 src_ptr += 8;
813 dst_ptr += 8;
814 w -= 8;
815 } while (w != 0);
816 }
817 }
818
highbd_convolve4_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t filter,const int32x4_t offset)819 static inline uint16x4_t highbd_convolve4_4(
820 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
821 const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
822 int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
823 sum = vmlal_lane_s16(sum, s1, filter, 1);
824 sum = vmlal_lane_s16(sum, s2, filter, 2);
825 sum = vmlal_lane_s16(sum, s3, filter, 3);
826
827 return vqshrun_n_s32(sum, ROUND0_BITS);
828 }
829
highbd_convolve4_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filter,const int32x4_t offset)830 static inline uint16x8_t highbd_convolve4_8(
831 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
832 const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
833 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
834 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
835 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
836 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
837
838 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0);
839 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
840 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
841 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
842
843 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
844 vqshrun_n_s32(sum1, ROUND0_BITS));
845 }
846
highbd_dist_wtd_convolve_y_4tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)847 static inline void highbd_dist_wtd_convolve_y_4tap_neon(
848 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
849 int w, int h, const int16_t *y_filter_ptr, const int offset) {
850 const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
851 const int32x4_t offset_vec = vdupq_n_s32(offset);
852
853 if (w == 4) {
854 const int16_t *s = (const int16_t *)src_ptr;
855 uint16_t *d = dst_ptr;
856
857 int16x4_t s0, s1, s2;
858 load_s16_4x3(s, src_stride, &s0, &s1, &s2);
859 s += 3 * src_stride;
860
861 do {
862 int16x4_t s3, s4, s5, s6;
863 load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
864
865 uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec);
866 uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec);
867 uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec);
868 uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec);
869
870 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
871
872 s0 = s4;
873 s1 = s5;
874 s2 = s6;
875
876 s += 4 * src_stride;
877 d += 4 * dst_stride;
878 h -= 4;
879 } while (h != 0);
880 } else {
881 do {
882 int height = h;
883 const int16_t *s = (const int16_t *)src_ptr;
884 uint16_t *d = dst_ptr;
885
886 int16x8_t s0, s1, s2;
887 load_s16_8x3(s, src_stride, &s0, &s1, &s2);
888 s += 3 * src_stride;
889
890 do {
891 int16x8_t s3, s4, s5, s6;
892 load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
893
894 uint16x8_t d0 =
895 highbd_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec);
896 uint16x8_t d1 =
897 highbd_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec);
898 uint16x8_t d2 =
899 highbd_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec);
900 uint16x8_t d3 =
901 highbd_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec);
902
903 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
904
905 s0 = s4;
906 s1 = s5;
907 s2 = s6;
908
909 s += 4 * src_stride;
910 d += 4 * dst_stride;
911 height -= 4;
912 } while (height != 0);
913 src_ptr += 8;
914 dst_ptr += 8;
915 w -= 8;
916 } while (w != 0);
917 }
918 }
919
highbd_12_dist_wtd_convolve_y_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)920 static inline void highbd_12_dist_wtd_convolve_y_8tap_neon(
921 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
922 int w, int h, const int16_t *y_filter_ptr, const int offset) {
923 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
924 const int32x4_t offset_vec = vdupq_n_s32(offset);
925
926 if (w == 4) {
927 const int16_t *s = (const int16_t *)src_ptr;
928 uint16_t *d = dst_ptr;
929
930 int16x4_t s0, s1, s2, s3, s4, s5, s6;
931 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
932 s += 7 * src_stride;
933
934 do {
935 int16x4_t s7, s8, s9, s10;
936 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
937
938 uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
939 y_filter, offset_vec);
940 uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
941 y_filter, offset_vec);
942 uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
943 y_filter, offset_vec);
944 uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
945 y_filter, offset_vec);
946
947 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
948
949 s0 = s4;
950 s1 = s5;
951 s2 = s6;
952 s3 = s7;
953 s4 = s8;
954 s5 = s9;
955 s6 = s10;
956 s += 4 * src_stride;
957 d += 4 * dst_stride;
958 h -= 4;
959 } while (h != 0);
960 } else {
961 do {
962 int height = h;
963 const int16_t *s = (const int16_t *)src_ptr;
964 uint16_t *d = dst_ptr;
965
966 int16x8_t s0, s1, s2, s3, s4, s5, s6;
967 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
968 s += 7 * src_stride;
969
970 do {
971 int16x8_t s7, s8, s9, s10;
972 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
973
974 uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
975 y_filter, offset_vec);
976 uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
977 y_filter, offset_vec);
978 uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
979 y_filter, offset_vec);
980 uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
981 y_filter, offset_vec);
982
983 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
984
985 s0 = s4;
986 s1 = s5;
987 s2 = s6;
988 s3 = s7;
989 s4 = s8;
990 s5 = s9;
991 s6 = s10;
992 s += 4 * src_stride;
993 d += 4 * dst_stride;
994 height -= 4;
995 } while (height != 0);
996 src_ptr += 8;
997 dst_ptr += 8;
998 w -= 8;
999 } while (w != 0);
1000 }
1001 }
highbd_dist_wtd_convolve_y_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)1002 static inline void highbd_dist_wtd_convolve_y_8tap_neon(
1003 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1004 int w, int h, const int16_t *y_filter_ptr, const int offset) {
1005 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1006 const int32x4_t offset_vec = vdupq_n_s32(offset);
1007
1008 if (w == 4) {
1009 const int16_t *s = (const int16_t *)src_ptr;
1010 uint16_t *d = dst_ptr;
1011
1012 int16x4_t s0, s1, s2, s3, s4, s5, s6;
1013 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1014 s += 7 * src_stride;
1015
1016 do {
1017 int16x4_t s7, s8, s9, s10;
1018 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
1019
1020 uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
1021 y_filter, offset_vec);
1022 uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
1023 y_filter, offset_vec);
1024 uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
1025 y_filter, offset_vec);
1026 uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
1027 y_filter, offset_vec);
1028
1029 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1030
1031 s0 = s4;
1032 s1 = s5;
1033 s2 = s6;
1034 s3 = s7;
1035 s4 = s8;
1036 s5 = s9;
1037 s6 = s10;
1038 s += 4 * src_stride;
1039 d += 4 * dst_stride;
1040 h -= 4;
1041 } while (h != 0);
1042 } else {
1043 do {
1044 int height = h;
1045 const int16_t *s = (const int16_t *)src_ptr;
1046 uint16_t *d = dst_ptr;
1047
1048 int16x8_t s0, s1, s2, s3, s4, s5, s6;
1049 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1050 s += 7 * src_stride;
1051
1052 do {
1053 int16x8_t s7, s8, s9, s10;
1054 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
1055
1056 uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
1057 y_filter, offset_vec);
1058 uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
1059 y_filter, offset_vec);
1060 uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
1061 y_filter, offset_vec);
1062 uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
1063 y_filter, offset_vec);
1064
1065 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1066
1067 s0 = s4;
1068 s1 = s5;
1069 s2 = s6;
1070 s3 = s7;
1071 s4 = s8;
1072 s5 = s9;
1073 s6 = s10;
1074 s += 4 * src_stride;
1075 d += 4 * dst_stride;
1076 height -= 4;
1077 } while (height != 0);
1078 src_ptr += 8;
1079 dst_ptr += 8;
1080 w -= 8;
1081 } while (w != 0);
1082 }
1083 }
1084
av1_highbd_dist_wtd_convolve_y_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1085 void av1_highbd_dist_wtd_convolve_y_neon(
1086 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1087 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
1088 ConvolveParams *conv_params, int bd) {
1089 DECLARE_ALIGNED(16, uint16_t,
1090 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1091 CONV_BUF_TYPE *dst16 = conv_params->dst;
1092 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1093 int dst16_stride = conv_params->dst_stride;
1094 const int im_stride = MAX_SB_SIZE;
1095 const int vert_offset = filter_params_y->taps / 2 - 1;
1096 assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
1097 const int round_offset_conv = (1 << (conv_params->round_0 - 1)) +
1098 (1 << (bd + FILTER_BITS)) +
1099 (1 << (bd + FILTER_BITS - 1));
1100
1101 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1102 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1103
1104 src -= vert_offset * src_stride;
1105
1106 if (bd == 12) {
1107 if (conv_params->do_average) {
1108 if (y_filter_taps <= 4) {
1109 highbd_12_dist_wtd_convolve_y_4tap_neon(
1110 src + 2 * src_stride, src_stride, im_block, im_stride, w, h,
1111 y_filter_ptr, round_offset_conv);
1112 } else if (y_filter_taps == 6) {
1113 highbd_12_dist_wtd_convolve_y_6tap_neon(
1114 src + src_stride, src_stride, im_block, im_stride, w, h,
1115 y_filter_ptr, round_offset_conv);
1116 } else {
1117 highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
1118 im_stride, w, h, y_filter_ptr,
1119 round_offset_conv);
1120 }
1121 if (conv_params->use_dist_wtd_comp_avg) {
1122 highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
1123 w, h, conv_params);
1124 } else {
1125 highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1126 conv_params);
1127 }
1128 } else {
1129 if (y_filter_taps <= 4) {
1130 highbd_12_dist_wtd_convolve_y_4tap_neon(
1131 src + 2 * src_stride, src_stride, dst16, dst16_stride, w, h,
1132 y_filter_ptr, round_offset_conv);
1133 } else if (y_filter_taps == 6) {
1134 highbd_12_dist_wtd_convolve_y_6tap_neon(
1135 src + src_stride, src_stride, dst16, dst16_stride, w, h,
1136 y_filter_ptr, round_offset_conv);
1137 } else {
1138 highbd_12_dist_wtd_convolve_y_8tap_neon(
1139 src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr,
1140 round_offset_conv);
1141 }
1142 }
1143 } else {
1144 if (conv_params->do_average) {
1145 if (y_filter_taps <= 4) {
1146 highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride,
1147 im_block, im_stride, w, h,
1148 y_filter_ptr, round_offset_conv);
1149 } else if (y_filter_taps == 6) {
1150 highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
1151 im_block, im_stride, w, h,
1152 y_filter_ptr, round_offset_conv);
1153 } else {
1154 highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
1155 im_stride, w, h, y_filter_ptr,
1156 round_offset_conv);
1157 }
1158 if (conv_params->use_dist_wtd_comp_avg) {
1159 highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
1160 h, conv_params, bd);
1161 } else {
1162 highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1163 conv_params, bd);
1164 }
1165 } else {
1166 if (y_filter_taps <= 4) {
1167 highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride,
1168 dst16, dst16_stride, w, h,
1169 y_filter_ptr, round_offset_conv);
1170 } else if (y_filter_taps == 6) {
1171 highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
1172 dst16, dst16_stride, w, h,
1173 y_filter_ptr, round_offset_conv);
1174 } else {
1175 highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16,
1176 dst16_stride, w, h, y_filter_ptr,
1177 round_offset_conv);
1178 }
1179 }
1180 }
1181 }
1182
highbd_2d_copy_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int round_bits,const int offset)1183 static inline void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
1184 uint16_t *dst_ptr, int dst_stride, int w,
1185 int h, const int round_bits,
1186 const int offset) {
1187 if (w <= 4) {
1188 const int16x4_t round_shift_s16 = vdup_n_s16(round_bits);
1189 const uint16x4_t offset_u16 = vdup_n_u16(offset);
1190
1191 for (int y = 0; y < h; ++y) {
1192 const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
1193 uint16x4_t d = vshl_u16(s, round_shift_s16);
1194 d = vadd_u16(d, offset_u16);
1195 if (w == 2) {
1196 store_u16_2x1(dst_ptr + y * dst_stride, d);
1197 } else {
1198 vst1_u16(dst_ptr + y * dst_stride, d);
1199 }
1200 }
1201 } else {
1202 const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits);
1203 const uint16x8_t offset_u16 = vdupq_n_u16(offset);
1204
1205 for (int y = 0; y < h; ++y) {
1206 for (int x = 0; x < w; x += 8) {
1207 const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
1208 uint16x8_t d = vshlq_u16(s, round_shift_s16);
1209 d = vaddq_u16(d, offset_u16);
1210 vst1q_u16(dst_ptr + y * dst_stride + x, d);
1211 }
1212 }
1213 }
1214 }
1215
av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params,int bd)1216 void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
1217 int src_stride, uint16_t *dst,
1218 int dst_stride, int w, int h,
1219 ConvolveParams *conv_params,
1220 int bd) {
1221 DECLARE_ALIGNED(16, uint16_t,
1222 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1223
1224 const int im_stride = MAX_SB_SIZE;
1225 CONV_BUF_TYPE *dst16 = conv_params->dst;
1226 int dst16_stride = conv_params->dst_stride;
1227 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1228 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1229 (1 << (offset_bits - conv_params->round_1 - 1));
1230 const int round_bits =
1231 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1232 assert(round_bits >= 0);
1233
1234 if (conv_params->do_average) {
1235 highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits,
1236 round_offset);
1237 } else {
1238 highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits,
1239 round_offset);
1240 }
1241
1242 if (conv_params->do_average) {
1243 if (conv_params->use_dist_wtd_comp_avg) {
1244 if (bd == 12) {
1245 highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
1246 w, h, conv_params);
1247 } else {
1248 highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
1249 h, conv_params, bd);
1250 }
1251 } else {
1252 if (bd == 12) {
1253 highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1254 conv_params);
1255 } else {
1256 highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1257 conv_params, bd);
1258 }
1259 }
1260 }
1261 }
1262
highbd_convolve6_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter,const int32x4_t offset)1263 static inline uint16x4_t highbd_convolve6_4_2d_v(
1264 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1265 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1266 const int16x8_t y_filter, const int32x4_t offset) {
1267 // Values at indices 0 and 7 of y_filter are zero.
1268 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1269 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1270
1271 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
1272 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
1273 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
1274 sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
1275 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
1276 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
1277
1278 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
1279 }
1280
highbd_convolve6_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const int32x4_t offset)1281 static inline uint16x8_t highbd_convolve6_8_2d_v(
1282 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1283 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1284 const int16x8_t y_filter, const int32x4_t offset) {
1285 // Values at indices 0 and 7 of y_filter are zero.
1286 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1287 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1288
1289 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
1290 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
1291 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
1292 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
1293 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
1294 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
1295
1296 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
1297 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
1298 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
1299 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
1300 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
1301 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
1302
1303 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
1304 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
1305 }
1306
highbd_dist_wtd_convolve_2d_vert_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int offset)1307 static inline void highbd_dist_wtd_convolve_2d_vert_6tap_neon(
1308 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1309 int w, int h, const int16_t *y_filter_ptr, int offset) {
1310 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1311 const int32x4_t offset_vec = vdupq_n_s32(offset);
1312
1313 if (w == 4) {
1314 const int16_t *s = (const int16_t *)src_ptr;
1315 uint16_t *d = dst_ptr;
1316
1317 int16x4_t s0, s1, s2, s3, s4;
1318 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
1319 s += 5 * src_stride;
1320
1321 do {
1322 int16x4_t s5, s6, s7, s8;
1323 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
1324
1325 uint16x4_t d0 =
1326 highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
1327 uint16x4_t d1 =
1328 highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
1329 uint16x4_t d2 =
1330 highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
1331 uint16x4_t d3 =
1332 highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
1333
1334 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1335
1336 s0 = s4;
1337 s1 = s5;
1338 s2 = s6;
1339 s3 = s7;
1340 s4 = s8;
1341 s += 4 * src_stride;
1342 d += 4 * dst_stride;
1343 h -= 4;
1344 } while (h != 0);
1345 } else {
1346 do {
1347 int height = h;
1348 const int16_t *s = (const int16_t *)src_ptr;
1349 uint16_t *d = dst_ptr;
1350
1351 int16x8_t s0, s1, s2, s3, s4;
1352 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
1353 s += 5 * src_stride;
1354
1355 do {
1356 int16x8_t s5, s6, s7, s8;
1357 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
1358
1359 uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5,
1360 y_filter, offset_vec);
1361 uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6,
1362 y_filter, offset_vec);
1363 uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7,
1364 y_filter, offset_vec);
1365 uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8,
1366 y_filter, offset_vec);
1367
1368 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1369
1370 s0 = s4;
1371 s1 = s5;
1372 s2 = s6;
1373 s3 = s7;
1374 s4 = s8;
1375 s += 4 * src_stride;
1376 d += 4 * dst_stride;
1377 height -= 4;
1378 } while (height != 0);
1379 src_ptr += 8;
1380 dst_ptr += 8;
1381 w -= 8;
1382 } while (w != 0);
1383 }
1384 }
1385
highbd_convolve8_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter,const int32x4_t offset)1386 static inline uint16x4_t highbd_convolve8_4_2d_v(
1387 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1388 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1389 const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
1390 const int32x4_t offset) {
1391 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1392 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1393
1394 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
1395 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
1396 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
1397 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
1398 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
1399 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
1400 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
1401 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
1402
1403 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
1404 }
1405
highbd_convolve8_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const int32x4_t offset)1406 static inline uint16x8_t highbd_convolve8_8_2d_v(
1407 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1408 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1409 const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
1410 const int32x4_t offset) {
1411 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1412 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1413
1414 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
1415 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
1416 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
1417 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
1418 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
1419 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
1420 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
1421 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
1422
1423 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
1424 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
1425 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
1426 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
1427 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
1428 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
1429 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
1430 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
1431
1432 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
1433 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
1434 }
1435
highbd_dist_wtd_convolve_2d_vert_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int offset)1436 static inline void highbd_dist_wtd_convolve_2d_vert_8tap_neon(
1437 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1438 int w, int h, const int16_t *y_filter_ptr, int offset) {
1439 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1440 const int32x4_t offset_vec = vdupq_n_s32(offset);
1441
1442 if (w <= 4) {
1443 const int16_t *s = (const int16_t *)src_ptr;
1444 uint16_t *d = dst_ptr;
1445
1446 int16x4_t s0, s1, s2, s3, s4, s5, s6;
1447 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1448 s += 7 * src_stride;
1449
1450 do {
1451 int16x4_t s7, s8, s9, s10;
1452 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
1453
1454 uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
1455 y_filter, offset_vec);
1456 uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
1457 y_filter, offset_vec);
1458 uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
1459 y_filter, offset_vec);
1460 uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
1461 y_filter, offset_vec);
1462
1463 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1464
1465 s0 = s4;
1466 s1 = s5;
1467 s2 = s6;
1468 s3 = s7;
1469 s4 = s8;
1470 s5 = s9;
1471 s6 = s10;
1472 s += 4 * src_stride;
1473 d += 4 * dst_stride;
1474 h -= 4;
1475 } while (h != 0);
1476 } else {
1477 do {
1478 int height = h;
1479 const int16_t *s = (const int16_t *)src_ptr;
1480 uint16_t *d = dst_ptr;
1481
1482 int16x8_t s0, s1, s2, s3, s4, s5, s6;
1483 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1484 s += 7 * src_stride;
1485
1486 do {
1487 int16x8_t s7, s8, s9, s10;
1488 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
1489
1490 uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
1491 y_filter, offset_vec);
1492 uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
1493 y_filter, offset_vec);
1494 uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
1495 y_filter, offset_vec);
1496 uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
1497 y_filter, offset_vec);
1498
1499 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1500
1501 s0 = s4;
1502 s1 = s5;
1503 s2 = s6;
1504 s3 = s7;
1505 s4 = s8;
1506 s5 = s9;
1507 s6 = s10;
1508 s += 4 * src_stride;
1509 d += 4 * dst_stride;
1510 height -= 4;
1511 } while (height != 0);
1512 src_ptr += 8;
1513 dst_ptr += 8;
1514 w -= 8;
1515 } while (w != 0);
1516 }
1517 }
1518
highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1519 static inline void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
1520 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1521 int w, int h, const int16_t *x_filter_ptr, const int offset) {
1522 // The smallest block height is 4, and the horizontal convolution needs to
1523 // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1524 assert(h >= 5);
1525 const int32x4_t offset_vec = vdupq_n_s32(offset);
1526
1527 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1528
1529 int height = h;
1530
1531 do {
1532 int width = w;
1533 const int16_t *s = (const int16_t *)src_ptr;
1534 uint16_t *d = dst_ptr;
1535
1536 do {
1537 int16x8_t s0[6], s1[6], s2[6], s3[6];
1538 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1539 &s0[4], &s0[5]);
1540 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1541 &s1[4], &s1[5]);
1542 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1543 &s2[4], &s2[5]);
1544 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1545 &s3[4], &s3[5]);
1546
1547 uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1548 s0[5], x_filter, offset_vec);
1549 uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
1550 s1[5], x_filter, offset_vec);
1551 uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
1552 s2[5], x_filter, offset_vec);
1553 uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
1554 s3[5], x_filter, offset_vec);
1555
1556 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1557
1558 s += 8;
1559 d += 8;
1560 width -= 8;
1561 } while (width != 0);
1562 src_ptr += 4 * src_stride;
1563 dst_ptr += 4 * dst_stride;
1564 height -= 4;
1565 } while (height > 4);
1566
1567 do {
1568 int width = w;
1569 const int16_t *s = (const int16_t *)src_ptr;
1570 uint16_t *d = dst_ptr;
1571
1572 do {
1573 int16x8_t s0[6];
1574 load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
1575
1576 uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1577 s0[5], x_filter, offset_vec);
1578 vst1q_u16(d, d0);
1579
1580 s += 8;
1581 d += 8;
1582 width -= 8;
1583 } while (width != 0);
1584 src_ptr += src_stride;
1585 dst_ptr += dst_stride;
1586 } while (--height != 0);
1587 }
1588
highbd_dist_wtd_convolve_2d_horiz_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1589 static inline void highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
1590 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1591 int w, int h, const int16_t *x_filter_ptr, const int offset) {
1592 // The smallest block height is 4, and the horizontal convolution needs to
1593 // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1594 assert(h >= 5);
1595 const int32x4_t offset_vec = vdupq_n_s32(offset);
1596
1597 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1598
1599 int height = h;
1600
1601 do {
1602 int width = w;
1603 const int16_t *s = (const int16_t *)src_ptr;
1604 uint16_t *d = dst_ptr;
1605
1606 do {
1607 int16x8_t s0[6], s1[6], s2[6], s3[6];
1608 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1609 &s0[4], &s0[5]);
1610 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1611 &s1[4], &s1[5]);
1612 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1613 &s2[4], &s2[5]);
1614 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1615 &s3[4], &s3[5]);
1616
1617 uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1618 s0[5], x_filter, offset_vec);
1619 uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
1620 s1[5], x_filter, offset_vec);
1621 uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
1622 s2[5], x_filter, offset_vec);
1623 uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
1624 s3[5], x_filter, offset_vec);
1625
1626 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1627
1628 s += 8;
1629 d += 8;
1630 width -= 8;
1631 } while (width != 0);
1632 src_ptr += 4 * src_stride;
1633 dst_ptr += 4 * dst_stride;
1634 height -= 4;
1635 } while (height > 4);
1636
1637 do {
1638 int width = w;
1639 const int16_t *s = (const int16_t *)src_ptr;
1640 uint16_t *d = dst_ptr;
1641
1642 do {
1643 int16x8_t s0[6];
1644 load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
1645
1646 uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1647 s0[5], x_filter, offset_vec);
1648 vst1q_u16(d, d0);
1649
1650 s += 8;
1651 d += 8;
1652 width -= 8;
1653 } while (width != 0);
1654 src_ptr += src_stride;
1655 dst_ptr += dst_stride;
1656 } while (--height != 0);
1657 }
1658
highbd_12_dist_wtd_convolve_2d_horiz_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1659 static inline void highbd_12_dist_wtd_convolve_2d_horiz_neon(
1660 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1661 int w, int h, const int16_t *x_filter_ptr, const int offset) {
1662 // The smallest block height is 4, and the horizontal convolution needs to
1663 // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1664 assert(h >= 5);
1665 const int32x4_t offset_vec = vdupq_n_s32(offset);
1666
1667 if (w == 4) {
1668 // 4-tap filters are used for blocks having width == 4.
1669 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1670 const int16_t *s = (const int16_t *)(src_ptr + 1);
1671 uint16_t *d = dst_ptr;
1672
1673 do {
1674 int16x4_t s0[4], s1[4], s2[4], s3[4];
1675 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1676 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
1677 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
1678 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
1679
1680 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
1681 uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
1682 uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
1683 uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
1684
1685 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1686
1687 s += 4 * src_stride;
1688 d += 4 * dst_stride;
1689 h -= 4;
1690 } while (h > 4);
1691
1692 do {
1693 int16x4_t s0[4];
1694 load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1695
1696 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
1697 vst1_u16(d, d0);
1698
1699 s += src_stride;
1700 d += dst_stride;
1701 } while (--h != 0);
1702 } else {
1703 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1704 int height = h;
1705
1706 do {
1707 int width = w;
1708 const int16_t *s = (const int16_t *)src_ptr;
1709 uint16_t *d = dst_ptr;
1710
1711 do {
1712 int16x8_t s0[8], s1[8], s2[8], s3[8];
1713 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1714 &s0[4], &s0[5], &s0[6], &s0[7]);
1715 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1716 &s1[4], &s1[5], &s1[6], &s1[7]);
1717 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1718 &s2[4], &s2[5], &s2[6], &s2[7]);
1719 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1720 &s3[4], &s3[5], &s3[6], &s3[7]);
1721
1722 uint16x8_t d0 =
1723 highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
1724 s0[6], s0[7], x_filter, offset_vec);
1725 uint16x8_t d1 =
1726 highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
1727 s1[6], s1[7], x_filter, offset_vec);
1728 uint16x8_t d2 =
1729 highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
1730 s2[6], s2[7], x_filter, offset_vec);
1731 uint16x8_t d3 =
1732 highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
1733 s3[6], s3[7], x_filter, offset_vec);
1734
1735 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1736
1737 s += 8;
1738 d += 8;
1739 width -= 8;
1740 } while (width != 0);
1741 src_ptr += 4 * src_stride;
1742 dst_ptr += 4 * dst_stride;
1743 height -= 4;
1744 } while (height > 4);
1745
1746 do {
1747 int width = w;
1748 const int16_t *s = (const int16_t *)src_ptr;
1749 uint16_t *d = dst_ptr;
1750
1751 do {
1752 int16x8_t s0[8];
1753 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1754 &s0[4], &s0[5], &s0[6], &s0[7]);
1755
1756 uint16x8_t d0 =
1757 highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
1758 s0[6], s0[7], x_filter, offset_vec);
1759 vst1q_u16(d, d0);
1760
1761 s += 8;
1762 d += 8;
1763 width -= 8;
1764 } while (width != 0);
1765 src_ptr += src_stride;
1766 dst_ptr += dst_stride;
1767 } while (--height != 0);
1768 }
1769 }
1770
highbd_dist_wtd_convolve_2d_horiz_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1771 static inline void highbd_dist_wtd_convolve_2d_horiz_neon(
1772 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1773 int w, int h, const int16_t *x_filter_ptr, const int offset) {
1774 // The smallest block height is 4, and the horizontal convolution needs to
1775 // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1776 assert(h >= 5);
1777 const int32x4_t offset_vec = vdupq_n_s32(offset);
1778
1779 if (w == 4) {
1780 // 4-tap filters are used for blocks having width == 4.
1781 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1782 const int16_t *s = (const int16_t *)(src_ptr + 1);
1783 uint16_t *d = dst_ptr;
1784
1785 do {
1786 int16x4_t s0[4], s1[4], s2[4], s3[4];
1787 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1788 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
1789 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
1790 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
1791
1792 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
1793 uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
1794 uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
1795 uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
1796
1797 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1798
1799 s += 4 * src_stride;
1800 d += 4 * dst_stride;
1801 h -= 4;
1802 } while (h > 4);
1803
1804 do {
1805 int16x4_t s0[4];
1806 load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1807
1808 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
1809 vst1_u16(d, d0);
1810
1811 s += src_stride;
1812 d += dst_stride;
1813 } while (--h != 0);
1814 } else {
1815 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1816 int height = h;
1817
1818 do {
1819 int width = w;
1820 const int16_t *s = (const int16_t *)src_ptr;
1821 uint16_t *d = dst_ptr;
1822
1823 do {
1824 int16x8_t s0[8], s1[8], s2[8], s3[8];
1825 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1826 &s0[4], &s0[5], &s0[6], &s0[7]);
1827 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1828 &s1[4], &s1[5], &s1[6], &s1[7]);
1829 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1830 &s2[4], &s2[5], &s2[6], &s2[7]);
1831 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1832 &s3[4], &s3[5], &s3[6], &s3[7]);
1833
1834 uint16x8_t d0 =
1835 highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
1836 s0[7], x_filter, offset_vec);
1837 uint16x8_t d1 =
1838 highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
1839 s1[7], x_filter, offset_vec);
1840 uint16x8_t d2 =
1841 highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
1842 s2[7], x_filter, offset_vec);
1843 uint16x8_t d3 =
1844 highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
1845 s3[7], x_filter, offset_vec);
1846
1847 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1848
1849 s += 8;
1850 d += 8;
1851 width -= 8;
1852 } while (width != 0);
1853 src_ptr += 4 * src_stride;
1854 dst_ptr += 4 * dst_stride;
1855 height -= 4;
1856 } while (height > 4);
1857
1858 do {
1859 int width = w;
1860 const int16_t *s = (const int16_t *)src_ptr;
1861 uint16_t *d = dst_ptr;
1862
1863 do {
1864 int16x8_t s0[8];
1865 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1866 &s0[4], &s0[5], &s0[6], &s0[7]);
1867
1868 uint16x8_t d0 =
1869 highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
1870 s0[7], x_filter, offset_vec);
1871 vst1q_u16(d, d0);
1872
1873 s += 8;
1874 d += 8;
1875 width -= 8;
1876 } while (width != 0);
1877 src_ptr += src_stride;
1878 dst_ptr += dst_stride;
1879 } while (--height != 0);
1880 }
1881 }
1882
av1_highbd_dist_wtd_convolve_2d_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1883 void av1_highbd_dist_wtd_convolve_2d_neon(
1884 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1885 int h, const InterpFilterParams *filter_params_x,
1886 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1887 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1888 DECLARE_ALIGNED(16, uint16_t,
1889 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1890 DECLARE_ALIGNED(16, uint16_t,
1891 im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1892
1893 CONV_BUF_TYPE *dst16 = conv_params->dst;
1894 int dst16_stride = conv_params->dst_stride;
1895 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1896 const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
1897 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1898 const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
1899
1900 const int im_h = h + clamped_y_taps - 1;
1901 const int im_stride = MAX_SB_SIZE;
1902 const int vert_offset = clamped_y_taps / 2 - 1;
1903 const int horiz_offset = clamped_x_taps / 2 - 1;
1904 // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a
1905 // faster non-rounding non-saturating left shift.
1906 const int round_offset_conv_x =
1907 (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1));
1908 const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1909 const int round_offset_conv_y = (1 << y_offset_bits);
1910
1911 const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1912
1913 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1914 filter_params_x, subpel_x_qn & SUBPEL_MASK);
1915 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1916 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1917
1918 // horizontal filter
1919 if (bd == 12) {
1920 if (x_filter_taps <= 6 && w != 4) {
1921 highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
1922 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1923 round_offset_conv_x);
1924 } else {
1925 highbd_12_dist_wtd_convolve_2d_horiz_neon(
1926 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1927 round_offset_conv_x);
1928 }
1929 } else {
1930 if (x_filter_taps <= 6 && w != 4) {
1931 highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
1932 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1933 round_offset_conv_x);
1934 } else {
1935 highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block,
1936 im_stride, w, im_h, x_filter_ptr,
1937 round_offset_conv_x);
1938 }
1939 }
1940
1941 // vertical filter
1942 if (y_filter_taps <= 6) {
1943 if (conv_params->do_average) {
1944 highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2,
1945 im_stride, w, h, y_filter_ptr,
1946 round_offset_conv_y);
1947 } else {
1948 highbd_dist_wtd_convolve_2d_vert_6tap_neon(
1949 im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
1950 round_offset_conv_y);
1951 }
1952 } else {
1953 if (conv_params->do_average) {
1954 highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2,
1955 im_stride, w, h, y_filter_ptr,
1956 round_offset_conv_y);
1957 } else {
1958 highbd_dist_wtd_convolve_2d_vert_8tap_neon(
1959 im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
1960 round_offset_conv_y);
1961 }
1962 }
1963
1964 // Do the compound averaging outside the loop, avoids branching within the
1965 // main loop
1966 if (conv_params->do_average) {
1967 if (conv_params->use_dist_wtd_comp_avg) {
1968 if (bd == 12) {
1969 highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride,
1970 w, h, conv_params);
1971 } else {
1972 highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
1973 h, conv_params, bd);
1974 }
1975 } else {
1976 if (bd == 12) {
1977 highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
1978 conv_params);
1979 } else {
1980 highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
1981 conv_params, bd);
1982 }
1983 }
1984 }
1985 }
1986