1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <arm_neon.h>
14
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_ports/mem.h"
21 #include "av1/common/convolve.h"
22 #include "av1/common/filter.h"
23
highbd_convolve6_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter,const uint16x4_t max)24 static inline uint16x4_t highbd_convolve6_4_y(
25 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
26 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
27 const int16x8_t y_filter, const uint16x4_t max) {
28 // Values at indices 0 and 7 of y_filter are zero.
29 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
30 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
31
32 int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 1);
33 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
34 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
35 sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
36 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
37 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
38
39 uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
40 return vmin_u16(res, max);
41 }
42
highbd_convolve6_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const uint16x8_t max)43 static inline uint16x8_t highbd_convolve6_8_y(
44 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
45 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
46 const int16x8_t y_filter, const uint16x8_t max) {
47 // Values at indices 0 and 7 of y_filter are zero.
48 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
49 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
50
51 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 1);
52 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
53 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
54 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
55 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
56 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
57
58 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 1);
59 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
60 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
61 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
62 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
63 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
64
65 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
66 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
67 return vminq_u16(res, max);
68 }
69
highbd_convolve_y_sr_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int bd)70 static inline void highbd_convolve_y_sr_6tap_neon(
71 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
72 int w, int h, const int16_t *y_filter_ptr, const int bd) {
73 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
74
75 if (w == 4) {
76 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
77 const int16_t *s = (const int16_t *)(src_ptr + src_stride);
78 uint16_t *d = dst_ptr;
79
80 int16x4_t s0, s1, s2, s3, s4;
81 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
82 s += 5 * src_stride;
83
84 do {
85 int16x4_t s5, s6, s7, s8;
86 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
87
88 uint16x4_t d0 =
89 highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max);
90 uint16x4_t d1 =
91 highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max);
92 uint16x4_t d2 =
93 highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max);
94 uint16x4_t d3 =
95 highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max);
96
97 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
98
99 s0 = s4;
100 s1 = s5;
101 s2 = s6;
102 s3 = s7;
103 s4 = s8;
104 s += 4 * src_stride;
105 d += 4 * dst_stride;
106 h -= 4;
107 } while (h != 0);
108 } else {
109 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
110 // Width is a multiple of 8 and height is a multiple of 4.
111 do {
112 int height = h;
113 const int16_t *s = (const int16_t *)(src_ptr + src_stride);
114 uint16_t *d = dst_ptr;
115
116 int16x8_t s0, s1, s2, s3, s4;
117 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
118 s += 5 * src_stride;
119
120 do {
121 int16x8_t s5, s6, s7, s8;
122 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
123
124 uint16x8_t d0 =
125 highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max);
126 uint16x8_t d1 =
127 highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max);
128 uint16x8_t d2 =
129 highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max);
130 uint16x8_t d3 =
131 highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max);
132
133 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
134
135 s0 = s4;
136 s1 = s5;
137 s2 = s6;
138 s3 = s7;
139 s4 = s8;
140 s += 4 * src_stride;
141 d += 4 * dst_stride;
142 height -= 4;
143 } while (height != 0);
144
145 src_ptr += 8;
146 dst_ptr += 8;
147 w -= 8;
148 } while (w != 0);
149 }
150 }
151
highbd_convolve8_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter,const uint16x4_t max)152 static inline uint16x4_t highbd_convolve8_4_y(
153 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
154 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
155 const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
156 const uint16x4_t max) {
157 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
158 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
159
160 int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
161 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
162 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
163 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
164 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
165 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
166 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
167 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
168
169 uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
170 return vmin_u16(res, max);
171 }
172
highbd_convolve8_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const uint16x8_t max)173 static inline uint16x8_t highbd_convolve8_8_y(
174 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
175 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
176 const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
177 const uint16x8_t max) {
178 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
179 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
180
181 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
182 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
183 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
184 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
185 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
186 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
187 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
188 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
189
190 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
191 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
192 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
193 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
194 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
195 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
196 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
197 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
198
199 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
200 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
201 return vminq_u16(res, max);
202 }
203
highbd_convolve_y_sr_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int bd)204 static inline void highbd_convolve_y_sr_8tap_neon(
205 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
206 int w, int h, const int16_t *y_filter_ptr, int bd) {
207 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
208
209 if (w == 4) {
210 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
211 const int16_t *s = (const int16_t *)src_ptr;
212 uint16_t *d = dst_ptr;
213
214 int16x4_t s0, s1, s2, s3, s4, s5, s6;
215 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
216 s += 7 * src_stride;
217
218 do {
219 int16x4_t s7, s8, s9, s10;
220 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
221
222 uint16x4_t d0 =
223 highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
224 uint16x4_t d1 =
225 highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
226 uint16x4_t d2 =
227 highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
228 uint16x4_t d3 =
229 highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);
230
231 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
232
233 s0 = s4;
234 s1 = s5;
235 s2 = s6;
236 s3 = s7;
237 s4 = s8;
238 s5 = s9;
239 s6 = s10;
240 s += 4 * src_stride;
241 d += 4 * dst_stride;
242 h -= 4;
243 } while (h != 0);
244 } else {
245 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
246
247 do {
248 int height = h;
249 const int16_t *s = (const int16_t *)src_ptr;
250 uint16_t *d = dst_ptr;
251
252 int16x8_t s0, s1, s2, s3, s4, s5, s6;
253 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
254 s += 7 * src_stride;
255
256 do {
257 int16x8_t s7, s8, s9, s10;
258 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
259
260 uint16x8_t d0 =
261 highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
262 uint16x8_t d1 =
263 highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
264 uint16x8_t d2 =
265 highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
266 uint16x8_t d3 = highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10,
267 y_filter, max);
268
269 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
270
271 s0 = s4;
272 s1 = s5;
273 s2 = s6;
274 s3 = s7;
275 s4 = s8;
276 s5 = s9;
277 s6 = s10;
278 s += 4 * src_stride;
279 d += 4 * dst_stride;
280 height -= 4;
281 } while (height != 0);
282 src_ptr += 8;
283 dst_ptr += 8;
284 w -= 8;
285 } while (w != 0);
286 }
287 }
288
highbd_convolve12_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11,const uint16x4_t max)289 static inline uint16x4_t highbd_convolve12_4_y(
290 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
291 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
292 const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
293 const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
294 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
295 const uint16x4_t max) {
296 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
297 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
298
299 int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
300 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
301 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
302 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
303 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
304 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
305 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
306 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
307 sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
308 sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
309 sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
310 sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
311
312 uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
313 return vmin_u16(res, max);
314 }
315
highbd_convolve12_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t s8,const int16x8_t s9,const int16x8_t s10,const int16x8_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11,const uint16x8_t max)316 static inline uint16x8_t highbd_convolve12_8_y(
317 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
318 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
319 const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
320 const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
321 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
322 const uint16x8_t max) {
323 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
324 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
325
326 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
327 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
328 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
329 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
330 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
331 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
332 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
333 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
334 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
335 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
336 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
337 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
338
339 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
340 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
341 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
342 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
343 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
344 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
345 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
346 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
347 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
348 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
349 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
350 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
351
352 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
353 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
354 return vminq_u16(res, max);
355 }
356
highbd_convolve_y_sr_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int bd)357 static inline void highbd_convolve_y_sr_12tap_neon(
358 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
359 int w, int h, const int16_t *y_filter_ptr, int bd) {
360 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
361 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
362
363 if (w == 4) {
364 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
365 const int16_t *s = (const int16_t *)src_ptr;
366 uint16_t *d = dst_ptr;
367
368 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
369 load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
370 &s9, &s10);
371 s += 11 * src_stride;
372
373 do {
374 int16x4_t s11, s12, s13, s14;
375 load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
376
377 uint16x4_t d0 =
378 highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
379 s11, y_filter_0_7, y_filter_8_11, max);
380 uint16x4_t d1 =
381 highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
382 s12, y_filter_0_7, y_filter_8_11, max);
383 uint16x4_t d2 =
384 highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
385 s13, y_filter_0_7, y_filter_8_11, max);
386 uint16x4_t d3 =
387 highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
388 s14, y_filter_0_7, y_filter_8_11, max);
389
390 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
391
392 s0 = s4;
393 s1 = s5;
394 s2 = s6;
395 s3 = s7;
396 s4 = s8;
397 s5 = s9;
398 s6 = s10;
399 s7 = s11;
400 s8 = s12;
401 s9 = s13;
402 s10 = s14;
403 s += 4 * src_stride;
404 d += 4 * dst_stride;
405 h -= 4;
406 } while (h != 0);
407 } else {
408 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
409
410 do {
411 int height = h;
412 const int16_t *s = (const int16_t *)src_ptr;
413 uint16_t *d = dst_ptr;
414
415 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
416 load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
417 &s9, &s10);
418 s += 11 * src_stride;
419
420 do {
421 int16x8_t s11, s12, s13, s14;
422 load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
423
424 uint16x8_t d0 =
425 highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
426 s11, y_filter_0_7, y_filter_8_11, max);
427 uint16x8_t d1 =
428 highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
429 s12, y_filter_0_7, y_filter_8_11, max);
430 uint16x8_t d2 =
431 highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
432 s13, y_filter_0_7, y_filter_8_11, max);
433 uint16x8_t d3 =
434 highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
435 s13, s14, y_filter_0_7, y_filter_8_11, max);
436
437 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
438
439 s0 = s4;
440 s1 = s5;
441 s2 = s6;
442 s3 = s7;
443 s4 = s8;
444 s5 = s9;
445 s6 = s10;
446 s7 = s11;
447 s8 = s12;
448 s9 = s13;
449 s10 = s14;
450 s += 4 * src_stride;
451 d += 4 * dst_stride;
452 height -= 4;
453 } while (height != 0);
454
455 src_ptr += 8;
456 dst_ptr += 8;
457 w -= 8;
458 } while (w != 0);
459 }
460 }
461
av1_highbd_convolve_y_sr_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)462 void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride,
463 uint16_t *dst, int dst_stride, int w, int h,
464 const InterpFilterParams *filter_params_y,
465 const int subpel_y_qn, int bd) {
466 if (w == 2 || h == 2) {
467 av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
468 filter_params_y, subpel_y_qn, bd);
469 return;
470 }
471 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
472 const int vert_offset = filter_params_y->taps / 2 - 1;
473 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
474 filter_params_y, subpel_y_qn & SUBPEL_MASK);
475
476 src -= vert_offset * src_stride;
477
478 if (y_filter_taps > 8) {
479 highbd_convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
480 y_filter_ptr, bd);
481 return;
482 }
483 if (y_filter_taps < 8) {
484 highbd_convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h,
485 y_filter_ptr, bd);
486 return;
487 }
488
489 highbd_convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h,
490 y_filter_ptr, bd);
491 }
492
highbd_convolve6_8_x(const int16x8_t s[6],const int16x8_t x_filter,const int32x4_t offset,const uint16x8_t max)493 static inline uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
494 const int16x8_t x_filter,
495 const int32x4_t offset,
496 const uint16x8_t max) {
497 // Values at indices 0 and 7 of y_filter are zero.
498 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
499 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
500
501 int32x4_t sum0 = offset;
502 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 1);
503 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
504 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
505 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
506 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
507 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
508
509 int32x4_t sum1 = offset;
510 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 1);
511 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
512 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
513 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
514 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
515 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
516
517 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
518 vqrshrun_n_s32(sum1, FILTER_BITS));
519 return vminq_u16(res, max);
520 }
521
highbd_convolve_x_sr_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,int bd)522 static inline void highbd_convolve_x_sr_6tap_neon(
523 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
524 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
525 int bd) {
526 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
527 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
528 // This shim allows to do only one rounding shift instead of two.
529 const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
530
531 int height = h;
532
533 do {
534 int width = w;
535 const int16_t *s = (const int16_t *)src_ptr;
536 uint16_t *d = dst_ptr;
537
538 do {
539 int16x8_t s0[6], s1[6], s2[6], s3[6];
540 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
541 &s0[4], &s0[5]);
542 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
543 &s1[4], &s1[5]);
544 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
545 &s2[4], &s2[5]);
546 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
547 &s3[4], &s3[5]);
548
549 uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset, max);
550 uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset, max);
551 uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset, max);
552 uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset, max);
553
554 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
555
556 s += 8;
557 d += 8;
558 width -= 8;
559 } while (width != 0);
560
561 src_ptr += 4 * src_stride;
562 dst_ptr += 4 * dst_stride;
563 height -= 4;
564 } while (height != 0);
565 }
566
highbd_convolve4_4_x(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t offset,const uint16x4_t max)567 static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
568 const int16x4_t x_filter,
569 const int32x4_t offset,
570 const uint16x4_t max) {
571 int32x4_t sum = offset;
572 sum = vmlal_lane_s16(sum, s[0], x_filter, 0);
573 sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
574 sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
575 sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
576
577 uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
578 return vmin_u16(res, max);
579 }
580
highbd_convolve8_8_x(const int16x8_t s[8],const int16x8_t x_filter,const int32x4_t offset,const uint16x8_t max)581 static inline uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
582 const int16x8_t x_filter,
583 const int32x4_t offset,
584 const uint16x8_t max) {
585 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
586 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
587
588 int32x4_t sum0 = offset;
589 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
590 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
591 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
592 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
593 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
594 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
595 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
596 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
597
598 int32x4_t sum1 = offset;
599 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
600 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
601 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
602 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
603 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
604 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
605 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
606 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
607
608 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
609 vqrshrun_n_s32(sum1, FILTER_BITS));
610 return vminq_u16(res, max);
611 }
612
highbd_convolve_x_sr_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,int bd)613 static inline void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
614 int src_stride, uint16_t *dst_ptr,
615 int dst_stride, int w, int h,
616 const int16_t *x_filter_ptr,
617 ConvolveParams *conv_params,
618 int bd) {
619 // This shim allows to do only one rounding shift instead of two.
620 const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
621
622 if (w == 4) {
623 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
624 // 4-tap filters are used for blocks having width == 4.
625 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
626 const int16_t *s = (const int16_t *)(src_ptr + 2);
627 uint16_t *d = dst_ptr;
628
629 do {
630 int16x4_t s0[4], s1[4], s2[4], s3[4];
631 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
632 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
633 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
634 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
635
636 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset, max);
637 uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset, max);
638 uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset, max);
639 uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset, max);
640
641 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
642
643 s += 4 * src_stride;
644 d += 4 * dst_stride;
645 h -= 4;
646 } while (h != 0);
647 } else {
648 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
649 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
650 int height = h;
651
652 do {
653 int width = w;
654 const int16_t *s = (const int16_t *)src_ptr;
655 uint16_t *d = dst_ptr;
656
657 do {
658 int16x8_t s0[8], s1[8], s2[8], s3[8];
659 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
660 &s0[4], &s0[5], &s0[6], &s0[7]);
661 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
662 &s1[4], &s1[5], &s1[6], &s1[7]);
663 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
664 &s2[4], &s2[5], &s2[6], &s2[7]);
665 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
666 &s3[4], &s3[5], &s3[6], &s3[7]);
667
668 uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset, max);
669 uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset, max);
670 uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset, max);
671 uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset, max);
672
673 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
674
675 s += 8;
676 d += 8;
677 width -= 8;
678 } while (width != 0);
679 src_ptr += 4 * src_stride;
680 dst_ptr += 4 * dst_stride;
681 height -= 4;
682 } while (height != 0);
683 }
684 }
685
highbd_convolve12_4_x(const int16x4_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t offset,const uint16x4_t max)686 static inline uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
687 const int16x8_t x_filter_0_7,
688 const int16x4_t x_filter_8_11,
689 const int32x4_t offset,
690 const uint16x4_t max) {
691 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
692 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
693
694 int32x4_t sum = offset;
695 sum = vmlal_lane_s16(sum, s[0], x_filter_0_3, 0);
696 sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
697 sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
698 sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
699 sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
700 sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
701 sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
702 sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
703 sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
704 sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
705 sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
706 sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
707
708 uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
709 return vmin_u16(res, max);
710 }
711
highbd_convolve12_8_x(const int16x8_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t offset,const uint16x8_t max)712 static inline uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
713 const int16x8_t x_filter_0_7,
714 const int16x4_t x_filter_8_11,
715 const int32x4_t offset,
716 const uint16x8_t max) {
717 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
718 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
719
720 int32x4_t sum0 = offset;
721 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
722 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
723 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
724 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
725 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
726 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
727 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
728 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
729 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
730 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
731 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
732 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
733
734 int32x4_t sum1 = offset;
735 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
736 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
737 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
738 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
739 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
740 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
741 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
742 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
743 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
744 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
745 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
746 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
747
748 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
749 vqrshrun_n_s32(sum1, FILTER_BITS));
750 return vminq_u16(res, max);
751 }
752
highbd_convolve_x_sr_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,int bd)753 static inline void highbd_convolve_x_sr_12tap_neon(
754 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
755 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
756 int bd) {
757 // This shim allows to do only one rounding shift instead of two.
758 const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
759 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
760 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
761
762 if (w == 4) {
763 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
764 const int16_t *s = (const int16_t *)src_ptr;
765 uint16_t *d = dst_ptr;
766
767 do {
768 int16x4_t s0[12], s1[12], s2[12], s3[12];
769 load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
770 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
771 &s0[11]);
772 load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
773 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
774 &s1[11]);
775 load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
776 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
777 &s2[11]);
778 load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
779 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
780 &s3[11]);
781
782 uint16x4_t d0 =
783 highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset, max);
784 uint16x4_t d1 =
785 highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset, max);
786 uint16x4_t d2 =
787 highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset, max);
788 uint16x4_t d3 =
789 highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset, max);
790
791 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
792
793 s += 4 * src_stride;
794 d += 4 * dst_stride;
795 h -= 4;
796 } while (h != 0);
797 } else {
798 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
799 int height = h;
800
801 do {
802 int width = w;
803 const int16_t *s = (const int16_t *)src_ptr;
804 uint16_t *d = dst_ptr;
805
806 do {
807 int16x8_t s0[12], s1[12], s2[12], s3[12];
808 load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
809 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
810 &s0[11]);
811 load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
812 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
813 &s1[11]);
814 load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
815 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
816 &s2[11]);
817 load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
818 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
819 &s3[11]);
820
821 uint16x8_t d0 =
822 highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset, max);
823 uint16x8_t d1 =
824 highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset, max);
825 uint16x8_t d2 =
826 highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset, max);
827 uint16x8_t d3 =
828 highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset, max);
829
830 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
831
832 s += 8;
833 d += 8;
834 width -= 8;
835 } while (width != 0);
836 src_ptr += 4 * src_stride;
837 dst_ptr += 4 * dst_stride;
838 height -= 4;
839 } while (height != 0);
840 }
841 }
842
av1_highbd_convolve_x_sr_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)843 void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride,
844 uint16_t *dst, int dst_stride, int w, int h,
845 const InterpFilterParams *filter_params_x,
846 const int subpel_x_qn,
847 ConvolveParams *conv_params, int bd) {
848 if (w == 2 || h == 2) {
849 av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
850 filter_params_x, subpel_x_qn, conv_params, bd);
851 return;
852 }
853 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
854 const int horiz_offset = filter_params_x->taps / 2 - 1;
855 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
856 filter_params_x, subpel_x_qn & SUBPEL_MASK);
857
858 src -= horiz_offset;
859
860 if (x_filter_taps > 8) {
861 highbd_convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
862 x_filter_ptr, conv_params, bd);
863 return;
864 }
865 if (x_filter_taps <= 6 && w != 4) {
866 highbd_convolve_x_sr_6tap_neon(src + 1, src_stride, dst, dst_stride, w, h,
867 x_filter_ptr, conv_params, bd);
868 return;
869 }
870
871 highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h,
872 x_filter_ptr, conv_params, bd);
873 }
874
highbd_convolve6_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset,const uint16x4_t max)875 static inline uint16x4_t highbd_convolve6_4_2d_v(
876 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
877 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
878 const int16x8_t y_filter, const int32x4_t round_shift,
879 const int32x4_t offset, const uint16x4_t max) {
880 // Values at indices 0 and 7 of y_filter are zero.
881 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
882 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
883
884 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
885 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
886 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
887 sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
888 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
889 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
890
891 sum = vshlq_s32(sum, round_shift);
892 uint16x4_t res = vqmovun_s32(sum);
893 return vmin_u16(res, max);
894 }
895
highbd_convolve6_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset,const uint16x8_t max)896 static inline uint16x8_t highbd_convolve6_8_2d_v(
897 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
898 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
899 const int16x8_t y_filter, const int32x4_t round_shift,
900 const int32x4_t offset, const uint16x8_t max) {
901 // Values at indices 0 and 7 of y_filter are zero.
902 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
903 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
904
905 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
906 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
907 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
908 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
909 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
910 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
911
912 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
913 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
914 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
915 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
916 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
917 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
918
919 sum0 = vshlq_s32(sum0, round_shift);
920 sum1 = vshlq_s32(sum1, round_shift);
921
922 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
923 return vminq_u16(res, max);
924 }
925
highbd_convolve_2d_sr_vert_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd,const int offset)926 static inline void highbd_convolve_2d_sr_vert_6tap_neon(
927 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
928 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
929 int bd, const int offset) {
930 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
931 const int32x4_t offset_s32 = vdupq_n_s32(offset);
932 const int round1_shift = conv_params->round_1;
933 const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
934
935 if (w == 4) {
936 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
937 const int16_t *s = (const int16_t *)src_ptr;
938 uint16_t *d = dst_ptr;
939 int16x4_t s0, s1, s2, s3, s4;
940 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
941 s += 5 * src_stride;
942
943 do {
944 int16x4_t s5, s6, s7, s8;
945 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
946
947 uint16x4_t d0 = highbd_convolve6_4_2d_v(
948 s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32, max);
949 uint16x4_t d1 = highbd_convolve6_4_2d_v(
950 s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32, max);
951 uint16x4_t d2 = highbd_convolve6_4_2d_v(
952 s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32, max);
953 uint16x4_t d3 = highbd_convolve6_4_2d_v(
954 s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32, max);
955
956 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
957
958 s0 = s4;
959 s1 = s5;
960 s2 = s6;
961 s3 = s7;
962 s4 = s8;
963 s += 4 * src_stride;
964 d += 4 * dst_stride;
965 h -= 4;
966 } while (h != 0);
967 } else {
968 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
969
970 do {
971 int height = h;
972 const int16_t *s = (const int16_t *)src_ptr;
973 uint16_t *d = dst_ptr;
974 int16x8_t s0, s1, s2, s3, s4;
975 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
976 s += 5 * src_stride;
977
978 do {
979 int16x8_t s5, s6, s7, s8;
980 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
981
982 uint16x8_t d0 =
983 highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter,
984 round1_shift_s32, offset_s32, max);
985 uint16x8_t d1 =
986 highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter,
987 round1_shift_s32, offset_s32, max);
988 uint16x8_t d2 =
989 highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter,
990 round1_shift_s32, offset_s32, max);
991 uint16x8_t d3 =
992 highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter,
993 round1_shift_s32, offset_s32, max);
994
995 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
996
997 s0 = s4;
998 s1 = s5;
999 s2 = s6;
1000 s3 = s7;
1001 s4 = s8;
1002 s += 4 * src_stride;
1003 d += 4 * dst_stride;
1004 height -= 4;
1005 } while (height != 0);
1006 src_ptr += 8;
1007 dst_ptr += 8;
1008 w -= 8;
1009 } while (w != 0);
1010 }
1011 }
1012
highbd_convolve8_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset,const uint16x4_t max)1013 static inline uint16x4_t highbd_convolve8_4_2d_v(
1014 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1015 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1016 const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
1017 const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) {
1018 const int16x4_t y_filter_lo = vget_low_s16(y_filter);
1019 const int16x4_t y_filter_hi = vget_high_s16(y_filter);
1020
1021 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0);
1022 sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
1023 sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
1024 sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
1025 sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
1026 sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
1027 sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
1028 sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
1029
1030 sum = vshlq_s32(sum, round_shift);
1031 uint16x4_t res = vqmovun_s32(sum);
1032 return vmin_u16(res, max);
1033 }
1034
highbd_convolve8_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset,const uint16x8_t max)1035 static inline uint16x8_t highbd_convolve8_8_2d_v(
1036 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1037 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1038 const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
1039 const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) {
1040 const int16x4_t y_filter_lo = vget_low_s16(y_filter);
1041 const int16x4_t y_filter_hi = vget_high_s16(y_filter);
1042
1043 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0);
1044 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
1045 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
1046 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
1047 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
1048 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
1049 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
1050 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
1051
1052 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0);
1053 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
1054 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
1055 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
1056 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
1057 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
1058 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
1059 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
1060
1061 sum0 = vshlq_s32(sum0, round_shift);
1062 sum1 = vshlq_s32(sum1, round_shift);
1063
1064 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1065 return vminq_u16(res, max);
1066 }
1067
highbd_convolve_2d_sr_vert_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd,const int offset)1068 static inline void highbd_convolve_2d_sr_vert_8tap_neon(
1069 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1070 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
1071 int bd, const int offset) {
1072 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1073 const int32x4_t offset_s32 = vdupq_n_s32(offset);
1074 const int round1_shift = conv_params->round_1;
1075 const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
1076
1077 if (w == 4) {
1078 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1079 const int16_t *s = (const int16_t *)src_ptr;
1080 uint16_t *d = dst_ptr;
1081
1082 int16x4_t s0, s1, s2, s3, s4, s5, s6;
1083 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1084 s += 7 * src_stride;
1085
1086 do {
1087 int16x4_t s7, s8, s9, s10;
1088 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
1089
1090 uint16x4_t d0 =
1091 highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1092 round1_shift_s32, offset_s32, max);
1093 uint16x4_t d1 =
1094 highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1095 round1_shift_s32, offset_s32, max);
1096 uint16x4_t d2 =
1097 highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1098 round1_shift_s32, offset_s32, max);
1099 uint16x4_t d3 =
1100 highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1101 round1_shift_s32, offset_s32, max);
1102
1103 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1104
1105 s0 = s4;
1106 s1 = s5;
1107 s2 = s6;
1108 s3 = s7;
1109 s4 = s8;
1110 s5 = s9;
1111 s6 = s10;
1112 s += 4 * src_stride;
1113 d += 4 * dst_stride;
1114 h -= 4;
1115 } while (h != 0);
1116 } else {
1117 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1118
1119 do {
1120 int height = h;
1121 const int16_t *s = (const int16_t *)src_ptr;
1122 uint16_t *d = dst_ptr;
1123
1124 int16x8_t s0, s1, s2, s3, s4, s5, s6;
1125 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1126 s += 7 * src_stride;
1127
1128 do {
1129 int16x8_t s7, s8, s9, s10;
1130 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
1131
1132 uint16x8_t d0 =
1133 highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1134 round1_shift_s32, offset_s32, max);
1135 uint16x8_t d1 =
1136 highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1137 round1_shift_s32, offset_s32, max);
1138 uint16x8_t d2 =
1139 highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1140 round1_shift_s32, offset_s32, max);
1141 uint16x8_t d3 =
1142 highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1143 round1_shift_s32, offset_s32, max);
1144
1145 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1146
1147 s0 = s4;
1148 s1 = s5;
1149 s2 = s6;
1150 s3 = s7;
1151 s4 = s8;
1152 s5 = s9;
1153 s6 = s10;
1154 s += 4 * src_stride;
1155 d += 4 * dst_stride;
1156 height -= 4;
1157 } while (height != 0);
1158 src_ptr += 8;
1159 dst_ptr += 8;
1160 w -= 8;
1161 } while (w != 0);
1162 }
1163 }
1164
highbd_convolve12_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11,const int32x4_t round_shift,const int32x4_t offset,const uint16x4_t max)1165 static inline uint16x4_t highbd_convolve12_4_2d_v(
1166 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1167 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1168 const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
1169 const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
1170 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
1171 const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) {
1172 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
1173 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
1174
1175 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
1176 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
1177 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
1178 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
1179 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
1180 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
1181 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
1182 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
1183 sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
1184 sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
1185 sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
1186 sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
1187
1188 sum = vshlq_s32(sum, round_shift);
1189 uint16x4_t res = vqmovun_s32(sum);
1190 return vmin_u16(res, max);
1191 }
1192
highbd_convolve12_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t s8,const int16x8_t s9,const int16x8_t s10,const int16x8_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11,const int32x4_t round_shift,const int32x4_t offset,const uint16x8_t max)1193 static inline uint16x8_t highbd_convolve12_8_2d_v(
1194 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1195 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1196 const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
1197 const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
1198 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
1199 const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) {
1200 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
1201 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
1202
1203 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
1204 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
1205 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
1206 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
1207 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
1208 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
1209 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
1210 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
1211 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
1212 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
1213 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
1214 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
1215
1216 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
1217 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
1218 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
1219 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
1220 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
1221 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
1222 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
1223 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
1224 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
1225 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
1226 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
1227 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
1228
1229 sum0 = vshlq_s32(sum0, round_shift);
1230 sum1 = vshlq_s32(sum1, round_shift);
1231
1232 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1233 return vminq_u16(res, max);
1234 }
1235
highbd_convolve_2d_sr_vert_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,ConvolveParams * conv_params,const int bd,const int offset)1236 static inline void highbd_convolve_2d_sr_vert_12tap_neon(
1237 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1238 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
1239 const int bd, const int offset) {
1240 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
1241 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
1242 const int32x4_t offset_s32 = vdupq_n_s32(offset);
1243 const int round1_shift = conv_params->round_1;
1244 const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
1245
1246 if (w == 4) {
1247 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1248 const int16_t *s = (const int16_t *)src_ptr;
1249 uint16_t *d = dst_ptr;
1250
1251 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
1252 load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
1253 &s9, &s10);
1254 s += 11 * src_stride;
1255
1256 do {
1257 int16x4_t s11, s12, s13, s14;
1258 load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
1259
1260 uint16x4_t d0 = highbd_convolve12_4_2d_v(
1261 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
1262 y_filter_8_11, round1_shift_s32, offset_s32, max);
1263 uint16x4_t d1 = highbd_convolve12_4_2d_v(
1264 s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
1265 y_filter_8_11, round1_shift_s32, offset_s32, max);
1266 uint16x4_t d2 = highbd_convolve12_4_2d_v(
1267 s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
1268 y_filter_8_11, round1_shift_s32, offset_s32, max);
1269 uint16x4_t d3 = highbd_convolve12_4_2d_v(
1270 s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
1271 y_filter_8_11, round1_shift_s32, offset_s32, max);
1272
1273 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1274
1275 s0 = s4;
1276 s1 = s5;
1277 s2 = s6;
1278 s3 = s7;
1279 s4 = s8;
1280 s5 = s9;
1281 s6 = s10;
1282 s7 = s11;
1283 s8 = s12;
1284 s9 = s13;
1285 s10 = s14;
1286 s += 4 * src_stride;
1287 d += 4 * dst_stride;
1288 h -= 4;
1289 } while (h != 0);
1290 } else {
1291 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1292
1293 do {
1294 int height = h;
1295 const int16_t *s = (const int16_t *)src_ptr;
1296 uint16_t *d = dst_ptr;
1297
1298 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
1299 load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
1300 &s9, &s10);
1301 s += 11 * src_stride;
1302
1303 do {
1304 int16x8_t s11, s12, s13, s14;
1305 load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
1306
1307 uint16x8_t d0 = highbd_convolve12_8_2d_v(
1308 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
1309 y_filter_8_11, round1_shift_s32, offset_s32, max);
1310 uint16x8_t d1 = highbd_convolve12_8_2d_v(
1311 s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
1312 y_filter_8_11, round1_shift_s32, offset_s32, max);
1313 uint16x8_t d2 = highbd_convolve12_8_2d_v(
1314 s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
1315 y_filter_8_11, round1_shift_s32, offset_s32, max);
1316 uint16x8_t d3 = highbd_convolve12_8_2d_v(
1317 s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
1318 y_filter_8_11, round1_shift_s32, offset_s32, max);
1319
1320 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1321
1322 s0 = s4;
1323 s1 = s5;
1324 s2 = s6;
1325 s3 = s7;
1326 s4 = s8;
1327 s5 = s9;
1328 s6 = s10;
1329 s7 = s11;
1330 s8 = s12;
1331 s9 = s13;
1332 s10 = s14;
1333 s += 4 * src_stride;
1334 d += 4 * dst_stride;
1335 height -= 4;
1336 } while (height != 0);
1337
1338 src_ptr += 8;
1339 dst_ptr += 8;
1340 w -= 8;
1341 } while (w != 0);
1342 }
1343 }
1344
highbd_convolve6_8_2d_h(const int16x8_t s[6],const int16x8_t x_filter,const int32x4_t shift_s32,const int32x4_t offset)1345 static inline uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6],
1346 const int16x8_t x_filter,
1347 const int32x4_t shift_s32,
1348 const int32x4_t offset) {
1349 // Values at indices 0 and 7 of y_filter are zero.
1350 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
1351 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
1352
1353 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 1);
1354 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
1355 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
1356 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
1357 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
1358 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
1359
1360 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 1);
1361 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
1362 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
1363 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
1364 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
1365 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
1366
1367 sum0 = vqrshlq_s32(sum0, shift_s32);
1368 sum1 = vqrshlq_s32(sum1, shift_s32);
1369
1370 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1371 }
1372
highbd_convolve_2d_sr_horiz_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,const int offset)1373 static inline void highbd_convolve_2d_sr_horiz_6tap_neon(
1374 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1375 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
1376 const int offset) {
1377 // The smallest block height processed by the SIMD functions is 4, and the
1378 // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
1379 // for the vertical convolution.
1380 assert(h >= 5);
1381 const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
1382 const int32x4_t offset_s32 = vdupq_n_s32(offset);
1383
1384 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1385 int height = h;
1386
1387 do {
1388 int width = w;
1389 const int16_t *s = (const int16_t *)src_ptr;
1390 uint16_t *d = dst_ptr;
1391
1392 do {
1393 int16x8_t s0[6], s1[6], s2[6], s3[6];
1394 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1395 &s0[4], &s0[5]);
1396 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1397 &s1[4], &s1[5]);
1398 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1399 &s2[4], &s2[5]);
1400 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1401 &s3[4], &s3[5]);
1402
1403 uint16x8_t d0 =
1404 highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1405 uint16x8_t d1 =
1406 highbd_convolve6_8_2d_h(s1, x_filter, shift_s32, offset_s32);
1407 uint16x8_t d2 =
1408 highbd_convolve6_8_2d_h(s2, x_filter, shift_s32, offset_s32);
1409 uint16x8_t d3 =
1410 highbd_convolve6_8_2d_h(s3, x_filter, shift_s32, offset_s32);
1411
1412 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1413
1414 s += 8;
1415 d += 8;
1416 width -= 8;
1417 } while (width != 0);
1418 src_ptr += 4 * src_stride;
1419 dst_ptr += 4 * dst_stride;
1420 height -= 4;
1421 } while (height > 4);
1422 do {
1423 int width = w;
1424 const int16_t *s = (const int16_t *)src_ptr;
1425 uint16_t *d = dst_ptr;
1426
1427 do {
1428 int16x8_t s0[6];
1429 load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
1430
1431 uint16x8_t d0 =
1432 highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1433 vst1q_u16(d, d0);
1434
1435 s += 8;
1436 d += 8;
1437 width -= 8;
1438 } while (width != 0);
1439 src_ptr += src_stride;
1440 dst_ptr += dst_stride;
1441 } while (--height != 0);
1442 }
1443
highbd_convolve4_4_2d_h(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t shift_s32,const int32x4_t offset)1444 static inline uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4],
1445 const int16x4_t x_filter,
1446 const int32x4_t shift_s32,
1447 const int32x4_t offset) {
1448 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
1449 sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
1450 sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
1451 sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
1452
1453 sum = vqrshlq_s32(sum, shift_s32);
1454 return vqmovun_s32(sum);
1455 }
1456
highbd_convolve8_8_2d_h(const int16x8_t s[8],const int16x8_t x_filter,const int32x4_t shift_s32,const int32x4_t offset)1457 static inline uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8],
1458 const int16x8_t x_filter,
1459 const int32x4_t shift_s32,
1460 const int32x4_t offset) {
1461 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
1462 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
1463
1464 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
1465 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
1466 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
1467 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
1468 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
1469 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
1470 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
1471 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
1472
1473 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
1474 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
1475 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
1476 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
1477 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
1478 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
1479 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
1480 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
1481
1482 sum0 = vqrshlq_s32(sum0, shift_s32);
1483 sum1 = vqrshlq_s32(sum1, shift_s32);
1484
1485 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1486 }
1487
highbd_convolve_2d_sr_horiz_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,const int offset)1488 static inline void highbd_convolve_2d_sr_horiz_neon(
1489 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1490 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
1491 const int offset) {
1492 // The smallest block height processed by the SIMD functions is 4, and the
1493 // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
1494 // for the vertical convolution.
1495 assert(h >= 5);
1496 const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
1497 const int32x4_t offset_s32 = vdupq_n_s32(offset);
1498
1499 if (w == 4) {
1500 // 4-tap filters are used for blocks having width <= 4.
1501 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1502 const int16_t *s = (const int16_t *)(src_ptr + 1);
1503 uint16_t *d = dst_ptr;
1504
1505 do {
1506 int16x4_t s0[4], s1[4], s2[4], s3[4];
1507 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1508 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
1509 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
1510 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
1511
1512 uint16x4_t d0 =
1513 highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
1514 uint16x4_t d1 =
1515 highbd_convolve4_4_2d_h(s1, x_filter, shift_s32, offset_s32);
1516 uint16x4_t d2 =
1517 highbd_convolve4_4_2d_h(s2, x_filter, shift_s32, offset_s32);
1518 uint16x4_t d3 =
1519 highbd_convolve4_4_2d_h(s3, x_filter, shift_s32, offset_s32);
1520
1521 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1522
1523 s += 4 * src_stride;
1524 d += 4 * dst_stride;
1525 h -= 4;
1526 } while (h > 4);
1527
1528 do {
1529 int16x4_t s0[4];
1530 load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1531
1532 uint16x4_t d0 =
1533 highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
1534
1535 vst1_u16(d, d0);
1536
1537 s += src_stride;
1538 d += dst_stride;
1539 } while (--h != 0);
1540 } else {
1541 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1542 int height = h;
1543
1544 do {
1545 int width = w;
1546 const int16_t *s = (const int16_t *)src_ptr;
1547 uint16_t *d = dst_ptr;
1548
1549 do {
1550 int16x8_t s0[8], s1[8], s2[8], s3[8];
1551 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1552 &s0[4], &s0[5], &s0[6], &s0[7]);
1553 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1554 &s1[4], &s1[5], &s1[6], &s1[7]);
1555 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1556 &s2[4], &s2[5], &s2[6], &s2[7]);
1557 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1558 &s3[4], &s3[5], &s3[6], &s3[7]);
1559
1560 uint16x8_t d0 =
1561 highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1562 uint16x8_t d1 =
1563 highbd_convolve8_8_2d_h(s1, x_filter, shift_s32, offset_s32);
1564 uint16x8_t d2 =
1565 highbd_convolve8_8_2d_h(s2, x_filter, shift_s32, offset_s32);
1566 uint16x8_t d3 =
1567 highbd_convolve8_8_2d_h(s3, x_filter, shift_s32, offset_s32);
1568
1569 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1570
1571 s += 8;
1572 d += 8;
1573 width -= 8;
1574 } while (width != 0);
1575 src_ptr += 4 * src_stride;
1576 dst_ptr += 4 * dst_stride;
1577 height -= 4;
1578 } while (height > 4);
1579
1580 do {
1581 int width = w;
1582 const int16_t *s = (const int16_t *)src_ptr;
1583 uint16_t *d = dst_ptr;
1584
1585 do {
1586 int16x8_t s0[8];
1587 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1588 &s0[4], &s0[5], &s0[6], &s0[7]);
1589
1590 uint16x8_t d0 =
1591 highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1592 vst1q_u16(d, d0);
1593
1594 s += 8;
1595 d += 8;
1596 width -= 8;
1597 } while (width != 0);
1598 src_ptr += src_stride;
1599 dst_ptr += dst_stride;
1600 } while (--height != 0);
1601 }
1602 }
1603
highbd_convolve12_4_2d_h(const int16x4_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t shift_s32,const int32x4_t offset)1604 static inline uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12],
1605 const int16x8_t x_filter_0_7,
1606 const int16x4_t x_filter_8_11,
1607 const int32x4_t shift_s32,
1608 const int32x4_t offset) {
1609 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
1610 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
1611
1612 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter_0_3, 0);
1613 sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
1614 sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
1615 sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
1616 sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
1617 sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
1618 sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
1619 sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
1620 sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
1621 sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
1622 sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
1623 sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
1624
1625 sum = vqrshlq_s32(sum, shift_s32);
1626 return vqmovun_s32(sum);
1627 }
1628
highbd_convolve12_8_2d_h(const int16x8_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t shift_s32,const int32x4_t offset)1629 static inline uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12],
1630 const int16x8_t x_filter_0_7,
1631 const int16x4_t x_filter_8_11,
1632 const int32x4_t shift_s32,
1633 const int32x4_t offset) {
1634 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
1635 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
1636
1637 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
1638 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
1639 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
1640 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
1641 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
1642 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
1643 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
1644 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
1645 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
1646 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
1647 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
1648 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
1649
1650 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
1651 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
1652 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
1653 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
1654 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
1655 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
1656 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
1657 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
1658 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
1659 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
1660 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
1661 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
1662
1663 sum0 = vqrshlq_s32(sum0, shift_s32);
1664 sum1 = vqrshlq_s32(sum1, shift_s32);
1665
1666 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1667 }
1668
highbd_convolve_2d_sr_horiz_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,const int offset)1669 static inline void highbd_convolve_2d_sr_horiz_12tap_neon(
1670 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1671 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
1672 const int offset) {
1673 // The smallest block height processed by the SIMD functions is 4, and the
1674 // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
1675 // for the vertical convolution.
1676 assert(h >= 5);
1677 const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
1678 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
1679 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
1680 const int32x4_t offset_s32 = vdupq_n_s32(offset);
1681
1682 if (w == 4) {
1683 const int16_t *s = (const int16_t *)src_ptr;
1684 uint16_t *d = dst_ptr;
1685
1686 do {
1687 int16x4_t s0[12], s1[12], s2[12], s3[12];
1688 load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1689 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
1690 &s0[11]);
1691 load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1692 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
1693 &s1[11]);
1694 load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1695 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
1696 &s2[11]);
1697 load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1698 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
1699 &s3[11]);
1700
1701 uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
1702 shift_s32, offset_s32);
1703 uint16x4_t d1 = highbd_convolve12_4_2d_h(s1, x_filter_0_7, x_filter_8_11,
1704 shift_s32, offset_s32);
1705 uint16x4_t d2 = highbd_convolve12_4_2d_h(s2, x_filter_0_7, x_filter_8_11,
1706 shift_s32, offset_s32);
1707 uint16x4_t d3 = highbd_convolve12_4_2d_h(s3, x_filter_0_7, x_filter_8_11,
1708 shift_s32, offset_s32);
1709
1710 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1711
1712 s += 4 * src_stride;
1713 d += 4 * dst_stride;
1714 h -= 4;
1715 } while (h > 4);
1716
1717 do {
1718 int16x4_t s0[12];
1719 load_s16_4x12(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5],
1720 &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]);
1721
1722 uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
1723 shift_s32, offset_s32);
1724
1725 vst1_u16(d, d0);
1726
1727 s += src_stride;
1728 d += dst_stride;
1729 } while (--h != 0);
1730 } else {
1731 int height = h;
1732
1733 do {
1734 int width = w;
1735 const int16_t *s = (const int16_t *)src_ptr;
1736 uint16_t *d = dst_ptr;
1737
1738 do {
1739 int16x8_t s0[12], s1[12], s2[12], s3[12];
1740 load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1741 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
1742 &s0[11]);
1743 load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1744 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
1745 &s1[11]);
1746 load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1747 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
1748 &s2[11]);
1749 load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1750 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
1751 &s3[11]);
1752
1753 uint16x8_t d0 = highbd_convolve12_8_2d_h(
1754 s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1755 uint16x8_t d1 = highbd_convolve12_8_2d_h(
1756 s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1757 uint16x8_t d2 = highbd_convolve12_8_2d_h(
1758 s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1759 uint16x8_t d3 = highbd_convolve12_8_2d_h(
1760 s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1761
1762 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1763
1764 s += 8;
1765 d += 8;
1766 width -= 8;
1767 } while (width != 0);
1768 src_ptr += 4 * src_stride;
1769 dst_ptr += 4 * dst_stride;
1770 height -= 4;
1771 } while (height > 4);
1772
1773 do {
1774 int width = w;
1775 const int16_t *s = (const int16_t *)src_ptr;
1776 uint16_t *d = dst_ptr;
1777
1778 do {
1779 int16x8_t s0[12];
1780 load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1781 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
1782 &s0[11]);
1783
1784 uint16x8_t d0 = highbd_convolve12_8_2d_h(
1785 s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1786 vst1q_u16(d, d0);
1787
1788 s += 8;
1789 d += 8;
1790 width -= 8;
1791 } while (width > 0);
1792 src_ptr += src_stride;
1793 dst_ptr += dst_stride;
1794 } while (--height != 0);
1795 }
1796 }
1797
av1_highbd_convolve_2d_sr_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1798 void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride,
1799 uint16_t *dst, int dst_stride, int w, int h,
1800 const InterpFilterParams *filter_params_x,
1801 const InterpFilterParams *filter_params_y,
1802 const int subpel_x_qn,
1803 const int subpel_y_qn,
1804 ConvolveParams *conv_params, int bd) {
1805 if (w == 2 || h == 2) {
1806 av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1807 filter_params_x, filter_params_y, subpel_x_qn,
1808 subpel_y_qn, conv_params, bd);
1809 return;
1810 }
1811 DECLARE_ALIGNED(16, uint16_t,
1812 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1813 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1814 const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
1815
1816 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1817 const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
1818 const int im_h = h + clamped_y_taps - 1;
1819 const int im_stride = MAX_SB_SIZE;
1820 const int vert_offset = clamped_y_taps / 2 - 1;
1821 const int horiz_offset = clamped_x_taps / 2 - 1;
1822 const int x_offset_initial = (1 << (bd + FILTER_BITS - 1));
1823 const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1824 // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a
1825 // simple shift left instead of a rounding saturating shift left.
1826 const int y_offset =
1827 (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1));
1828
1829 const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1830
1831 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1832 filter_params_x, subpel_x_qn & SUBPEL_MASK);
1833 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1834 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1835
1836 if (x_filter_taps > 8) {
1837 highbd_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block,
1838 im_stride, w, im_h, x_filter_ptr,
1839 conv_params, x_offset_initial);
1840
1841 highbd_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride,
1842 w, h, y_filter_ptr, conv_params, bd,
1843 y_offset);
1844 return;
1845 }
1846 if (x_filter_taps <= 6 && w != 4) {
1847 highbd_convolve_2d_sr_horiz_6tap_neon(src_ptr, src_stride, im_block,
1848 im_stride, w, im_h, x_filter_ptr,
1849 conv_params, x_offset_initial);
1850 } else {
1851 highbd_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride,
1852 w, im_h, x_filter_ptr, conv_params,
1853 x_offset_initial);
1854 }
1855
1856 if (y_filter_taps <= 6) {
1857 highbd_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride,
1858 w, h, y_filter_ptr, conv_params, bd,
1859 y_offset);
1860 } else {
1861 highbd_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride,
1862 w, h, y_filter_ptr, conv_params, bd,
1863 y_offset);
1864 }
1865 }
1866
1867 // Filter used is [64, 64].
av1_highbd_convolve_x_sr_intrabc_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)1868 void av1_highbd_convolve_x_sr_intrabc_neon(
1869 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1870 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
1871 ConvolveParams *conv_params, int bd) {
1872 assert(subpel_x_qn == 8);
1873 assert(filter_params_x->taps == 2);
1874 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
1875 (void)filter_params_x;
1876 (void)subpel_x_qn;
1877 (void)conv_params;
1878 (void)bd;
1879
1880 if (w <= 4) {
1881 do {
1882 uint16x4_t s0 = vld1_u16(src);
1883 uint16x4_t s1 = vld1_u16(src + 1);
1884
1885 uint16x4_t d0 = vrhadd_u16(s0, s1);
1886
1887 if (w == 2) {
1888 store_u16_2x1(dst, d0);
1889 } else {
1890 vst1_u16(dst, d0);
1891 }
1892
1893 src += src_stride;
1894 dst += dst_stride;
1895 } while (--h != 0);
1896 } else {
1897 do {
1898 const uint16_t *src_ptr = src;
1899 uint16_t *dst_ptr = dst;
1900 int width = w;
1901
1902 do {
1903 uint16x8_t s0 = vld1q_u16(src_ptr);
1904 uint16x8_t s1 = vld1q_u16(src_ptr + 1);
1905
1906 uint16x8_t d0 = vrhaddq_u16(s0, s1);
1907
1908 vst1q_u16(dst_ptr, d0);
1909
1910 src_ptr += 8;
1911 dst_ptr += 8;
1912 width -= 8;
1913 } while (width != 0);
1914 src += src_stride;
1915 dst += dst_stride;
1916 } while (--h != 0);
1917 }
1918 }
1919
1920 // Filter used is [64, 64].
av1_highbd_convolve_y_sr_intrabc_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)1921 void av1_highbd_convolve_y_sr_intrabc_neon(
1922 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1923 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
1924 int bd) {
1925 assert(subpel_y_qn == 8);
1926 assert(filter_params_y->taps == 2);
1927 (void)filter_params_y;
1928 (void)subpel_y_qn;
1929 (void)bd;
1930
1931 if (w <= 4) {
1932 do {
1933 uint16x4_t s0 = vld1_u16(src);
1934 uint16x4_t s1 = vld1_u16(src + src_stride);
1935
1936 uint16x4_t d0 = vrhadd_u16(s0, s1);
1937
1938 if (w == 2) {
1939 store_u16_2x1(dst, d0);
1940 } else {
1941 vst1_u16(dst, d0);
1942 }
1943
1944 src += src_stride;
1945 dst += dst_stride;
1946 } while (--h != 0);
1947 } else {
1948 do {
1949 const uint16_t *src_ptr = src;
1950 uint16_t *dst_ptr = dst;
1951 int height = h;
1952
1953 do {
1954 uint16x8_t s0 = vld1q_u16(src_ptr);
1955 uint16x8_t s1 = vld1q_u16(src_ptr + src_stride);
1956
1957 uint16x8_t d0 = vrhaddq_u16(s0, s1);
1958
1959 vst1q_u16(dst_ptr, d0);
1960
1961 src_ptr += src_stride;
1962 dst_ptr += dst_stride;
1963 } while (--height != 0);
1964 src += 8;
1965 dst += 8;
1966 w -= 8;
1967 } while (w != 0);
1968 }
1969 }
1970
1971 // Both horizontal and vertical passes use the same 2-tap filter: [64, 64].
av1_highbd_convolve_2d_sr_intrabc_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1972 void av1_highbd_convolve_2d_sr_intrabc_neon(
1973 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1974 int h, const InterpFilterParams *filter_params_x,
1975 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1976 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1977 assert(subpel_x_qn == 8);
1978 assert(subpel_y_qn == 8);
1979 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
1980 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
1981 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
1982 (void)filter_params_x;
1983 (void)subpel_x_qn;
1984 (void)filter_params_y;
1985 (void)subpel_y_qn;
1986 (void)conv_params;
1987 (void)bd;
1988
1989 DECLARE_ALIGNED(16, uint16_t,
1990 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
1991 int im_h = h + 1;
1992 int im_stride = MAX_SB_SIZE;
1993
1994 uint16x8_t vert_offset = vdupq_n_u16(1);
1995
1996 uint16_t *im = im_block;
1997
1998 // Horizontal filter.
1999 if (w <= 4) {
2000 do {
2001 uint16x4_t s0 = vld1_u16(src);
2002 uint16x4_t s1 = vld1_u16(src + 1);
2003
2004 uint16x4_t d0 = vadd_u16(s0, s1);
2005
2006 // Safe to store the whole vector, the im buffer is big enough.
2007 vst1_u16(im, d0);
2008
2009 src += src_stride;
2010 im += im_stride;
2011 } while (--im_h != 0);
2012 } else {
2013 do {
2014 const uint16_t *src_ptr = src;
2015 uint16_t *im_ptr = im;
2016 int width = w;
2017
2018 do {
2019 uint16x8_t s0 = vld1q_u16(src_ptr);
2020 uint16x8_t s1 = vld1q_u16(src_ptr + 1);
2021
2022 uint16x8_t d0 = vaddq_u16(s0, s1);
2023
2024 vst1q_u16(im_ptr, d0);
2025
2026 src_ptr += 8;
2027 im_ptr += 8;
2028 width -= 8;
2029 } while (width != 0);
2030 src += src_stride;
2031 im += im_stride;
2032 } while (--im_h != 0);
2033 }
2034
2035 im = im_block;
2036
2037 // Vertical filter.
2038 if (w <= 4) {
2039 do {
2040 uint16x4_t s0 = vld1_u16(im);
2041 uint16x4_t s1 = vld1_u16(im + im_stride);
2042
2043 uint16x4_t d0 = vhadd_u16(s0, s1);
2044 d0 = vhadd_u16(d0, vget_low_u16(vert_offset));
2045
2046 if (w == 2) {
2047 store_u16_2x1(dst, d0);
2048 } else {
2049 vst1_u16(dst, d0);
2050 }
2051
2052 im += im_stride;
2053 dst += dst_stride;
2054 } while (--h != 0);
2055 } else {
2056 do {
2057 uint16_t *im_ptr = im;
2058 uint16_t *dst_ptr = dst;
2059 int height = h;
2060
2061 do {
2062 uint16x8_t s0 = vld1q_u16(im_ptr);
2063 uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
2064
2065 uint16x8_t d0 = vhaddq_u16(s0, s1);
2066 d0 = vhaddq_u16(d0, vert_offset);
2067
2068 vst1q_u16(dst_ptr, d0);
2069
2070 im_ptr += im_stride;
2071 dst_ptr += dst_stride;
2072 } while (--height != 0);
2073 im += 8;
2074 dst += 8;
2075 w -= 8;
2076 } while (w != 0);
2077 }
2078 }
2079