xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 
17 #include "vpx/vpx_integer.h"
18 #include "vpx_dsp/arm/highbd_convolve8_neon.h"
19 #include "vpx_dsp/arm/highbd_convolve8_sve.h"
20 #include "vpx_dsp/arm/mem_neon.h"
21 #include "vpx_dsp/arm/transpose_neon.h"
22 #include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
23 #include "vpx_dsp/arm/vpx_neon_sve2_bridge.h"
24 
25 // clang-format off
26 DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = {
27   // Shift left and insert new last column in transposed 4x4 block.
28   1, 2, 3, 0, 5, 6, 7, 4,
29   // Shift left and insert two new columns in transposed 4x4 block.
30   2, 3, 0, 1, 6, 7, 4, 5,
31   // Shift left and insert three new columns in transposed 4x4 block.
32   3, 0, 1, 2, 7, 4, 5, 6,
33 };
34 // clang-format on
35 
36 DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6,
37                                                                1, 3, 5, 7 };
38 
transpose_concat_4x4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,int16x8_t res[2])39 static INLINE void transpose_concat_4x4(const int16x4_t s0, const int16x4_t s1,
40                                         const int16x4_t s2, const int16x4_t s3,
41                                         int16x8_t res[2]) {
42   // Transpose 16-bit elements:
43   // s0: 00, 01, 02, 03
44   // s1: 10, 11, 12, 13
45   // s2: 20, 21, 22, 23
46   // s3: 30, 31, 32, 33
47   //
48   // res[0]: 00 10 20 30 01 11 21 31
49   // res[1]: 02 12 22 32 03 13 23 33
50 
51   int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
52   int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
53   int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
54   int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
55 
56   int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q));
57   int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q));
58 
59   int32x4x2_t t0123 = vzipq_s32(s01, s23);
60 
61   res[0] = vreinterpretq_s16_s32(t0123.val[0]);
62   res[1] = vreinterpretq_s16_s32(t0123.val[1]);
63 }
64 
transpose_concat_8x4(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,int16x8_t res[4])65 static INLINE void transpose_concat_8x4(const int16x8_t s0, const int16x8_t s1,
66                                         const int16x8_t s2, const int16x8_t s3,
67                                         int16x8_t res[4]) {
68   // Transpose 16-bit elements:
69   // s0: 00, 01, 02, 03, 04, 05, 06, 07
70   // s1: 10, 11, 12, 13, 14, 15, 16, 17
71   // s2: 20, 21, 22, 23, 24, 25, 26, 27
72   // s3: 30, 31, 32, 33, 34, 35, 36, 37
73   //
74   // res[0]: 00 10 20 30 01 11 21 31
75   // res[1]: 02 12 22 32 03 13 23 33
76   // res[2]: 04 14 24 34 05 15 25 35
77   // res[3]: 06 16 26 36 07 17 27 37
78 
79   int16x8x2_t s01 = vzipq_s16(s0, s1);
80   int16x8x2_t s23 = vzipq_s16(s2, s3);
81 
82   int32x4x2_t t0123_lo = vzipq_s32(vreinterpretq_s32_s16(s01.val[0]),
83                                    vreinterpretq_s32_s16(s23.val[0]));
84   int32x4x2_t t0123_hi = vzipq_s32(vreinterpretq_s32_s16(s01.val[1]),
85                                    vreinterpretq_s32_s16(s23.val[1]));
86 
87   res[0] = vreinterpretq_s16_s32(t0123_lo.val[0]);
88   res[1] = vreinterpretq_s16_s32(t0123_lo.val[1]);
89   res[2] = vreinterpretq_s16_s32(t0123_hi.val[0]);
90   res[3] = vreinterpretq_s16_s32(t0123_hi.val[1]);
91 }
92 
vpx_tbl2x4_s16(int16x8_t s0[4],int16x8_t s1[4],int16x8_t res[4],uint16x8_t idx)93 static INLINE void vpx_tbl2x4_s16(int16x8_t s0[4], int16x8_t s1[4],
94                                   int16x8_t res[4], uint16x8_t idx) {
95   res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
96   res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
97   res[2] = vpx_tbl2_s16(s0[2], s1[2], idx);
98   res[3] = vpx_tbl2_s16(s0[3], s1[3], idx);
99 }
100 
vpx_tbl2x2_s16(int16x8_t s0[2],int16x8_t s1[2],int16x8_t res[2],uint16x8_t idx)101 static INLINE void vpx_tbl2x2_s16(int16x8_t s0[2], int16x8_t s1[2],
102                                   int16x8_t res[2], uint16x8_t idx) {
103   res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
104   res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
105 }
106 
highbd_convolve8_4_v(int16x8_t s_lo[2],int16x8_t s_hi[2],int16x8_t filter,uint16x4_t max)107 static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t s_lo[2],
108                                               int16x8_t s_hi[2],
109                                               int16x8_t filter,
110                                               uint16x4_t max) {
111   int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
112   sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
113 
114   int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
115   sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
116 
117   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
118 
119   uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
120   return vmin_u16(res, max);
121 }
122 
highbd_convolve8_8_v(const int16x8_t s_lo[4],const int16x8_t s_hi[4],const int16x8_t filter,const uint16x8_t max)123 static INLINE uint16x8_t highbd_convolve8_8_v(const int16x8_t s_lo[4],
124                                               const int16x8_t s_hi[4],
125                                               const int16x8_t filter,
126                                               const uint16x8_t max) {
127   int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
128   sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
129 
130   int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
131   sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
132 
133   int64x2_t sum45 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[2], filter, 0);
134   sum45 = vpx_dotq_lane_s16(sum45, s_hi[2], filter, 1);
135 
136   int64x2_t sum67 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[3], filter, 0);
137   sum67 = vpx_dotq_lane_s16(sum67, s_hi[3], filter, 1);
138 
139   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
140   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
141 
142   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
143                                 vqrshrun_n_s32(sum4567, FILTER_BITS));
144   return vminq_u16(res, max);
145 }
146 
highbd_convolve8_8tap_vert_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x8_t filter,int bd)147 static INLINE void highbd_convolve8_8tap_vert_sve2(
148     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
149     ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
150   assert(w >= 4 && h >= 4);
151   uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl);
152 
153   // Correct indices by the size of vector length.
154   merge_tbl_idx.val[0] = vaddq_u16(
155       merge_tbl_idx.val[0],
156       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)));
157   merge_tbl_idx.val[1] = vaddq_u16(
158       merge_tbl_idx.val[1],
159       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)));
160   merge_tbl_idx.val[2] = vaddq_u16(
161       merge_tbl_idx.val[2],
162       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)));
163 
164   if (w == 4) {
165     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
166     const int16_t *s = (const int16_t *)src;
167     uint16_t *d = dst;
168 
169     int16x4_t s0, s1, s2, s3, s4, s5, s6;
170     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
171     s += 7 * src_stride;
172 
173     int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
174     transpose_concat_4x4(s0, s1, s2, s3, s0123);
175     transpose_concat_4x4(s1, s2, s3, s4, s1234);
176     transpose_concat_4x4(s2, s3, s4, s5, s2345);
177     transpose_concat_4x4(s3, s4, s5, s6, s3456);
178 
179     do {
180       int16x4_t s7, s8, s9, sA;
181 
182       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
183 
184       int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
185       transpose_concat_4x4(s7, s8, s9, sA, s789A);
186 
187       vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
188       vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
189       vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
190 
191       uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filter, max);
192       uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filter, max);
193       uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filter, max);
194       uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filter, max);
195 
196       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
197 
198       s0123[0] = s4567[0];
199       s0123[1] = s4567[1];
200       s1234[0] = s5678[0];
201       s1234[1] = s5678[1];
202       s2345[0] = s6789[0];
203       s2345[1] = s6789[1];
204       s3456[0] = s789A[0];
205       s3456[1] = s789A[1];
206 
207       s += 4 * src_stride;
208       d += 4 * dst_stride;
209       h -= 4;
210     } while (h != 0);
211   } else {
212     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
213 
214     do {
215       const int16_t *s = (const int16_t *)src;
216       uint16_t *d = dst;
217       int height = h;
218 
219       int16x8_t s0, s1, s2, s3, s4, s5, s6;
220       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
221       s += 7 * src_stride;
222 
223       int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
224       transpose_concat_8x4(s0, s1, s2, s3, s0123);
225       transpose_concat_8x4(s1, s2, s3, s4, s1234);
226       transpose_concat_8x4(s2, s3, s4, s5, s2345);
227       transpose_concat_8x4(s3, s4, s5, s6, s3456);
228 
229       do {
230         int16x8_t s7, s8, s9, sA;
231         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA);
232 
233         int16x8_t s4567[4], s5678[5], s6789[4], s789A[4];
234         transpose_concat_8x4(s7, s8, s9, sA, s789A);
235 
236         vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
237         vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
238         vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
239 
240         uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filter, max);
241         uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filter, max);
242         uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filter, max);
243         uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filter, max);
244 
245         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
246 
247         s0123[0] = s4567[0];
248         s0123[1] = s4567[1];
249         s0123[2] = s4567[2];
250         s0123[3] = s4567[3];
251         s1234[0] = s5678[0];
252         s1234[1] = s5678[1];
253         s1234[2] = s5678[2];
254         s1234[3] = s5678[3];
255         s2345[0] = s6789[0];
256         s2345[1] = s6789[1];
257         s2345[2] = s6789[2];
258         s2345[3] = s6789[3];
259         s3456[0] = s789A[0];
260         s3456[1] = s789A[1];
261         s3456[2] = s789A[2];
262         s3456[3] = s789A[3];
263 
264         s += 4 * src_stride;
265         d += 4 * dst_stride;
266         height -= 4;
267       } while (height != 0);
268       src += 8;
269       dst += 8;
270       w -= 8;
271     } while (w != 0);
272   }
273 }
274 
vpx_highbd_convolve8_vert_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)275 void vpx_highbd_convolve8_vert_sve2(const uint16_t *src, ptrdiff_t src_stride,
276                                     uint16_t *dst, ptrdiff_t dst_stride,
277                                     const InterpKernel *filter, int x0_q4,
278                                     int x_step_q4, int y0_q4, int y_step_q4,
279                                     int w, int h, int bd) {
280   if (y_step_q4 != 16) {
281     vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
282                                 x_step_q4, y0_q4, y_step_q4, w, h, bd);
283     return;
284   }
285 
286   assert((intptr_t)dst % 4 == 0);
287   assert(dst_stride % 4 == 0);
288   assert(y_step_q4 == 16);
289 
290   (void)x_step_q4;
291   (void)y0_q4;
292   (void)y_step_q4;
293 
294   if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
295     vpx_highbd_convolve8_vert_neon(src, src_stride, dst, dst_stride, filter,
296                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
297                                    bd);
298   } else {
299     const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
300     highbd_convolve8_8tap_vert_sve2(src - 3 * src_stride, src_stride, dst,
301                                     dst_stride, w, h, y_filter_8tap, bd);
302   }
303 }
304 
vpx_highbd_convolve8_avg_vert_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)305 void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src,
306                                         ptrdiff_t src_stride, uint16_t *dst,
307                                         ptrdiff_t dst_stride,
308                                         const InterpKernel *filter, int x0_q4,
309                                         int x_step_q4, int y0_q4, int y_step_q4,
310                                         int w, int h, int bd) {
311   if (y_step_q4 != 16) {
312     vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
313                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
314                                     bd);
315     return;
316   }
317 
318   assert((intptr_t)dst % 4 == 0);
319   assert(dst_stride % 4 == 0);
320 
321   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
322 
323   src -= 3 * src_stride;
324 
325   uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl);
326 
327   // Correct indices by the size of vector length.
328   merge_tbl_idx.val[0] = vaddq_u16(
329       merge_tbl_idx.val[0],
330       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)));
331   merge_tbl_idx.val[1] = vaddq_u16(
332       merge_tbl_idx.val[1],
333       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)));
334   merge_tbl_idx.val[2] = vaddq_u16(
335       merge_tbl_idx.val[2],
336       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)));
337 
338   if (w == 4) {
339     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
340     const int16_t *s = (const int16_t *)src;
341     uint16_t *d = dst;
342 
343     int16x4_t s0, s1, s2, s3, s4, s5, s6;
344     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
345     s += 7 * src_stride;
346 
347     int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
348     transpose_concat_4x4(s0, s1, s2, s3, s0123);
349     transpose_concat_4x4(s1, s2, s3, s4, s1234);
350     transpose_concat_4x4(s2, s3, s4, s5, s2345);
351     transpose_concat_4x4(s3, s4, s5, s6, s3456);
352 
353     do {
354       int16x4_t s7, s8, s9, sA;
355 
356       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
357 
358       int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
359       transpose_concat_4x4(s7, s8, s9, sA, s789A);
360 
361       vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
362       vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
363       vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
364 
365       uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filters, max);
366       uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filters, max);
367       uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filters, max);
368       uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filters, max);
369 
370       d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
371       d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
372       d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
373       d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
374 
375       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
376 
377       s0123[0] = s4567[0];
378       s0123[1] = s4567[1];
379       s1234[0] = s5678[0];
380       s1234[1] = s5678[1];
381       s2345[0] = s6789[0];
382       s2345[1] = s6789[1];
383       s3456[0] = s789A[0];
384       s3456[1] = s789A[1];
385 
386       s += 4 * src_stride;
387       d += 4 * dst_stride;
388       h -= 4;
389     } while (h != 0);
390   } else {
391     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
392 
393     do {
394       const int16_t *s = (const int16_t *)src;
395       uint16_t *d = dst;
396       int height = h;
397 
398       int16x8_t s0, s1, s2, s3, s4, s5, s6;
399       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
400       s += 7 * src_stride;
401 
402       int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
403       transpose_concat_8x4(s0, s1, s2, s3, s0123);
404       transpose_concat_8x4(s1, s2, s3, s4, s1234);
405       transpose_concat_8x4(s2, s3, s4, s5, s2345);
406       transpose_concat_8x4(s3, s4, s5, s6, s3456);
407 
408       do {
409         int16x8_t s7, s8, s9, sA;
410         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA);
411 
412         int16x8_t s4567[4], s5678[5], s6789[4], s789A[4];
413         transpose_concat_8x4(s7, s8, s9, sA, s789A);
414 
415         vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
416         vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
417         vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
418 
419         uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filters, max);
420         uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filters, max);
421         uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filters, max);
422         uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filters, max);
423 
424         d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
425         d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
426         d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
427         d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
428 
429         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
430 
431         s0123[0] = s4567[0];
432         s0123[1] = s4567[1];
433         s0123[2] = s4567[2];
434         s0123[3] = s4567[3];
435         s1234[0] = s5678[0];
436         s1234[1] = s5678[1];
437         s1234[2] = s5678[2];
438         s1234[3] = s5678[3];
439         s2345[0] = s6789[0];
440         s2345[1] = s6789[1];
441         s2345[2] = s6789[2];
442         s2345[3] = s6789[3];
443         s3456[0] = s789A[0];
444         s3456[1] = s789A[1];
445         s3456[2] = s789A[2];
446         s3456[3] = s789A[3];
447 
448         s += 4 * src_stride;
449         d += 4 * dst_stride;
450         height -= 4;
451       } while (height != 0);
452       src += 8;
453       dst += 8;
454       w -= 8;
455     } while (w != 0);
456   }
457 }
458 
highbd_convolve_2d_4tap_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x4_t x_filters,const int16x4_t y_filters,int bd)459 static INLINE void highbd_convolve_2d_4tap_sve2(
460     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
461     ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filters,
462     const int16x4_t y_filters, int bd) {
463   const int16x8_t x_filter = vcombine_s16(x_filters, vdup_n_s16(0));
464 
465   if (w == 4) {
466     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
467     const int16_t *s = (const int16_t *)src;
468     uint16_t *d = dst;
469 
470     int16x4_t h_s0[4], h_s1[4], h_s2[4];
471     load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
472     load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
473     load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
474 
475     int16x4_t v_s0 =
476         vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s0, x_filter, max));
477     int16x4_t v_s1 =
478         vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s1, x_filter, max));
479     int16x4_t v_s2 =
480         vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s2, x_filter, max));
481 
482     s += 3 * src_stride;
483 
484     do {
485       int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
486       load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
487                    &h_s3[3]);
488       load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
489                    &h_s4[3]);
490       load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
491                    &h_s5[3]);
492       load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
493                    &h_s6[3]);
494 
495       int16x4_t v_s3 =
496           vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s3, x_filter, max));
497       int16x4_t v_s4 =
498           vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s4, x_filter, max));
499       int16x4_t v_s5 =
500           vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s5, x_filter, max));
501       int16x4_t v_s6 =
502           vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s6, x_filter, max));
503 
504       uint16x4_t d0 =
505           highbd_convolve4_4_neon(v_s0, v_s1, v_s2, v_s3, y_filters, max);
506       uint16x4_t d1 =
507           highbd_convolve4_4_neon(v_s1, v_s2, v_s3, v_s4, y_filters, max);
508       uint16x4_t d2 =
509           highbd_convolve4_4_neon(v_s2, v_s3, v_s4, v_s5, y_filters, max);
510       uint16x4_t d3 =
511           highbd_convolve4_4_neon(v_s3, v_s4, v_s5, v_s6, y_filters, max);
512 
513       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
514 
515       v_s0 = v_s4;
516       v_s1 = v_s5;
517       v_s2 = v_s6;
518       s += 4 * src_stride;
519       d += 4 * dst_stride;
520       h -= 4;
521     } while (h != 0);
522 
523   } else {
524     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
525     const uint16x8_t idx = vld1q_u16(kTblConv4_8);
526 
527     do {
528       const int16_t *s = (const int16_t *)src;
529       uint16_t *d = dst;
530       int height = h;
531 
532       int16x8_t h_s0[4], h_s1[4], h_s2[4];
533       load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2],
534                    &h_s0[3]);
535       load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2],
536                    &h_s1[3]);
537       load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2],
538                    &h_s2[3]);
539 
540       int16x8_t v_s0 = vreinterpretq_s16_u16(
541           highbd_convolve4_8_sve(h_s0, x_filter, max, idx));
542       int16x8_t v_s1 = vreinterpretq_s16_u16(
543           highbd_convolve4_8_sve(h_s1, x_filter, max, idx));
544       int16x8_t v_s2 = vreinterpretq_s16_u16(
545           highbd_convolve4_8_sve(h_s2, x_filter, max, idx));
546 
547       s += 3 * src_stride;
548 
549       do {
550         int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
551         load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
552                      &h_s3[3]);
553         load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
554                      &h_s4[3]);
555         load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
556                      &h_s5[3]);
557         load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
558                      &h_s6[3]);
559 
560         int16x8_t v_s3 = vreinterpretq_s16_u16(
561             highbd_convolve4_8_sve(h_s3, x_filter, max, idx));
562         int16x8_t v_s4 = vreinterpretq_s16_u16(
563             highbd_convolve4_8_sve(h_s4, x_filter, max, idx));
564         int16x8_t v_s5 = vreinterpretq_s16_u16(
565             highbd_convolve4_8_sve(h_s5, x_filter, max, idx));
566         int16x8_t v_s6 = vreinterpretq_s16_u16(
567             highbd_convolve4_8_sve(h_s6, x_filter, max, idx));
568 
569         uint16x8_t d0 =
570             highbd_convolve4_8_neon(v_s0, v_s1, v_s2, v_s3, y_filters, max);
571         uint16x8_t d1 =
572             highbd_convolve4_8_neon(v_s1, v_s2, v_s3, v_s4, y_filters, max);
573         uint16x8_t d2 =
574             highbd_convolve4_8_neon(v_s2, v_s3, v_s4, v_s5, y_filters, max);
575         uint16x8_t d3 =
576             highbd_convolve4_8_neon(v_s3, v_s4, v_s5, v_s6, y_filters, max);
577 
578         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
579 
580         v_s0 = v_s4;
581         v_s1 = v_s5;
582         v_s2 = v_s6;
583         s += 4 * src_stride;
584         d += 4 * dst_stride;
585         height -= 4;
586       } while (height != 0);
587       src += 8;
588       dst += 8;
589       w -= 8;
590     } while (w != 0);
591   }
592 }
593 
highbd_convolve8_2d_horiz_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)594 static INLINE void highbd_convolve8_2d_horiz_sve2(
595     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
596     ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4,
597     int y0_q4, int y_step_q4, int w, int h, int bd) {
598   assert((intptr_t)dst % 4 == 0);
599   assert(dst_stride % 4 == 0);
600   assert(x_step_q4 == 16);
601   assert(h % 4 == 3 && h >= 7);
602 
603   (void)x_step_q4;
604   (void)y0_q4;
605   (void)y_step_q4;
606 
607   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
608 
609   src -= 3;
610 
611   if (w == 4) {
612     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
613     const int16_t *s = (const int16_t *)src;
614     uint16_t *d = dst;
615 
616     do {
617       int16x8_t s0[4], s1[4], s2[4], s3[4];
618       load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
619       load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
620       load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
621       load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
622 
623       uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
624       uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
625       uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
626       uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
627 
628       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
629 
630       s += 4 * src_stride;
631       d += 4 * dst_stride;
632       h -= 4;
633     } while (h != 3);
634 
635     // Process final three rows (h % 4 == 3).
636     int16x8_t s0[4], s1[4], s2[4];
637     load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
638     load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
639     load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
640 
641     uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
642     uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
643     uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
644 
645     store_u16_4x3(d, dst_stride, d0, d1, d2);
646   } else {
647     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
648 
649     do {
650       const int16_t *s = (const int16_t *)src;
651       uint16_t *d = dst;
652       int width = w;
653 
654       do {
655         int16x8_t s0[8], s1[8], s2[8], s3[8];
656         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
657                      &s0[4], &s0[5], &s0[6], &s0[7]);
658         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
659                      &s1[4], &s1[5], &s1[6], &s1[7]);
660         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
661                      &s2[4], &s2[5], &s2[6], &s2[7]);
662         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
663                      &s3[4], &s3[5], &s3[6], &s3[7]);
664 
665         uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
666         uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
667         uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
668         uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
669 
670         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
671 
672         s += 8;
673         d += 8;
674         width -= 8;
675       } while (width != 0);
676       src += 4 * src_stride;
677       dst += 4 * dst_stride;
678       h -= 4;
679     } while (h != 3);
680 
681     // Process final three rows (h % 4 == 3).
682     const int16_t *s = (const int16_t *)src;
683     uint16_t *d = dst;
684     int width = w;
685 
686     do {
687       int16x8_t s0[8], s1[8], s2[8];
688       load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
689                    &s0[4], &s0[5], &s0[6], &s0[7]);
690       load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
691                    &s1[4], &s1[5], &s1[6], &s1[7]);
692       load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
693                    &s2[4], &s2[5], &s2[6], &s2[7]);
694 
695       uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
696       uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
697       uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
698 
699       store_u16_8x3(d, dst_stride, d0, d1, d2);
700 
701       s += 8;
702       d += 8;
703       width -= 8;
704     } while (width != 0);
705   }
706 }
707 
vpx_highbd_convolve8_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)708 void vpx_highbd_convolve8_sve2(const uint16_t *src, ptrdiff_t src_stride,
709                                uint16_t *dst, ptrdiff_t dst_stride,
710                                const InterpKernel *filter, int x0_q4,
711                                int x_step_q4, int y0_q4, int y_step_q4, int w,
712                                int h, int bd) {
713   if (x_step_q4 != 16 || y_step_q4 != 16) {
714     vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
715                            x_step_q4, y0_q4, y_step_q4, w, h, bd);
716     return;
717   }
718 
719   assert(y_step_q4 == 16);
720   assert(x_step_q4 == 16);
721 
722   const int horiz_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
723   const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
724 
725   if (horiz_filter_taps == 4 || vert_filter_taps == 4) {
726     const ptrdiff_t horiz_offset = horiz_filter_taps / 2 - 1;
727     const ptrdiff_t vert_offset = (vert_filter_taps / 2 - 1) * src_stride;
728     const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
729     const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2);
730 
731     highbd_convolve_2d_4tap_sve2(src - horiz_offset - vert_offset, src_stride,
732                                  dst, dst_stride, w, h, x_filter, y_filter, bd);
733     return;
734   }
735 
736   // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
737   // maximum buffer size to 64 * (64 + 7).
738   DECLARE_ALIGNED(32, uint16_t, im_block[64 * 71]);
739   const int im_stride = 64;
740 
741   // Account for the vertical phase needing SUBPEL_TAPS / 2 - 1 lines prior
742   // and SUBPEL_TAPS / 2 lines post.
743   const int im_height = h + SUBPEL_TAPS - 1;
744   const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1;
745 
746   highbd_convolve8_2d_horiz_sve2(src - src_stride * border_offset, src_stride,
747                                  im_block, im_stride, filter, x0_q4, x_step_q4,
748                                  y0_q4, y_step_q4, w, im_height, bd);
749 
750   // Step into the temporary buffer border_offset rows to get actual frame data.
751   vpx_highbd_convolve8_vert_sve2(im_block + im_stride * border_offset,
752                                  im_stride, dst, dst_stride, filter, x0_q4,
753                                  x_step_q4, y0_q4, y_step_q4, w, h, bd);
754 }
755 
vpx_highbd_convolve8_avg_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)756 void vpx_highbd_convolve8_avg_sve2(const uint16_t *src, ptrdiff_t src_stride,
757                                    uint16_t *dst, ptrdiff_t dst_stride,
758                                    const InterpKernel *filter, int x0_q4,
759                                    int x_step_q4, int y0_q4, int y_step_q4,
760                                    int w, int h, int bd) {
761   if (x_step_q4 != 16 || y_step_q4 != 16) {
762     vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
763                                x_step_q4, y0_q4, y_step_q4, w, h, bd);
764     return;
765   }
766 
767   assert(y_step_q4 == 16);
768   assert(x_step_q4 == 16);
769 
770   // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
771   // maximum buffer size to 64 * (64 + 7).
772   DECLARE_ALIGNED(32, uint16_t, im_block[64 * 71]);
773   const int im_stride = 64;
774 
775   // Account for the vertical phase needing SUBPEL_TAPS / 2 - 1 lines prior
776   // and SUBPEL_TAPS / 2 lines post.
777   const int im_height = h + SUBPEL_TAPS - 1;
778   const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1;
779 
780   highbd_convolve8_2d_horiz_sve2(src - src_stride * border_offset, src_stride,
781                                  im_block, im_stride, filter, x0_q4, x_step_q4,
782                                  y0_q4, y_step_q4, w, im_height, bd);
783 
784   // Step into the temporary buffer border_offset rows to get actual frame data.
785   vpx_highbd_convolve8_avg_vert_sve2(im_block + im_stride * border_offset,
786                                      im_stride, dst, dst_stride, filter, x0_q4,
787                                      x_step_q4, y0_q4, y_step_q4, w, h, bd);
788 }
789