xref: /aosp_15_r20/external/libdav1d/src/arm/64/mc16_sve.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2024, Arm Limited
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this
9 *    list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 *    this list of conditions and the following disclaimer in the documentation
13 *    and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "src/arm/asm.S"
28#include "util.S"
29
30#define PREP_BIAS 32, lsl #8        // 8192
31#define PREP_BIAS_NEG 224, lsl #8   // -8192
32
33#if HAVE_SVE2
34ENABLE_SVE
35ENABLE_SVE2
36
37// No spaces in these expressions, due to gas-preprocessor. It is translated by
38// -1 to save the negative offset when getting the address of `mc_subpel_filters`.
39#define REGULAR1        (((0*15-1)<<7)|(3*15-1))
40#define SMOOTH1         (((1*15-1)<<7)|(4*15-1))
41#define SHARP1          (((2*15-1)<<7)|(3*15-1))
42
43#define FUNC_ALIGN      2
44#define JUMP_ALIGN      2
45#define LOOP_ALIGN      2
46
47
48// Shuffle indices to permute horizontal samples in preparation for input to
49// 16-bit SDOT instructions. The 8-tap horizontal convolution uses sample
50// indices in the interval of [-3, 4] relative to the current sample position.
51const h_tbl_sve, align=4
52        .byte  0,  1,  2,  3,  4,  5,  6,  7,   2,  3,  4,  5,  6,  7,  8,  9
53        .byte  4,  5,  6,  7,  8,  9, 10, 11,   6,  7,  8,  9, 10, 11, 12, 13
54endconst
55
56// Vertical convolutions also use 16-bit SDOT instructions, where two 128-bit
57// registers contain a transposed 4x4 matrix of values. Subsequent iterations
58// of the vertical convolution can reuse the 3x4 sub-matrix from the previous
59// loop iteration. These shuffle indices shift and merge this 4x4 matrix with
60// the values of a new line.
61const v_tbl_sve, align=4
62        .byte  2,  3,  4,  5,  6,  7, 16, 17,  10, 11, 12, 13, 14, 15, 24, 25
63        .byte  2,  3,  4,  5,  6,  7, 16, 17,  10, 11, 12, 13, 14, 15, 18, 19
64        .byte  2,  3,  4,  5,  6,  7, 20, 21,  10, 11, 12, 13, 14, 15, 22, 23
65        .byte  2,  3,  4,  5,  6,  7, 24, 25,  10, 11, 12, 13, 14, 15, 26, 27
66        .byte  2,  3,  4,  5,  6,  7, 28, 29,  10, 11, 12, 13, 14, 15, 30, 31
67endconst
68
69
70.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
71function \op\()_8tap_\type\()_16bpc_\isa, export=1, align=FUNC_ALIGN
72        mov             x9,  \type_h
73        mov             x10, \type_v
74    .if \jump
75        b               \op\()_8tap_\isa
76    .endif
77endfunc
78.endm
79
80.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, xmx, xmy, ldst, lsrc, wd_strd, ws_strd
81make_8tap_fn \type, sharp,          SHARP1,   SHARP1,   \isa
82make_8tap_fn \type, sharp_smooth,   SHARP1,   SMOOTH1,  \isa
83make_8tap_fn \type, sharp_regular,  SHARP1,   REGULAR1, \isa
84make_8tap_fn \type, smooth_sharp,   SMOOTH1,  SHARP1,   \isa
85make_8tap_fn \type, smooth,         SMOOTH1,  SMOOTH1,  \isa
86make_8tap_fn \type, smooth_regular, SMOOTH1,  REGULAR1, \isa
87make_8tap_fn \type, regular_sharp,  REGULAR1, SHARP1,   \isa
88make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1,  \isa
89make_8tap_fn \type, regular,        REGULAR1, REGULAR1, \isa, jump=0
90
91function \type\()_8tap_\isa, align=FUNC_ALIGN
92        clz             w8, \w
93        mov             w11, #0x4081                    // (1<<14) | (1<<7) | 1
94        ptrue           p0.b, vl16
95        sub             w8, w8, #24                     // for jump tables
96        movrel          x12, X(mc_subpel_filters)
97        cbnz            \mx, L(\type\()_8tap_h_hv_\isa)
98.ifc \type, prep
99        cbz             \my, prep_sve
100.else   // put
101        cbnz            \my, L(\type\()_8tap_v_\isa)
102        mov             w9, w8
103        b               X(put_16bpc_neon)
104
105        .align JUMP_ALIGN
106.endif
107
108L(\type\()_8tap_v_\isa):
109        madd            \my, \my, w11, w10
110        movrel          x13, v_tbl_sve
111.ifc \bdmax, w8                                         // put case, but skip
112        ld1r            {v5.8h}, [sp]                   // loading into w8
113.endif
114        sub             \src, \src, \s_strd             // src - s_strd
115        ubfx            w11, \my, #7, #7
116        and             \my, \my, #0x7F
117        ldr             q6, [x13]
118        cmp             \h, #4
119        csel            \my, \my, w11, le
120        sub             \src, \src, \s_strd, lsl #1     // src - 3 * s_strd
121        add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
122        ldp             q28, q29, [x13, #16]
123        ld1sb           {z7.h}, p0/z, [\xmy]
124.ifc \type, prep
125        clz             \bdmax, \bdmax
126        sub             \bdmax, \bdmax, #24
127        dup             v5.4s, \bdmax
128.endif
129        cmp             \w, #8
130        b.lt            40f
131
132        // .align JUMP_ALIGN   // fallthrough
13380:     // V - 8xN+
134        ldp             q30, q31, [x13, #48]
135.ifc \type, prep
136        add             \wd_strd, \w, \w                // d_strd = 2 * w
137.endif
138        .align LOOP_ALIGN
13981:
140        add             \lsrc, \src, \s_strd, lsl #1
141
142        ldr             q16, [\src]
143        ldr             q17, [\src, \s_strd]
144        ldr             q18, [\lsrc]
145        ldr             q19, [\lsrc, \s_strd]
146        add             \lsrc, \lsrc, \s_strd, lsl #1
147        mov             \ldst, \dst
148
149        ldr             q20, [\lsrc]
150        ldr             q21, [\lsrc, \s_strd]
151        add             \lsrc, \lsrc, \s_strd, lsl #1
152        ldr             q22, [\lsrc]
153        ldr             q23, [\lsrc, \s_strd]
154        add             \lsrc, \lsrc, \s_strd, lsl #1
155        sub             w8, \h, #1
156
157        zip1            v0.8h, v16.8h, v17.8h
158        zip2            v1.8h, v16.8h, v17.8h
159        zip1            v2.8h, v18.8h, v19.8h
160        zip2            v3.8h, v18.8h, v19.8h
161
162        zip1            v18.8h, v20.8h, v21.8h
163        zip2            v21.8h, v20.8h, v21.8h
164        zip1            v24.8h, v22.8h, v23.8h
165        zip2            v27.8h, v22.8h, v23.8h
166
167        zip1            v16.4s, v0.4s, v2.4s
168        zip2            v19.4s, v0.4s, v2.4s
169        zip1            v22.4s, v1.4s, v3.4s
170        zip2            v25.4s, v1.4s, v3.4s
171
172        zip1            v17.4s, v18.4s, v24.4s
173        zip2            v20.4s, v18.4s, v24.4s
174        zip1            v23.4s, v21.4s, v27.4s
175        zip2            v26.4s, v21.4s, v27.4s
176
177        .align LOOP_ALIGN
1788:
179        ld1             {v18.16b}, [\lsrc], \s_strd
180
181        movi            v0.2d, #0
182        movi            v1.2d, #0
183        movi            v2.2d, #0
184        movi            v3.2d, #0
185        mov             v21.16b, v18.16b
186        mov             v24.16b, v18.16b
187        mov             v27.16b, v18.16b
188
189        sdot            z0.d, z16.h, z7.h[0]
190        tbl             v16.16b, {v16.16b, v17.16b}, v6.16b
191        sdot            z1.d, z19.h, z7.h[0]
192        tbl             v19.16b, {v19.16b, v20.16b}, v6.16b
193        sdot            z2.d, z22.h, z7.h[0]
194        tbl             v22.16b, {v22.16b, v23.16b}, v6.16b
195        subs            w8, w8, #1
196        sdot            z3.d, z25.h, z7.h[0]
197        tbl             v25.16b, {v25.16b, v26.16b}, v6.16b
198
199        sdot            z0.d, z17.h, z7.h[1]
200        tbl             v17.16b, {v17.16b, v18.16b}, v28.16b
201        sdot            z1.d, z20.h, z7.h[1]
202        tbl             v20.16b, {v20.16b, v21.16b}, v29.16b
203        sdot            z2.d, z23.h, z7.h[1]
204        tbl             v23.16b, {v23.16b, v24.16b}, v30.16b
205        sdot            z3.d, z26.h, z7.h[1]
206        tbl             v26.16b, {v26.16b, v27.16b}, v31.16b
207
208        uzp1            v0.4s, v0.4s, v1.4s
209        uzp1            v1.4s, v2.4s, v3.4s
210.ifc \type, prep
211        srshl           v0.4s, v0.4s, v5.4s
212        srshl           v1.4s, v1.4s, v5.4s
213        uzp1            v0.8h, v0.8h, v1.8h
214        sub             z0.h, z0.h, #PREP_BIAS
215.else   // put
216        sqrshrun        v0.4h, v0.4s, #6
217        sqrshrun2       v0.8h, v1.4s, #6
218        umin            v0.8h, v0.8h, v5.8h
219.endif
220        st1             {v0.16b}, [\ldst], \d_strd
221        b.gt            8b
222
223        movi            v0.2d, #0
224        movi            v1.2d, #0
225        movi            v2.2d, #0
226        movi            v3.2d, #0
227
228        sdot            z0.d, z16.h, z7.h[0]
229        sdot            z1.d, z19.h, z7.h[0]
230        sdot            z2.d, z22.h, z7.h[0]
231        sdot            z3.d, z25.h, z7.h[0]
232
233        sdot            z0.d, z17.h, z7.h[1]
234        sdot            z1.d, z20.h, z7.h[1]
235        sdot            z2.d, z23.h, z7.h[1]
236        sdot            z3.d, z26.h, z7.h[1]
237        subs            \w, \w, #8
238
239        uzp1            v0.4s, v0.4s, v1.4s
240        uzp1            v1.4s, v2.4s, v3.4s
241.ifc \type, prep
242        srshl           v0.4s, v0.4s, v5.4s
243        srshl           v1.4s, v1.4s, v5.4s
244        uzp1            v0.8h, v0.8h, v1.8h
245        sub             z0.h, z0.h, #PREP_BIAS
246.else   // put
247        sqrshrun        v0.4h, v0.4s, #6
248        sqrshrun2       v0.8h, v1.4s, #6
249        umin            v0.8h, v0.8h, v5.8h
250.endif
251        str             q0, [\ldst]
252
253        add             \dst, \dst, #16
254        add             \src, \src, #16
255        b.gt            81b
256        ret
257
258        .align JUMP_ALIGN
25940:     // V - 4xN, put only: 2xN
260.ifc \type, put
261        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
262        whilelt         p1.h, wzr, \w               // masking for writes
263.endif
264        cmp             \h, #4
265        b.le            44f
266
267        ldr             d16, [\src]
268        ldr             d17, [\src, \s_strd]
269        add             \src, \src, \s_strd, lsl #1
270        ldr             d18, [\src]
271        ldr             d19, [\src, \s_strd]
272        add             \src, \src, \s_strd, lsl #1
273
274        ldr             d20, [\src]
275        ldr             d21, [\src, \s_strd]
276        add             \src, \src, \s_strd, lsl #1
277        ldr             d22, [\src]
278        ldr             d23, [\src, \s_strd]
279        add             \src, \src, \s_strd, lsl #1
280        sub             \h, \h, #2
281
282        zip1            v0.8h, v16.8h, v17.8h
283        zip1            v2.8h, v18.8h, v19.8h
284        zip1            v18.8h, v20.8h, v21.8h
285        zip1            v24.8h, v22.8h, v23.8h
286
287        zip1            v16.4s, v0.4s, v2.4s
288        zip2            v19.4s, v0.4s, v2.4s
289        zip1            v17.4s, v18.4s, v24.4s
290        zip2            v20.4s, v18.4s, v24.4s
291
292        .align LOOP_ALIGN
2934:
294        ldr             d18, [\src]
295        ldr             d24, [\src, \s_strd]
296        add             \src, \src, \s_strd, lsl #1
297
298        movi            v0.2d, #0
299        movi            v1.2d, #0
300        movi            v2.2d, #0
301        movi            v3.2d, #0
302        mov             v21.16b, v18.16b
303        mov             v27.16b, v24.16b
304
305        sdot            z0.d, z16.h, z7.h[0]
306        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
307        sdot            z1.d, z19.h, z7.h[0]
308        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
309        sdot            z0.d, z17.h, z7.h[1]
310        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
311        sdot            z1.d, z20.h, z7.h[1]
312        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
313        subs            \h, \h, #2
314
315        sdot            z2.d, z22.h, z7.h[0]
316        tbl             v16.16b, {v22.16b, v23.16b}, v6.16b
317        sdot            z3.d, z25.h, z7.h[0]
318        tbl             v19.16b, {v25.16b, v26.16b}, v6.16b
319        sdot            z2.d, z23.h, z7.h[1]
320        tbl             v17.16b, {v23.16b, v24.16b}, v28.16b
321        sdot            z3.d, z26.h, z7.h[1]
322        tbl             v20.16b, {v26.16b, v27.16b}, v29.16b
323
324        uzp1            v0.4s, v0.4s, v1.4s
325        uzp1            v1.4s, v2.4s, v3.4s
326.ifc \type, prep
327        srshl           v0.4s, v0.4s, v5.4s
328        srshl           v1.4s, v1.4s, v5.4s
329        uzp1            v0.8h, v0.8h, v1.8h
330        sub             z0.h, z0.h, #PREP_BIAS
331        str             q0, [\dst], #16
332.else   // put
333        sqrshrun        v0.4h, v0.4s, #6
334        sqrshrun        v1.4h, v1.4s, #6
335        umin            v0.4h, v0.4h, v5.4h
336        umin            v1.4h, v1.4h, v5.4h
337        st1h            {z0.h}, p1, [\dst]
338        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
339        add             \dst, \dst, \d_strd, lsl #2
340.endif
341        b.gt            4b
342
343        ldr             d18, [\src]
344
345        movi            v0.2d, #0
346        movi            v1.2d, #0
347        movi            v2.2d, #0
348        movi            v3.2d, #0
349        mov             v21.16b, v18.16b
350
351        sdot            z0.d, z16.h, z7.h[0]
352        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
353        sdot            z1.d, z19.h, z7.h[0]
354        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
355        sdot            z0.d, z17.h, z7.h[1]
356        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
357        sdot            z1.d, z20.h, z7.h[1]
358        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
359
360        sdot            z2.d, z22.h, z7.h[0]
361        sdot            z3.d, z25.h, z7.h[0]
362        sdot            z2.d, z23.h, z7.h[1]
363        sdot            z3.d, z26.h, z7.h[1]
364
365        uzp1            v0.4s, v0.4s, v1.4s
366        uzp1            v1.4s, v2.4s, v3.4s
367.ifc \type, prep
368        srshl           v0.4s, v0.4s, v5.4s
369        srshl           v1.4s, v1.4s, v5.4s
370        uzp1            v0.8h, v0.8h, v1.8h
371        sub             z0.h, z0.h, #PREP_BIAS
372        str             q0, [\dst]
373.else   // put
374        sqrshrun        v0.4h, v0.4s, #6
375        sqrshrun        v1.4h, v1.4s, #6
376        umin            v0.4h, v0.4h, v5.4h
377        umin            v1.4h, v1.4h, v5.4h
378        st1h            {z0.h}, p1, [\dst]
379        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
380.endif
381        ret
382
383        .align JUMP_ALIGN
38444:     // V - 4x4, put only: 4x2, 2x4, 2x2
385        add             \src, \src, \s_strd, lsl #1     // src - s_strd
386        subs            \h, \h, #2
387
388        ldr             d16, [\src]
389        ldr             d17, [\src, \s_strd]
390        add             \src, \src, \s_strd, lsl #1
391        ldr             d18, [\src]
392        ldr             d19, [\src, \s_strd]
393        add             \src, \src, \s_strd, lsl #1
394
395        ext             v7.16b, v7.16b, v7.16b, #4      // [\xmy + 2 * 2]
396
397        zip1            v0.8h, v16.8h, v17.8h
398        zip1            v2.8h, v18.8h, v19.8h
399        zip1            v16.4s, v0.4s, v2.4s
400        zip2            v19.4s, v0.4s, v2.4s
401
402.ifc \type, put
403        b.eq            42f
404.endif
405        ldr             d17, [\src]
406        ldr             d23, [\src, \s_strd]
407        add             \src, \src, \s_strd, lsl #1
408
409        movi            v0.2d, #0
410        movi            v1.2d, #0
411        movi            v2.2d, #0
412        movi            v3.2d, #0
413        mov             v20.16b, v17.16b
414        mov             v26.16b, v23.16b
415
416        sdot            z0.d, z16.h, z7.h[0]
417        tbl             v22.16b, {v16.16b, v17.16b}, v28.16b
418        sdot            z1.d, z19.h, z7.h[0]
419        tbl             v25.16b, {v19.16b, v20.16b}, v29.16b
420        sdot            z2.d, z22.h, z7.h[0]
421        tbl             v16.16b, {v22.16b, v23.16b}, v28.16b
422        sdot            z3.d, z25.h, z7.h[0]
423        tbl             v19.16b, {v25.16b, v26.16b}, v29.16b
424
425        uzp1            v0.4s, v0.4s, v1.4s
426        uzp1            v1.4s, v2.4s, v3.4s
427.ifc \type, prep
428        srshl           v0.4s, v0.4s, v5.4s
429        srshl           v1.4s, v1.4s, v5.4s
430        uzp1            v0.8h, v0.8h, v1.8h
431        sub             z0.h, z0.h, #PREP_BIAS
432        str             q0, [\dst], #16
433.else   // put
434        sqrshrun        v0.4h, v0.4s, #6
435        sqrshrun        v1.4h, v1.4s, #6
436        umin            v0.4h, v0.4h, v5.4h
437        umin            v1.4h, v1.4h, v5.4h
438        st1h            {z0.h}, p1, [\dst]
439        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
440        add             \dst, \dst, \d_strd, lsl #2
441.endif
442
443.ifc \type, put
444        .align JUMP_ALIGN
44542:
446.endif
447        ldr             d17, [\src]
448
449        movi            v0.2d, #0
450        movi            v1.2d, #0
451        movi            v2.2d, #0
452        movi            v3.2d, #0
453        mov             v20.16b, v17.16b
454
455        sdot            z0.d, z16.h, z7.h[0]
456        tbl             v22.16b, {v16.16b, v17.16b}, v28.16b
457        sdot            z1.d, z19.h, z7.h[0]
458        tbl             v25.16b, {v19.16b, v20.16b}, v29.16b
459
460        sdot            z2.d, z22.h, z7.h[0]
461        sdot            z3.d, z25.h, z7.h[0]
462
463        uzp1            v0.4s, v0.4s, v1.4s
464        uzp1            v1.4s, v2.4s, v3.4s
465.ifc \type, prep
466        srshl           v0.4s, v0.4s, v5.4s
467        srshl           v1.4s, v1.4s, v5.4s
468        uzp1            v0.8h, v0.8h, v1.8h
469        sub             z0.h, z0.h, #PREP_BIAS
470        str             q0, [\dst]
471.else   // put
472        sqrshrun        v0.4h, v0.4s, #6
473        sqrshrun        v1.4h, v1.4s, #6
474        umin            v0.4h, v0.4h, v5.4h
475        umin            v1.4h, v1.4h, v5.4h
476        st1h            {z0.h}, p1, [\dst]
477        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
478.endif
479        ret
480
481        .align JUMP_ALIGN
482L(\type\()_8tap_h_hv_\isa):
483        madd            \mx, \mx, w11, w9
484        movrel          x13, h_tbl_sve
485        sub             \src, \src, #6              // src - 3 * 2
486        ubfx            w9, \mx, #7, #7
487        and             \mx, \mx, #0x7F
488        cmp             \w, #4
489        csel            \mx, \mx, w9, le
490        ldp             q30, q31, [x13]
491        add             \xmx, x12, \xmx, lsl #3     // subpel H filter address
492        cbz             \my, L(\type\()_8tap_h_\isa)
493
494        // HV cases
495        madd            w14, \my, w11, w10
496.ifc \bdmax, w8
497        ldr             \bdmax, [sp]
498.endif
499        ubfx            w11, w14, #7, #7
500        and             w14, w14, #0x7F
501        ld1sb           {z4.h}, p0/z, [\xmx]
502        cmp             \h, #4
503        csel            w14, w14, w11, le
504.ifc \type, put
505        dup             v29.8h, \bdmax
506.endif
507        clz             \bdmax, \bdmax
508        add             \xmy, x12, x14, lsl #3      // subpel V filter address
509        ld1sb           {z7.h}, p0/z, [\xmy]
510.ifc \type, put
511        mov             w9, #12
512        sub             w9, w9, \bdmax
513        dup             v6.4s, w9
514.endif
515        sub             \bdmax, \bdmax, #24
516        mov             x15, x30
517        sub             \src, \src, \s_strd         // src - s_strd - 3 * 2
518        dup             v5.4s, \bdmax
519        cmp             w10, SHARP1
520        b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
521
522        // HV 8-tap cases
523        cmp             \w, #4
524        b.le            40f
525
526        // .align JUMP_ALIGN    // fallthrough
52780:     // HV8 - 8xN+
528.ifc \type, prep
529        add             \wd_strd, \w, \w                // d_strd = 2 * w
530.endif
531        cmp             \h, #4
532        b.le            84f
533        sub             \src, \src, \s_strd, lsl #1     // src - 3 * s_strd - 3 * 2
534
535        .align LOOP_ALIGN
53681:
537        mov             \lsrc, \src
538        mov             \ldst, \dst
539        mov             w8, \h
540
541        bl              L(\type\()_hv_filter8_\isa)
542        uzp1            v16.8h, v23.8h, v24.8h
543        bl              L(\type\()_hv_filter8_\isa)
544        uzp1            v17.8h, v23.8h, v24.8h
545        bl              L(\type\()_hv_filter8_\isa)
546        uzp1            v18.8h, v23.8h, v24.8h
547        bl              L(\type\()_hv_filter8_\isa)
548        uzp1            v19.8h, v23.8h, v24.8h
549        bl              L(\type\()_hv_filter8_\isa)
550        uzp1            v20.8h, v23.8h, v24.8h
551        bl              L(\type\()_hv_filter8_\isa)
552        uzp1            v21.8h, v23.8h, v24.8h
553        bl              L(\type\()_hv_filter8_\isa)
554        uzp1            v22.8h, v23.8h, v24.8h
555
556        .align LOOP_ALIGN
5578:
558        ldp             q24, q28, [\lsrc]
559        smull           v0.4s, v16.4h, v7.h[0]
560        smull2          v1.4s, v16.8h, v7.h[0]
561        mov             v16.16b, v17.16b
562
563        movi            v2.2d, #0
564        movi            v3.2d, #0
565        tbl             v23.16b, {v24.16b}, v30.16b
566        tbl             v24.16b, {v24.16b}, v31.16b
567
568        ldur            q26, [\lsrc, #8]
569        smlal           v0.4s, v17.4h, v7.h[1]
570        smlal2          v1.4s, v17.8h, v7.h[1]
571        mov             v17.16b, v18.16b
572        add             \lsrc, \lsrc, \s_strd
573
574        sdot            z2.d, z23.h, z4.h[0]
575        sdot            z3.d, z24.h, z4.h[0]
576        movi            v23.2d, #0
577        movi            v24.2d, #0
578        tbl             v25.16b, {v26.16b}, v30.16b
579        tbl             v26.16b, {v26.16b}, v31.16b
580        smlal           v0.4s, v18.4h, v7.h[2]
581        smlal2          v1.4s, v18.8h, v7.h[2]
582        mov             v18.16b, v19.16b
583
584        sdot            z23.d, z25.h, z4.h[0]
585        sdot            z24.d, z26.h, z4.h[0]
586        tbl             v27.16b, {v28.16b}, v30.16b
587        tbl             v28.16b, {v28.16b}, v31.16b
588        smlal           v0.4s, v19.4h, v7.h[3]
589        smlal2          v1.4s, v19.8h, v7.h[3]
590        mov             v19.16b, v20.16b
591
592        subs            w8, w8, #1
593        sdot            z2.d, z25.h, z4.h[1]
594        sdot            z3.d, z26.h, z4.h[1]
595        sdot            z23.d, z27.h, z4.h[1]
596        sdot            z24.d, z28.h, z4.h[1]
597
598        smlal           v0.4s, v20.4h, v7.h[4]
599        smlal2          v1.4s, v20.8h, v7.h[4]
600        mov             v20.16b, v21.16b
601
602        uzp1            v3.4s, v2.4s, v3.4s
603        uzp1            v24.4s, v23.4s, v24.4s
604        smlal           v0.4s, v21.4h, v7.h[5]
605        smlal2          v1.4s, v21.8h, v7.h[5]
606        mov             v21.16b, v22.16b
607
608        srshl           v23.4s, v3.4s, v5.4s
609        srshl           v24.4s, v24.4s, v5.4s
610        smlal           v0.4s, v22.4h, v7.h[6]
611        smlal2          v1.4s, v22.8h, v7.h[6]
612
613        uzp1            v22.8h, v23.8h, v24.8h
614        smlal           v0.4s, v22.4h, v7.h[7]
615        smlal2          v1.4s, v22.8h, v7.h[7]
616
617.ifc \type, prep
618        rshrn           v0.4h, v0.4s, #6
619        rshrn2          v0.8h, v1.4s, #6
620        sub             z0.h, z0.h, #PREP_BIAS
621.else   // put
622        srshl           v0.4s, v0.4s, v6.4s
623        srshl           v1.4s, v1.4s, v6.4s
624        sqxtun          v0.4h, v0.4s
625        sqxtun2         v0.8h, v1.4s
626        umin            v0.8h, v0.8h, v29.8h
627.endif
628        st1             {v0.8h}, [\ldst], \d_strd
629        b.gt            8b
630
631        subs            \w, \w, #8
632        add             \src, \src, #16
633        add             \dst, \dst, #16
634        b.gt            81b
635        ret             x15
636
637        .align JUMP_ALIGN
63840:     // HV8 - 4xN, put only: 2xN
639.ifc \type, put
640        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
641        whilelt         p1.h, wzr, \w               // masking for writes
642.endif
643        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
644        add             \src, \src, #4
645
646        cmp             \h, #4
647        b.le            44f
648
649        sub             \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2
650        bl              L(\type\()_hv_filter4_\isa)
651        xtn             v16.4h, v0.4s
652        bl              L(\type\()_hv_filter4_\isa)
653        xtn             v17.4h, v0.4s
654        bl              L(\type\()_hv_filter4_\isa)
655        xtn             v18.4h, v0.4s
656        bl              L(\type\()_hv_filter4_\isa)
657        xtn             v19.4h, v0.4s
658        bl              L(\type\()_hv_filter4_\isa)
659        xtn             v20.4h, v0.4s
660        bl              L(\type\()_hv_filter4_\isa)
661        xtn             v21.4h, v0.4s
662        bl              L(\type\()_hv_filter4_\isa)
663        xtn             v22.4h, v0.4s
664
665        .align LOOP_ALIGN
6664:
667        ld1             {v3.16b}, [\src], \s_strd
668
669        smull           v24.4s, v16.4h, v7.h[0]
670        smlal           v24.4s, v17.4h, v7.h[1]
671        tbl             v2.16b, {v3.16b}, v30.16b
672        tbl             v3.16b, {v3.16b}, v31.16b
673        movi            v0.2d, #0
674        movi            v1.2d, #0
675        mov             v16.16b, v17.16b
676        mov             v17.16b, v18.16b
677
678        smlal           v24.4s, v18.4h, v7.h[2]
679        smlal           v24.4s, v19.4h, v7.h[3]
680        sdot            z0.d, z2.h, z4.h[0]
681        sdot            z1.d, z3.h, z4.h[0]
682        mov             v18.16b, v19.16b
683        mov             v19.16b, v20.16b
684        uzp1            v0.4s, v0.4s, v1.4s
685
686        smlal           v24.4s, v20.4h, v7.h[4]
687        smlal           v24.4s, v21.4h, v7.h[5]
688        srshl           v0.4s, v0.4s, v5.4s
689        mov             v20.16b, v21.16b
690        mov             v21.16b, v22.16b
691
692        subs            \h, \h, #1
693        smlal           v24.4s, v22.4h, v7.h[6]
694        xtn             v22.4h, v0.4s
695        smlal           v24.4s, v22.4h, v7.h[7]
696
697.ifc \type, prep
698        rshrn           v0.4h, v24.4s, #6
699        sub             z0.h, z0.h, #PREP_BIAS
700        str             d0, [\dst], #8
701.else   // put
702        srshl           v0.4s, v24.4s, v6.4s
703        sqxtun          v0.4h, v0.4s
704        umin            v0.4h, v0.4h, v29.4h
705        st1h            {z0.h}, p1, [\dst]
706        add             \dst, \dst, \d_strd, lsl #1
707.endif
708        b.gt            4b
709        ret             x15
710
711        .align JUMP_ALIGN
712L(\type\()_6tap_hv_\isa):
713        cmp             \w, #4
714        b.le            46f
715
716        // .align JUMP_ALIGN    // fallthrough
71780:     // HV6 - 8xN+
718.ifc \type, prep
719        add             \wd_strd, \w, \w        // d_strd = 2 * w
720.endif
721        cmp             \h, #4
722        b.le            84f
723        sub             \src, \src, \s_strd     // src - 2 * s_strd - 3 * 2
724
725        .align LOOP_ALIGN
72681:
727        mov             \lsrc, \src
728        mov             \ldst, \dst
729        mov             w8, \h
730
731        bl              L(\type\()_hv_filter8_\isa)
732        uzp1            v16.8h, v23.8h, v24.8h
733        bl              L(\type\()_hv_filter8_\isa)
734        uzp1            v17.8h, v23.8h, v24.8h
735        bl              L(\type\()_hv_filter8_\isa)
736        uzp1            v18.8h, v23.8h, v24.8h
737        bl              L(\type\()_hv_filter8_\isa)
738        uzp1            v19.8h, v23.8h, v24.8h
739        bl              L(\type\()_hv_filter8_\isa)
740        uzp1            v20.8h, v23.8h, v24.8h
741
742        .align LOOP_ALIGN
7438:
744        ldp             q24, q28, [\lsrc]
745
746        smull           v0.4s, v16.4h, v7.h[1]
747        smull2          v1.4s, v16.8h, v7.h[1]
748        mov             v16.16b, v17.16b
749
750        tbl             v23.16b, {v24.16b}, v30.16b
751        tbl             v24.16b, {v24.16b}, v31.16b
752        movi            v2.2d, #0
753        movi            v3.2d, #0
754
755        ldur            q26, [\lsrc, #8]
756        add             \lsrc, \lsrc, \s_strd
757
758        sdot            z2.d, z23.h, z4.h[0]
759        sdot            z3.d, z24.h, z4.h[0]
760        tbl             v25.16b, {v26.16b}, v30.16b
761        tbl             v26.16b, {v26.16b}, v31.16b
762        movi            v23.2d, #0
763        movi            v24.2d, #0
764
765        sdot            z23.d, z25.h, z4.h[0]
766        sdot            z24.d, z26.h, z4.h[0]
767        tbl             v27.16b, {v28.16b}, v30.16b
768        tbl             v28.16b, {v28.16b}, v31.16b
769        smlal           v0.4s, v17.4h, v7.h[2]
770        smlal2          v1.4s, v17.8h, v7.h[2]
771        mov             v17.16b, v18.16b
772
773        sdot            z2.d, z25.h, z4.h[1]
774        sdot            z3.d, z26.h, z4.h[1]
775        sdot            z23.d, z27.h, z4.h[1]
776        sdot            z24.d, z28.h, z4.h[1]
777
778        smlal           v0.4s, v18.4h, v7.h[3]
779        smlal2          v1.4s, v18.8h, v7.h[3]
780        mov             v18.16b, v19.16b
781
782        uzp1            v3.4s, v2.4s, v3.4s
783        uzp1            v24.4s, v23.4s, v24.4s
784        smlal           v0.4s, v19.4h, v7.h[4]
785        smlal2          v1.4s, v19.8h, v7.h[4]
786        mov             v19.16b, v20.16b
787
788        srshl           v23.4s, v3.4s, v5.4s
789        srshl           v24.4s, v24.4s, v5.4s
790        smlal           v0.4s, v20.4h, v7.h[5]
791        smlal2          v1.4s, v20.8h, v7.h[5]
792
793        subs            w8, w8, #1
794        uzp1            v20.8h, v23.8h, v24.8h
795        smlal           v0.4s, v20.4h, v7.h[6]
796        smlal2          v1.4s, v20.8h, v7.h[6]
797
798.ifc \type, prep
799        rshrn           v0.4h, v0.4s, #6
800        rshrn2          v0.8h, v1.4s, #6
801        sub             z0.h, z0.h, #PREP_BIAS
802.else   // put
803        srshl           v0.4s, v0.4s, v6.4s
804        srshl           v1.4s, v1.4s, v6.4s
805        sqxtun          v0.4h, v0.4s
806        sqxtun2         v0.8h, v1.4s
807        umin            v0.8h, v0.8h, v29.8h
808.endif
809        st1             {v0.8h}, [\ldst], \d_strd
810        b.gt            8b
811
812        add             \dst, \dst, #16
813        subs            \w, \w, #8
814        add             \src, \src, #16
815        b.gt            81b
816        ret             x15
817
818        .align LOOP_ALIGN
81984:     // HV4 - 8x4, 8x2
820        mov             \lsrc, \src
821        mov             \ldst, \dst
822        mov             w8, \h
823
824        bl              L(\type\()_hv_filter8_\isa)
825        uzp1            v17.8h, v23.8h, v24.8h
826        bl              L(\type\()_hv_filter8_\isa)
827        uzp1            v18.8h, v23.8h, v24.8h
828        bl              L(\type\()_hv_filter8_\isa)
829        uzp1            v19.8h, v23.8h, v24.8h
830
831        .align LOOP_ALIGN
83281:
833        ldp             q24, q28, [\lsrc]
834        ldur            q26, [\lsrc, #8]
835        add             \lsrc, \lsrc, \s_strd
836
837        tbl             v23.16b, {v24.16b}, v30.16b
838        tbl             v24.16b, {v24.16b}, v31.16b
839        movi            v2.2d, #0
840        movi            v3.2d, #0
841        sdot            z2.d, z23.h, z4.h[0]
842        sdot            z3.d, z24.h, z4.h[0]
843
844        tbl             v25.16b, {v26.16b}, v30.16b
845        tbl             v26.16b, {v26.16b}, v31.16b
846        movi            v23.2d, #0
847        movi            v24.2d, #0
848        sdot            z23.d, z25.h, z4.h[0]
849        sdot            z24.d, z26.h, z4.h[0]
850
851        tbl             v27.16b, {v28.16b}, v30.16b
852        tbl             v28.16b, {v28.16b}, v31.16b
853        sdot            z2.d, z25.h, z4.h[1]
854        sdot            z3.d, z26.h, z4.h[1]
855        sdot            z23.d, z27.h, z4.h[1]
856        sdot            z24.d, z28.h, z4.h[1]
857
858        smull           v0.4s, v17.4h, v7.h[2]
859        smull2          v1.4s, v17.8h, v7.h[2]
860        mov             v17.16b, v18.16b
861
862        subs            w8, w8, #1
863        uzp1            v3.4s, v2.4s, v3.4s
864        uzp1            v24.4s, v23.4s, v24.4s
865        smlal           v0.4s, v18.4h, v7.h[3]
866        smlal2          v1.4s, v18.8h, v7.h[3]
867        mov             v18.16b, v19.16b
868
869        srshl           v23.4s, v3.4s, v5.4s
870        srshl           v24.4s, v24.4s, v5.4s
871        smlal           v0.4s, v19.4h, v7.h[4]
872        smlal2          v1.4s, v19.8h, v7.h[4]
873
874        uzp1            v19.8h, v23.8h, v24.8h
875        smlal           v0.4s, v19.4h, v7.h[5]
876        smlal2          v1.4s, v19.8h, v7.h[5]
877
878.ifc \type, prep
879        rshrn           v0.4h, v0.4s, #6
880        rshrn2          v0.8h, v1.4s, #6
881        sub             z0.h, z0.h, #PREP_BIAS
882.else   // put
883        srshl           v0.4s, v0.4s, v6.4s
884        srshl           v1.4s, v1.4s, v6.4s
885        sqxtun          v0.4h, v0.4s
886        sqxtun2         v0.8h, v1.4s
887        umin            v0.8h, v0.8h, v29.8h
888.endif
889        st1             {v0.8h}, [\ldst], \d_strd
890        b.gt            81b
891
892        subs            \w, \w, #8
893        add             \dst, \dst, #16
894        add             \src, \src, #16
895        b.gt            84b
896        ret             x15
897
898        .align FUNC_ALIGN
899L(\type\()_hv_filter8_\isa):
900        ldp             q24, q28, [\lsrc]
901        ldur            q26, [\lsrc, #8]
902        add             \lsrc, \lsrc, \s_strd
903
904        tbl             v23.16b, {v24.16b}, v30.16b
905        tbl             v24.16b, {v24.16b}, v31.16b
906        movi            v2.2d, #0
907        movi            v3.2d, #0
908        sdot            z2.d, z23.h, z4.h[0]
909        sdot            z3.d, z24.h, z4.h[0]
910
911        tbl             v25.16b, {v26.16b}, v30.16b
912        tbl             v26.16b, {v26.16b}, v31.16b
913        movi            v23.2d, #0
914        movi            v24.2d, #0
915        sdot            z23.d, z25.h, z4.h[0]
916        sdot            z24.d, z26.h, z4.h[0]
917
918        tbl             v27.16b, {v28.16b}, v30.16b
919        tbl             v28.16b, {v28.16b}, v31.16b
920        sdot            z2.d, z25.h, z4.h[1]
921        sdot            z3.d, z26.h, z4.h[1]
922        sdot            z23.d, z27.h, z4.h[1]
923        sdot            z24.d, z28.h, z4.h[1]
924
925        uzp1            v3.4s, v2.4s, v3.4s
926        uzp1            v24.4s, v23.4s, v24.4s
927        srshl           v23.4s, v3.4s, v5.4s
928        srshl           v24.4s, v24.4s, v5.4s
929        ret
930
931        .align FUNC_ALIGN
932L(\type\()_hv_filter4_\isa):
933        ld1             {v3.16b}, [\src], \s_strd
934
935        tbl             v2.16b, {v3.16b}, v30.16b
936        tbl             v3.16b, {v3.16b}, v31.16b
937        movi            v0.2d, #0
938        movi            v1.2d, #0
939        sdot            z0.d, z2.h, z4.h[0]
940        sdot            z1.d, z3.h, z4.h[0]
941
942        uzp1            v0.4s, v0.4s, v1.4s
943        srshl           v0.4s, v0.4s, v5.4s
944        ret
945
946        .align JUMP_ALIGN
94746:     // H4V6 - 4xN, put only: 2xN
948.ifc \type, put
949        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
950        whilelt         p1.h, wzr, \w               // masking for writes
951.endif
952        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
953        add             \src, \src, #4
954
955        cmp             \h, #4
956        b.le            44f
957
958        sub             \src, \src, \s_strd         // src - 2 * s_strd - 3 * 2
959        bl              L(\type\()_hv_filter4_\isa)
960        xtn             v16.4h, v0.4s
961        bl              L(\type\()_hv_filter4_\isa)
962        xtn             v17.4h, v0.4s
963        bl              L(\type\()_hv_filter4_\isa)
964        xtn             v18.4h, v0.4s
965        bl              L(\type\()_hv_filter4_\isa)
966        xtn             v19.4h, v0.4s
967        bl              L(\type\()_hv_filter4_\isa)
968        xtn             v20.4h, v0.4s
969
970        .align LOOP_ALIGN
9714:
972        ld1             {v3.16b}, [\src], \s_strd
973        smull           v24.4s, v16.4h, v7.h[1]
974        smlal           v24.4s, v17.4h, v7.h[2]
975
976        tbl             v2.16b, {v3.16b}, v30.16b
977        tbl             v3.16b, {v3.16b}, v31.16b
978        movi            v0.2d, #0
979        movi            v1.2d, #0
980        sdot            z0.d, z2.h, z4.h[0]
981        sdot            z1.d, z3.h, z4.h[0]
982
983        mov             v16.16b, v17.16b
984        mov             v17.16b, v18.16b
985        smlal           v24.4s, v18.4h, v7.h[3]
986        smlal           v24.4s, v19.4h, v7.h[4]
987        uzp1            v0.4s, v0.4s, v1.4s
988
989        mov             v18.16b, v19.16b
990        mov             v19.16b, v20.16b
991        subs            \h, \h, #1
992        srshl           v0.4s, v0.4s, v5.4s
993        smlal           v24.4s, v20.4h, v7.h[5]
994        xtn             v20.4h, v0.4s
995        smlal           v24.4s, v20.4h, v7.h[6]
996
997.ifc \type, prep
998        rshrn           v0.4h, v24.4s, #6
999        sub             z0.h, z0.h, #PREP_BIAS
1000        str             d0, [\dst], #8
1001.else   // put
1002        srshl           v0.4s, v24.4s, v6.4s
1003        sqxtun          v0.4h, v0.4s
1004        umin            v0.4h, v0.4h, v29.4h
1005        st1h            {z0.h}, p1, [\dst]
1006        add             \dst, \dst, \d_strd, lsl #1
1007.endif
1008        b.gt            4b
1009        ret             x15
1010
1011        .align JUMP_ALIGN
101244:     // H4V4 - 4x4, put only: 4x2, 2x4, 2x2
1013        bl              L(\type\()_hv_filter4_\isa)
1014        xtn             v17.4h, v0.4s
1015        bl              L(\type\()_hv_filter4_\isa)
1016        xtn             v18.4h, v0.4s
1017        bl              L(\type\()_hv_filter4_\isa)
1018        xtn             v19.4h, v0.4s
1019
1020        .align LOOP_ALIGN
10214:
1022        ld1             {v3.16b}, [\src], \s_strd
1023        smull           v24.4s, v17.4h, v7.h[2]
1024        smlal           v24.4s, v18.4h, v7.h[3]
1025
1026        tbl             v2.16b, {v3.16b}, v30.16b
1027        tbl             v3.16b, {v3.16b}, v31.16b
1028        movi            v0.2d, #0
1029        movi            v1.2d, #0
1030        sdot            z0.d, z2.h, z4.h[0]
1031        sdot            z1.d, z3.h, z4.h[0]
1032        uzp1            v0.4s, v0.4s, v1.4s
1033
1034        mov             v17.16b, v18.16b
1035        mov             v18.16b, v19.16b
1036        subs            \h, \h, #1
1037        srshl           v0.4s, v0.4s, v5.4s
1038        smlal           v24.4s, v19.4h, v7.h[4]
1039        xtn             v19.4h, v0.4s
1040        smlal           v24.4s, v19.4h, v7.h[5]
1041
1042.ifc \type, prep
1043        rshrn           v0.4h, v24.4s, #6
1044        sub             z0.h, z0.h, #PREP_BIAS
1045        str             d0, [\dst], #8
1046.else   // put
1047        srshl           v0.4s, v24.4s, v6.4s
1048        sqxtun          v0.4h, v0.4s
1049        umin            v0.4h, v0.4h, v29.4h
1050        st1h            {z0.h}, p1, [\dst]
1051        add             \dst, \dst, \d_strd, lsl #1
1052.endif
1053        b.gt            4b
1054        ret             x15
1055
1056        .align JUMP_ALIGN
1057L(\type\()_8tap_h_\isa):
1058        movrel          x11, \type\()_8tap_h_\isa\()_tbl
1059        ldrsw           x12, [x11, x8, lsl #2]
1060.ifc \bdmax, w8
1061        ldr             \bdmax, [sp]
1062.endif
1063.ifc \type, prep
1064        clz             \bdmax, \bdmax
1065        sub             \bdmax, \bdmax, #24
1066        dup             v5.4s, \bdmax
1067.else   // put
1068        mov             w9, #34             // rounding for 10-bit case
1069        mov             w10, #40            // rounding for 12-bit case
1070        cmp             \bdmax, #0xFFF
1071        csel            w9, w9, w10, ne     // select rounding based on \bdmax
1072        dup             v5.8h, \bdmax
1073        dup             v6.2d, x9
1074.endif
1075        add             x11, x11, x12
1076        ld1sb           {z4.h}, p0/z, [\xmx]
1077        br              x11
1078
1079        .align JUMP_ALIGN
108020:     // H - 4xN, put only: 2xN
108140:
1082        AARCH64_VALID_JUMP_TARGET
1083        add             \src, \src, #4              // src - 1 * 2
1084        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
1085.ifc \type, put
1086        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
1087        whilelt         p1.h, wzr, \w               // masking for writes
1088.endif
1089        .align LOOP_ALIGN
10904:
1091        ldr             q17, [\src]
1092        ldr             q19, [\src, \s_strd]
1093        add             \src, \src, \s_strd, lsl #1
1094
1095.ifc \type, prep
1096        movi            v0.2d, #0
1097        movi            v1.2d, #0
1098        movi            v2.2d, #0
1099        movi            v3.2d, #0
1100.else
1101        mov             v0.16b, v6.16b
1102        mov             v1.16b, v6.16b
1103        mov             v2.16b, v6.16b
1104        mov             v3.16b, v6.16b
1105.endif
1106        tbl             v16.16b, {v17.16b}, v30.16b
1107        tbl             v17.16b, {v17.16b}, v31.16b
1108        sdot            z0.d, z16.h, z4.h[0]
1109        sdot            z1.d, z17.h, z4.h[0]
1110        subs            \h, \h, #2
1111        tbl             v18.16b, {v19.16b}, v30.16b
1112        tbl             v19.16b, {v19.16b}, v31.16b
1113        sdot            z2.d, z18.h, z4.h[0]
1114        sdot            z3.d, z19.h, z4.h[0]
1115
1116        uzp1            v0.4s, v0.4s, v1.4s
1117        uzp1            v1.4s, v2.4s, v3.4s
1118.ifc \type, prep
1119        srshl           v0.4s, v0.4s, v5.4s
1120        srshl           v1.4s, v1.4s, v5.4s
1121        uzp1            v0.8h, v0.8h, v1.8h
1122        sub             z0.h, z0.h, #PREP_BIAS
1123        str             q0, [\dst], #16
1124.else   // put
1125        sqshrun         v0.4h, v0.4s, #6
1126        sqshrun         v1.4h, v1.4s, #6
1127        umin            v0.4h, v0.4h, v5.4h
1128        umin            v1.4h, v1.4h, v5.4h
1129        st1h            {z0.h}, p1, [\dst]
1130        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
1131        add             \dst, \dst, \d_strd, lsl #2
1132.endif
1133        b.gt            4b
1134        ret
1135
1136        .align JUMP_ALIGN
113780:     // H - 8xN
1138        AARCH64_VALID_JUMP_TARGET
1139
1140        .align LOOP_ALIGN
11418:
1142        ldp             q17, q21, [\src]
1143        ldur            q19, [\src, #8]
1144
1145.ifc \type, prep
1146        movi            v0.2d, #0
1147        movi            v2.2d, #0
1148.else
1149        mov             v0.16b, v6.16b
1150        mov             v2.16b, v6.16b
1151.endif
1152        tbl             v16.16b, {v17.16b}, v30.16b
1153        tbl             v17.16b, {v17.16b}, v31.16b
1154        add             \src, \src, \s_strd
1155        sdot            z0.d, z16.h, z4.h[0]
1156        sdot            z2.d, z17.h, z4.h[0]
1157
1158        tbl             v18.16b, {v19.16b}, v30.16b
1159        tbl             v19.16b, {v19.16b}, v31.16b
1160.ifc \type, prep
1161        movi            v16.2d, #0
1162        movi            v17.2d, #0
1163.else
1164        mov             v16.16b, v6.16b
1165        mov             v17.16b, v6.16b
1166.endif
1167        ldp             q23, q27, [\src]
1168        ldur            q25, [\src, #8]
1169
1170        sdot            z16.d, z18.h, z4.h[0]
1171        sdot            z17.d, z19.h, z4.h[0]
1172
1173        tbl             v22.16b, {v23.16b}, v30.16b
1174        tbl             v23.16b, {v23.16b}, v31.16b
1175.ifc \type, prep
1176        movi            v1.2d, #0
1177        movi            v3.2d, #0
1178.else
1179        mov             v1.16b, v6.16b
1180        mov             v3.16b, v6.16b
1181.endif
1182        add             \src, \src, \s_strd
1183        sdot            z1.d, z22.h, z4.h[0]
1184        sdot            z3.d, z23.h, z4.h[0]
1185
1186        tbl             v24.16b, {v25.16b}, v30.16b
1187        tbl             v25.16b, {v25.16b}, v31.16b
1188.ifc \type, prep
1189        movi            v22.2d, #0
1190        movi            v23.2d, #0
1191.else
1192        mov             v22.16b, v6.16b
1193        mov             v23.16b, v6.16b
1194.endif
1195        sdot            z22.d, z24.h, z4.h[0]
1196        sdot            z23.d, z25.h, z4.h[0]
1197
1198        tbl             v20.16b, {v21.16b}, v30.16b
1199        tbl             v21.16b, {v21.16b}, v31.16b
1200        sdot            z0.d, z18.h, z4.h[1]
1201        sdot            z2.d, z19.h, z4.h[1]
1202        tbl             v26.16b, {v27.16b}, v30.16b
1203        tbl             v27.16b, {v27.16b}, v31.16b
1204        sdot            z16.d, z20.h, z4.h[1]
1205        sdot            z17.d, z21.h, z4.h[1]
1206
1207        sdot            z1.d, z24.h, z4.h[1]
1208        sdot            z3.d, z25.h, z4.h[1]
1209
1210        sdot            z22.d, z26.h, z4.h[1]
1211        sdot            z23.d, z27.h, z4.h[1]
1212
1213        subs            \h, \h, #2
1214        uzp1            v0.4s, v0.4s, v2.4s
1215        uzp1            v2.4s, v16.4s, v17.4s
1216        uzp1            v1.4s, v1.4s, v3.4s
1217        uzp1            v3.4s, v22.4s, v23.4s
1218.ifc \type, prep
1219        srshl           v0.4s, v0.4s, v5.4s
1220        srshl           v2.4s, v2.4s, v5.4s
1221        srshl           v1.4s, v1.4s, v5.4s
1222        srshl           v3.4s, v3.4s, v5.4s
1223        uzp1            v0.8h, v0.8h, v2.8h
1224        uzp1            v1.8h, v1.8h, v3.8h
1225        sub             z0.h, z0.h, #PREP_BIAS
1226        sub             z1.h, z1.h, #PREP_BIAS
1227        stp             q0, q1, [\dst], #32
1228.else   // put
1229        sqshrun         v0.4h, v0.4s, #6
1230        sqshrun2        v0.8h, v2.4s, #6
1231        sqshrun         v1.4h, v1.4s, #6
1232        sqshrun2        v1.8h, v3.4s, #6
1233        umin            v0.8h, v0.8h, v5.8h
1234        umin            v1.8h, v1.8h, v5.8h
1235        st1             {v0.16b}, [\dst], \d_strd
1236        st1             {v1.16b}, [\dst], \d_strd
1237.endif
1238        b.gt            8b
1239        ret
1240
1241        .align JUMP_ALIGN
1242160:    // H - 16xN
1243        AARCH64_VALID_JUMP_TARGET
1244
1245        .align LOOP_ALIGN
124616:
1247        ldp             q17, q21, [\src]
1248        ldur            q19, [\src, #8]
1249
1250.ifc \type, prep
1251        movi            v0.2d, #0
1252        movi            v2.2d, #0
1253.else
1254        mov             v0.16b, v6.16b
1255        mov             v2.16b, v6.16b
1256.endif
1257        tbl             v16.16b, {v17.16b}, v30.16b
1258        tbl             v17.16b, {v17.16b}, v31.16b
1259        sdot            z0.d, z16.h, z4.h[0]
1260        sdot            z2.d, z17.h, z4.h[0]
1261
1262        tbl             v18.16b, {v19.16b}, v30.16b
1263        tbl             v19.16b, {v19.16b}, v31.16b
1264.ifc \type, prep
1265        movi            v16.2d, #0
1266        movi            v17.2d, #0
1267.else
1268        mov             v16.16b, v6.16b
1269        mov             v17.16b, v6.16b
1270.endif
1271        ldur            q25, [\src, #24]
1272        ldr             q27, [\src, #32]
1273
1274        sdot            z16.d, z18.h, z4.h[0]
1275        sdot            z17.d, z19.h, z4.h[0]
1276
1277        tbl             v22.16b, {v21.16b}, v30.16b
1278        tbl             v23.16b, {v21.16b}, v31.16b
1279.ifc \type, prep
1280        movi            v1.2d, #0
1281        movi            v3.2d, #0
1282.else
1283        mov             v1.16b, v6.16b
1284        mov             v3.16b, v6.16b
1285.endif
1286        add             \src, \src, \s_strd
1287        sdot            z1.d, z22.h, z4.h[0]
1288        sdot            z3.d, z23.h, z4.h[0]
1289
1290        tbl             v24.16b, {v25.16b}, v30.16b
1291        tbl             v25.16b, {v25.16b}, v31.16b
1292.ifc \type, prep
1293        movi            v22.2d, #0
1294        movi            v23.2d, #0
1295.else
1296        mov             v22.16b, v6.16b
1297        mov             v23.16b, v6.16b
1298.endif
1299        sdot            z22.d, z24.h, z4.h[0]
1300        sdot            z23.d, z25.h, z4.h[0]
1301
1302        tbl             v20.16b, {v21.16b}, v30.16b
1303        tbl             v21.16b, {v21.16b}, v31.16b
1304        sdot            z0.d, z18.h, z4.h[1]
1305        sdot            z2.d, z19.h, z4.h[1]
1306        tbl             v26.16b, {v27.16b}, v30.16b
1307        tbl             v27.16b, {v27.16b}, v31.16b
1308        sdot            z16.d, z20.h, z4.h[1]
1309        sdot            z17.d, z21.h, z4.h[1]
1310
1311        sdot            z1.d, z24.h, z4.h[1]
1312        sdot            z3.d, z25.h, z4.h[1]
1313
1314        sdot            z22.d, z26.h, z4.h[1]
1315        sdot            z23.d, z27.h, z4.h[1]
1316
1317        subs            \h, \h, #1
1318        uzp1            v0.4s, v0.4s, v2.4s
1319        uzp1            v2.4s, v16.4s, v17.4s
1320        uzp1            v1.4s, v1.4s, v3.4s
1321        uzp1            v3.4s, v22.4s, v23.4s
1322.ifc \type, prep
1323        srshl           v0.4s, v0.4s, v5.4s
1324        srshl           v2.4s, v2.4s, v5.4s
1325        srshl           v1.4s, v1.4s, v5.4s
1326        srshl           v3.4s, v3.4s, v5.4s
1327        uzp1            v0.8h, v0.8h, v2.8h
1328        uzp1            v1.8h, v1.8h, v3.8h
1329        sub             z0.h, z0.h, #PREP_BIAS
1330        sub             z1.h, z1.h, #PREP_BIAS
1331        stp             q0, q1, [\dst], #32
1332.else   // put
1333        sqshrun         v0.4h, v0.4s, #6
1334        sqshrun2        v0.8h, v2.4s, #6
1335        sqshrun         v1.4h, v1.4s, #6
1336        sqshrun2        v1.8h, v3.4s, #6
1337        umin            v0.8h, v0.8h, v5.8h
1338        umin            v1.8h, v1.8h, v5.8h
1339        st1             {v0.16b, v1.16b}, [\dst], \d_strd
1340.endif
1341        b.gt            16b
1342        ret
1343
1344        .align JUMP_ALIGN
1345320:    // H - 32xN+
1346640:
13471280:
1348        AARCH64_VALID_JUMP_TARGET
1349.ifc \type, put
1350        sub             \d_strd, \d_strd, \w, uxtw #1
1351.endif
1352        sub             \s_strd, \s_strd, \w, uxtw #1
1353        mov             w8, \w
1354
1355        .align LOOP_ALIGN
135632:
1357        ldp             q17, q21, [\src]
1358        ldur            q19, [\src, #8]
1359
1360.ifc \type, prep
1361        movi            v0.2d, #0
1362        movi            v2.2d, #0
1363.else
1364        mov             v0.16b, v6.16b
1365        mov             v2.16b, v6.16b
1366.endif
1367        tbl             v16.16b, {v17.16b}, v30.16b
1368        tbl             v17.16b, {v17.16b}, v31.16b
1369        sdot            z0.d, z16.h, z4.h[0]
1370        sdot            z2.d, z17.h, z4.h[0]
1371
1372        tbl             v18.16b, {v19.16b}, v30.16b
1373        tbl             v19.16b, {v19.16b}, v31.16b
1374.ifc \type, prep
1375        movi            v16.2d, #0
1376        movi            v17.2d, #0
1377.else
1378        mov             v16.16b, v6.16b
1379        mov             v17.16b, v6.16b
1380.endif
1381        ldur            q25, [\src, #24]
1382
1383        sdot            z16.d, z18.h, z4.h[0]
1384        sdot            z17.d, z19.h, z4.h[0]
1385
1386        ldr             q27, [\src, #32]!
1387
1388        tbl             v22.16b, {v21.16b}, v30.16b
1389        tbl             v23.16b, {v21.16b}, v31.16b
1390.ifc \type, prep
1391        movi            v1.2d, #0
1392        movi            v3.2d, #0
1393.else
1394        mov             v1.16b, v6.16b
1395        mov             v3.16b, v6.16b
1396.endif
1397        sdot            z1.d, z22.h, z4.h[0]
1398        sdot            z3.d, z23.h, z4.h[0]
1399
1400        tbl             v24.16b, {v25.16b}, v30.16b
1401        tbl             v25.16b, {v25.16b}, v31.16b
1402.ifc \type, prep
1403        movi            v22.2d, #0
1404        movi            v23.2d, #0
1405.else
1406        mov             v22.16b, v6.16b
1407        mov             v23.16b, v6.16b
1408.endif
1409        sdot            z22.d, z24.h, z4.h[0]
1410        sdot            z23.d, z25.h, z4.h[0]
1411
1412        tbl             v20.16b, {v21.16b}, v30.16b
1413        tbl             v21.16b, {v21.16b}, v31.16b
1414        sdot            z0.d, z18.h, z4.h[1]
1415        sdot            z2.d, z19.h, z4.h[1]
1416        tbl             v26.16b, {v27.16b}, v30.16b
1417        tbl             v27.16b, {v27.16b}, v31.16b
1418        sdot            z16.d, z20.h, z4.h[1]
1419        sdot            z17.d, z21.h, z4.h[1]
1420
1421        sdot            z1.d, z24.h, z4.h[1]
1422        sdot            z3.d, z25.h, z4.h[1]
1423
1424        sdot            z22.d, z26.h, z4.h[1]
1425        sdot            z23.d, z27.h, z4.h[1]
1426
1427        subs            w8, w8, #16
1428        uzp1            v0.4s, v0.4s, v2.4s
1429        uzp1            v2.4s, v16.4s, v17.4s
1430        uzp1            v1.4s, v1.4s, v3.4s
1431        uzp1            v3.4s, v22.4s, v23.4s
1432.ifc \type, prep
1433        srshl           v0.4s, v0.4s, v5.4s
1434        srshl           v2.4s, v2.4s, v5.4s
1435        srshl           v1.4s, v1.4s, v5.4s
1436        srshl           v3.4s, v3.4s, v5.4s
1437        uzp1            v0.8h, v0.8h, v2.8h
1438        uzp1            v1.8h, v1.8h, v3.8h
1439        sub             z0.h, z0.h, #PREP_BIAS
1440        sub             z1.h, z1.h, #PREP_BIAS
1441.else   // put
1442        sqshrun         v0.4h, v0.4s, #6
1443        sqshrun2        v0.8h, v2.4s, #6
1444        sqshrun         v1.4h, v1.4s, #6
1445        sqshrun2        v1.8h, v3.4s, #6
1446        umin            v0.8h, v0.8h, v5.8h
1447        umin            v1.8h, v1.8h, v5.8h
1448.endif
1449        stp             q0, q1, [\dst], #32
1450        b.gt            32b
1451
1452        add             \src, \src, \s_strd
1453.ifc \type, put
1454        add             \dst, \dst, \d_strd
1455.endif
1456        subs            \h, \h, #1
1457        mov             w8, \w
1458        b.gt            32b
1459        ret
1460endfunc
1461
1462jumptable \type\()_8tap_h_\isa\()_tbl
1463        .word 1280b - \type\()_8tap_h_\isa\()_tbl
1464        .word 640b  - \type\()_8tap_h_\isa\()_tbl
1465        .word 320b  - \type\()_8tap_h_\isa\()_tbl
1466        .word 160b  - \type\()_8tap_h_\isa\()_tbl
1467        .word 80b   - \type\()_8tap_h_\isa\()_tbl
1468        .word 40b   - \type\()_8tap_h_\isa\()_tbl
1469.ifc \type, put
1470        .word 20b   - \type\()_8tap_h_\isa\()_tbl
1471.endif
1472endjumptable
1473.endm
1474
1475
1476function prep_sve
1477        movrel          x9, prep_tbl
1478        mov             w6, #19
1479        ldrsw           x8, [x9, x8, lsl #2]
1480        sub             w6, w6, w7, lsr #8          // 19 - bdmax / 256
1481        add             x9, x9, x8
1482        movi            v30.8h, #PREP_BIAS_NEG
1483        dup             v29.8h, w6                  // 10b: 1 << 4, 12b: 1 << 2
1484        br              x9
1485
1486        .align JUMP_ALIGN
148740:     // prep - 4xN
1488        AARCH64_VALID_JUMP_TARGET
1489
1490        .align LOOP_ALIGN
14914:
1492        ldr             d0, [x1]
1493        ldr             d1, [x1, x2]
1494        add             x1, x1, x2, lsl #1
1495        subs            w4, w4, #2
1496        mad             z0.h, p0/m, z29.h, z30.h
1497        mad             z1.h, p0/m, z29.h, z30.h
1498        stp             d0, d1, [x0], #16
1499        b.gt            4b
1500        ret
1501
1502        .align JUMP_ALIGN
150380:     // prep - 8xN
1504        AARCH64_VALID_JUMP_TARGET
1505
1506        .align LOOP_ALIGN
15078:
1508        ld1             {v0.8h}, [x1], x2
1509        ld1             {v1.8h}, [x1], x2
1510        subs            w4, w4, #2
1511        mad             z0.h, p0/m, z29.h, z30.h
1512        mad             z1.h, p0/m, z29.h, z30.h
1513        stp             q0, q1, [x0], #32
1514        b.gt            8b
1515        ret
1516
1517        .align JUMP_ALIGN
1518160:    // prep - 16xN
1519        AARCH64_VALID_JUMP_TARGET
1520
1521        .align LOOP_ALIGN
152216:
1523        ld1             {v0.8h, v1.8h}, [x1], x2
1524        mad             z0.h, p0/m, z29.h, z30.h
1525        mad             z1.h, p0/m, z29.h, z30.h
1526        subs            w4, w4, #2
1527        ld1             {v2.8h, v3.8h}, [x1], x2
1528        mad             z2.h, p0/m, z29.h, z30.h
1529        mad             z3.h, p0/m, z29.h, z30.h
1530        stp             q0, q1, [x0]
1531        stp             q2, q3, [x0, #32]
1532        add             x0, x0, #64
1533        b.gt            16b
1534        ret
1535
1536        .align JUMP_ALIGN
1537320:    // prep - 32xN
1538        AARCH64_VALID_JUMP_TARGET
1539
1540        .align LOOP_ALIGN
154132:
1542        ldp             q0, q1, [x1]
1543        mad             z0.h, p0/m, z29.h, z30.h
1544        mad             z1.h, p0/m, z29.h, z30.h
1545        ldp             q2, q3, [x1, #32]
1546        subs            w4, w4, #1
1547        mad             z2.h, p0/m, z29.h, z30.h
1548        mad             z3.h, p0/m, z29.h, z30.h
1549        add             x1, x1, x2
1550        stp             q0, q1, [x0]
1551        stp             q2, q3, [x0, #32]
1552        add             x0, x0, #64
1553        b.gt            32b
1554        ret
1555
1556        .align JUMP_ALIGN
1557640:    // prep - 64xN
1558        AARCH64_VALID_JUMP_TARGET
1559
1560        .align LOOP_ALIGN
156164:
1562        ldp             q0, q1, [x1]
1563        mad             z0.h, p0/m, z29.h, z30.h
1564        mad             z1.h, p0/m, z29.h, z30.h
1565        ldp             q2, q3, [x1, #32]
1566        mad             z2.h, p0/m, z29.h, z30.h
1567        mad             z3.h, p0/m, z29.h, z30.h
1568        ldp             q4, q5, [x1, #64]
1569        mad             z4.h, p0/m, z29.h, z30.h
1570        mad             z5.h, p0/m, z29.h, z30.h
1571        ldp             q6, q7, [x1, #96]
1572        add             x1, x1, x2
1573        subs            w4, w4, #1
1574        mad             z6.h, p0/m, z29.h, z30.h
1575        mad             z7.h, p0/m, z29.h, z30.h
1576        stp             q0, q1, [x0]
1577        stp             q2, q3, [x0, #32]
1578        stp             q4, q5, [x0, #64]
1579        stp             q6, q7, [x0, #96]
1580        add             x0, x0, #128
1581        b.gt            64b
1582        ret
1583
1584        .align JUMP_ALIGN
15851280:   // prep - 128xN
1586        AARCH64_VALID_JUMP_TARGET
1587
1588        .align LOOP_ALIGN
1589128:
1590        ldp             q0, q1, [x1]
1591        mad             z0.h, p0/m, z29.h, z30.h
1592        mad             z1.h, p0/m, z29.h, z30.h
1593        ldp             q2, q3, [x1, #32]
1594        mad             z2.h, p0/m, z29.h, z30.h
1595        mad             z3.h, p0/m, z29.h, z30.h
1596        ldp             q4, q5, [x1, #64]
1597        mad             z4.h, p0/m, z29.h, z30.h
1598        mad             z5.h, p0/m, z29.h, z30.h
1599        ldp             q6, q7, [x1, #96]
1600        mad             z6.h, p0/m, z29.h, z30.h
1601        mad             z7.h, p0/m, z29.h, z30.h
1602        ldp             q16, q17, [x1, #128]
1603        mad             z16.h, p0/m, z29.h, z30.h
1604        mad             z17.h, p0/m, z29.h, z30.h
1605        ldp             q18, q19, [x1, #160]
1606        mad             z18.h, p0/m, z29.h, z30.h
1607        mad             z19.h, p0/m, z29.h, z30.h
1608        ldp             q20, q21, [x1, #192]
1609        mad             z20.h, p0/m, z29.h, z30.h
1610        mad             z21.h, p0/m, z29.h, z30.h
1611        ldp             q22, q23, [x1, #224]
1612        add             x1, x1, x2
1613        mad             z22.h, p0/m, z29.h, z30.h
1614        mad             z23.h, p0/m, z29.h, z30.h
1615        subs            w4, w4, #1
1616        stp             q0, q1, [x0]
1617        stp             q2, q3, [x0, #32]
1618        stp             q4, q5, [x0, #64]
1619        stp             q6, q7, [x0, #96]
1620        stp             q16, q17, [x0, #128]
1621        stp             q18, q19, [x0, #160]
1622        stp             q20, q21, [x0, #192]
1623        stp             q22, q23, [x0, #224]
1624        add             x0, x0, #256
1625        b.gt            128b
1626        ret
1627endfunc
1628
1629jumptable prep_tbl
1630        .word 1280b - prep_tbl
1631        .word 640b  - prep_tbl
1632        .word 320b  - prep_tbl
1633        .word 160b  - prep_tbl
1634        .word 80b   - prep_tbl
1635        .word 40b   - prep_tbl
1636endjumptable
1637
1638
1639// dst(x0), d_strd(x9), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6), bdmax(w7)
1640// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w9), ws_strd(w2)
1641filter_8tap_fn prep, sve2, x0, x9, x1, x2, w3, w4, w5, w6, w7, x5, x6, x5, x6, w9, w2
1642
1643// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7), bdmax(w8)
1644// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1), ws_strd(w3)
1645filter_8tap_fn  put, sve2, x0, x1, x2, x3, w4, w5, w6, w7, w8, x6, x7, x6, x7, w1, w3
1646
1647DISABLE_SVE2
1648DISABLE_SVE
1649#endif  // HAVE_SVE2
1650