xref: /aosp_15_r20/external/libdav1d/src/x86/ipred16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2022-2024, VideoLAN and dav1d authors
2; Copyright © 2022-2024, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32
33ipred_shuf:    db 14, 15, 14, 15,  0,  1,  2,  3,  6,  7,  6,  7,  0,  1,  2,  3
34               db 10, 11, 10, 11,  8,  9, 10, 11,  2,  3,  2,  3,  8,  9, 10, 11
35               db 12, 13, 12, 13,  4,  5,  6,  7,  4,  5,  4,  5,  4,  5,  6,  7
36               db  8,  9,  8,  9, 12, 13, 14, 15,  0,  1,  0,  1, 12, 13, 14, 15
37smooth_perm:   db  1,  2,  5,  6,  9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
38               db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
39               db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
40               db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
41pal_pred_perm: db  0, 16, 32, 48,  1, 17, 33, 49,  2, 18, 34, 50,  3, 19, 35, 51
42               db  4, 20, 36, 52,  5, 21, 37, 53,  6, 22, 38, 54,  7, 23, 39, 55
43               db  8, 24, 40, 56,  9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
44               db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
45pw_31to0:      dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
46               dw 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0
47pw_1to32:      dw  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
48               dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
49z_upsample:    dw  0, -1,  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6
50               dw  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14
51z_xpos_mul:    dw  1,  1,  1,  1,  2,  2,  1,  1,  3,  3,  2,  2,  4,  4,  2,  2
52               dw  5,  5,  3,  3,  6,  6,  3,  3,  7,  7,  4,  4,  8,  8,  4,  4
53z_ypos_mul:    dw  0,  0,  0,  0,  1,  1,  0,  0,  2,  2,  1,  1,  3,  3,  1,  1
54               dw  4,  4,  2,  2,  5,  5,  2,  2,  6,  6,  3,  3,  7,  7,  3,  3
55z_filter_t0:   db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
56z_filter_t1:   db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
57z_xpos_off1a:  dw  30720,  30784,  30848,  30912,  30976,  31040,  31104,  31168
58z_xpos_off1b:  dw  30720,  30848,  30976,  31104,  31232,  31360,  31488,  31616
59filter_permA:  times 4 db  6,  7,  8,  9, 14, 15,  4,  5
60               times 4 db 10, 11, 12, 13,  2,  3, -1, -1
61filter_permB:  times 4 db 22, 23, 24, 25, 30, 31,  6,  7
62               times 4 db 26, 27, 28, 29, 14, 15, -1, -1
63filter_permC:          dd  8 ; dq  8, 10,  1, 11,  0,  9
64pw_1:          times 2 dw  1
65                       dd 10
66filter_rnd:            dd 32
67                       dd  1
68                       dd  8
69                       dd 11
70filter_shift:  times 2 dw  6
71                       dd  0
72               times 2 dw  4
73                       dd  9
74pd_65536:              dd 65536
75pal_unpack:    db  0,  8,  4, 12, 32, 40, 36, 44
76               db 16, 24, 20, 28, 48, 56, 52, 60
77z_filter_wh:   db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
78               db 39, 39, 47, 47, 47, 79, 79, 79
79z_filter_k:    dw  8,  8,  6,  6,  4,  4
80               dw  4,  4,  5,  5,  4,  4
81               dw  0,  0,  0,  0,  2,  2
82pb_90:         times 4 db 90
83pw_15:         times 2 dw 15
84pw_16:         times 2 dw 16
85pw_17:         times 2 dw 17
86pw_24:         times 2 dw 24
87pw_31:         times 2 dw 31
88pw_32:         times 2 dw 32
89pw_63:         times 2 dw 63
90pw_64:         times 2 dw 64
91pw_512:        times 2 dw 512
92pw_2048:       times 2 dw 2048
93pw_31806:      times 2 dw 31806
94pw_32640:      times 2 dw 32640
95pw_32672:      times 2 dw 32672
96pw_32704:      times 2 dw 32704
97pw_32735:      times 2 dw 32735
98pw_32736:      times 2 dw 32736
99
100%define pw_2 (z_xpos_mul+4* 2)
101%define pw_3 (z_xpos_mul+4* 4)
102%define pw_7 (z_xpos_mul+4*12)
103%define pw_0to31 (pw_1to32-2)
104
105%macro JMP_TABLE 3-*
106    %xdefine %1_%2_table (%%table - 2*4)
107    %xdefine %%base mangle(private_prefix %+ _%1_%2)
108    %%table:
109    %rep %0 - 2
110        dd %%base %+ .%3 - (%%table - 2*4)
111        %rotate 1
112    %endrep
113%endmacro
114
115JMP_TABLE ipred_paeth_16bpc,      avx512icl, w4, w8, w16, w32, w64
116JMP_TABLE ipred_smooth_16bpc,     avx512icl, w4, w8, w16, w32, w64
117JMP_TABLE ipred_smooth_h_16bpc,   avx512icl, w4, w8, w16, w32, w64
118JMP_TABLE ipred_smooth_v_16bpc,   avx512icl, w4, w8, w16, w32, w64
119JMP_TABLE ipred_z1_16bpc,         avx512icl, w4, w8, w16, w32, w64
120JMP_TABLE ipred_z2_16bpc,         avx512icl, w4, w8, w16, w32, w64
121JMP_TABLE ipred_z3_16bpc,         avx512icl, w4, w8, w16, w32, w64
122JMP_TABLE pal_pred_16bpc,         avx512icl, w4, w8, w16, w32, w64
123
124cextern smooth_weights_1d_16bpc
125cextern smooth_weights_2d_16bpc
126cextern dr_intra_derivative
127cextern filter_intra_taps
128
129SECTION .text
130
131%macro PAETH 3 ; top, signed_ldiff, ldiff
132    paddw               m0, m%2, m2
133    psubw               m1, m0, m3  ; tldiff
134    psubw               m0, m%1     ; tdiff
135    pabsw               m1, m1
136    pabsw               m0, m0
137    pcmpgtw             k1, m0, m1
138    pminsw              m0, m1
139    pcmpgtw             k2, m%3, m0
140    vpblendmw       m0{k1}, m%1, m3
141    vpblendmw       m0{k2}, m2, m0
142%endmacro
143
144INIT_ZMM avx512icl
145cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h
146%define base r6-ipred_paeth_16bpc_avx512icl_table
147    lea                 r6, [ipred_paeth_16bpc_avx512icl_table]
148    tzcnt               wd, wm
149    movifnidn           hd, hm
150    movsxd              wq, [r6+wq*4]
151    vpbroadcastw        m3, [tlq]   ; topleft
152    add                 wq, r6
153    jmp                 wq
154.w4:
155    vpbroadcastq        m4, [tlq+2] ; top
156    movsldup            m7, [base+ipred_shuf]
157    lea                 r6, [strideq*3]
158    psubw               m5, m4, m3
159    pabsw               m6, m5
160.w4_loop:
161    sub                tlq, 16
162    vbroadcasti32x4     m2, [tlq]
163    pshufb              m2, m7      ; left
164    PAETH                4, 5, 6
165    vextracti32x4      xm1, m0, 2
166    vextracti32x4      xm8, ym0, 1
167    vextracti32x4      xm9, m0, 3
168    movq   [dstq+strideq*0], xm0
169    movq   [dstq+strideq*1], xm1
170    movq   [dstq+strideq*2], xm8
171    movq   [dstq+r6       ], xm9
172    sub                 hd, 8
173    jl .w4_end
174    lea               dstq, [dstq+strideq*4]
175    movhps [dstq+strideq*0], xm0
176    movhps [dstq+strideq*1], xm1
177    movhps [dstq+strideq*2], xm8
178    movhps [dstq+r6       ], xm9
179    lea               dstq, [dstq+strideq*4]
180    jg .w4_loop
181.w4_end:
182    RET
183.w8:
184    vbroadcasti32x4     m4, [tlq+2]
185    movsldup            m7, [base+ipred_shuf]
186    lea                 r6, [strideq*3]
187    psubw               m5, m4, m3
188    pabsw               m6, m5
189.w8_loop:
190    sub                tlq, 8
191    vpbroadcastq        m2, [tlq]
192    pshufb              m2, m7
193    PAETH                4, 5, 6
194    mova          [dstq+strideq*0], xm0
195    vextracti32x4 [dstq+strideq*1], m0, 2
196    vextracti32x4 [dstq+strideq*2], ym0, 1
197    vextracti32x4 [dstq+r6       ], m0, 3
198    lea               dstq, [dstq+strideq*4]
199    sub                 hd, 4
200    jg .w8_loop
201    RET
202.w16:
203    vbroadcasti32x8     m4, [tlq+2]
204    movsldup            m7, [base+ipred_shuf]
205    psubw               m5, m4, m3
206    pabsw               m6, m5
207.w16_loop:
208    sub                tlq, 4
209    vpbroadcastd        m2, [tlq]
210    pshufb              m2, m7
211    PAETH                4, 5, 6
212    mova          [dstq+strideq*0], ym0
213    vextracti32x8 [dstq+strideq*1], m0, 1
214    lea               dstq, [dstq+strideq*2]
215    sub                 hd, 2
216    jg .w16_loop
217    RET
218.w32:
219    movu                m4, [tlq+2]
220    psubw               m5, m4, m3
221    pabsw               m6, m5
222.w32_loop:
223    sub                tlq, 2
224    vpbroadcastw        m2, [tlq]
225    PAETH                4, 5, 6
226    mova            [dstq], m0
227    add               dstq, strideq
228    dec                 hd
229    jg .w32_loop
230    RET
231.w64:
232    movu                m4, [tlq+ 2]
233    movu                m7, [tlq+66]
234    psubw               m5, m4, m3
235    psubw               m8, m7, m3
236    pabsw               m6, m5
237    pabsw               m9, m8
238.w64_loop:
239    sub                tlq, 2
240    vpbroadcastw        m2, [tlq]
241    PAETH                4, 5, 6
242    mova       [dstq+64*0], m0
243    PAETH                7, 8, 9
244    mova       [dstq+64*1], m0
245    add               dstq, strideq
246    dec                 hd
247    jg .w64_loop
248    RET
249
250cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
251%define base r6-$$
252    lea                  r6, [$$]
253    tzcnt                wd, wm
254    mov                  hd, hm
255    movsxd               wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4]
256    lea            weightsq, [base+smooth_weights_1d_16bpc+hq*4]
257    neg                  hq
258    vpbroadcastw         m6, [tlq+hq*2] ; bottom
259    lea                  wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq]
260    lea            stride3q, [strideq*3]
261    jmp                  wq
262.w4:
263    vpbroadcastq         m5, [tlq+2]    ; top
264    movsldup             m4, [ipred_shuf]
265    psubw                m5, m6         ; top - bottom
266.w4_loop:
267    vbroadcasti32x4      m3, [weightsq+hq*2]
268    pshufb               m3, m4
269    pmulhrsw             m3, m5
270    paddw                m3, m6
271    vextracti32x4       xm0, m3, 3
272    vextracti32x4       xm1, ym3, 1
273    vextracti32x4       xm2, m3, 2
274    movhps [dstq+strideq*0], xm0
275    movhps [dstq+strideq*1], xm1
276    movhps [dstq+strideq*2], xm2
277    movhps [dstq+stride3q ], xm3
278    add                  hq, 8
279    jg .end
280    lea                dstq, [dstq+strideq*4]
281    movq   [dstq+strideq*0], xm0
282    movq   [dstq+strideq*1], xm1
283    movq   [dstq+strideq*2], xm2
284    movq   [dstq+stride3q ], xm3
285    lea                dstq, [dstq+strideq*4]
286    jl .w4_loop
287.end:
288    RET
289.w8:
290    vbroadcasti32x4      m5, [tlq+2]    ; top
291    movsldup             m4, [ipred_shuf]
292    psubw                m5, m6         ; top - bottom
293.w8_loop:
294    vpbroadcastq         m0, [weightsq+hq*2]
295    pshufb               m0, m4
296    pmulhrsw             m0, m5
297    paddw                m0, m6
298    vextracti32x4 [dstq+strideq*0], m0, 3
299    vextracti32x4 [dstq+strideq*1], ym0, 1
300    vextracti32x4 [dstq+strideq*2], m0, 2
301    mova          [dstq+stride3q ], xm0
302    lea                dstq, [dstq+strideq*4]
303    add                  hq, 4
304    jl .w8_loop
305    RET
306.w16:
307    vbroadcasti32x8      m5, [tlq+2]    ; top
308    movsldup             m4, [ipred_shuf]
309    psubw                m5, m6         ; top - bottom
310.w16_loop:
311    vpbroadcastd         m0, [weightsq+hq*2+0]
312    vpbroadcastd         m1, [weightsq+hq*2+4]
313    pshufb               m0, m4
314    pshufb               m1, m4
315    pmulhrsw             m0, m5
316    pmulhrsw             m1, m5
317    paddw                m0, m6
318    paddw                m1, m6
319    vextracti32x8 [dstq+strideq*0], m0, 1
320    mova          [dstq+strideq*1], ym0
321    vextracti32x8 [dstq+strideq*2], m1, 1
322    mova          [dstq+stride3q ], ym1
323    lea                dstq, [dstq+strideq*4]
324    add                  hq, 4
325    jl .w16_loop
326    RET
327.w32:
328    movu                 m5, [tlq+2]
329    psubw                m5, m6
330.w32_loop:
331    vpbroadcastw         m0, [weightsq+hq*2+0]
332    vpbroadcastw         m1, [weightsq+hq*2+2]
333    vpbroadcastw         m2, [weightsq+hq*2+4]
334    vpbroadcastw         m3, [weightsq+hq*2+6]
335    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
336    REPX   {paddw    x, m6}, m0, m1, m2, m3
337    mova   [dstq+strideq*0], m0
338    mova   [dstq+strideq*1], m1
339    mova   [dstq+strideq*2], m2
340    mova   [dstq+stride3q ], m3
341    lea                dstq, [dstq+strideq*4]
342    add                  hq, 4
343    jl .w32_loop
344    RET
345.w64:
346    movu                 m4, [tlq+ 2]
347    movu                 m5, [tlq+66]
348    psubw                m4, m6
349    psubw                m5, m6
350.w64_loop:
351    vpbroadcastw         m1, [weightsq+hq*2+0]
352    vpbroadcastw         m3, [weightsq+hq*2+2]
353    pmulhrsw             m0, m4, m1
354    pmulhrsw             m1, m5
355    pmulhrsw             m2, m4, m3
356    pmulhrsw             m3, m5
357    REPX      {paddw x, m6}, m0, m1, m2, m3
358    mova [dstq+strideq*0+64*0], m0
359    mova [dstq+strideq*0+64*1], m1
360    mova [dstq+strideq*1+64*0], m2
361    mova [dstq+strideq*1+64*1], m3
362    lea                dstq, [dstq+strideq*2]
363    add                  hq, 2
364    jl .w64_loop
365    RET
366
367cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3
368    lea                  r6, [$$]
369    mov                  wd, wm
370    movifnidn            hd, hm
371    vpbroadcastw         m6, [tlq+wq*2] ; right
372    tzcnt                wd, wd
373    add                  hd, hd
374    movsxd               wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4]
375    sub                 tlq, hq
376    lea            stride3q, [strideq*3]
377    lea                  wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq]
378    jmp                  wq
379.w4:
380    movsldup             m4, [base+ipred_shuf]
381    vpbroadcastq         m5, [base+smooth_weights_1d_16bpc+4*2]
382.w4_loop:
383    vbroadcasti32x4      m0, [tlq+hq-16] ; left
384    pshufb               m0, m4
385    psubw                m0, m6          ; left - right
386    pmulhrsw             m0, m5
387    paddw                m0, m6
388    vextracti32x4       xm1, m0, 2
389    vextracti32x4       xm2, ym0, 1
390    vextracti32x4       xm3, m0, 3
391    movq   [dstq+strideq*0], xm0
392    movq   [dstq+strideq*1], xm1
393    movq   [dstq+strideq*2], xm2
394    movq   [dstq+stride3q ], xm3
395    sub                  hd, 8*2
396    jl .end
397    lea                dstq, [dstq+strideq*4]
398    movhps [dstq+strideq*0], xm0
399    movhps [dstq+strideq*1], xm1
400    movhps [dstq+strideq*2], xm2
401    movhps [dstq+stride3q ], xm3
402    lea                dstq, [dstq+strideq*4]
403    jg .w4_loop
404.end:
405    RET
406.w8:
407    movsldup             m4, [base+ipred_shuf]
408    vbroadcasti32x4      m5, [base+smooth_weights_1d_16bpc+8*2]
409.w8_loop:
410    vpbroadcastq         m0, [tlq+hq-8] ; left
411    pshufb               m0, m4
412    psubw                m0, m6         ; left - right
413    pmulhrsw             m0, m5
414    paddw                m0, m6
415    mova          [dstq+strideq*0], xm0
416    vextracti32x4 [dstq+strideq*1], m0, 2
417    vextracti32x4 [dstq+strideq*2], ym0, 1
418    vextracti32x4 [dstq+stride3q ], m0, 3
419    lea                dstq, [dstq+strideq*4]
420    sub                  hd, 4*2
421    jg .w8_loop
422    RET
423.w16:
424    movsldup             m4, [base+ipred_shuf]
425    vbroadcasti32x8      m5, [base+smooth_weights_1d_16bpc+16*2]
426.w16_loop:
427    vpbroadcastd         m0, [tlq+hq-4]
428    vpbroadcastd         m1, [tlq+hq-8]
429    pshufb               m0, m4
430    pshufb               m1, m4
431    psubw                m0, m6
432    psubw                m1, m6
433    pmulhrsw             m0, m5
434    pmulhrsw             m1, m5
435    paddw                m0, m6
436    paddw                m1, m6
437    mova          [dstq+strideq*0], ym0
438    vextracti32x8 [dstq+strideq*1], m0, 1
439    mova          [dstq+strideq*2], ym1
440    vextracti32x8 [dstq+stride3q ], m1, 1
441    lea                dstq, [dstq+strideq*4]
442    sub                  hq, 4*2
443    jg .w16_loop
444    RET
445.w32:
446    movu                 m5, [base+smooth_weights_1d_16bpc+32*2]
447.w32_loop:
448    vpbroadcastq         m3, [tlq+hq-8]
449    punpcklwd            m3, m3
450    psubw                m3, m6
451    pshufd               m0, m3, q3333
452    pshufd               m1, m3, q2222
453    pshufd               m2, m3, q1111
454    pshufd               m3, m3, q0000
455    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
456    REPX   {paddw    x, m6}, m0, m1, m2, m3
457    mova   [dstq+strideq*0], m0
458    mova   [dstq+strideq*1], m1
459    mova   [dstq+strideq*2], m2
460    mova   [dstq+stride3q ], m3
461    lea                dstq, [dstq+strideq*4]
462    sub                  hq, 4*2
463    jg .w32_loop
464    RET
465.w64:
466    movu                 m4, [base+smooth_weights_1d_16bpc+64*2]
467    movu                 m5, [base+smooth_weights_1d_16bpc+64*3]
468.w64_loop:
469    vpbroadcastw         m1, [tlq+hq-2]
470    vpbroadcastw         m3, [tlq+hq-4]
471    psubw                m1, m6
472    psubw                m3, m6
473    pmulhrsw             m0, m4, m1
474    pmulhrsw             m1, m5
475    pmulhrsw             m2, m4, m3
476    pmulhrsw             m3, m5
477    REPX      {paddw x, m6}, m0, m1, m2, m3
478    mova [dstq+strideq*0+64*0], m0
479    mova [dstq+strideq*0+64*1], m1
480    mova [dstq+strideq*1+64*0], m2
481    mova [dstq+strideq*1+64*1], m3
482    lea                dstq, [dstq+strideq*2]
483    sub                  hq, 2*2
484    jg .w64_loop
485    RET
486
487cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
488    lea                 r6, [$$]
489    mov                 wd, wm
490    movifnidn           hd, hm
491    vpbroadcastw       m13, [tlq+wq*2]   ; right
492    tzcnt               wd, wd
493    add                 hd, hd
494    movsxd              wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4]
495    mov                r5d, 0x55555555
496    sub                tlq, hq
497    mova               m14, [base+smooth_perm]
498    kmovd               k1, r5d
499    vpbroadcastw        m0, [tlq]        ; bottom
500    mov                 r5, 0x3333333333333333
501    pxor               m15, m15
502    lea                 wq, [base+ipred_smooth_16bpc_avx512icl_table+wq]
503    kmovq               k2, r5
504    lea         v_weightsq, [base+smooth_weights_2d_16bpc+hq*2]
505    jmp                 wq
506.w4:
507    vpbroadcastq        m5, [tlq+hq+2]
508    movshdup            m3, [base+ipred_shuf]
509    movsldup            m4, [base+ipred_shuf]
510    vbroadcasti32x4     m6, [base+smooth_weights_2d_16bpc+4*4]
511    lea           stride3q, [strideq*3]
512    punpcklwd           m5, m0           ; top, bottom
513.w4_loop:
514    vbroadcasti32x4     m0, [v_weightsq]
515    vpbroadcastq        m2, [tlq+hq-8]
516    mova                m1, m13
517    pshufb              m0, m3
518    pmaddwd             m0, m5
519    pshufb          m1{k2}, m2, m4       ; left, right
520    vpdpwssd            m0, m1, m6
521    vpermb              m0, m14, m0
522    pavgw              ym0, ym15
523    vextracti32x4      xm1, ym0, 1
524    movq   [dstq+strideq*0], xm0
525    movq   [dstq+strideq*1], xm1
526    movhps [dstq+strideq*2], xm0
527    movhps [dstq+stride3q ], xm1
528    lea               dstq, [dstq+strideq*4]
529    add         v_weightsq, 4*4
530    sub                 hd, 4*2
531    jg .w4_loop
532    RET
533.w8:
534    vbroadcasti32x4    ym5, [tlq+hq+2]
535    movshdup            m6, [base+ipred_shuf]
536    movsldup            m7, [base+ipred_shuf]
537    pmovzxwd            m5, ym5
538    vbroadcasti32x8     m8, [base+smooth_weights_2d_16bpc+8*4]
539    lea           stride3q, [strideq*3]
540    vpblendmw       m5{k1}, m0, m5       ; top, bottom
541.w8_loop:
542    vpbroadcastq        m0, [v_weightsq+0]
543    vpbroadcastq        m1, [v_weightsq+8]
544    vpbroadcastd        m3, [tlq+hq-4]
545    vpbroadcastd        m4, [tlq+hq-8]
546    pshufb              m0, m6
547    pmaddwd             m0, m5
548    pshufb              m1, m6
549    pmaddwd             m1, m5
550    mova                m2, m13
551    pshufb          m2{k2}, m3, m7       ; left, right
552    mova                m3, m13
553    pshufb          m3{k2}, m4, m7
554    vpdpwssd            m0, m2, m8
555    vpdpwssd            m1, m3, m8
556    add         v_weightsq, 4*4
557    vpermt2b            m0, m14, m1
558    pavgw               m0, m15
559    mova          [dstq+strideq*0], xm0
560    vextracti32x4 [dstq+strideq*1], ym0, 1
561    vextracti32x4 [dstq+strideq*2], m0, 2
562    vextracti32x4 [dstq+stride3q ], m0, 3
563    lea               dstq, [dstq+strideq*4]
564    sub                 hd, 4*2
565    jg .w8_loop
566    RET
567.w16:
568    pmovzxwd            m5, [tlq+hq+2]
569    mova                m6, [base+smooth_weights_2d_16bpc+16*4]
570    vpblendmw       m5{k1}, m0, m5       ; top, bottom
571.w16_loop:
572    vpbroadcastd        m0, [v_weightsq+0]
573    vpbroadcastd        m1, [v_weightsq+4]
574    pmaddwd             m0, m5
575    pmaddwd             m1, m5
576    mova                m2, m13
577    vpbroadcastw    m2{k1}, [tlq+hq-2] ; left, right
578    mova                m3, m13
579    vpbroadcastw    m3{k1}, [tlq+hq-4]
580    vpdpwssd            m0, m2, m6
581    vpdpwssd            m1, m3, m6
582    add         v_weightsq, 2*4
583    vpermt2b            m0, m14, m1
584    pavgw               m0, m15
585    mova          [dstq+strideq*0], ym0
586    vextracti32x8 [dstq+strideq*1], m0, 1
587    lea               dstq, [dstq+strideq*2]
588    sub                 hq, 2*2
589    jg .w16_loop
590    RET
591.w32:
592    pmovzxwd            m5, [tlq+hq+ 2]
593    pmovzxwd            m6, [tlq+hq+34]
594    mova                m7, [base+smooth_weights_2d_16bpc+32*4]
595    mova                m8, [base+smooth_weights_2d_16bpc+32*6]
596    vpblendmw       m5{k1}, m0, m5       ; top, bottom
597    vpblendmw       m6{k1}, m0, m6
598.w32_loop:
599    vpbroadcastd        m2, [v_weightsq+0]
600    vpbroadcastd        m3, [v_weightsq+4]
601    pmaddwd             m0, m5, m2
602    pmaddwd             m2, m6
603    pmaddwd             m1, m5, m3
604    pmaddwd             m3, m6
605    mova                m4, m13
606    vpbroadcastw    m4{k1}, [tlq+hq-2] ; left, right
607    vpdpwssd            m0, m4, m7
608    vpdpwssd            m2, m4, m8
609    mova                m4, m13
610    vpbroadcastw    m4{k1}, [tlq+hq-4]
611    vpdpwssd            m1, m4, m7
612    vpdpwssd            m3, m4, m8
613    add         v_weightsq, 2*4
614    vpermt2b            m0, m14, m2
615    vpermt2b            m1, m14, m3
616    pavgw               m0, m15
617    pavgw               m1, m15
618    mova  [dstq+strideq*0], m0
619    mova  [dstq+strideq*1], m1
620    lea               dstq, [dstq+strideq*2]
621    sub                 hq, 2*2
622    jg .w32_loop
623    RET
624.w64:
625    pmovzxwd            m5, [tlq+hq+ 2]
626    pmovzxwd            m6, [tlq+hq+34]
627    pmovzxwd            m7, [tlq+hq+66]
628    pmovzxwd            m8, [tlq+hq+98]
629    mova                m9, [base+smooth_weights_2d_16bpc+64*4]
630    vpblendmw       m5{k1}, m0, m5       ; top, bottom
631    mova               m10, [base+smooth_weights_2d_16bpc+64*5]
632    vpblendmw       m6{k1}, m0, m6
633    mova               m11, [base+smooth_weights_2d_16bpc+64*6]
634    vpblendmw       m7{k1}, m0, m7
635    mova               m12, [base+smooth_weights_2d_16bpc+64*7]
636    vpblendmw       m8{k1}, m0, m8
637.w64_loop:
638    vpbroadcastd        m3, [v_weightsq]
639    mova                m4, m13
640    vpbroadcastw    m4{k1}, [tlq+hq-2] ; left, right
641    pmaddwd             m0, m5, m3
642    pmaddwd             m2, m6, m3
643    pmaddwd             m1, m7, m3
644    pmaddwd             m3, m8
645    vpdpwssd            m0, m4, m9
646    vpdpwssd            m2, m4, m10
647    vpdpwssd            m1, m4, m11
648    vpdpwssd            m3, m4, m12
649    add         v_weightsq, 1*4
650    vpermt2b            m0, m14, m2
651    vpermt2b            m1, m14, m3
652    pavgw               m0, m15
653    pavgw               m1, m15
654    mova       [dstq+64*0], m0
655    mova       [dstq+64*1], m1
656    add               dstq, strideq
657    sub                 hd, 1*2
658    jg .w64_loop
659    RET
660
661%if WIN64
662    DECLARE_REG_TMP 4
663%else
664    DECLARE_REG_TMP 8
665%endif
666
667cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
668%define base r7-z_filter_t0
669    lea                  r7, [z_filter_t0]
670    tzcnt                wd, wm
671    movifnidn        angled, anglem
672    lea                  t0, [dr_intra_derivative]
673    movsxd               wq, [base+ipred_z1_16bpc_avx512icl_table+wq*4]
674    add                 tlq, 2
675    mov                 dxd, angled
676    and                 dxd, 0x7e
677    add              angled, 165 ; ~90
678    movzx               dxd, word [t0+dxq]
679    lea                  wq, [base+ipred_z1_16bpc_avx512icl_table+wq]
680    movifnidn            hd, hm
681    xor              angled, 0x4ff ; d = 90 - angle
682    vpbroadcastd        m15, [base+pw_31806]
683    jmp                  wq
684.w4:
685    vpbroadcastw         m5, [tlq+14]
686    vinserti32x4         m5, [tlq], 0
687    cmp              angleb, 40
688    jae .w4_no_upsample
689    lea                 r3d, [angleq-1024]
690    sar                 r3d, 7
691    add                 r3d, hd
692    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
693    call .upsample_top
694    vpbroadcastq         m0, [base+z_xpos_off1b]
695    jmp .w4_main2
696.w4_no_upsample:
697    test             angled, 0x400
698    jnz .w4_main ; !enable_intra_edge_filter
699    lea                 r3d, [hq+3]
700    vpbroadcastb        xm0, r3d
701    vpbroadcastb        xm1, angled
702    shr              angled, 8 ; is_sm << 1
703    vpcmpeqb             k1, xm0, [base+z_filter_wh]
704    vpcmpgtb         k1{k1}, xm1, [base+z_filter_t0+angleq*8]
705    kmovw               r5d, k1
706    test                r5d, r5d
707    jz .w4_main
708    call .w16_filter
709    mov                 r2d, 9
710    cmp                  hd, 4
711    cmovne              r3d, r2d
712    vpbroadcastw         m6, r3d
713    pminuw               m6, [base+pw_0to31]
714    vpermw               m5, m6, m5
715.w4_main:
716    vpbroadcastq         m0, [base+z_xpos_off1a]
717.w4_main2:
718    movsldup             m3, [base+z_xpos_mul]
719    vpbroadcastw         m4, dxd
720    lea                  r2, [strideq*3]
721    pmullw               m3, m4
722    vshufi32x4           m6, m5, m5, q3321
723    psllw                m4, 3       ; dx*8
724    paddsw               m3, m0      ; xpos
725    palignr              m6, m5, 2   ; top+1
726.w4_loop:
727    psrlw                m1, m3, 6   ; base_x
728    pand                 m2, m15, m3 ; frac
729    vpermw               m0, m1, m5  ; top[base_x]
730    vpermw               m1, m1, m6  ; top[base_x+1]
731    psllw                m2, 9
732    psubw                m1, m0
733    pmulhrsw             m1, m2
734    paddw                m0, m1
735    vextracti32x4       xm1, ym0, 1
736    movq   [dstq+strideq*0], xm0
737    movhps [dstq+strideq*1], xm0
738    movq   [dstq+strideq*2], xm1
739    movhps [dstq+r2       ], xm1
740    sub                  hd, 8
741    jl .w4_end
742    vextracti32x4       xm1, m0, 2
743    paddsw               m3, m4      ; xpos += dx
744    lea                dstq, [dstq+strideq*4]
745    vextracti32x4       xm0, m0, 3
746    movq   [dstq+strideq*0], xm1
747    movhps [dstq+strideq*1], xm1
748    movq   [dstq+strideq*2], xm0
749    movhps [dstq+r2       ], xm0
750    lea                dstq, [dstq+strideq*4]
751    jg .w4_loop
752.w4_end:
753    RET
754.upsample_top:
755    vinserti32x4         m5, [tlq-16], 3
756    mova                 m3, [base+z_upsample]
757    vpbroadcastd         m4, [base+pd_65536]
758    add                 dxd, dxd
759    vpermw               m0, m3, m5
760    paddw                m3, m4
761    vpermw               m1, m3, m5
762    paddw                m3, m4
763    vpermw               m2, m3, m5
764    paddw                m3, m4
765    vpermw               m3, m3, m5
766    vpbroadcastw         m5, r9m     ; pixel_max
767    paddw                m1, m2      ; b+c
768    paddw                m0, m3      ; a+d
769    psubw                m0, m1, m0
770    psraw                m0, 3
771    pxor                 m2, m2
772    paddw                m0, m1
773    pmaxsw               m0, m2
774    pavgw                m0, m2
775    pminsw               m5, m0
776    ret
777.w8:
778    lea                 r3d, [angleq+216]
779    movu                ym5, [tlq]
780    mov                 r3b, hb
781    movu                m10, [base+pw_0to31]
782    cmp                 r3d, 8
783    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
784    lea                 r3d, [hq+7]
785    vpbroadcastw         m6, r3d
786    add                 r3d, r3d
787    pminuw               m6, m10
788    vpermw               m5, m6, m5
789    call .upsample_top
790    vbroadcasti32x4      m0, [base+z_xpos_off1b]
791    jmp .w8_main2
792.w8_no_upsample:
793    lea                 r3d, [hq+7]
794    vpbroadcastb        ym0, r3d
795    and                 r3d, 7
796    or                  r3d, 8 ; imin(h+7, 15)
797    vpbroadcastw         m6, r3d
798    pminuw               m6, m10
799    vpermw               m5, m6, m5
800    test             angled, 0x400
801    jnz .w8_main
802    vpbroadcastb        ym1, angled
803    shr              angled, 8
804    vpcmpeqb             k1, ym0, [base+z_filter_wh]
805    mova                xm0, [base+z_filter_t0+angleq*8]
806    vpcmpgtb         k1{k1}, ym1, ym0
807    kmovd               r5d, k1
808    test                r5d, r5d
809    jz .w8_main
810    call .w16_filter
811    cmp                  hd, r3d
812    jl .w8_filter_end
813    pminud               m6, m10, [base+pw_17] {1to16}
814    add                 r3d, 2
815.w8_filter_end:
816    vpermw               m5, m6, m5
817.w8_main:
818    vbroadcasti32x4      m0, [base+z_xpos_off1a]
819.w8_main2:
820    movshdup             m3, [base+z_xpos_mul]
821    vpbroadcastw         m4, dxd
822    shl                 r3d, 6
823    lea                  r2, [strideq*3]
824    pmullw               m3, m4
825    vshufi32x4           m6, m5, m5, q3321
826    sub                 r3d, dxd
827    psllw                m4, 2       ; dx*4
828    shl                 dxd, 2
829    paddsw               m3, m0      ; xpos
830    palignr              m6, m5, 2   ; top+1
831.w8_loop:
832    psrlw                m1, m3, 6   ; base_x
833    pand                 m2, m15, m3 ; frac
834    vpermw               m0, m1, m5  ; top[base_x]
835    vpermw               m1, m1, m6  ; top[base_x+1]
836    psllw                m2, 9
837    psubw                m1, m0
838    pmulhrsw             m1, m2
839    paddw                m0, m1
840    mova          [dstq+strideq*0], xm0
841    vextracti32x4 [dstq+strideq*1], ym0, 1
842    vextracti32x4 [dstq+strideq*2], m0, 2
843    vextracti32x4 [dstq+r2       ], m0, 3
844    sub                  hd, 4
845    jz .w8_end
846    paddsw               m3, m4      ; xpos += dx
847    lea                dstq, [dstq+strideq*4]
848    sub                 r3d, dxd
849    jg .w8_loop
850    vextracti32x4       xm5, m5, 3
851.w8_end_loop:
852    mova   [dstq+strideq*0], xm5
853    mova   [dstq+strideq*1], xm5
854    mova   [dstq+strideq*2], xm5
855    mova   [dstq+r2       ], xm5
856    lea                dstq, [dstq+strideq*4]
857    sub                  hd, 4
858    jg .w8_end_loop
859.w8_end:
860    RET
861.w16_filter:
862    vpbroadcastw         m1, [tlq-2]
863    popcnt              r5d, r5d
864    valignq              m3, m6, m5, 2
865    vpbroadcastd         m7, [base+z_filter_k+(r5-1)*4+12*0]
866    valignq              m1, m5, m1, 6
867    vpbroadcastd         m8, [base+z_filter_k+(r5-1)*4+12*1]
868    palignr              m2, m3, m5, 2
869    vpbroadcastd         m9, [base+z_filter_k+(r5-1)*4+12*2]
870    palignr              m0, m5, m1, 14
871    pmullw               m7, m5
872    palignr              m3, m5, 4
873    paddw                m0, m2
874    palignr              m5, m1, 12
875    pmullw               m0, m8
876    paddw                m5, m3
877    pmullw               m5, m9
878    pxor                 m1, m1
879    paddw                m0, m7
880    paddw                m5, m0
881    psrlw                m5, 3
882    pavgw                m5, m1
883    ret
884.w16:
885    lea                 r3d, [hq+15]
886    vpbroadcastb        ym0, r3d
887    and                 r3d, 15
888    or                  r3d, 16 ; imin(h+15, 31)
889    vpbroadcastw        m11, r3d
890    pminuw              m10, m11, [base+pw_0to31]
891    vpbroadcastw         m6, [tlq+r3*2]
892    vpermw               m5, m10, [tlq]
893    test             angled, 0x400
894    jnz .w16_main
895    vpbroadcastb        ym1, angled
896    shr              angled, 8
897    vpcmpeqb             k1, ym0, [base+z_filter_wh]
898    mova                xm0, [base+z_filter_t0+angleq*8]
899    vpcmpgtb         k1{k1}, ym1, ym0
900    kmovd               r5d, k1
901    test                r5d, r5d
902    jz .w16_main
903    call .w16_filter
904    cmp                  hd, 16
905    jg .w16_filter_h32
906    vpermw               m6, m11, m5
907    vpermw               m5, m10, m5
908    jmp .w16_main
909.w16_filter_h32:
910    movzx               r3d, word [tlq+62]
911    movzx               r2d, word [tlq+60]
912    lea                 r2d, [r2+r3*8+4]
913    sub                 r2d, r3d
914    mov                 r3d, 1
915    shr                 r2d, 3
916    kmovb                k1, r3d
917    movd                xm0, r2d
918    or                  r3d, 32
919    vmovdqu16        m6{k1}, m0
920.w16_main:
921    rorx                r2d, dxd, 23
922    mov                  r7, rsp
923    and                 rsp, ~63
924    vpbroadcastw         m3, r2d
925    sub                 rsp, 64*2
926    mov                 r2d, dxd
927    paddw                m4, m3, m3
928    mova         [rsp+64*0], m5
929    vinserti32x8         m3, ym4, 1
930    mova         [rsp+64*1], m6
931    shl                 r3d, 6
932.w16_loop:
933    lea                 r5d, [r2+dxq]
934    shr                 r2d, 6
935    movu                ym0, [rsp+r2*2]
936    movu                ym1, [rsp+r2*2+2]
937    lea                 r2d, [r5+dxq]
938    shr                 r5d, 6
939    vinserti32x8         m0, [rsp+r5*2], 1
940    vinserti32x8         m1, [rsp+r5*2+2], 1
941    pand                 m2, m15, m3 ; frac << 9
942    psubw                m1, m0
943    pmulhrsw             m1, m2
944    paddw                m0, m1
945    mova          [dstq+strideq*0], ym0
946    vextracti32x8 [dstq+strideq*1], m0, 1
947    sub                  hd, 2
948    jz .w16_end
949    paddw                m3, m4
950    lea                dstq, [dstq+strideq*2]
951    cmp                 r2d, r3d
952    jl .w16_loop
953    punpckhqdq          ym6, ym6
954.w16_end_loop:
955    mova   [dstq+strideq*0], ym6
956    mova   [dstq+strideq*1], ym6
957    lea                dstq, [dstq+strideq*2]
958    sub                  hd, 2
959    jg .w16_end_loop
960.w16_end:
961    mov                 rsp, r7
962    RET
963.w32:
964    lea                 r3d, [hq+31]
965    movu                 m7, [tlq+64*0]
966    and                 r3d, 31
967    vpbroadcastw        m11, r3d
968    or                  r3d, 32 ; imin(h+31, 63)
969    pminuw              m10, m11, [base+pw_0to31]
970    vpbroadcastw         m9, [tlq+r3*2]
971    vpermw               m8, m10, [tlq+64*1]
972    test             angled, 0x400
973    jnz .w32_main
974    vpbroadcastd         m5, [base+pw_3]
975    mov                 r5d, ~1
976    movu                 m3, [tlq-2]
977    kmovd                k1, r5d
978    valignq              m2, m8, m7, 6
979    paddw                m7, m3
980    vmovdqu16        m3{k1}, [tlq-4]
981    valignq              m4, m9, m8, 2
982    paddw                m3, m5
983    paddw                m7, [tlq+2]
984    palignr              m1, m8, m2, 14
985    pavgw                m3, [tlq+4]
986    palignr              m2, m8, m2, 12
987    paddw                m7, m3
988    palignr              m3, m4, m8, 2
989    psrlw                m7, 2
990    palignr              m4, m8, 4
991    paddw                m8, m1
992    paddw                m2, m5
993    paddw                m8, m3
994    pavgw                m2, m4
995    paddw                m8, m2
996    psrlw                m8, 2
997    cmp                  hd, 64
998    je .w32_filter_h64
999    vpermw               m9, m11, m8
1000    vpermw               m8, m10, m8
1001    jmp .w32_main
1002.w32_filter_h64:
1003    movzx               r3d, word [tlq+126]
1004    movzx               r2d, word [tlq+124]
1005    lea                 r2d, [r2+r3*8+4]
1006    sub                 r2d, r3d
1007    mov                 r3d, 65
1008    shr                 r2d, 3
1009    movd                xm0, r2d
1010    vpblendmw        m9{k1}, m0, m9
1011.w32_main:
1012    rorx                r2d, dxd, 23
1013    mov                  r7, rsp
1014    and                 rsp, ~63
1015    vpbroadcastw         m5, r2d
1016    sub                 rsp, 64*4
1017    mov                 r2d, dxd
1018    mova         [rsp+64*0], m7
1019    shl                 r3d, 6
1020    mova         [rsp+64*1], m8
1021    mova                 m6, m5
1022    mova         [rsp+64*2], m9
1023    punpckhqdq           m9, m9
1024    mova         [rsp+64*3], ym9
1025.w32_loop:
1026    lea                 r5d, [r2+dxq]
1027    shr                 r2d, 6
1028    movu                 m0, [rsp+r2*2]
1029    movu                 m2, [rsp+r2*2+2]
1030    lea                 r2d, [r5+dxq]
1031    shr                 r5d, 6
1032    movu                 m1, [rsp+r5*2]
1033    movu                 m3, [rsp+r5*2+2]
1034    pand                 m4, m15, m5
1035    paddw                m5, m6
1036    psubw                m2, m0
1037    pmulhrsw             m2, m4
1038    pand                 m4, m15, m5
1039    psubw                m3, m1
1040    pmulhrsw             m3, m4
1041    paddw                m0, m2
1042    paddw                m1, m3
1043    mova   [dstq+strideq*0], m0
1044    mova   [dstq+strideq*1], m1
1045    sub                  hd, 2
1046    jz .w32_end
1047    paddw                m5, m6
1048    lea                dstq, [dstq+strideq*2]
1049    cmp                 r2d, r3d
1050    jl .w32_loop
1051.w32_end_loop:
1052    mova   [dstq+strideq*0], m9
1053    mova   [dstq+strideq*1], m9
1054    lea                dstq, [dstq+strideq*2]
1055    sub                  hd, 2
1056    jg .w32_end_loop
1057.w32_end:
1058    mov                 rsp, r7
1059    RET
1060.w64_filter96:
1061    vpbroadcastd         m4, [base+pw_3]
1062    mov                 r5d, ~1
1063    movu                 m0, [tlq-2]
1064    kmovd                k1, r5d
1065    paddw                m7, m0
1066    vmovdqu16        m0{k1}, [tlq-4]
1067    paddw                m0, m4
1068    paddw                m7, [tlq+2]
1069    pavgw                m0, [tlq+4]
1070    valignq              m1, m9, m8, 6
1071    paddw                m8, [tlq+62]
1072    paddw                m2, m4, [tlq+60]
1073    valignq              m3, m10, m9, 2
1074    paddw                m8, [tlq+66]
1075    pavgw                m2, [tlq+68]
1076    paddw                m7, m0
1077    palignr              m0, m9, m1, 14
1078    paddw                m8, m2
1079    palignr              m1, m9, m1, 12
1080    psrlw                m7, 2
1081    palignr              m2, m3, m9, 2
1082    psrlw                m8, 2
1083    palignr              m3, m9, 4
1084    paddw                m0, m9
1085    paddw                m1, m4
1086    paddw                m0, m2
1087    pavgw                m1, m3
1088    paddw                m0, m1
1089    ret
1090.w64:
1091    movu                 m7, [tlq+64*0]
1092    lea                 r3d, [hq-1]
1093    movu                 m8, [tlq+64*1]
1094    vpbroadcastw        m11, [tlq+r3*2+128]
1095    movu                 m9, [tlq+64*2]
1096    cmp                  hd, 64
1097    je .w64_h64
1098    vpbroadcastw        m13, r3d
1099    or                  r3d, 64
1100    pminuw              m12, m13, [base+pw_0to31]
1101    mova                m10, m11
1102    vpermw               m9, m12, m9
1103    test             angled, 0x400
1104    jnz .w64_main
1105    call .w64_filter96
1106    psrlw                m0, 2
1107    vpermw               m9, m12, m0
1108    vpermw              m10, m13, m0
1109    mova                m11, m10
1110    jmp .w64_main
1111.w64_h64:
1112    movu                m10, [tlq+64*3]
1113    or                  r3d, 64
1114    test             angled, 0x400
1115    jnz .w64_main
1116    call .w64_filter96
1117    valignq              m1, m10, m9, 6
1118    valignq              m3, m11, m10, 2
1119    vpbroadcastd        m11, [base+pw_63]
1120    psrlw                m9, m0, 2
1121    palignr              m0, m10, m1, 14
1122    palignr              m1, m10, m1, 12
1123    palignr              m2, m3, m10, 2
1124    palignr              m3, m10, 4
1125    paddw               m10, m0
1126    paddw                m1, m4
1127    paddw               m10, m2
1128    pavgw                m1, m3
1129    paddw               m10, m1
1130    psrlw               m10, 2
1131    vpermw              m11, m11, m10
1132.w64_main:
1133    rorx                r2d, dxd, 23
1134    mov                  r7, rsp
1135    and                 rsp, ~63
1136    vpbroadcastw         m5, r2d
1137    sub                 rsp, 64*6
1138    mova         [rsp+64*0], m7
1139    mov                 r2d, dxd
1140    mova         [rsp+64*1], m8
1141    lea                  r5, [rsp+r3*2]
1142    mova         [rsp+64*2], m9
1143    shl                 r3d, 6
1144    mova         [rsp+64*3], m10
1145    sub                  r2, r3
1146    mova         [rsp+64*4], m11
1147    mova                 m6, m5
1148    mova         [rsp+64*5], m11
1149.w64_loop:
1150    mov                  r3, r2
1151    sar                  r3, 6
1152    movu                 m0, [r5+r3*2+64*0]
1153    movu                 m2, [r5+r3*2+64*0+2]
1154    movu                 m1, [r5+r3*2+64*1]
1155    movu                 m3, [r5+r3*2+64*1+2]
1156    pand                 m4, m15, m5
1157    psubw                m2, m0
1158    pmulhrsw             m2, m4
1159    psubw                m3, m1
1160    pmulhrsw             m3, m4
1161    paddw                m0, m2
1162    paddw                m1, m3
1163    mova        [dstq+64*0], m0
1164    mova        [dstq+64*1], m1
1165    dec                  hd
1166    jz .w64_end
1167    paddw                m5, m6
1168    add                dstq, strideq
1169    add                  r2, dxq
1170    jl .w64_loop
1171.w64_end_loop:
1172    mova        [dstq+64*0], m11
1173    mova        [dstq+64*1], m11
1174    add                dstq, strideq
1175    dec                  hd
1176    jg .w64_end_loop
1177.w64_end:
1178    mov                 rsp, r7
1179    RET
1180
1181cglobal ipred_z2_16bpc, 3, 9, 16, dst, stride, tl, w, h, angle, dx, _, dy
1182    tzcnt                wd, wm
1183    movifnidn        angled, anglem
1184    lea                 dxq, [dr_intra_derivative-90]
1185    movzx               dyd, angleb
1186    xor              angled, 0x400
1187    mov                  r7, dxq
1188    sub                 dxq, dyq
1189    movifnidn            hd, hm
1190    and                 dyd, ~1
1191    vpbroadcastw        m12, [tlq]
1192    and                 dxq, ~1
1193    movzx               dyd, word [r7+dyq]  ; angle - 90
1194    lea                  r7, [z_filter_t0]
1195    movzx               dxd, word [dxq+270] ; 180 - angle
1196    mova                 m0, [base+pw_31to0]
1197    movsxd               wq, [base+ipred_z2_16bpc_avx512icl_table+wq*4]
1198    movu                 m4, [tlq+2]
1199    neg                 dyd
1200    vpermw               m7, m0, [tlq-64*1]
1201    lea                  wq, [base+ipred_z2_16bpc_avx512icl_table+wq]
1202    vpbroadcastd        m14, [base+pw_31806]
1203    vpbroadcastd        m15, [base+pw_1]
1204    jmp                  wq
1205.w4:
1206    movq                xm3, [tlq]
1207    vpbroadcastq         m8, [base+pw_1to32]
1208    test             angled, 0x400
1209    jnz .w4_main ; !enable_intra_edge_filter
1210    lea                 r3d, [hq+2]
1211    add              angled, 1022
1212    shl                 r3d, 6
1213    test                r3d, angled
1214    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
1215    pshuflw             xm0, xm4, q3321
1216    sub              angled, 1075 ; angle - 53
1217    lea                 r3d, [hq+3]
1218    call .upsample_above
1219    punpcklwd           xm4, xm3, xm4
1220    palignr             xm3, xm4, xm12, 14
1221    jmp .w4_main
1222.w4_upsample_left:
1223    call .upsample_left
1224    movsldup             m1, [base+z_xpos_mul]
1225    paddw                m1, m1
1226    jmp .w4_main2
1227.w4_no_upsample_above:
1228    lea                 r3d, [hq+3]
1229    vpbroadcastd        ym0, [base+pw_3]
1230    sub              angled, 1112 ; angle - 90
1231    call .filter_above2
1232    lea                 r3d, [hq+2]
1233    add              angled, 973 ; angle + 883
1234    palignr             xm3, xm4, xm12, 14
1235    shl                 r3d, 6
1236    test                r3d, angled
1237    jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
1238    call .filter_left16
1239.w4_main:
1240    movsldup             m1, [base+z_xpos_mul]
1241    psllw               m15, 3
1242.w4_main2:
1243    vpbroadcastq         m0, [base+pw_1to32]
1244    vpbroadcastw        m11, dxd
1245    movsldup             m2, [base+z_xpos_mul]
1246    vpbroadcastw        m13, dyd
1247    vpbroadcastd         m5, [tlq-2]
1248    psllw               m10, m8, 6
1249    valignq              m5, m7, m5, 6
1250    pmullw               m2, m11
1251    psubw               m10, m2       ; xpos
1252    pmullw              m13, m0       ; ypos
1253    palignr              m5, m7, m5, 14
1254    psrlw               m12, m13, 6
1255    psllw               m13, 9
1256    paddw               m12, m1       ; base_y
1257    pand                m13, m14      ; frac_y << 9
1258    psllw               m11, 3
1259    lea                  r5, [strideq*3]
1260.w4_loop:
1261    psrlw                m1, m10, 6   ; base_x
1262    pand                 m2, m14, m10 ; frac
1263    vpermw               m0, m1, m3   ; top[base_x]
1264    vpermw               m1, m1, m4   ; top[base_x+1]
1265    vpmovw2m             k1, m10      ; base_x < 0
1266    psllw                m2, 9
1267    vpermw           m0{k1}, m12, m5  ; left[base_y]
1268    vpermw           m1{k1}, m12, m7  ; left[base_y+1]
1269    vmovdqu16        m2{k1}, m13
1270    psubw                m1, m0
1271    pmulhrsw             m1, m2
1272    paddw                m0, m1
1273    vextracti32x4       xm1, ym0, 1
1274    movq   [dstq+strideq*0], xm0
1275    movhps [dstq+strideq*1], xm0
1276    movq   [dstq+strideq*2], xm1
1277    movhps [dstq+r5       ], xm1
1278    sub                  hd, 8
1279    jl .w4_end
1280    vextracti32x8       ym0, m0, 1
1281    psubw               m10, m11      ; base_x -= dx
1282    lea                dstq, [dstq+strideq*4]
1283    paddw               m12, m15      ; base_y++
1284    vextracti32x4       xm1, ym0, 1
1285    movq   [dstq+strideq*0], xm0
1286    movhps [dstq+strideq*1], xm0
1287    movq   [dstq+strideq*2], xm1
1288    movhps [dstq+r5       ], xm1
1289    lea                dstq, [dstq+strideq*4]
1290    jg .w4_loop
1291.w4_end:
1292    RET
1293.upsample_above: ; w4/w8
1294    mova                ym9, [base+pw_1to32]
1295    palignr             xm1, xm4, xm12, 12
1296    paddw               xm3, xm4  ; b+c
1297    xor              angled, 0x7f ; 180 - angle
1298    paddw               xm0, xm1  ; a+d
1299    vpbroadcastw        xm1, r9m  ; pixel_max
1300    vpbroadcastb       xm11, r3d
1301    psubw               xm0, xm3, xm0
1302    vpbroadcastb        xm2, angled
1303    psraw               xm0, 3
1304    shr              angled, 8
1305    paddw               xm3, xm0
1306    pxor                xm0, xm0
1307    vpcmpeqb             k2, xm11, [base+z_filter_wh]
1308    pmaxsw              xm3, xm0
1309    add                 dxd, dxd
1310    pavgw               xm3, xm0
1311    vpcmpgtb         k2{k2}, xm2, [base+z_filter_t0+angleq*8]
1312    pminsw              xm3, xm1
1313    paddw                m8, m8
1314    jmp .filter_left16b
1315.upsample_left: ; h4/h8
1316    lea                 r3d, [hq-1]
1317    palignr             xm2, xm7, xm12, 14
1318    vpbroadcastw        xm0, r3d
1319    palignr             xm1, xm7, xm12, 12
1320    pminuw              xm0, xm9
1321    paddw               xm2, xm7 ; b+c
1322    vpermw              xm0, xm0, xm7
1323    add                 dyd, dyd
1324    paddw               xm0, xm1 ; a+d
1325    vpbroadcastw        xm1, r9m ; pixel_max
1326    psubw               xm0, xm2, xm0
1327    psraw               xm0, 3
1328    paddw               xm2, xm0
1329    pxor                xm0, xm0
1330    pmaxsw              xm2, xm0
1331    pavgw               xm2, xm0
1332    pminsw              xm2, xm1
1333    punpckhwd           xm0, xm2, xm7
1334    punpcklwd           xm7, xm2, xm7
1335    vinserti32x4        ym7, xm0, 1
1336    ret
1337.filter_above:
1338    sub              angled, 90
1339.filter_above2:
1340    vpbroadcastb        ym1, r3d
1341    vpbroadcastb       ym10, angled
1342    mov                 r3d, angled
1343    shr                 r3d, 8
1344    vpcmpeqb             k2, ym1, [base+z_filter_wh]
1345    mova               xm11, [base+z_filter_t0+r3*8]
1346    vpcmpgtb         k1{k2}, ym10, ym11
1347    mova                 m9, [base+pw_1to32]
1348    kmovd               r3d, k1
1349    test                r3d, r3d
1350    jz .filter_end
1351    pminuw              ym0, ym9
1352    popcnt              r3d, r3d
1353    vpbroadcastd        ym6, r7m      ; max_w
1354    kxnorw               k1, k1, k1
1355    vpbroadcastd        ym5, [base+z_filter_k+(r3-1)*4+12*0]
1356    kaddw                k1, k1, k1   ; ~1
1357    vpbroadcastd       ym13, [base+z_filter_k+(r3-1)*4+12*1]
1358    vpermw              ym2, ym0, ym4 ; +1
1359    pmullw              ym5, ym4
1360    paddw               ym1, ym2, ym3
1361    vmovdqu16        m3{k1}, [tlq-2]  ; -2
1362    vpermw              ym2, ym0, ym2 ; +2
1363    vpbroadcastd        ym0, [base+z_filter_k+(r3-1)*4+12*2]
1364    pmullw              ym1, ym13
1365    movu                m13, [base+pw_0to31]
1366    paddw               ym2, ym3
1367    packssdw            ym6, ym6
1368    pmullw              ym2, ym0
1369    paddw               ym1, ym5
1370    vpcmpgtw             k1, ym6, ym13
1371    paddw               ym1, ym2
1372    pxor                ym2, ym2
1373    psrlw               ym1, 3
1374    pavgw           ym4{k1}, ym1, ym2
1375.filter_end:
1376    ret
1377.filter_left16:
1378    vpbroadcastd        ym1, [base+pb_90]
1379    psubb               ym1, ym10
1380    vpcmpgtb         k2{k2}, ym1, ym11
1381.filter_left16b:
1382    kmovd               r3d, k2
1383    test                r3d, r3d
1384    jz .filter_end
1385    lea                 r5d, [hq-1]
1386    vinserti32x4        ym0, ym12, xm7, 1
1387    vpbroadcastw        ym1, r5d
1388    popcnt              r3d, r3d
1389    vpbroadcastd        ym6, r8m          ; max_h
1390    pminuw              ym9, ym1
1391    vpbroadcastd        ym5, [base+z_filter_k+(r3-1)*4+12*0]
1392    vpermw              ym2, ym9, ym7     ; +1
1393    vpbroadcastd       ym10, [base+z_filter_k+(r3-1)*4+12*1]
1394    palignr             ym1, ym7, ym0, 14 ; -1
1395    pmullw              ym5, ym7
1396    palignr             ym0, ym7, ym0, 12 ; -2
1397    paddw               ym1, ym2
1398    vpermw              ym2, ym9, ym2     ; +2
1399    vpbroadcastd        ym9, [base+z_filter_k+(r3-1)*4+12*2]
1400    pmullw              ym1, ym10
1401    paddw               ym2, ym0
1402    packssdw            ym6, ym6
1403    pmullw              ym2, ym9
1404    paddw               ym1, ym5
1405    vpcmpgtw             k1, ym6, [base+pw_0to31]
1406    paddw               ym1, ym2
1407    pxor                ym2, ym2
1408    psrlw               ym1, 3
1409    pavgw           ym7{k1}, ym1, ym2
1410    ret
1411.filter_left:
1412    cmp                  hd, 32
1413    jl .filter_left16
1414    vpbroadcastd         m5, [base+pw_3]
1415    pminud               m0, m9, [base+pw_31] {1to16}
1416.filter_left32:
1417    vpbroadcastd         m6, r8m         ; max_h
1418    valignq              m2, m7, m12, 6
1419    packssdw             m6, m6
1420    palignr              m1, m7, m2, 14  ; -1
1421    paddw                m1, m7
1422    palignr              m2, m7, m2, 12  ; -2
1423    vpcmpgtw             k1, m6, m13
1424    paddw                m2, m5
1425    cmp                  hd, 64
1426    je .filter_left64
1427    lea                 r3d, [hq-1]
1428    vpbroadcastw        m10, r3d
1429    pminuw               m0, m10
1430    vpermw              m10, m0, m7      ; +1
1431    paddw                m1, m10
1432    vpermw              m10, m0, m10     ; +2
1433    pavgw                m2, m10
1434    paddw                m1, m2
1435    vpsrlw           m7{k1}, m1, 2
1436    ret
1437.filter_left64:
1438    valignq             m10, m8, m7, 2
1439    vpaddd              m13, [base+pw_32] {1to16}
1440    palignr             m11, m10, m7, 2  ; +1
1441    paddw                m1, m11
1442    palignr             m11, m10, m7, 4  ; +2
1443    valignq             m10, m8, m7, 6
1444    pavgw               m11, m2
1445    vpermw               m2, m0, m8      ; 32+1
1446    paddw                m1, m11
1447    vpsrlw           m7{k1}, m1, 2
1448    palignr              m1, m8, m10, 14 ; 32-1
1449    paddw                m1, m8
1450    palignr             m10, m8, m10, 12 ; 32-2
1451    paddw                m1, m2
1452    vpermw               m2, m0, m2      ; 32+2
1453    paddw               m10, m5
1454    vpcmpgtw             k1, m6, m13
1455    pavgw                m2, m10
1456    paddw                m1, m2
1457    vpsrlw           m8{k1}, m1, 2
1458    ret
1459.w8:
1460    mova                xm3, [tlq]
1461    vbroadcasti32x4      m8, [base+pw_1to32]
1462    test             angled, 0x400
1463    jnz .w8_main
1464    lea                 r3d, [angleq+126]
1465    mov                 r3b, hb
1466    cmp                 r3d, 8
1467    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
1468    psrldq              xm0, xm4, 2
1469    sub              angled, 53
1470    pshufhw             xm0, xm0, q2210
1471    lea                 r3d, [hq+7]
1472    call .upsample_above
1473    punpcklwd           xm0, xm3, xm4
1474    punpckhwd           xm4, xm3, xm4
1475    vinserti32x4        ym3, ym12, xm0, 1
1476    vinserti32x4        ym4, ym0, xm4, 1
1477    palignr             ym3, ym4, ym3, 14
1478    jmp .w8_main
1479.w8_upsample_left:
1480    call .upsample_left
1481    movshdup             m1, [base+z_xpos_mul]
1482    psllw               m15, 3
1483    paddw                m1, m1
1484    jmp .w8_main2
1485.w8_no_upsample_above:
1486    lea                 r3d, [hq+7]
1487    vpbroadcastd        ym0, [base+pw_7]
1488    call .filter_above
1489    lea                 r3d, [angleq-51]
1490    mov                 r3b, hb
1491    palignr             xm3, xm4, xm12, 14
1492    cmp                 r3d, 8
1493    jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
1494    call .filter_left
1495.w8_main:
1496    movshdup             m1, [base+z_xpos_mul]
1497    psllw               m15, 2
1498.w8_main2:
1499    vbroadcasti32x4      m0, [base+pw_1to32]
1500    vpbroadcastw        m11, dxd
1501    movshdup             m2, [base+z_xpos_mul]
1502    vpbroadcastw        m13, dyd
1503    psllw               m10, m8, 6
1504    valignq              m5, m7, m12, 6
1505    pmullw               m2, m11
1506    psubw               m10, m2       ; xpos
1507    pmullw              m13, m0       ; ypos
1508    palignr              m5, m7, m5, 14
1509    psrlw               m12, m13, 6
1510    psllw               m13, 9
1511    mov                 r2d, 1<<6
1512    paddw               m12, m1       ; base_y
1513    lea                 r3d, [dxq-(8<<6)] ; left-only threshold
1514    pand                m13, m14      ; frac_y << 9
1515    shl                 dxd, 2
1516    psllw               m11, 2
1517    lea                  r5, [strideq*3]
1518.w8_loop:
1519    psrlw                m1, m10, 6
1520    pand                 m2, m14, m10
1521    vpermw               m0, m1, m3
1522    vpermw               m1, m1, m4
1523    psllw                m2, 9
1524    sub                 r2d, dxd
1525    jge .w8_toponly
1526    vpmovw2m             k1, m10
1527    vpermw           m0{k1}, m12, m5
1528    vpermw           m1{k1}, m12, m7
1529    vmovdqu16        m2{k1}, m13
1530.w8_toponly:
1531    psubw                m1, m0
1532    pmulhrsw             m1, m2
1533    paddw                m0, m1
1534    mova          [dstq+strideq*0], xm0
1535    vextracti32x4 [dstq+strideq*1], ym0, 1
1536    vextracti32x4 [dstq+strideq*2], m0, 2
1537    vextracti32x4 [dstq+r5       ], m0, 3
1538    sub                  hd, 4
1539    jz .w8_end
1540    psubw               m10, m11      ; base_x -= dx
1541    lea                dstq, [dstq+strideq*4]
1542    paddw               m12, m15      ; base_y++
1543    cmp                 r2d, r3d
1544    jge .w8_loop
1545.w8_leftonly_loop:
1546    vpermw               m0, m12, m5
1547    vpermw               m1, m12, m7
1548    psubw                m1, m0
1549    pmulhrsw             m1, m13
1550    paddw               m12, m15
1551    paddw                m0, m1
1552    mova          [dstq+strideq*0], xm0
1553    vextracti32x4 [dstq+strideq*1], ym0, 1
1554    vextracti32x4 [dstq+strideq*2], m0, 2
1555    vextracti32x4 [dstq+r5       ], m0, 3
1556    lea                dstq, [dstq+strideq*4]
1557    sub                  hd, 4
1558    jg .w8_leftonly_loop
1559.w8_end:
1560    RET
1561.w16:
1562    mova                ym3, [tlq]
1563    vpermw               m8, m0, [tlq-64*2]
1564    test             angled, 0x400
1565    jnz .w16_main
1566    lea                 r3d, [hq+15]
1567    vpbroadcastd        ym0, [base+pw_15]
1568    call .filter_above
1569    call .filter_left
1570    vinserti32x4        ym3, ym12, xm4, 1
1571    palignr             ym3, ym4, ym3, 14
1572.w16_main:
1573    vbroadcasti32x8      m0, [base+pw_1to32]
1574    vpbroadcastw        m11, dxd
1575    vpbroadcastw        m13, dyd
1576    kxnorw               k2, k2, k2
1577    psllw               m10, m0, 6
1578    valignq              m5, m7, m12, 6
1579    psubw               m10, m11      ; xpos
1580    valignq              m6, m8, m7, 6
1581    pmullw              m13, m0       ; ypos
1582    knotd                k1, k2
1583    palignr              m5, m7, m5, 14
1584    palignr              m6, m8, m6, 14
1585    vpsubw          m10{k1}, m11
1586    psrlw               m12, m13, 6
1587    psllw               m13, 9
1588    mov                 r2d, 1<<6
1589    vpsubw          m12{k2}, m15      ; base_y
1590    pand                m13, m14      ; frac_y << 9
1591    lea                 r3d, [dxq-(16<<6)]
1592    paddw               m11, m11
1593    add                 dxd, dxd
1594    paddw               m15, m15
1595.w16_loop:
1596    psrlw                m1, m10, 6
1597    pand                 m2, m14, m10
1598    vpermw               m0, m1, m3
1599    vpermw               m1, m1, m4
1600    psllw                m2, 9
1601    psubw                m1, m0
1602    pmulhrsw             m1, m2
1603    paddw               m12, m15      ; base_y++
1604    paddw                m0, m1
1605    sub                 r2d, dxd
1606    jge .w16_toponly
1607    mova                 m1, m5
1608    vpermt2w             m1, m12, m6
1609    mova                 m2, m7
1610    vpermt2w             m2, m12, m8
1611    vpmovw2m             k1, m10
1612    psubw                m2, m1
1613    pmulhrsw             m2, m13
1614    vpaddw           m0{k1}, m1, m2
1615.w16_toponly:
1616    mova          [dstq+strideq*0], ym0
1617    vextracti32x8 [dstq+strideq*1], m0, 1
1618    sub                  hd, 2
1619    jz .w16_end
1620    psubw               m10, m11      ; base_x -= dx
1621    lea                dstq, [dstq+strideq*2]
1622    cmp                 r2d, r3d
1623    jge .w16_loop
1624    paddw               m12, m15
1625    vpermt2w             m5, m12, m6
1626    mova                 m1, m7
1627    vpermt2w             m1, m12, m8
1628    jmp .w16_leftonly_loop_start
1629.w16_leftonly_loop:
1630    mova                 m1, m7
1631    vpermt2w             m1, m12, m8
1632    vshufi32x4           m5, m1, q1032
1633.w16_leftonly_loop_start:
1634    psubw                m0, m1, m5
1635    pmulhrsw             m0, m13
1636    paddw               m12, m15
1637    paddw                m0, m5
1638    mova                 m5, m1
1639    mova          [dstq+strideq*0], ym0
1640    vextracti32x8 [dstq+strideq*1], m0, 1
1641    lea                dstq, [dstq+strideq*2]
1642    sub                  hd, 2
1643    jg .w16_leftonly_loop
1644.w16_end:
1645    RET
1646.w32:
1647    mova                 m3, [tlq]
1648    vpermw               m8, m0, [tlq-64*2]
1649    mova                 m9, [base+pw_1to32]
1650    test             angled, 0x400
1651    jnz .w32_main
1652    pminud               m0, m9, [base+pw_31] {1to16}
1653    mov                 r3d, ~1
1654    kmovd                k1, r3d
1655    vpbroadcastd         m5, [base+pw_3]
1656    vpbroadcastd         m6, r6m     ; max_w
1657    vpermw               m2, m0, m4  ; +1
1658    movu                m13, [base+pw_0to31]
1659    paddw                m1, m4, m3
1660    vmovdqu16        m3{k1}, [tlq-2] ; -2
1661    packssdw             m6, m6
1662    paddw                m1, m2
1663    vpermw               m2, m0, m2  ; +2
1664    paddw                m3, m5
1665    vpcmpgtw             k1, m6, m13
1666    pavgw                m2, m3
1667    paddw                m1, m2
1668    psrlw            m4{k1}, m1, 2
1669    call .filter_left32
1670.w32_main:
1671    sub                 rsp, 64*2
1672    call .w32_main1
1673    add                 rsp, 64*2
1674    RET
1675.w32_main1:
1676    vpbroadcastw        m11, dxd
1677    movu           [rsp+64], m4
1678    vpbroadcastw         m4, dyd
1679    movd           [rsp+60], xm12
1680    valignq              m5, m7, m12, 6
1681    psllw                m3, m9, 6    ; xpos
1682    valignq              m6, m8, m7, 6
1683    pmullw               m9, m4       ; ypos
1684    palignr              m5, m7, m5, 14
1685    mov                 r2d, 33<<6
1686    palignr              m6, m8, m6, 14
1687    mova                m10, m3
1688.w32_main2:
1689    psllw               m13, m9, 9
1690    sub                 r2d, dxd
1691    psrlw               m12, m9, 6    ; base_y
1692    mov                 r8d, hd
1693    pand                m13, m14      ; frac_y << 9
1694.w32_loop:
1695    mov                 r3d, r2d
1696    shr                 r3d, 6
1697    psubw               m10, m11      ; base_x -= dx
1698    movu                 m0, [rsp+r3*2-2]
1699    pand                 m2, m10, m14 ; frac_x
1700    movu                 m1, [rsp+r3*2]
1701    psllw                m2, 9
1702    psubw                m1, m0
1703    pmulhrsw             m1, m2
1704    paddw               m12, m15      ; base_y++
1705    paddw                m0, m1
1706    cmp                 r2d, 32<<6
1707    jge .w32_toponly
1708    mova                 m1, m5
1709    vpermt2w             m1, m12, m6
1710    mova                 m2, m7
1711    vpermt2w             m2, m12, m8
1712    vpmovw2m             k1, m10
1713    psubw                m2, m1
1714    pmulhrsw             m2, m13
1715    vpaddw           m0{k1}, m1, m2
1716.w32_toponly:
1717    mova             [dstq], m0
1718    dec                 r8d
1719    jz .w32_end
1720    add                dstq, strideq
1721    sub                 r2d, dxd
1722    jge .w32_loop
1723    paddw               m12, m15
1724    mova                 m2, m5
1725    vpermt2w             m2, m12, m6
1726.w32_leftonly_loop:
1727    mova                 m1, m7
1728    vpermt2w             m1, m12, m8
1729    psubw                m0, m1, m2
1730    pmulhrsw             m0, m13
1731    paddw               m12, m15
1732    paddw                m0, m2
1733    mova                 m2, m1
1734    mova             [dstq], m0
1735    add                dstq, strideq
1736    dec                 r8d
1737    jg .w32_leftonly_loop
1738.w32_end:
1739    ret
1740.w64:
1741    movu                 m3, [tlq+66]
1742    vpermw               m8, m0, [tlq-64*2]
1743    mova                 m9, [base+pw_1to32]
1744    test             angled, 0x400
1745    jnz .w64_main
1746    mova                 m2, [tlq]        ; -1
1747    mov                 r3d, ~1
1748    vpbroadcastd         m5, [base+pw_3]
1749    kmovd                k1, r3d
1750    movu                m13, [base+pw_0to31]
1751    vpbroadcastd         m6, r6m          ; max_w
1752    pminud               m0, m9, [base+pw_31] {1to16}
1753    paddw                m1, m4, m2
1754    vmovdqu16        m2{k1}, [tlq-2]      ; -2
1755    packssdw             m6, m6
1756    paddw                m1, [tlq+4]      ; +1
1757    paddw                m2, m5
1758    vpcmpgtw             k1, m6, m13
1759    pavgw                m2, [tlq+6]      ; +2
1760    paddw                m1, m2
1761    vpermw               m2, m0, m3       ; 32+1
1762    psrlw            m4{k1}, m1, 2
1763    paddw                m1, m3, [tlq+64] ; 32-1
1764    vpaddd              m11, m13, [base+pw_32] {1to16}
1765    paddw                m1, m2
1766    vpermw               m2, m0, m2       ; 32+2
1767    paddw               m10, m5, [tlq+62] ; 32-2
1768    vpcmpgtw             k1, m6, m11
1769    pavgw                m2, m10
1770    paddw                m1, m2
1771    psrlw            m3{k1}, m1, 2
1772    call .filter_left32
1773.w64_main:
1774    sub                 rsp, 64*3
1775    movu [rsp+64*2-gprsize], m3
1776    mov                  r5, dstq
1777    call .w32_main1
1778    psllw                m4, 5
1779    mov                 r2d, 65<<6
1780    vpaddd              m10, m3, [base+pw_2048] {1to16} ; xpos
1781    lea                dstq, [r5+64]
1782    paddw                m9, m4 ; ypos
1783    call .w32_main2
1784    add                 rsp, 64*3
1785    RET
1786
1787cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
1788    lea                  r7, [z_filter_t0]
1789    tzcnt                wd, wm
1790    movifnidn        angled, anglem
1791    lea                  t0, [dr_intra_derivative+45*2-1]
1792    movsxd               wq, [base+ipred_z3_16bpc_avx512icl_table+wq*4]
1793    sub              angled, 180
1794    mov                 dyd, angled
1795    neg                 dyd
1796    xor              angled, 0x400
1797    or                  dyq, ~0x7e
1798    mova                 m0, [base+pw_31to0]
1799    movzx               dyd, word [t0+dyq]
1800    lea                  wq, [base+ipred_z3_16bpc_avx512icl_table+wq]
1801    movifnidn            hd, hm
1802    vpbroadcastd        m14, [base+pw_31806]
1803    vpbroadcastd        m15, [base+pw_1]
1804    jmp                  wq
1805.w4:
1806    lea                 r3d, [hq+3]
1807    xor                 r3d, 31 ; 32 - (h + imin(w, h))
1808    vpbroadcastw         m7, r3d
1809    pmaxuw               m7, m0
1810    vpermw               m6, m7, [tlq-64*1]
1811    test             angled, 0x400 ; !enable_intra_edge_filter
1812    jnz .w4_main
1813    cmp              angleb, 40
1814    jae .w4_filter
1815    lea                 r3d, [angleq-1024]
1816    sar                 r3d, 7
1817    add                 r3d, hd
1818    jg .w4_filter ; h > 8 || (h == 8 && is_sm)
1819    call .upsample
1820    movsldup             m1, [base+z_ypos_mul]
1821    paddw                m1, m1
1822    jmp .w4_main2
1823.w4_filter:
1824    lea                 r3d, [hq+3]
1825    call .filter32
1826.w4_main:
1827    movsldup             m1, [base+z_ypos_mul]
1828.w4_main2:
1829    vpbroadcastq         m0, [base+pw_1to32]
1830    vpbroadcastw         m4, dyd
1831    lea                 r2d, [hq+4]
1832    shr                 r2d, 3
1833    pmullw               m4, m0      ; ypos
1834    vpbroadcastw         m0, r2d
1835    imul                 r2, strideq ; stride * imax(height / 8, 1)
1836    pmullw               m1, m0
1837    lea                  r3, [r2*3]
1838    paddd                m1, [base+pw_32736] {1to16}
1839    psrlw                m2, m4, 6
1840    psllw                m4, 9
1841    paddsw               m2, m1      ; base+0
1842    vpandd               m4, m14     ; frac << 9
1843    vpermw               m3, m2, m6  ; left[base+0]
1844.w4_loop:
1845    paddsw               m2, m15     ; base+1
1846    vpermw               m1, m2, m6  ; left[base+1]
1847    psubw                m0, m1, m3
1848    pmulhrsw             m0, m4
1849    paddw                m0, m3
1850    movq        [dstq+r2*0], xm0
1851    movhps      [dstq+r2*1], xm0
1852    vextracti32x4       xm3, ym0, 1
1853    movq        [dstq+r2*2], xm3
1854    movhps      [dstq+r3  ], xm3
1855    sub                  hd, 8
1856    jl .w4_end
1857    lea                  r5, [dstq+r2*4]
1858    vextracti32x8       ym0, m0, 1
1859    mova                 m3, m1
1860    movq          [r5+r2*0], xm0
1861    movhps        [r5+r2*1], xm0
1862    vextracti32x4       xm1, ym0, 1
1863    movq          [r5+r2*2], xm1
1864    movhps        [r5+r3  ], xm1
1865    add                dstq, strideq
1866    test                 hd, hd
1867    jnz .w4_loop
1868.w4_end:
1869    RET
1870.upsample:
1871    vinserti32x4         m6, [tlq-14], 3
1872    mova                 m3, [base+z_upsample]
1873    vpbroadcastd         m4, [base+pd_65536]
1874    add                 dyd, dyd
1875    vpermw               m0, m3, m6
1876    paddw                m3, m4
1877    vpermw               m1, m3, m6
1878    paddw                m3, m4
1879    vpermw               m2, m3, m6
1880    paddw                m3, m4
1881    vpermw               m3, m3, m6
1882    vpbroadcastw         m6, r9m     ; pixel_max
1883    paddw                m1, m2      ; b+c
1884    paddw                m0, m3      ; a+d
1885    psubw                m0, m1, m0
1886    psraw                m0, 3
1887    pxor                 m2, m2
1888    paddw                m0, m1
1889    pmaxsw               m0, m2
1890    pavgw                m0, m2
1891    pminsw               m6, m0
1892    ret
1893.w8:
1894    mova                 m6, [tlq-64*1]
1895    cmp                  hd, 32
1896    je .w8_h32
1897    mov                 r3d, 8
1898    cmp                  hd, 4
1899    cmove               r3d, hd
1900    lea                 r3d, [r3+hq-1]
1901    xor                 r3d, 31 ; 32 - (h + imin(w, h))
1902    vpbroadcastw         m1, r3d
1903    vpermw               m7, m1, m6
1904    pmaxuw               m1, m0
1905    vpermw               m6, m1, m6
1906    test             angled, 0x400
1907    jnz .w8_main
1908    lea                 r3d, [angleq+216]
1909    mov                 r3b, hb
1910    cmp                 r3d, 8
1911    ja .w8_filter ; is_sm || d >= 40 || h > 8
1912    call .upsample
1913    movshdup             m1, [base+z_ypos_mul]
1914    paddw                m1, m1
1915    call .w8_main_setup
1916.w8_upsample_loop:
1917    vpermw               m3, m2, m6  ; left[base+0]
1918    paddw                m2, m15     ; base+1
1919    vpermw               m1, m2, m6  ; left[base+1]
1920    psubw                m0, m1, m3
1921    pmulhrsw             m0, m4
1922    paddw                m2, m15     ; base+2
1923    paddw                m0, m3
1924    mova                 m3, m1
1925    mova          [dstq+r2*0], xm0
1926    vextracti32x4 [dstq+r2*1], ym0, 1
1927    vextracti32x4 [dstq+r2*2], m0, 2
1928    vextracti32x4 [dstq+r3  ], m0, 3
1929    add                dstq, strideq
1930    sub                  hd, 4
1931    jg .w8_upsample_loop
1932    RET
1933.w8_main_setup:
1934    vbroadcasti32x4      m0, [base+pw_1to32]
1935    vpbroadcastw         m4, dyd
1936    rorx                r2d, hd, 2
1937    pmullw               m4, m0      ; ypos
1938    vpbroadcastw         m0, r2d
1939    imul                 r2, strideq ; stride * height / 4
1940    lea                  r3, [r2*3]
1941    pmullw               m1, m0      ; 0 1 2 3
1942    paddd                m1, [base+pw_32704] {1to16}
1943    psrlw                m2, m4, 6
1944    psllw                m4, 9
1945    paddsw               m2, m1      ; base+0
1946    vpandd               m4, m14     ; frac << 9
1947    ret
1948.w8_h32:
1949    pmaxud               m7, m0, [base+pw_24] {1to16}
1950    vpermw               m6, m0, m6
1951    vpermw               m7, m7, [tlq-64*2]
1952    test             angled, 0x400
1953    jnz .w8_main
1954    call .filter64
1955    vpbroadcastd         m0, [base+pw_7]
1956    pminuw               m0, [base+pw_0to31]
1957    vpermw               m7, m0, m7
1958    jmp .w8_main
1959.w8_filter:
1960    lea                 r3d, [hq+7]
1961    call .filter32
1962.w8_main:
1963    movshdup             m1, [base+z_ypos_mul]
1964    call .w8_main_setup
1965    mova                 m3, m6
1966    vpermt2w             m3, m2, m7  ; left[base+0]
1967.w8_loop:
1968    paddsw               m2, m15     ; base+1
1969    mova                 m1, m6
1970    vpermt2w             m1, m2, m7  ; left[base+1]
1971    psubw                m0, m1, m3
1972    pmulhrsw             m0, m4
1973    paddw                m0, m3
1974    mova                 m3, m1
1975    mova          [dstq+r2*0], xm0
1976    vextracti32x4 [dstq+r2*1], ym0, 1
1977    vextracti32x4 [dstq+r2*2], m0, 2
1978    vextracti32x4 [dstq+r3  ], m0, 3
1979    add                dstq, strideq
1980    sub                  hd, 4
1981    jg .w8_loop
1982    RET
1983.filter32:
1984    vpbroadcastb       ym10, r3d
1985    vpbroadcastb        ym1, angled
1986    shr              angled, 8
1987    vpcmpeqb             k1, ym10, [base+z_filter_wh]
1988    mova                xm2, [base+z_filter_t0+angleq*8]
1989    vpcmpgtb         k1{k1}, ym1, ym2
1990    kmovd               r5d, k1
1991    test                r5d, r5d
1992    jz .filter32_end
1993    vpbroadcastw         m2, [tlq]
1994    popcnt              r5d, r5d
1995    vpbroadcastd         m5, [base+z_filter_k+(r5-1)*4+12*0]
1996    valignq              m2, m6, m2, 6
1997    vpbroadcastd         m8, [base+z_filter_k+(r5-1)*4+12*1]
1998    valignq              m4, m7, m6, 2
1999    vpbroadcastd         m9, [base+z_filter_k+(r5-1)*4+12*2]
2000    palignr              m1, m6, m2, 14
2001    pmullw               m5, m6
2002    palignr              m3, m4, m6, 2
2003    paddw                m1, m3
2004    palignr              m2, m6, m2, 12
2005    pmullw               m1, m8
2006    palignr              m4, m6, 4
2007    paddw                m2, m4
2008    pmullw               m2, m9
2009    pmovzxbw            m10, ym10
2010    pxor                 m6, m6
2011    paddw                m5, m1
2012    pminuw               m1, m10, [base+pw_0to31]
2013    paddw                m5, m2
2014    psrlw                m5, 3
2015    pavgw                m6, m5
2016    vpermw               m7, m10, m6
2017    vpermw               m6, m1, m6
2018.filter32_end:
2019    ret
2020.w16:
2021    mova                 m6, [tlq-64*1]
2022    cmp                  hd, 32
2023    jl .w16_h16
2024    pmaxud               m8, m0, [base+pw_16] {1to16}
2025    mova                 m7, [tlq-64*2]
2026    vpermw               m6, m0, m6
2027    jg .w16_h64
2028    vpermw               m7, m8, m7
2029    test             angled, 0x400
2030    jnz .w16_main
2031    call .filter64
2032    vpbroadcastd         m0, [base+pw_15]
2033    vinserti32x8         m0, [base+pw_0to31], 0
2034    vpermw               m7, m0, m7
2035    jmp .w16_main
2036.w16_h16:
2037    lea                 r3d, [hq*2-1]
2038    xor                 r3d, 31 ; 32 - (h + imin(w, h))
2039    vpbroadcastw         m1, r3d
2040    vpermw               m7, m1, m6
2041    pmaxuw               m1, m0
2042    vpermw               m6, m1, m6
2043    test             angled, 0x400
2044    jnz .w16_main
2045    lea                 r3d, [hq+15]
2046    call .filter32
2047.w16_main:
2048    vbroadcasti32x8      m0, [base+pw_1to32]
2049    vpbroadcastw         m4, dyd
2050    rorx                r2d, hd, 1
2051    pmullw               m4, m0      ; ypos
2052    vpbroadcastw        ym1, r2d
2053    imul                 r2, strideq ; stride * height / 2
2054    paddd                m1, [base+pw_32704] {1to16}
2055    lea                  r3, [r2+strideq]
2056    psrlw                m2, m4, 6
2057    psllw                m4, 9
2058    paddsw               m2, m1      ; base+0
2059    vpandd               m4, m14     ; frac << 9
2060    mova                 m3, m6
2061    vpermt2w             m3, m2, m7  ; left[base+0]
2062.w16_loop:
2063    paddsw               m1, m2, m15 ; base+1
2064    paddsw               m2, m1, m15 ; base+2
2065    vpermi2w             m1, m6, m7  ; left[base+1]
2066    psubw                m0, m1, m3
2067    pmulhrsw             m0, m4
2068    paddw                m0, m3
2069    mova                 m3, m6
2070    vpermt2w             m3, m2, m7  ; left[base+2]
2071    vextracti32x8 [dstq+strideq*0], m0, 1
2072    mova          [dstq+r2       ], ym0
2073    psubw                m0, m3, m1
2074    pmulhrsw             m0, m4
2075    paddw                m0, m1
2076    vextracti32x8 [dstq+strideq*1], m0, 1
2077    mova          [dstq+r3       ], ym0
2078    lea                dstq, [dstq+strideq*2]
2079    sub                  hd, 4
2080    jg .w16_loop
2081    RET
2082.w16_h64:
2083    vpermw               m7, m0, m7
2084    vpermw               m8, m8, [tlq-64*3]
2085    test             angled, 0x400
2086    jnz .w16_h64_main
2087    valignq             m11, m8, m7, 6
2088    call .filter64
2089    vshufi32x4           m2, m8, m8, q3321
2090    vpbroadcastd         m0, [base+pw_15]
2091    palignr             ym3, ym8, ym11, 12
2092    vinserti32x8         m0, [base+pw_0to31], 0
2093    palignr             ym4, ym8, ym11, 14
2094    palignr             ym1, ym2, ym8, 4
2095    paddw               ym3, ym5
2096    palignr             ym2, ym8, 2
2097    paddw               ym8, ym4
2098    pavgw               ym3, ym1
2099    paddw               ym8, ym2
2100    paddw               ym8, ym3
2101    psrlw               ym8, 2
2102    vpermw               m8, m0, m8
2103.w16_h64_main:
2104    vbroadcasti32x8      m0, [base+pw_1to32]
2105    vpbroadcastw         m4, dyd
2106    pmullw               m4, m0    ; ypos
2107    vpbroadcastd        ym1, [base+pw_32]
2108    paddd                m1, [base+pw_32672] {1to16}
2109    mov                  r2, strideq
2110    shl                  r2, 5      ; stride*32
2111    vpbroadcastd         m9, [base+pw_32735]
2112    lea                  r3, [r2+strideq]
2113    psrlw                m2, m4, 6
2114    psllw                m4, 9
2115    paddsw               m2, m1     ; base+0
2116    vpandd               m4, m14    ; frac << 9
2117    mova                 m3, m7
2118    vpermt2w             m3, m2, m6
2119    vpcmpgtw             k1, m2, m9
2120    vpermw           m3{k1}, m2, m8 ; left[base+0]
2121.w16_h64_loop:
2122    paddsw               m2, m15    ; base+1
2123    mova                 m1, m7
2124    vpermt2w             m1, m2, m6
2125    vpcmpgtw             k1, m2, m9
2126    vpermw           m1{k1}, m2, m8 ; left[base+1]
2127    psubw                m0, m1, m3
2128    pmulhrsw             m0, m4
2129    paddsw               m2, m15    ; base+2
2130    paddw                m0, m3
2131    mova                 m3, m7
2132    vpermt2w             m3, m2, m6
2133    vpcmpgtw             k1, m2, m9
2134    vpermw           m3{k1}, m2, m8 ; left[base+2]
2135    vextracti32x8 [dstq+strideq*0], m0, 1
2136    mova          [dstq+r2       ], ym0
2137    psubw                m0, m3, m1
2138    pmulhrsw             m0, m4
2139    paddw                m0, m1
2140    vextracti32x8 [dstq+strideq*1], m0, 1
2141    mova          [dstq+r3       ], ym0
2142    lea                dstq, [dstq+strideq*2]
2143    sub                  hd, 4
2144    jg .w16_h64_loop
2145    RET
2146.filter64:
2147    vpbroadcastw         m2, [tlq]
2148    vpbroadcastd         m5, [base+pw_3]
2149    valignq              m2, m6, m2, 6
2150    valignq              m4, m7, m6, 2
2151    valignq             m10, m7, m6, 6
2152    palignr              m1, m6, m2, 12
2153    palignr              m2, m6, m2, 14
2154    palignr              m3, m4, m6, 4
2155    paddw                m1, m5
2156    palignr              m4, m6, 2
2157    paddw                m6, m2
2158    valignq              m2, m8, m7, 2
2159    pavgw                m1, m3
2160    palignr              m3, m7, m10, 12
2161    paddw                m6, m4
2162    palignr              m4, m7, m10, 14
2163    paddw                m6, m1
2164    palignr              m1, m2, m7, 4
2165    psrlw                m6, 2
2166    palignr              m2, m7, 2
2167    paddw                m3, m5
2168    paddw                m7, m4
2169    pavgw                m3, m1
2170    paddw                m7, m2
2171    paddw                m7, m3
2172    psrlw                m7, 2
2173    ret
2174.w32:
2175    mova                 m6, [tlq-64*1]
2176    cmp                  hd, 32
2177    jl .w32_h16
2178    mova                 m8, [tlq-64*2]
2179    vpermw               m6, m0, m6
2180    vpermw               m7, m0, m8
2181    jg .w32_h64
2182    test             angled, 0x400
2183    jnz .w32_main
2184    vpbroadcastw        xm8, xm8
2185    jmp .w32_filter
2186.w32_h16:
2187    lea                 r3d, [hq*2-1]
2188    xor                 r3d, 31 ; 32 - (h + imin(w, h))
2189    vpbroadcastw         m1, r3d
2190    vpermw               m7, m1, m6
2191    pmaxuw               m1, m0
2192    vpermw               m6, m1, m6
2193    test             angled, 0x400
2194    jnz .w32_main
2195    vextracti32x4       xm8, m7, 3
2196.w32_filter:
2197    call .filter64
2198.w32_main:
2199    vpbroadcastw         m4, dyd
2200    vpbroadcastd         m1, [base+pw_32704]
2201    pmullw               m4, [base+pw_1to32] ; ypos
2202    psrlw                m2, m4, 6
2203    psllw                m4, 9
2204    paddsw               m2, m1      ; base+0
2205    vpandd               m4, m14     ; frac << 9
2206    mova                 m3, m6
2207    vpermt2w             m3, m2, m7  ; left[base+0]
2208.w32_loop:
2209    paddsw               m1, m2, m15 ; base+1
2210    paddsw               m2, m1, m15 ; base+2
2211    vpermi2w             m1, m6, m7  ; left[base+1]
2212    psubw                m0, m1, m3
2213    pmulhrsw             m0, m4
2214    paddw                m0, m3
2215    mova                 m3, m6
2216    vpermt2w             m3, m2, m7  ; left[base+2]
2217    mova   [dstq+strideq*0], m0
2218    psubw                m0, m3, m1
2219    pmulhrsw             m0, m4
2220    paddw                m0, m1
2221    mova   [dstq+strideq*1], m0
2222    lea                dstq, [dstq+strideq*2]
2223    sub                  hd, 2
2224    jg .w32_loop
2225    RET
2226.w32_h64:
2227    mova                 m9, [tlq-64*3]
2228    vpermw               m8, m0, m9
2229    test             angled, 0x400
2230    jnz .w32_h64_main
2231    vpbroadcastw        xm9, xm9
2232    call .filter96
2233.w32_h64_main:
2234    vpbroadcastw         m4, dyd
2235    vpbroadcastd         m1, [base+pw_32672]
2236    pmullw               m4, [base+pw_1to32] ; ypos
2237    vpbroadcastd         m9, [base+pw_32735]
2238    psrlw                m2, m4, 6
2239    psllw                m4, 9
2240    paddsw               m2, m1     ; base+0
2241    vpandd               m4, m14    ; frac << 9
2242    mova                 m3, m7
2243    vpermt2w             m3, m2, m6
2244    vpcmpgtw             k1, m2, m9
2245    vpermw           m3{k1}, m2, m8 ; left[base+0]
2246.w32_h64_loop:
2247    paddsw               m2, m15    ; base+1
2248    mova                 m1, m7
2249    vpermt2w             m1, m2, m6
2250    vpcmpgtw             k1, m2, m9
2251    vpermw           m1{k1}, m2, m8 ; left[base+1]
2252    psubw                m0, m1, m3
2253    pmulhrsw             m0, m4
2254    paddsw               m2, m15    ; base+2
2255    paddw                m0, m3
2256    mova                 m3, m7
2257    vpermt2w             m3, m2, m6
2258    vpcmpgtw             k1, m2, m9
2259    vpermw           m3{k1}, m2, m8 ; left[base+2]
2260    mova   [dstq+strideq*0], m0
2261    psubw                m0, m3, m1
2262    pmulhrsw             m0, m4
2263    paddw                m0, m1
2264    mova   [dstq+strideq*1], m0
2265    lea                dstq, [dstq+strideq*2]
2266    sub                  hd, 2
2267    jg .w32_h64_loop
2268    RET
2269.filter96:
2270    valignq             m11, m8, m7, 6
2271    call .filter64
2272    valignq              m2, m9, m8, 2
2273    palignr              m3, m8, m11, 12
2274    palignr              m4, m8, m11, 14
2275    palignr              m1, m2, m8, 4
2276    paddw                m3, m5
2277    palignr              m2, m8, 2
2278    paddw                m8, m4
2279    pavgw                m3, m1
2280    paddw                m8, m2
2281    paddw                m8, m3
2282    psrlw                m8, 2
2283    ret
2284.w64:
2285    mova                 m7, [tlq-64*1]
2286    vpermw               m6, m0, m7
2287    cmp                  hd, 32
2288    jl .w64_h16
2289    mova                 m8, [tlq-64*2]
2290    vpermw               m7, m0, m8
2291    jg .w64_h64
2292    test             angled, 0x400
2293    jnz .w64_main
2294    vpbroadcastw         m8, xm8
2295    mova                 m9, m8
2296    call .filter96
2297    vshufi32x4           m9, m8, m8, q3333
2298    jmp .w64_h64_main
2299.w64_h16:
2300    vpbroadcastw         m7, xm7
2301    test             angled, 0x400
2302    jnz .w64_main
2303    mova                 m8, m7
2304    call .filter64
2305.w64_main:
2306    vpbroadcastw        m11, dyd
2307    vpbroadcastd         m1, [base+pw_32704]
2308    pmullw              m10, m11, [base+pw_1to32] ; ypos
2309    psllw               m11, 5
2310    psrlw                m8, m10, 6
2311    paddw               m11, m10
2312    psllw               m10, 9
2313    psrlw                m9, m11, 6
2314    psllw               m11, 9
2315    psubw                m9, m8
2316    paddsw               m8, m1     ; base+0
2317    vpandd              m10, m14    ; frac << 9
2318    vpandd              m11, m14    ; frac << 9
2319    mova                 m4, m6
2320    vpermt2w             m4, m8, m7 ; left[base+0] ( 0..31)
2321    paddsw               m5, m8, m9
2322    vpermi2w             m5, m6, m7 ; left[base+0] (32..63)
2323.w64_loop:
2324    paddsw               m8, m15    ; base+1      ( 0..31)
2325    mova                 m2, m6
2326    vpermt2w             m2, m8, m7 ; left[base+1] ( 0..31)
2327    paddsw               m3, m8, m9 ; base+1      (32..63)
2328    vpermi2w             m3, m6, m7 ; left[base+1] (32..63)
2329    psubw                m0, m2, m4
2330    psubw                m1, m3, m5
2331    pmulhrsw             m0, m10
2332    pmulhrsw             m1, m11
2333    paddw                m0, m4
2334    paddw                m1, m5
2335    mova                 m4, m2
2336    mova        [dstq+64*0], m0
2337    mova                 m5, m3
2338    mova        [dstq+64*1], m1
2339    add                dstq, strideq
2340    dec                  hd
2341    jg .w64_loop
2342    RET
2343.w64_h64:
2344    vpermw               m8, m0, [tlq-64*3]
2345    mova                m13, [tlq-64*4]
2346    vpermw               m9, m0, m13
2347    test             angled, 0x400
2348    jnz .w64_h64_main
2349    valignq             m12, m9, m8, 6
2350    call .filter96
2351    vpbroadcastw        xm2, xm13
2352    valignq              m2, m9, 2
2353    palignr              m3, m9, m12, 12
2354    palignr              m4, m9, m12, 14
2355    palignr              m1, m2, m9, 4
2356    paddw                m3, m5
2357    palignr              m2, m9, 2
2358    paddw                m9, m4
2359    pavgw                m3, m1
2360    paddw                m9, m2
2361    paddw                m9, m3
2362    psrlw                m9, 2
2363.w64_h64_main:
2364    vpbroadcastw        m11, dyd
2365    vpbroadcastd         m1, [base+pw_32640]
2366    pmullw              m10, m11, [base+pw_1to32] ; ypos
2367    psllw               m11, 5
2368    psrlw               m12, m10, 6
2369    paddw               m11, m10
2370    psllw               m10, 9
2371    psrlw               m13, m11, 6
2372    psllw               m11, 9
2373    psubw               m13, m12
2374    paddsw              m12, m1     ; base+0
2375    vpandd              m10, m14    ; frac << 9
2376    vpandd              m11, m14    ; frac << 9
2377    vpbroadcastd        m14, [base+pw_64]
2378    mova                 m4, m6
2379    vpermt2w             m4, m12, m7
2380    vptestmw             k1, m12, m14
2381    mova                 m0, m8
2382    vpermt2w             m0, m12, m9
2383    paddsw               m1, m12, m13
2384    mova                 m5, m6
2385    vpermt2w             m5, m1, m7
2386    vptestmw             k2, m1, m14
2387    vpermi2w             m1, m8, m9
2388    vmovdqu16        m4{k1}, m0     ; left[base+0] ( 0..31)
2389    vmovdqu16        m5{k2}, m1     ; left[base+0] (32..63)
2390.w64_h64_loop:
2391    paddsw              m12, m15    ; base+1
2392    mova                 m2, m6
2393    vpermt2w             m2, m12, m7
2394    vptestmw             k1, m12, m14
2395    mova                 m0, m8
2396    vpermt2w             m0, m12, m9
2397    paddsw               m1, m12, m13
2398    mova                 m3, m6
2399    vpermt2w             m3, m1, m7
2400    vptestmw             k2, m1, m14
2401    vpermi2w             m1, m8, m9
2402    vmovdqu16        m2{k1}, m0     ; left[base+1] ( 0..31)
2403    vmovdqu16        m3{k2}, m1     ; left[base+1] (32..63)
2404    psubw                m0, m2, m4
2405    psubw                m1, m3, m5
2406    pmulhrsw             m0, m10
2407    pmulhrsw             m1, m11
2408    paddw                m0, m4
2409    paddw                m1, m5
2410    mova                 m4, m2
2411    mova        [dstq+64*0], m0
2412    mova                 m5, m3
2413    mova        [dstq+64*1], m1
2414    add                dstq, strideq
2415    dec                  hd
2416    jg .w64_h64_loop
2417    RET
2418
2419cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
2420    lea                  r6, [pal_pred_16bpc_avx512icl_table]
2421    tzcnt                wd, wm
2422    mova                 m3, [pal_pred_perm]
2423    movifnidn            hd, hm
2424    movsxd               wq, [r6+wq*4]
2425    vpbroadcastq         m4, [pal_unpack+0]
2426    vpbroadcastq         m5, [pal_unpack+8]
2427    add                  wq, r6
2428    vbroadcasti32x4      m6, [palq]
2429    lea            stride3q, [strideq*3]
2430    jmp                  wq
2431.w4:
2432    pmovzxbd            ym0, [idxq]
2433    add                idxq, 8
2434    vpmultishiftqb      ym0, ym4, ym0
2435    vpermw              ym0, ym0, ym6
2436    vextracti32x4       xm1, ym0, 1
2437    movq   [dstq+strideq*0], xm0
2438    movhps [dstq+strideq*1], xm0
2439    movq   [dstq+strideq*2], xm1
2440    movhps [dstq+stride3q ], xm1
2441    lea                dstq, [dstq+strideq*4]
2442    sub                  hd, 4
2443    jg .w4
2444    RET
2445.w8:
2446    pmovzxbd             m0, [idxq]
2447    add                idxq, 16
2448    vpmultishiftqb       m0, m4, m0
2449    vpermw               m0, m0, m6
2450    mova          [dstq+strideq*0], xm0
2451    vextracti32x4 [dstq+strideq*1], ym0, 1
2452    vextracti32x4 [dstq+strideq*2], m0, 2
2453    vextracti32x4 [dstq+stride3q ], m0, 3
2454    lea                dstq, [dstq+strideq*4]
2455    sub                  hd, 4
2456    jg .w8
2457    RET
2458.w16:
2459    movu                ym1, [idxq]
2460    add                idxq, 32
2461    vpermb               m1, m3, m1
2462    vpmultishiftqb       m1, m4, m1
2463    vpermw               m0, m1, m6
2464    psrlw                m1, 8
2465    vpermw               m1, m1, m6
2466    mova          [dstq+strideq*0], ym0
2467    vextracti32x8 [dstq+strideq*1], m0, 1
2468    mova          [dstq+strideq*2], ym1
2469    vextracti32x8 [dstq+stride3q ], m1, 1
2470    lea                dstq, [dstq+strideq*4]
2471    sub                  hd, 4
2472    jg .w16
2473    RET
2474.w32:
2475    vpermb               m2, m3, [idxq]
2476    add                idxq, 64
2477    vpmultishiftqb       m1, m4, m2
2478    vpmultishiftqb       m2, m5, m2
2479    vpermw               m0, m1, m6
2480    psrlw                m1, 8
2481    vpermw               m1, m1, m6
2482    mova   [dstq+strideq*0], m0
2483    mova   [dstq+strideq*1], m1
2484    vpermw               m0, m2, m6
2485    psrlw                m2, 8
2486    vpermw               m1, m2, m6
2487    mova   [dstq+strideq*2], m0
2488    mova   [dstq+stride3q ], m1
2489    lea                dstq, [dstq+strideq*4]
2490    sub                  hd, 4
2491    jg .w32
2492    RET
2493.w64:
2494    vpermb               m2, m3, [idxq]
2495    add                idxq, 64
2496    vpmultishiftqb       m1, m4, m2
2497    vpmultishiftqb       m2, m5, m2
2498    vpermw               m0, m1, m6
2499    psrlw                m1, 8
2500    vpermw               m1, m1, m6
2501    mova          [dstq+ 0], m0
2502    mova          [dstq+64], m1
2503    vpermw               m0, m2, m6
2504    psrlw                m2, 8
2505    vpermw               m1, m2, m6
2506    mova  [dstq+strideq+ 0], m0
2507    mova  [dstq+strideq+64], m1
2508    lea                dstq, [dstq+strideq*2]
2509    sub                  hd, 2
2510    jg .w64
2511    RET
2512
2513; The ipred_filter SIMD processes 4x2 blocks in the following order which
2514; increases parallelism compared to doing things row by row.
2515;     w4     w8       w16             w32
2516;     1     1 2     1 2 5 6     1 2 5 6 9 a d e
2517;     2     2 3     2 3 6 7     2 3 6 7 a b e f
2518;     3     3 4     3 4 7 8     3 4 7 8 b c f g
2519;     4     4 5     4 5 8 9     4 5 8 9 c d g h
2520
2521cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top
2522%define base r6-$$
2523    lea                  r6, [$$]
2524%ifidn filterd, filterm
2525    movzx           filterd, filterb
2526%else
2527    movzx           filterd, byte filterm
2528%endif
2529    shl             filterd, 6
2530    movifnidn            hd, hm
2531    movu                xm0, [tlq-6]
2532    pmovsxbw             m7, [base+filter_intra_taps+filterq+32*0]
2533    pmovsxbw             m8, [base+filter_intra_taps+filterq+32*1]
2534    mov                 r5d, r8m ; bitdepth_max
2535    movsldup             m9, [base+filter_permA]
2536    movshdup            m10, [base+filter_permA]
2537    shr                 r5d, 11  ; is_12bpc
2538    jnz .12bpc
2539    psllw                m7, 2   ; upshift multipliers so that packusdw
2540    psllw                m8, 2   ; will perform clipping for free
2541.12bpc:
2542    vpbroadcastd         m5, [base+filter_rnd+r5*8]
2543    vpbroadcastd         m6, [base+filter_shift+r5*8]
2544    sub                  wd, 8
2545    jl .w4
2546.w8:
2547    call .main4
2548    movsldup            m11, [filter_permB]
2549    lea                 r5d, [hq*2+2]
2550    movshdup            m12, [filter_permB]
2551    lea                topq, [tlq+2]
2552    mova                m13, [filter_permC]
2553    sub                  hd, 4
2554    vinserti32x4        ym0, [topq], 1 ; a0 b0   t0 t1
2555    sub                 tlq, r5
2556%if WIN64
2557    push                 r7
2558    push                 r8
2559%endif
2560    mov                  r7, dstq
2561    mov                 r8d, hd
2562.w8_loop:
2563    movlps              xm4, xm0, [tlq+hq*2]
2564    call .main8
2565    lea                dstq, [dstq+strideq*2]
2566    sub                  hd, 2
2567    jge .w8_loop
2568    test                 wd, wd
2569    jz .end
2570    mov                 r2d, 0x0d
2571    kmovb                k1, r2d
2572    lea                  r2, [strideq*3]
2573.w16:
2574    movd               xmm0, [r7+strideq*1+12]
2575    vpblendd           xmm0, [topq+8], 0x0e ; t1 t2
2576    pinsrw              xm4, xmm0, [r7+strideq*0+14], 2
2577    call .main8
2578    add                  r7, 16
2579    vinserti32x4        ym0, [topq+16], 1   ; a2 b2   t2 t3
2580    mov                  hd, r8d
2581    mov                dstq, r7
2582    add                topq, 16
2583.w16_loop:
2584    movd               xmm1, [dstq+strideq*2-4]
2585    punpcklwd           xm4, xmm1, xmm0
2586    movd               xmm0, [dstq+r2-4]
2587    shufps          xm4{k1}, xmm0, xm0, q3210
2588    call .main8
2589    lea                dstq, [dstq+strideq*2]
2590    sub                  hd, 2
2591    jge .w16_loop
2592    sub                  wd, 8
2593    jg .w16
2594.end:
2595    vpermb               m2, m11, m0
2596    mova                ym1, ym5
2597    vpdpwssd             m1, m2, m7
2598    vpermb               m2, m12, m0
2599    vpdpwssd             m1, m2, m8
2600%if WIN64
2601    pop                  r8
2602    pop                  r7
2603%endif
2604    vextracti32x8       ym2, m1, 1
2605    paddd               ym1, ym2
2606    packusdw            ym1, ym1
2607    vpsrlvw             ym1, ym6
2608    vpermt2q             m0, m13, m1
2609    vextracti32x4 [dstq+strideq*0], m0, 2
2610    vextracti32x4 [dstq+strideq*1], ym0, 1
2611    RET
2612.w4_loop:
2613    movlps              xm0, [tlq-10]
2614    lea                dstq, [dstq+strideq*2]
2615    sub                 tlq, 4
2616.w4:
2617    call .main4
2618    movq   [dstq+strideq*0], xm0
2619    movhps [dstq+strideq*1], xm0
2620    sub                  hd, 2
2621    jg .w4_loop
2622    RET
2623ALIGN function_align
2624.main4:
2625    vpermb               m2, m9, m0
2626    mova                ym1, ym5
2627    vpdpwssd             m1, m2, m7
2628    vpermb               m0, m10, m0
2629    vpdpwssd             m1, m0, m8
2630    vextracti32x8       ym0, m1, 1
2631    paddd               ym0, ym1
2632    vextracti32x4       xm1, ym0, 1
2633    packusdw            xm0, xm1     ; clip
2634    vpsrlvw             xm0, xm6
2635    ret
2636ALIGN function_align
2637.main8:
2638    vpermb               m3, m11, m0
2639    mova                ym2, ym5
2640    vpdpwssd             m2, m3, m7
2641    vpermb               m3, m9, m4
2642    mova                ym1, ym5
2643    vpdpwssd             m1, m3, m7
2644    vpermb               m3, m12, m0
2645    vpdpwssd             m2, m3, m8
2646    vpermb               m3, m10, m4
2647    vpdpwssd             m1, m3, m8
2648    vextracti32x8       ym4, m2, 1
2649    vextracti32x8       ym3, m1, 1
2650    paddd               ym2, ym4
2651    paddd               ym1, ym3
2652    packusdw            ym1, ym2     ; clip
2653    vpsrlvw             ym1, ym6
2654    vpermt2q             m0, m13, m1 ; c0 d0   b0 b1   a0 a1
2655    vextracti32x4 [dstq+strideq*0], m0, 2
2656    vextracti32x4 [dstq+strideq*1], ym0, 1
2657    ret
2658
2659%endif
2660