xref: /aosp_15_r20/external/libdav1d/src/x86/mc16_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32
33; dav1d_obmc_masks[] * -512
34const obmc_masks_avx2
35            dw      0,      0,  -9728,      0, -12800,  -7168,  -2560,      0
36            dw -14336, -11264,  -8192,  -5632,  -3584,  -1536,      0,      0
37            dw -15360, -13824, -12288, -10752,  -9216,  -7680,  -6144,  -5120
38            dw  -4096,  -3072,  -2048,  -1536,      0,      0,      0,      0
39            dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240
40            dw  -9728,  -8704,  -8192,  -7168,  -6656,  -6144,  -5632,  -4608
41            dw  -4096,  -3584,  -3072,  -2560,  -2048,  -2048,  -1536,  -1024
42            dw      0,      0,      0,      0,      0,      0,      0,      0
43
44deint_shuf:     dd 0,  4,  1,  5,  2,  6,  3,  7
45subpel_h_shufA: db 0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
46subpel_h_shufB: db 4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
47subpel_h_shuf2: db 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
48subpel_s_shuf2: db 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7
49subpel_s_shuf8: db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
50rescale_mul:    dd 0,  1,  2,  3,  4,  5,  6,  7
51rescale_mul2:   dd 0,  1,  4,  5,  2,  3,  6,  7
52resize_shuf:    db 0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
53                db 8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
54blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
55wswap:          db 2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13
56bdct_lb_q: times 8 db 0
57           times 8 db 4
58           times 8 db 8
59           times 8 db 12
60
61prep_mul:         dw 16, 16, 4, 4
62put_bilin_h_rnd:  dw 8, 8, 10, 10
63put_8tap_h_rnd:   dd 34, 40
64s_8tap_h_rnd:     dd 2, 8
65s_8tap_h_sh:      dd 2, 4
66put_s_8tap_v_rnd: dd 512, 128
67put_s_8tap_v_sh:  dd 10, 8
68prep_8tap_1d_rnd: dd     8 - (8192 <<  4)
69prep_8tap_2d_rnd: dd    32 - (8192 <<  5)
70warp8x8t_rnd:     dd 16384 - (8192 << 15)
71warp8x8_shift:    dd  5,  3
72warp8x8_rnd:      dw   4096,   4096,  16384,  16384
73bidir_rnd:        dw -16400, -16400, -16388, -16388
74bidir_mul:        dw   2048,   2048,   8192,   8192
75
76%define pw_16 prep_mul
77%define pd_512 put_s_8tap_v_rnd
78
79pw_2:          times 2 dw 2
80pw_64:         times 2 dw 64
81pw_2048:       times 2 dw 2048
82pw_8192:       times 2 dw 8192
83pw_27615:      times 2 dw 27615
84pw_32766:      times 2 dw 32766
85pw_m512:       times 2 dw -512
86pd_32:         dd 32
87pd_63:         dd 63
88pd_64:         dd 64
89pd_32768:      dd 32768
90pd_65538:      dd 65538
91pd_m524256:    dd -524256 ; -8192 << 6 + 32
92pd_0x3ff:      dd 0x3ff
93pq_0x40000000: dq 0x40000000
94               dd 0
95
96%macro BIDIR_JMP_TABLE 2-*
97    %xdefine %1_%2_table (%%table - 2*%3)
98    %xdefine %%base %1_%2_table
99    %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
100    %%table:
101    %rep %0 - 2
102        dd %%prefix %+ .w%3 - %%base
103        %rotate 1
104    %endrep
105%endmacro
106
107BIDIR_JMP_TABLE avg,        avx2,    4, 8, 16, 32, 64, 128
108BIDIR_JMP_TABLE w_avg,      avx2,    4, 8, 16, 32, 64, 128
109BIDIR_JMP_TABLE mask,       avx2,    4, 8, 16, 32, 64, 128
110BIDIR_JMP_TABLE w_mask_420, avx2,    4, 8, 16, 32, 64, 128
111BIDIR_JMP_TABLE w_mask_422, avx2,    4, 8, 16, 32, 64, 128
112BIDIR_JMP_TABLE w_mask_444, avx2,    4, 8, 16, 32, 64, 128
113BIDIR_JMP_TABLE blend,      avx2,    4, 8, 16, 32
114BIDIR_JMP_TABLE blend_v,    avx2, 2, 4, 8, 16, 32
115BIDIR_JMP_TABLE blend_h,    avx2, 2, 4, 8, 16, 32, 64, 128
116
117%macro BASE_JMP_TABLE 3-*
118    %xdefine %1_%2_table (%%table - %3)
119    %xdefine %%base %1_%2
120    %%table:
121    %rep %0 - 2
122        dw %%base %+ _w%3 - %%base
123        %rotate 1
124    %endrep
125%endmacro
126
127%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put)
128%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep)
129
130BASE_JMP_TABLE put,  avx2, 2, 4, 8, 16, 32, 64, 128
131BASE_JMP_TABLE prep, avx2,    4, 8, 16, 32, 64, 128
132
133%macro HV_JMP_TABLE 5-*
134    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
135    %xdefine %%base %1_%3
136    %assign %%types %4
137    %if %%types & 1
138        %xdefine %1_%2_h_%3_table  (%%h  - %5)
139        %%h:
140        %rep %0 - 4
141            dw %%prefix %+ .h_w%5 - %%base
142            %rotate 1
143        %endrep
144        %rotate 4
145    %endif
146    %if %%types & 2
147        %xdefine %1_%2_v_%3_table  (%%v  - %5)
148        %%v:
149        %rep %0 - 4
150            dw %%prefix %+ .v_w%5 - %%base
151            %rotate 1
152        %endrep
153        %rotate 4
154    %endif
155    %if %%types & 4
156        %xdefine %1_%2_hv_%3_table (%%hv - %5)
157        %%hv:
158        %rep %0 - 4
159            dw %%prefix %+ .hv_w%5 - %%base
160            %rotate 1
161        %endrep
162    %endif
163%endmacro
164
165HV_JMP_TABLE put,  bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
166HV_JMP_TABLE prep, bilin, avx2, 7,    4, 8, 16, 32, 64, 128
167
168%macro SCALED_JMP_TABLE 2-*
169    %xdefine %1_%2_table (%%table - %3)
170    %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
171%%table:
172    %rep %0 - 2
173        dw %%base %+ .w%3 - %%base
174        %rotate 1
175    %endrep
176    %rotate 2
177 %%dy_1024:
178    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
179    %rep %0 - 2
180        dw %%base %+ .dy1_w%3 - %%base
181        %rotate 1
182    %endrep
183    %rotate 2
184 %%dy_2048:
185    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
186    %rep %0 - 2
187        dw %%base %+ .dy2_w%3 - %%base
188        %rotate 1
189    %endrep
190%endmacro
191
192SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
193SCALED_JMP_TABLE prep_8tap_scaled, avx2,   4, 8, 16, 32, 64, 128
194
195%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
196
197cextern mc_subpel_filters
198%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
199
200cextern mc_warp_filter
201cextern resize_filter
202
203SECTION .text
204
205INIT_XMM avx2
206cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
207    mov                mxyd, r6m ; mx
208    lea                  r7, [put_avx2]
209%if UNIX64
210    DECLARE_REG_TMP 8
211    %define org_w r8d
212    mov                 r8d, wd
213%else
214    DECLARE_REG_TMP 7
215    %define org_w wm
216%endif
217    tzcnt                wd, wm
218    movifnidn            hd, hm
219    test               mxyd, mxyd
220    jnz .h
221    mov                mxyd, r7m ; my
222    test               mxyd, mxyd
223    jnz .v
224.put:
225    movzx                wd, word [r7+wq*2+table_offset(put,)]
226    add                  wq, r7
227    jmp                  wq
228.put_w2:
229    mov                 r6d, [srcq+ssq*0]
230    mov                 r7d, [srcq+ssq*1]
231    lea                srcq, [srcq+ssq*2]
232    mov        [dstq+dsq*0], r6d
233    mov        [dstq+dsq*1], r7d
234    lea                dstq, [dstq+dsq*2]
235    sub                  hd, 2
236    jg .put_w2
237    RET
238.put_w4:
239    mov                  r6, [srcq+ssq*0]
240    mov                  r7, [srcq+ssq*1]
241    lea                srcq, [srcq+ssq*2]
242    mov        [dstq+dsq*0], r6
243    mov        [dstq+dsq*1], r7
244    lea                dstq, [dstq+dsq*2]
245    sub                  hd, 2
246    jg .put_w4
247    RET
248.put_w8:
249    movu                 m0, [srcq+ssq*0]
250    movu                 m1, [srcq+ssq*1]
251    lea                srcq, [srcq+ssq*2]
252    mova       [dstq+dsq*0], m0
253    mova       [dstq+dsq*1], m1
254    lea                dstq, [dstq+dsq*2]
255    sub                  hd, 2
256    jg .put_w8
257    RET
258INIT_YMM avx2
259.put_w16:
260    movu                 m0, [srcq+ssq*0]
261    movu                 m1, [srcq+ssq*1]
262    lea                srcq, [srcq+ssq*2]
263    mova       [dstq+dsq*0], m0
264    mova       [dstq+dsq*1], m1
265    lea                dstq, [dstq+dsq*2]
266    sub                  hd, 2
267    jg .put_w16
268    RET
269.put_w32:
270    movu                 m0, [srcq+ssq*0+32*0]
271    movu                 m1, [srcq+ssq*0+32*1]
272    movu                 m2, [srcq+ssq*1+32*0]
273    movu                 m3, [srcq+ssq*1+32*1]
274    lea                srcq, [srcq+ssq*2]
275    mova  [dstq+dsq*0+32*0], m0
276    mova  [dstq+dsq*0+32*1], m1
277    mova  [dstq+dsq*1+32*0], m2
278    mova  [dstq+dsq*1+32*1], m3
279    lea                dstq, [dstq+dsq*2]
280    sub                  hd, 2
281    jg .put_w32
282    RET
283.put_w64:
284    movu                 m0, [srcq+32*0]
285    movu                 m1, [srcq+32*1]
286    movu                 m2, [srcq+32*2]
287    movu                 m3, [srcq+32*3]
288    add                srcq, ssq
289    mova        [dstq+32*0], m0
290    mova        [dstq+32*1], m1
291    mova        [dstq+32*2], m2
292    mova        [dstq+32*3], m3
293    add                dstq, dsq
294    dec                  hd
295    jg .put_w64
296    RET
297.put_w128:
298    movu                 m0, [srcq+32*0]
299    movu                 m1, [srcq+32*1]
300    movu                 m2, [srcq+32*2]
301    movu                 m3, [srcq+32*3]
302    mova        [dstq+32*0], m0
303    mova        [dstq+32*1], m1
304    mova        [dstq+32*2], m2
305    mova        [dstq+32*3], m3
306    movu                 m0, [srcq+32*4]
307    movu                 m1, [srcq+32*5]
308    movu                 m2, [srcq+32*6]
309    movu                 m3, [srcq+32*7]
310    add                srcq, ssq
311    mova        [dstq+32*4], m0
312    mova        [dstq+32*5], m1
313    mova        [dstq+32*6], m2
314    mova        [dstq+32*7], m3
315    add                dstq, dsq
316    dec                  hd
317    jg .put_w128
318    RET
319.h:
320    movd                xm5, mxyd
321    mov                mxyd, r7m ; my
322    vpbroadcastd         m4, [pw_16]
323    vpbroadcastw         m5, xm5
324    psubw                m4, m5
325    test               mxyd, mxyd
326    jnz .hv
327    ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
328    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_h)]
329    mov                 r6d, r8m ; bitdepth_max
330    add                  wq, r7
331    shr                 r6d, 11
332    vpbroadcastd         m3, [r7-put_avx2+put_bilin_h_rnd+r6*4]
333    jmp                  wq
334.h_w2:
335    movq                xm1, [srcq+ssq*0]
336    movhps              xm1, [srcq+ssq*1]
337    lea                srcq, [srcq+ssq*2]
338    pmullw              xm0, xm4, xm1
339    psrlq               xm1, 16
340    pmullw              xm1, xm5
341    paddw               xm0, xm3
342    paddw               xm0, xm1
343    psrlw               xm0, 4
344    movd       [dstq+dsq*0], xm0
345    pextrd     [dstq+dsq*1], xm0, 2
346    lea                dstq, [dstq+dsq*2]
347    sub                  hd, 2
348    jg .h_w2
349    RET
350.h_w4:
351    movq                xm0, [srcq+ssq*0]
352    movhps              xm0, [srcq+ssq*1]
353    movq                xm1, [srcq+ssq*0+2]
354    movhps              xm1, [srcq+ssq*1+2]
355    lea                srcq, [srcq+ssq*2]
356    pmullw              xm0, xm4
357    pmullw              xm1, xm5
358    paddw               xm0, xm3
359    paddw               xm0, xm1
360    psrlw               xm0, 4
361    movq       [dstq+dsq*0], xm0
362    movhps     [dstq+dsq*1], xm0
363    lea                dstq, [dstq+dsq*2]
364    sub                  hd, 2
365    jg .h_w4
366    RET
367.h_w8:
368    movu                xm0, [srcq+ssq*0]
369    vinserti128          m0, [srcq+ssq*1], 1
370    movu                xm1, [srcq+ssq*0+2]
371    vinserti128          m1, [srcq+ssq*1+2], 1
372    lea                srcq, [srcq+ssq*2]
373    pmullw               m0, m4
374    pmullw               m1, m5
375    paddw                m0, m3
376    paddw                m0, m1
377    psrlw                m0, 4
378    mova         [dstq+dsq*0], xm0
379    vextracti128 [dstq+dsq*1], m0, 1
380    lea                dstq, [dstq+dsq*2]
381    sub                  hd, 2
382    jg .h_w8
383    RET
384.h_w16:
385    pmullw               m0, m4, [srcq+ssq*0]
386    pmullw               m1, m5, [srcq+ssq*0+2]
387    paddw                m0, m3
388    paddw                m0, m1
389    pmullw               m1, m4, [srcq+ssq*1]
390    pmullw               m2, m5, [srcq+ssq*1+2]
391    lea                srcq, [srcq+ssq*2]
392    paddw                m1, m3
393    paddw                m1, m2
394    psrlw                m0, 4
395    psrlw                m1, 4
396    mova       [dstq+dsq*0], m0
397    mova       [dstq+dsq*1], m1
398    lea                dstq, [dstq+dsq*2]
399    sub                  hd, 2
400    jg .h_w16
401    RET
402.h_w32:
403    pmullw               m0, m4, [srcq+32*0]
404    pmullw               m1, m5, [srcq+32*0+2]
405    paddw                m0, m3
406    paddw                m0, m1
407    pmullw               m1, m4, [srcq+32*1]
408    pmullw               m2, m5, [srcq+32*1+2]
409    add                srcq, ssq
410    paddw                m1, m3
411    paddw                m1, m2
412    psrlw                m0, 4
413    psrlw                m1, 4
414    mova        [dstq+32*0], m0
415    mova        [dstq+32*1], m1
416    add                dstq, dsq
417    dec                  hd
418    jg .h_w32
419    RET
420.h_w64:
421.h_w128:
422    movifnidn           t0d, org_w
423.h_w64_loop0:
424    mov                 r6d, t0d
425.h_w64_loop:
426    pmullw               m0, m4, [srcq+r6*2-32*1]
427    pmullw               m1, m5, [srcq+r6*2-32*1+2]
428    paddw                m0, m3
429    paddw                m0, m1
430    pmullw               m1, m4, [srcq+r6*2-32*2]
431    pmullw               m2, m5, [srcq+r6*2-32*2+2]
432    paddw                m1, m3
433    paddw                m1, m2
434    psrlw                m0, 4
435    psrlw                m1, 4
436    mova   [dstq+r6*2-32*1], m0
437    mova   [dstq+r6*2-32*2], m1
438    sub                 r6d, 32
439    jg .h_w64_loop
440    add                srcq, ssq
441    add                dstq, dsq
442    dec                  hd
443    jg .h_w64_loop0
444    RET
445.v:
446    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
447    shl                mxyd, 11
448    movd                xm5, mxyd
449    add                  wq, r7
450    vpbroadcastw         m5, xm5
451    jmp                  wq
452.v_w2:
453    movd                xm0, [srcq+ssq*0]
454.v_w2_loop:
455    movd                xm1, [srcq+ssq*1]
456    lea                srcq, [srcq+ssq*2]
457    punpckldq           xm2, xm0, xm1
458    movd                xm0, [srcq+ssq*0]
459    punpckldq           xm1, xm0
460    psubw               xm1, xm2
461    pmulhrsw            xm1, xm5
462    paddw               xm1, xm2
463    movd       [dstq+dsq*0], xm1
464    pextrd     [dstq+dsq*1], xm1, 1
465    lea                dstq, [dstq+dsq*2]
466    sub                  hd, 2
467    jg .v_w2_loop
468    RET
469.v_w4:
470    movq                xm0, [srcq+ssq*0]
471.v_w4_loop:
472    movq                xm1, [srcq+ssq*1]
473    lea                srcq, [srcq+ssq*2]
474    punpcklqdq          xm2, xm0, xm1
475    movq                xm0, [srcq+ssq*0]
476    punpcklqdq          xm1, xm0
477    psubw               xm1, xm2
478    pmulhrsw            xm1, xm5
479    paddw               xm1, xm2
480    movq       [dstq+dsq*0], xm1
481    movhps     [dstq+dsq*1], xm1
482    lea                dstq, [dstq+dsq*2]
483    sub                  hd, 2
484    jg .v_w4_loop
485    RET
486.v_w8:
487    movu                xm0, [srcq+ssq*0]
488.v_w8_loop:
489    vbroadcasti128       m1, [srcq+ssq*1]
490    lea                srcq, [srcq+ssq*2]
491    vpblendd             m2, m0, m1, 0xf0
492    vbroadcasti128       m0, [srcq+ssq*0]
493    vpblendd             m1, m0, 0xf0
494    psubw                m1, m2
495    pmulhrsw             m1, m5
496    paddw                m1, m2
497    mova         [dstq+dsq*0], xm1
498    vextracti128 [dstq+dsq*1], m1, 1
499    lea                dstq, [dstq+dsq*2]
500    sub                  hd, 2
501    jg .v_w8_loop
502    RET
503.v_w32:
504    movu                 m0, [srcq+ssq*0+32*0]
505    movu                 m1, [srcq+ssq*0+32*1]
506.v_w32_loop:
507    movu                 m2, [srcq+ssq*1+32*0]
508    movu                 m3, [srcq+ssq*1+32*1]
509    lea                srcq, [srcq+ssq*2]
510    psubw                m4, m2, m0
511    pmulhrsw             m4, m5
512    paddw                m4, m0
513    movu                 m0, [srcq+ssq*0+32*0]
514    mova  [dstq+dsq*0+32*0], m4
515    psubw                m4, m3, m1
516    pmulhrsw             m4, m5
517    paddw                m4, m1
518    movu                 m1, [srcq+ssq*0+32*1]
519    mova  [dstq+dsq*0+32*1], m4
520    psubw                m4, m0, m2
521    pmulhrsw             m4, m5
522    paddw                m4, m2
523    mova  [dstq+dsq*1+32*0], m4
524    psubw                m4, m1, m3
525    pmulhrsw             m4, m5
526    paddw                m4, m3
527    mova  [dstq+dsq*1+32*1], m4
528    lea                dstq, [dstq+dsq*2]
529    sub                  hd, 2
530    jg .v_w32_loop
531    RET
532.v_w16:
533.v_w64:
534.v_w128:
535    movifnidn           t0d, org_w
536    add                 t0d, t0d
537    mov                  r4, srcq
538    lea                 r6d, [hq+t0*8-256]
539    mov                  r7, dstq
540.v_w16_loop0:
541    movu                 m0, [srcq+ssq*0]
542.v_w16_loop:
543    movu                 m3, [srcq+ssq*1]
544    lea                srcq, [srcq+ssq*2]
545    psubw                m1, m3, m0
546    pmulhrsw             m1, m5
547    paddw                m1, m0
548    movu                 m0, [srcq+ssq*0]
549    psubw                m2, m0, m3
550    pmulhrsw             m2, m5
551    paddw                m2, m3
552    mova       [dstq+dsq*0], m1
553    mova       [dstq+dsq*1], m2
554    lea                dstq, [dstq+dsq*2]
555    sub                  hd, 2
556    jg .v_w16_loop
557    add                  r4, 32
558    add                  r7, 32
559    movzx                hd, r6b
560    mov                srcq, r4
561    mov                dstq, r7
562    sub                 r6d, 1<<8
563    jg .v_w16_loop0
564    RET
565.hv:
566    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
567    WIN64_SPILL_XMM       8
568    shl                mxyd, 11
569    vpbroadcastd         m3, [pw_2]
570    movd                xm6, mxyd
571    vpbroadcastd         m7, [pw_8192]
572    add                  wq, r7
573    vpbroadcastw         m6, xm6
574    test          dword r8m, 0x800
575    jnz .hv_12bpc
576    psllw                m4, 2
577    psllw                m5, 2
578    vpbroadcastd         m7, [pw_2048]
579.hv_12bpc:
580    jmp                  wq
581.hv_w2:
582    vpbroadcastq        xm1, [srcq+ssq*0]
583    pmullw              xm0, xm4, xm1
584    psrlq               xm1, 16
585    pmullw              xm1, xm5
586    paddw               xm0, xm3
587    paddw               xm0, xm1
588    psrlw               xm0, 2
589.hv_w2_loop:
590    movq                xm2, [srcq+ssq*1]
591    lea                srcq, [srcq+ssq*2]
592    movhps              xm2, [srcq+ssq*0]
593    pmullw              xm1, xm4, xm2
594    psrlq               xm2, 16
595    pmullw              xm2, xm5
596    paddw               xm1, xm3
597    paddw               xm1, xm2
598    psrlw               xm1, 2              ; 1 _ 2 _
599    shufpd              xm2, xm0, xm1, 0x01 ; 0 _ 1 _
600    mova                xm0, xm1
601    psubw               xm1, xm2
602    paddw               xm1, xm1
603    pmulhw              xm1, xm6
604    paddw               xm1, xm2
605    pmulhrsw            xm1, xm7
606    movd       [dstq+dsq*0], xm1
607    pextrd     [dstq+dsq*1], xm1, 2
608    lea                dstq, [dstq+dsq*2]
609    sub                  hd, 2
610    jg .hv_w2_loop
611    RET
612.hv_w4:
613    pmullw              xm0, xm4, [srcq+ssq*0-8]
614    pmullw              xm1, xm5, [srcq+ssq*0-6]
615    paddw               xm0, xm3
616    paddw               xm0, xm1
617    psrlw               xm0, 2
618.hv_w4_loop:
619    movq                xm1, [srcq+ssq*1]
620    movq                xm2, [srcq+ssq*1+2]
621    lea                srcq, [srcq+ssq*2]
622    movhps              xm1, [srcq+ssq*0]
623    movhps              xm2, [srcq+ssq*0+2]
624    pmullw              xm1, xm4
625    pmullw              xm2, xm5
626    paddw               xm1, xm3
627    paddw               xm1, xm2
628    psrlw               xm1, 2              ; 1 2
629    shufpd              xm2, xm0, xm1, 0x01 ; 0 1
630    mova                xm0, xm1
631    psubw               xm1, xm2
632    paddw               xm1, xm1
633    pmulhw              xm1, xm6
634    paddw               xm1, xm2
635    pmulhrsw            xm1, xm7
636    movq       [dstq+dsq*0], xm1
637    movhps     [dstq+dsq*1], xm1
638    lea                dstq, [dstq+dsq*2]
639    sub                  hd, 2
640    jg .hv_w4_loop
641    RET
642.hv_w8:
643    pmullw              xm0, xm4, [srcq+ssq*0]
644    pmullw              xm1, xm5, [srcq+ssq*0+2]
645    paddw               xm0, xm3
646    paddw               xm0, xm1
647    psrlw               xm0, 2
648    vinserti128          m0, xm0, 1
649.hv_w8_loop:
650    movu                xm1, [srcq+ssq*1]
651    movu                xm2, [srcq+ssq*1+2]
652    lea                srcq, [srcq+ssq*2]
653    vinserti128          m1, [srcq+ssq*0], 1
654    vinserti128          m2, [srcq+ssq*0+2], 1
655    pmullw               m1, m4
656    pmullw               m2, m5
657    paddw                m1, m3
658    paddw                m1, m2
659    psrlw                m1, 2            ; 1 2
660    vperm2i128           m2, m0, m1, 0x21 ; 0 1
661    mova                 m0, m1
662    psubw                m1, m2
663    paddw                m1, m1
664    pmulhw               m1, m6
665    paddw                m1, m2
666    pmulhrsw             m1, m7
667    mova         [dstq+dsq*0], xm1
668    vextracti128 [dstq+dsq*1], m1, 1
669    lea                dstq, [dstq+dsq*2]
670    sub                  hd, 2
671    jg .hv_w8_loop
672    RET
673.hv_w16:
674.hv_w32:
675.hv_w64:
676.hv_w128:
677%if UNIX64
678    lea                 r6d, [r8*2-32]
679%else
680    mov                 r6d, wm
681    lea                 r6d, [r6*2-32]
682%endif
683    mov                  r4, srcq
684    lea                 r6d, [hq+r6*8]
685    mov                  r7, dstq
686.hv_w16_loop0:
687    pmullw               m0, m4, [srcq+ssq*0]
688    pmullw               m1, m5, [srcq+ssq*0+2]
689    paddw                m0, m3
690    paddw                m0, m1
691    psrlw                m0, 2
692.hv_w16_loop:
693    pmullw               m1, m4, [srcq+ssq*1]
694    pmullw               m2, m5, [srcq+ssq*1+2]
695    lea                srcq, [srcq+ssq*2]
696    paddw                m1, m3
697    paddw                m1, m2
698    psrlw                m1, 2
699    psubw                m2, m1, m0
700    paddw                m2, m2
701    pmulhw               m2, m6
702    paddw                m2, m0
703    pmulhrsw             m2, m7
704    mova       [dstq+dsq*0], m2
705    pmullw               m0, m4, [srcq+ssq*0]
706    pmullw               m2, m5, [srcq+ssq*0+2]
707    paddw                m0, m3
708    paddw                m0, m2
709    psrlw                m0, 2
710    psubw                m2, m0, m1
711    paddw                m2, m2
712    pmulhw               m2, m6
713    paddw                m2, m1
714    pmulhrsw             m2, m7
715    mova       [dstq+dsq*1], m2
716    lea                dstq, [dstq+dsq*2]
717    sub                  hd, 2
718    jg .hv_w16_loop
719    add                  r4, 32
720    add                  r7, 32
721    movzx                hd, r6b
722    mov                srcq, r4
723    mov                dstq, r7
724    sub                 r6d, 1<<8
725    jg .hv_w16_loop0
726    RET
727
728cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
729    movifnidn          mxyd, r5m ; mx
730    lea                  r6, [prep_avx2]
731%if UNIX64
732    DECLARE_REG_TMP 7
733    %define org_w r7d
734%else
735    DECLARE_REG_TMP 6
736    %define org_w r5m
737%endif
738    mov               org_w, wd
739    tzcnt                wd, wm
740    movifnidn            hd, hm
741    test               mxyd, mxyd
742    jnz .h
743    mov                mxyd, r6m ; my
744    test               mxyd, mxyd
745    jnz .v
746.prep:
747    movzx                wd, word [r6+wq*2+table_offset(prep,)]
748    mov                 r5d, r7m ; bitdepth_max
749    vpbroadcastd         m5, [r6-prep_avx2+pw_8192]
750    add                  wq, r6
751    shr                 r5d, 11
752    vpbroadcastd         m4, [r6-prep_avx2+prep_mul+r5*4]
753    lea            stride3q, [strideq*3]
754    jmp                  wq
755.prep_w4:
756    movq                xm0, [srcq+strideq*0]
757    movhps              xm0, [srcq+strideq*1]
758    vpbroadcastq         m1, [srcq+strideq*2]
759    vpbroadcastq         m2, [srcq+stride3q ]
760    lea                srcq, [srcq+strideq*4]
761    vpblendd             m0, m1, 0x30
762    vpblendd             m0, m2, 0xc0
763    pmullw               m0, m4
764    psubw                m0, m5
765    mova             [tmpq], m0
766    add                tmpq, 32
767    sub                  hd, 4
768    jg .prep_w4
769    RET
770.prep_w8:
771    movu                xm0, [srcq+strideq*0]
772    vinserti128          m0, [srcq+strideq*1], 1
773    movu                xm1, [srcq+strideq*2]
774    vinserti128          m1, [srcq+stride3q ], 1
775    lea                srcq, [srcq+strideq*4]
776    pmullw               m0, m4
777    pmullw               m1, m4
778    psubw                m0, m5
779    psubw                m1, m5
780    mova        [tmpq+32*0], m0
781    mova        [tmpq+32*1], m1
782    add                tmpq, 32*2
783    sub                  hd, 4
784    jg .prep_w8
785    RET
786.prep_w16:
787    pmullw               m0, m4, [srcq+strideq*0]
788    pmullw               m1, m4, [srcq+strideq*1]
789    pmullw               m2, m4, [srcq+strideq*2]
790    pmullw               m3, m4, [srcq+stride3q ]
791    lea                srcq, [srcq+strideq*4]
792    psubw                m0, m5
793    psubw                m1, m5
794    psubw                m2, m5
795    psubw                m3, m5
796    mova        [tmpq+32*0], m0
797    mova        [tmpq+32*1], m1
798    mova        [tmpq+32*2], m2
799    mova        [tmpq+32*3], m3
800    add                tmpq, 32*4
801    sub                  hd, 4
802    jg .prep_w16
803    RET
804.prep_w32:
805    pmullw               m0, m4, [srcq+strideq*0+32*0]
806    pmullw               m1, m4, [srcq+strideq*0+32*1]
807    pmullw               m2, m4, [srcq+strideq*1+32*0]
808    pmullw               m3, m4, [srcq+strideq*1+32*1]
809    lea                srcq, [srcq+strideq*2]
810    psubw                m0, m5
811    psubw                m1, m5
812    psubw                m2, m5
813    psubw                m3, m5
814    mova        [tmpq+32*0], m0
815    mova        [tmpq+32*1], m1
816    mova        [tmpq+32*2], m2
817    mova        [tmpq+32*3], m3
818    add                tmpq, 32*4
819    sub                  hd, 2
820    jg .prep_w32
821    RET
822.prep_w64:
823    pmullw               m0, m4, [srcq+32*0]
824    pmullw               m1, m4, [srcq+32*1]
825    pmullw               m2, m4, [srcq+32*2]
826    pmullw               m3, m4, [srcq+32*3]
827    add                srcq, strideq
828    psubw                m0, m5
829    psubw                m1, m5
830    psubw                m2, m5
831    psubw                m3, m5
832    mova        [tmpq+32*0], m0
833    mova        [tmpq+32*1], m1
834    mova        [tmpq+32*2], m2
835    mova        [tmpq+32*3], m3
836    add                tmpq, 32*4
837    dec                  hd
838    jg .prep_w64
839    RET
840.prep_w128:
841    pmullw               m0, m4, [srcq+32*0]
842    pmullw               m1, m4, [srcq+32*1]
843    pmullw               m2, m4, [srcq+32*2]
844    pmullw               m3, m4, [srcq+32*3]
845    psubw                m0, m5
846    psubw                m1, m5
847    psubw                m2, m5
848    psubw                m3, m5
849    mova        [tmpq+32*0], m0
850    mova        [tmpq+32*1], m1
851    mova        [tmpq+32*2], m2
852    mova        [tmpq+32*3], m3
853    pmullw               m0, m4, [srcq+32*4]
854    pmullw               m1, m4, [srcq+32*5]
855    pmullw               m2, m4, [srcq+32*6]
856    pmullw               m3, m4, [srcq+32*7]
857    add                tmpq, 32*8
858    add                srcq, strideq
859    psubw                m0, m5
860    psubw                m1, m5
861    psubw                m2, m5
862    psubw                m3, m5
863    mova        [tmpq-32*4], m0
864    mova        [tmpq-32*3], m1
865    mova        [tmpq-32*2], m2
866    mova        [tmpq-32*1], m3
867    dec                  hd
868    jg .prep_w128
869    RET
870.h:
871    movd                xm5, mxyd
872    mov                mxyd, r6m ; my
873    vpbroadcastd         m4, [pw_16]
874    vpbroadcastw         m5, xm5
875    vpbroadcastd         m3, [pw_32766]
876    psubw                m4, m5
877    test          dword r7m, 0x800
878    jnz .h_12bpc
879    psllw                m4, 2
880    psllw                m5, 2
881.h_12bpc:
882    test               mxyd, mxyd
883    jnz .hv
884    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
885    add                  wq, r6
886    lea            stride3q, [strideq*3]
887    jmp                  wq
888.h_w4:
889    movu                xm1, [srcq+strideq*0]
890    vinserti128          m1, [srcq+strideq*2], 1
891    movu                xm2, [srcq+strideq*1]
892    vinserti128          m2, [srcq+stride3q ], 1
893    lea                srcq, [srcq+strideq*4]
894    punpcklqdq           m0, m1, m2
895    psrldq               m1, 2
896    pslldq               m2, 6
897    pmullw               m0, m4
898    vpblendd             m1, m2, 0xcc
899    pmullw               m1, m5
900    psubw                m0, m3
901    paddw                m0, m1
902    psraw                m0, 2
903    mova             [tmpq], m0
904    add                tmpq, 32
905    sub                  hd, 4
906    jg .h_w4
907    RET
908.h_w8:
909    movu                xm0, [srcq+strideq*0]
910    vinserti128          m0, [srcq+strideq*1], 1
911    movu                xm1, [srcq+strideq*0+2]
912    vinserti128          m1, [srcq+strideq*1+2], 1
913    lea                srcq, [srcq+strideq*2]
914    pmullw               m0, m4
915    pmullw               m1, m5
916    psubw                m0, m3
917    paddw                m0, m1
918    psraw                m0, 2
919    mova             [tmpq], m0
920    add                tmpq, 32
921    sub                  hd, 2
922    jg .h_w8
923    RET
924.h_w16:
925    pmullw               m0, m4, [srcq+strideq*0]
926    pmullw               m1, m5, [srcq+strideq*0+2]
927    psubw                m0, m3
928    paddw                m0, m1
929    pmullw               m1, m4, [srcq+strideq*1]
930    pmullw               m2, m5, [srcq+strideq*1+2]
931    lea                srcq, [srcq+strideq*2]
932    psubw                m1, m3
933    paddw                m1, m2
934    psraw                m0, 2
935    psraw                m1, 2
936    mova        [tmpq+32*0], m0
937    mova        [tmpq+32*1], m1
938    add                tmpq, 32*2
939    sub                  hd, 2
940    jg .h_w16
941    RET
942.h_w32:
943.h_w64:
944.h_w128:
945    movifnidn           t0d, org_w
946.h_w32_loop0:
947    mov                 r3d, t0d
948.h_w32_loop:
949    pmullw               m0, m4, [srcq+r3*2-32*1]
950    pmullw               m1, m5, [srcq+r3*2-32*1+2]
951    psubw                m0, m3
952    paddw                m0, m1
953    pmullw               m1, m4, [srcq+r3*2-32*2]
954    pmullw               m2, m5, [srcq+r3*2-32*2+2]
955    psubw                m1, m3
956    paddw                m1, m2
957    psraw                m0, 2
958    psraw                m1, 2
959    mova   [tmpq+r3*2-32*1], m0
960    mova   [tmpq+r3*2-32*2], m1
961    sub                 r3d, 32
962    jg .h_w32_loop
963    add                srcq, strideq
964    lea                tmpq, [tmpq+t0*2]
965    dec                  hd
966    jg .h_w32_loop0
967    RET
968.v:
969    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
970    movd                xm5, mxyd
971    vpbroadcastd         m4, [pw_16]
972    vpbroadcastw         m5, xm5
973    vpbroadcastd         m3, [pw_32766]
974    add                  wq, r6
975    lea            stride3q, [strideq*3]
976    psubw                m4, m5
977    test          dword r7m, 0x800
978    jnz .v_12bpc
979    psllw                m4, 2
980    psllw                m5, 2
981.v_12bpc:
982    jmp                  wq
983.v_w4:
984    movq                xm0, [srcq+strideq*0]
985.v_w4_loop:
986    vpbroadcastq         m2, [srcq+strideq*2]
987    vpbroadcastq        xm1, [srcq+strideq*1]
988    vpblendd             m2, m0, 0x03 ; 0 2 2 2
989    vpbroadcastq         m0, [srcq+stride3q ]
990    lea                srcq, [srcq+strideq*4]
991    vpblendd             m1, m0, 0xf0 ; 1 1 3 3
992    vpbroadcastq         m0, [srcq+strideq*0]
993    vpblendd             m1, m2, 0x33 ; 0 1 2 3
994    vpblendd             m0, m2, 0x0c ; 4 2 4 4
995    punpckhqdq           m2, m1, m0   ; 1 2 3 4
996    pmullw               m1, m4
997    pmullw               m2, m5
998    psubw                m1, m3
999    paddw                m1, m2
1000    psraw                m1, 2
1001    mova             [tmpq], m1
1002    add                tmpq, 32
1003    sub                  hd, 4
1004    jg .v_w4_loop
1005    RET
1006.v_w8:
1007    movu                xm0, [srcq+strideq*0]
1008.v_w8_loop:
1009    vbroadcasti128       m2, [srcq+strideq*1]
1010    lea                srcq, [srcq+strideq*2]
1011    vpblendd             m1, m0, m2, 0xf0 ; 0 1
1012    vbroadcasti128       m0, [srcq+strideq*0]
1013    vpblendd             m2, m0, 0xf0     ; 1 2
1014    pmullw               m1, m4
1015    pmullw               m2, m5
1016    psubw                m1, m3
1017    paddw                m1, m2
1018    psraw                m1, 2
1019    mova             [tmpq], m1
1020    add                tmpq, 32
1021    sub                  hd, 2
1022    jg .v_w8_loop
1023    RET
1024.v_w16:
1025    movu                 m0, [srcq+strideq*0]
1026.v_w16_loop:
1027    movu                 m2, [srcq+strideq*1]
1028    lea                srcq, [srcq+strideq*2]
1029    pmullw               m0, m4
1030    pmullw               m1, m5, m2
1031    psubw                m0, m3
1032    paddw                m1, m0
1033    movu                 m0, [srcq+strideq*0]
1034    psraw                m1, 2
1035    pmullw               m2, m4
1036    mova        [tmpq+32*0], m1
1037    pmullw               m1, m5, m0
1038    psubw                m2, m3
1039    paddw                m1, m2
1040    psraw                m1, 2
1041    mova        [tmpq+32*1], m1
1042    add                tmpq, 32*2
1043    sub                  hd, 2
1044    jg .v_w16_loop
1045    RET
1046.v_w32:
1047.v_w64:
1048.v_w128:
1049%if WIN64
1050    PUSH                 r7
1051%endif
1052    movifnidn           r7d, org_w
1053    add                 r7d, r7d
1054    mov                  r3, srcq
1055    lea                 r6d, [hq+r7*8-256]
1056    mov                  r5, tmpq
1057.v_w32_loop0:
1058    movu                 m0, [srcq+strideq*0]
1059.v_w32_loop:
1060    movu                 m2, [srcq+strideq*1]
1061    lea                srcq, [srcq+strideq*2]
1062    pmullw               m0, m4
1063    pmullw               m1, m5, m2
1064    psubw                m0, m3
1065    paddw                m1, m0
1066    movu                 m0, [srcq+strideq*0]
1067    psraw                m1, 2
1068    pmullw               m2, m4
1069    mova        [tmpq+r7*0], m1
1070    pmullw               m1, m5, m0
1071    psubw                m2, m3
1072    paddw                m1, m2
1073    psraw                m1, 2
1074    mova        [tmpq+r7*1], m1
1075    lea                tmpq, [tmpq+r7*2]
1076    sub                  hd, 2
1077    jg .v_w32_loop
1078    add                  r3, 32
1079    add                  r5, 32
1080    movzx                hd, r6b
1081    mov                srcq, r3
1082    mov                tmpq, r5
1083    sub                 r6d, 1<<8
1084    jg .v_w32_loop0
1085%if WIN64
1086    POP                  r7
1087%endif
1088    RET
1089.hv:
1090    WIN64_SPILL_XMM       7
1091    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
1092    shl                mxyd, 11
1093    movd                xm6, mxyd
1094    add                  wq, r6
1095    lea            stride3q, [strideq*3]
1096    vpbroadcastw         m6, xm6
1097    jmp                  wq
1098.hv_w4:
1099    movu                xm1, [srcq+strideq*0]
1100%if WIN64
1101    movaps         [rsp+24], xmm7
1102%endif
1103    pmullw              xm0, xm4, xm1
1104    psrldq              xm1, 2
1105    pmullw              xm1, xm5
1106    psubw               xm0, xm3
1107    paddw               xm0, xm1
1108    psraw               xm0, 2
1109    vpbroadcastq         m0, xm0
1110.hv_w4_loop:
1111    movu                xm1, [srcq+strideq*1]
1112    vinserti128          m1, [srcq+stride3q ], 1
1113    movu                xm2, [srcq+strideq*2]
1114    lea                srcq, [srcq+strideq*4]
1115    vinserti128          m2, [srcq+strideq*0], 1
1116    punpcklqdq           m7, m1, m2
1117    psrldq               m1, 2
1118    pslldq               m2, 6
1119    pmullw               m7, m4
1120    vpblendd             m1, m2, 0xcc
1121    pmullw               m1, m5
1122    psubw                m7, m3
1123    paddw                m1, m7
1124    psraw                m1, 2         ; 1 2 3 4
1125    vpblendd             m0, m1, 0x3f
1126    vpermq               m2, m0, q2103 ; 0 1 2 3
1127    mova                 m0, m1
1128    psubw                m1, m2
1129    pmulhrsw             m1, m6
1130    paddw                m1, m2
1131    mova             [tmpq], m1
1132    add                tmpq, 32
1133    sub                  hd, 4
1134    jg .hv_w4_loop
1135%if WIN64
1136    movaps             xmm7, [rsp+24]
1137%endif
1138    RET
1139.hv_w8:
1140    pmullw              xm0, xm4, [srcq+strideq*0]
1141    pmullw              xm1, xm5, [srcq+strideq*0+2]
1142    psubw               xm0, xm3
1143    paddw               xm0, xm1
1144    psraw               xm0, 2
1145    vinserti128          m0, xm0, 1
1146.hv_w8_loop:
1147    movu                xm1, [srcq+strideq*1]
1148    movu                xm2, [srcq+strideq*1+2]
1149    lea                srcq, [srcq+strideq*2]
1150    vinserti128          m1, [srcq+strideq*0], 1
1151    vinserti128          m2, [srcq+strideq*0+2], 1
1152    pmullw               m1, m4
1153    pmullw               m2, m5
1154    psubw                m1, m3
1155    paddw                m1, m2
1156    psraw                m1, 2            ; 1 2
1157    vperm2i128           m2, m0, m1, 0x21 ; 0 1
1158    mova                 m0, m1
1159    psubw                m1, m2
1160    pmulhrsw             m1, m6
1161    paddw                m1, m2
1162    mova             [tmpq], m1
1163    add                tmpq, 32
1164    sub                  hd, 2
1165    jg .hv_w8_loop
1166    RET
1167.hv_w16:
1168.hv_w32:
1169.hv_w64:
1170.hv_w128:
1171%if WIN64
1172    PUSH                 r7
1173%endif
1174    movifnidn           r7d, org_w
1175    add                 r7d, r7d
1176    mov                  r3, srcq
1177    lea                 r6d, [hq+r7*8-256]
1178    mov                  r5, tmpq
1179.hv_w16_loop0:
1180    pmullw               m0, m4, [srcq]
1181    pmullw               m1, m5, [srcq+2]
1182    psubw                m0, m3
1183    paddw                m0, m1
1184    psraw                m0, 2
1185.hv_w16_loop:
1186    pmullw               m1, m4, [srcq+strideq*1]
1187    pmullw               m2, m5, [srcq+strideq*1+2]
1188    lea                srcq, [srcq+strideq*2]
1189    psubw                m1, m3
1190    paddw                m1, m2
1191    psraw                m1, 2
1192    psubw                m2, m1, m0
1193    pmulhrsw             m2, m6
1194    paddw                m2, m0
1195    mova        [tmpq+r7*0], m2
1196    pmullw               m0, m4, [srcq+strideq*0]
1197    pmullw               m2, m5, [srcq+strideq*0+2]
1198    psubw                m0, m3
1199    paddw                m0, m2
1200    psraw                m0, 2
1201    psubw                m2, m0, m1
1202    pmulhrsw             m2, m6
1203    paddw                m2, m1
1204    mova        [tmpq+r7*1], m2
1205    lea                tmpq, [tmpq+r7*2]
1206    sub                  hd, 2
1207    jg .hv_w16_loop
1208    add                  r3, 32
1209    add                  r5, 32
1210    movzx                hd, r6b
1211    mov                srcq, r3
1212    mov                tmpq, r5
1213    sub                 r6d, 1<<8
1214    jg .hv_w16_loop0
1215%if WIN64
1216    POP                  r7
1217%endif
1218    RET
1219
1220; int8_t subpel_filters[5][15][8]
1221%assign FILTER_REGULAR (0*15 << 16) | 3*15
1222%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1223%assign FILTER_SHARP   (2*15 << 16) | 3*15
1224
1225%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
1226cglobal %1_%2_16bpc
1227    mov                 t0d, FILTER_%3
1228%ifidn %3, %4
1229    mov                 t1d, t0d
1230%else
1231    mov                 t1d, FILTER_%4
1232%endif
1233%if %0 == 5 ; skip the jump in the last filter
1234    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1235%endif
1236%endmacro
1237
1238%if WIN64
1239DECLARE_REG_TMP 4, 5
1240%else
1241DECLARE_REG_TMP 7, 8
1242%endif
1243
1244%define PUT_8TAP_FN FN put_8tap,
1245PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
1246PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
1247PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
1248PUT_8TAP_FN regular,        REGULAR, REGULAR
1249
1250cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
1251%define base r8-put_avx2
1252    imul                mxd, mxm, 0x010101
1253    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
1254    imul                myd, mym, 0x010101
1255    add                 myd, t1d ; 6tap_v, my, 4tap_v
1256    lea                  r8, [put_avx2]
1257    movifnidn            wd, wm
1258    movifnidn            hd, hm
1259    test                mxd, 0xf00
1260    jnz .h
1261    test                myd, 0xf00
1262    jnz .v
1263.put:
1264    tzcnt                wd, wd
1265    movzx                wd, word [r8+wq*2+table_offset(put,)]
1266    add                  wq, r8
1267%if WIN64
1268    pop                  r8
1269%endif
1270    jmp                  wq
1271.h_w2:
1272    movzx               mxd, mxb
1273    sub                srcq, 2
1274    mova                xm2, [subpel_h_shuf2]
1275    vpbroadcastd        xm3, [base+subpel_filters+mxq*8+2]
1276    pmovsxbw            xm3, xm3
1277.h_w2_loop:
1278    movu                xm0, [srcq+ssq*0]
1279    movu                xm1, [srcq+ssq*1]
1280    lea                srcq, [srcq+ssq*2]
1281    pshufb              xm0, xm2
1282    pshufb              xm1, xm2
1283    pmaddwd             xm0, xm3
1284    pmaddwd             xm1, xm3
1285    phaddd              xm0, xm1
1286    paddd               xm0, xm4
1287    psrad               xm0, 6
1288    packusdw            xm0, xm0
1289    pminsw              xm0, xm5
1290    movd       [dstq+dsq*0], xm0
1291    pextrd     [dstq+dsq*1], xm0, 1
1292    lea                dstq, [dstq+dsq*2]
1293    sub                  hd, 2
1294    jg .h_w2_loop
1295    RET
1296.h_w4:
1297    movzx               mxd, mxb
1298    sub                srcq, 2
1299    pmovsxbw            xm3, [base+subpel_filters+mxq*8]
1300    WIN64_SPILL_XMM       8
1301    vbroadcasti128       m6, [subpel_h_shufA]
1302    vbroadcasti128       m7, [subpel_h_shufB]
1303    pshufd              xm3, xm3, q2211
1304    vpbroadcastq         m2, xm3
1305    vpermq               m3, m3, q1111
1306.h_w4_loop:
1307    movu                xm1, [srcq+ssq*0]
1308    vinserti128          m1, [srcq+ssq*1], 1
1309    lea                srcq, [srcq+ssq*2]
1310    pshufb               m0, m1, m6 ; 0 1 1 2 2 3 3 4
1311    pshufb               m1, m7     ; 2 3 3 4 4 5 5 6
1312    pmaddwd              m0, m2
1313    pmaddwd              m1, m3
1314    paddd                m0, m4
1315    paddd                m0, m1
1316    psrad                m0, 6
1317    vextracti128        xm1, m0, 1
1318    packusdw            xm0, xm1
1319    pminsw              xm0, xm5
1320    movq       [dstq+dsq*0], xm0
1321    movhps     [dstq+dsq*1], xm0
1322    lea                dstq, [dstq+dsq*2]
1323    sub                  hd, 2
1324    jg .h_w4_loop
1325    RET
1326.h:
1327    test                myd, 0xf00
1328    jnz .hv
1329    mov                 r7d, r8m
1330    vpbroadcastw         m5, r8m
1331    shr                 r7d, 11
1332    vpbroadcastd         m4, [base+put_8tap_h_rnd+r7*4]
1333    cmp                  wd, 4
1334    je .h_w4
1335    jl .h_w2
1336    WIN64_SPILL_XMM      11
1337    shr                 mxd, 16
1338    sub                srcq, 4
1339    vpbroadcastq         m0, [base+subpel_filters+1+mxq*8]
1340    vbroadcasti128       m6, [base+subpel_h_shufA]
1341    punpcklbw            m0, m0
1342    psraw                m0, 8 ; sign-extend
1343    pshufd               m7, m0, q0000
1344    pshufd               m8, m0, q1111
1345    pshufd               m9, m0, q2222
1346    sub                  wd, 16
1347    jge .h_w16
1348.h_w8:
1349%macro PUT_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
1350    pshufb              m%1, m6        ; 01 12 23 34
1351    pshufb              m%2, m6        ; 45 56 67 78
1352    pmaddwd             m%4, m7, m%1   ; a0
1353    pshufb              m%3, m6        ; 89 9a ab bc
1354    pmaddwd             m%5, m9, m%2   ; a2
1355    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
1356    paddd               m%4, m%5       ; a0+a2
1357    pmaddwd             m%5, m7, m%2   ; b0
1358    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
1359    pmaddwd             m%3, m9        ; b2
1360    pmaddwd             m%1, m8        ; a1
1361    pmaddwd             m%2, m8        ; b1
1362    paddd               m%3, m%5       ; b0+b2
1363    paddd               m%4, m4
1364    paddd               m%3, m4
1365    paddd               m%1, m%4
1366    paddd               m%2, m%3
1367    psrad               m%1, 6
1368    psrad               m%2, 6
1369    packusdw            m%1, m%2
1370    pminsw              m%1, m5
1371%endmacro
1372    movu                xm0, [srcq+ssq*0+ 0]
1373    vinserti128          m0, [srcq+ssq*1+ 0], 1
1374    movu                xm2, [srcq+ssq*0+16]
1375    vinserti128          m2, [srcq+ssq*1+16], 1
1376    shufpd               m1, m0, m2, 0x05
1377    lea                srcq, [srcq+ssq*2]
1378    PUT_6TAP_H            0, 1, 2, 3, 10
1379    mova         [dstq+dsq*0], xm0
1380    vextracti128 [dstq+dsq*1], m0, 1
1381    lea                dstq, [dstq+dsq*2]
1382    sub                  hd, 2
1383    jg .h_w8
1384    RET
1385.h_w16:
1386    mov                 r6d, wd
1387.h_w16_loop:
1388    movu                 m0, [srcq+r6*2+ 0]
1389    movu                 m1, [srcq+r6*2+ 8]
1390    movu                 m2, [srcq+r6*2+16]
1391    PUT_6TAP_H            0, 1, 2, 3, 10
1392    mova        [dstq+r6*2], m0
1393    sub                 r6d, 16
1394    jge .h_w16_loop
1395    add                srcq, ssq
1396    add                dstq, dsq
1397    dec                  hd
1398    jg .h_w16
1399    RET
1400.v:
1401    movzx               mxd, myb
1402    shr                 myd, 16
1403    cmp                  hd, 6
1404    cmovs               myd, mxd
1405    vpbroadcastq         m0, [base+subpel_filters+1+myq*8]
1406    WIN64_SPILL_XMM      10, 12
1407    vpbroadcastd         m5, [pd_32]
1408    vpbroadcastw         m6, r8m
1409    punpcklbw            m0, m0
1410    mov                  r6, ssq
1411    psraw                m0, 8 ; sign-extend
1412    neg                  r6
1413    pshufd               m7, m0, q0000
1414    pshufd               m8, m0, q1111
1415    pshufd               m9, m0, q2222
1416    cmp                  wd, 4
1417    jg .v_w8
1418    je .v_w4
1419.v_w2:
1420    movd                xm2, [srcq+r6 *2]
1421    pinsrd              xm2, [srcq+r6 *1], 1
1422    pinsrd              xm2, [srcq+ssq*0], 2
1423    pinsrd              xm2, [srcq+ssq*1], 3 ; 0 1 2 3
1424    lea                srcq, [srcq+ssq*2]
1425    movd                xm0, [srcq+ssq*0]
1426    palignr             xm3, xm0, xm2, 4     ; 1 2 3 4
1427    punpcklwd           xm1, xm2, xm3        ; 01 12
1428    punpckhwd           xm2, xm3             ; 23 34
1429.v_w2_loop:
1430    movd                xm3, [srcq+ssq*1]
1431    pmaddwd             xm4, xm7, xm1        ; a0 b0
1432    mova                xm1, xm2
1433    pmaddwd             xm2, xm8             ; a1 b1
1434    lea                srcq, [srcq+ssq*2]
1435    paddd               xm4, xm2
1436    punpckldq           xm2, xm0, xm3        ; 4 5
1437    movd                xm0, [srcq+ssq*0]
1438    punpckldq           xm3, xm0             ; 5 6
1439    punpcklwd           xm2, xm3             ; 45 56
1440    pmaddwd             xm3, xm9, xm2        ; a2 b2
1441    paddd               xm4, xm5
1442    paddd               xm4, xm3
1443    psrad               xm4, 6
1444    packusdw            xm4, xm4
1445    pminsw              xm4, xm6
1446    movd       [dstq+dsq*0], xm4
1447    pextrd     [dstq+dsq*1], xm4, 1
1448    lea                dstq, [dstq+dsq*2]
1449    sub                  hd, 2
1450    jg .v_w2_loop
1451    RET
1452.v_w4:
1453    movq                xm1, [srcq+r6 *2]
1454    vpbroadcastq         m3, [srcq+r6 *1]
1455    vpbroadcastq         m2, [srcq+ssq*0]
1456    vpbroadcastq         m4, [srcq+ssq*1]
1457    lea                srcq, [srcq+ssq*2]
1458    vpbroadcastq         m0, [srcq+ssq*0]
1459    vpblendd             m1, m3, 0x30
1460    vpblendd             m3, m2, 0x30
1461    punpcklwd            m1, m3     ; 01 12
1462    vpblendd             m2, m4, 0x30
1463    vpblendd             m4, m0, 0x30
1464    punpcklwd            m2, m4     ; 23 34
1465.v_w4_loop:
1466    vpbroadcastq         m3, [srcq+ssq*1]
1467    pmaddwd              m4, m7, m1 ; a0 b0
1468    mova                 m1, m2
1469    pmaddwd              m2, m8     ; a1 b1
1470    lea                srcq, [srcq+ssq*2]
1471    paddd                m4, m2
1472    vpblendd             m2, m0, m3, 0x30
1473    vpbroadcastq         m0, [srcq+ssq*0]
1474    vpblendd             m3, m0, 0x30
1475    punpcklwd            m2, m3     ; 45 56
1476    pmaddwd              m3, m9, m2 ; a2 b2
1477    paddd                m4, m5
1478    paddd                m4, m3
1479    psrad                m4, 6
1480    vextracti128        xm3, m4, 1
1481    packusdw            xm4, xm3
1482    pminsw              xm4, xm6
1483    movq       [dstq+dsq*0], xm4
1484    movhps     [dstq+dsq*1], xm4
1485    lea                dstq, [dstq+dsq*2]
1486    sub                  hd, 2
1487    jg .v_w4_loop
1488    RET
1489.v_w8:
1490    shl                  wd, 5
1491    WIN64_PUSH_XMM       12
1492    lea                  wd, [hq+wq-256]
1493.v_w8_loop0:
1494    vbroadcasti128       m3, [srcq+r6 *2]
1495    vbroadcasti128       m4, [srcq+r6 *1]
1496    lea                  r7, [srcq+ssq*2]
1497    vbroadcasti128       m0, [srcq+ssq*0]
1498    vbroadcasti128       m1, [srcq+ssq*1]
1499    mov                  r8, dstq
1500    vbroadcasti128       m2, [r7+ssq*0]
1501    shufpd               m3, m0, 0x0c
1502    shufpd               m4, m1, 0x0c
1503    punpcklwd            m1, m3, m4 ; 01
1504    punpckhwd            m3, m4     ; 23
1505    shufpd               m0, m2, 0x0c
1506    punpcklwd            m2, m4, m0 ; 12
1507    punpckhwd            m4, m0     ; 34
1508.v_w8_loop:
1509    vbroadcasti128       m5, [r7+ssq*1]
1510    pmaddwd             m10, m7, m1 ; a0
1511    lea                  r7, [r7+ssq*2]
1512    pmaddwd             m11, m7, m2 ; b0
1513    mova                 m1, m3
1514    pmaddwd              m3, m8     ; a1
1515    mova                 m2, m4
1516    pmaddwd              m4, m8     ; b1
1517    paddd               m10, m3
1518    vbroadcasti128       m3, [r7+ssq*0]
1519    paddd               m11, m4
1520    shufpd               m4, m0, m5, 0x0d
1521    shufpd               m0, m5, m3, 0x0c
1522    punpcklwd            m3, m4, m0 ; 45
1523    punpckhwd            m4, m0     ; 56
1524    pmaddwd              m5, m9, m3 ; a2
1525    paddd               m10, m5
1526    pmaddwd              m5, m9, m4 ; b2
1527    paddd                m5, m11
1528    psrad               m10, 5
1529    psrad                m5, 5
1530    packusdw            m10, m5
1531    pxor                 m5, m5
1532    pavgw                m5, m10
1533    pminsw               m5, m6
1534    vpermq               m5, m5, q3120
1535    mova         [r8+dsq*0], xm5
1536    vextracti128 [r8+dsq*1], m5, 1
1537    lea                  r8, [r8+dsq*2]
1538    sub                  hd, 2
1539    jg .v_w8_loop
1540    add                srcq, 16
1541    add                dstq, 16
1542    movzx                hd, wb
1543    sub                  wd, 1<<8
1544    jg .v_w8_loop0
1545    RET
1546.hv:
1547    WIN64_SPILL_XMM      12, 16
1548    vpbroadcastd        m10, [pd_512]
1549    vpbroadcastw        m11, r8m
1550    cmp                  wd, 4
1551    jg .hv_w8
1552    movzx               mxd, mxb
1553    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
1554    movzx               mxd, myb
1555    shr                 myd, 16
1556    cmp                  hd, 6
1557    cmovs               myd, mxd
1558    vpbroadcastq         m1, [base+subpel_filters+1+myq*8]
1559    mov                  r6, ssq
1560    sub                srcq, 2
1561    neg                  r6
1562    pxor                 m6, m6
1563    punpcklbw            m6, m0
1564    punpcklbw            m1, m1
1565    psraw                m1, 8 ; sign-extend
1566    test          dword r8m, 0x800
1567    jz .hv_10bit
1568    psraw                m6, 2
1569    psllw                m1, 2
1570.hv_10bit:
1571    pshufd               m7, m1, q0000
1572    pshufd               m8, m1, q1111
1573    pshufd               m9, m1, q2222
1574    cmp                  wd, 4
1575    je .hv_w4
1576    vbroadcasti128       m5, [subpel_h_shuf2]
1577    vbroadcasti128       m0, [srcq+ssq*0]
1578    vinserti128          m2, m0, [srcq+r6*2], 1 ; 2 0
1579    movu                xm1, [srcq+ssq*1]
1580    vinserti128          m1, [srcq+r6 *1], 1    ; 3 1
1581    lea                srcq, [srcq+ssq*2]
1582    vinserti128          m0, [srcq+ssq*0], 0    ; 4 2
1583    REPX    {pshufb  x, m5}, m2, m1, m0
1584    REPX    {pmaddwd x, m6}, m2, m1, m0
1585    phaddd               m2, m1
1586    phaddd               m1, m0
1587    paddd                m2, m10
1588    paddd                m1, m10
1589    psrad                m2, 10
1590    psrad                m1, 10
1591    packssdw             m2, m1       ; 2 3 3 4   0 1 1 2
1592    punpckhqdq           m0, m2, m2
1593    punpcklwd            m2, m0       ; 23 34
1594    vextracti128        xm1, m2, 1    ; 01 12
1595.hv_w2_loop:
1596    movu                xm3, [srcq+ssq*1]
1597    lea                srcq, [srcq+ssq*2]
1598    movu                xm4, [srcq+ssq*0]
1599    pshufb              xm3, xm5
1600    pshufb              xm4, xm5
1601    pmaddwd             xm3, xm6
1602    pmaddwd             xm4, xm6
1603    phaddd              xm3, xm4
1604    pmaddwd             xm4, xm7, xm1 ; a0 b0
1605    mova                xm1, xm2
1606    pmaddwd             xm2, xm8      ; a1 b1
1607    paddd               xm4, xm2
1608    paddd               xm3, xm10
1609    psrad               xm3, 10
1610    packssdw            xm3, xm3
1611    palignr             xm2, xm3, xm0, 12
1612    mova                xm0, xm3
1613    punpcklwd           xm2, xm0      ; 45 56
1614    pmaddwd             xm3, xm9, xm2 ; a2 b2
1615    paddd               xm4, xm10
1616    paddd               xm4, xm3
1617    psrad               xm4, 10
1618    packusdw            xm4, xm4
1619    pminsw              xm4, xm11
1620    movd       [dstq+dsq*0], xm4
1621    pextrd     [dstq+dsq*1], xm4, 1
1622    lea                dstq, [dstq+dsq*2]
1623    sub                  hd, 2
1624    jg .hv_w2_loop
1625    RET
1626.hv_w4:
1627    WIN64_PUSH_XMM       14
1628    vbroadcasti128      m12, [subpel_h_shufA]
1629    pshufd               m5, m6, q0000
1630    vbroadcasti128      m13, [subpel_h_shufB]
1631    pshufd               m6, m6, q1111
1632    movu                xm2, [srcq+r6 *2]
1633    vinserti128          m2, [srcq+r6 *1], 1 ; 0 1
1634    movu                xm0, [srcq+ssq*0]
1635    vinserti128          m0, [srcq+ssq*1], 1 ; 2 3
1636    lea                srcq, [srcq+ssq*2]
1637    movu                xm3, [srcq+ssq*0]    ; 4
1638    pshufb               m1, m2, m12
1639    pmaddwd              m1, m5
1640    pshufb               m2, m13
1641    pmaddwd              m2, m6
1642    pshufb               m4, m0, m12
1643    pmaddwd              m4, m5
1644    pshufb               m0, m13
1645    pmaddwd              m0, m6
1646    paddd                m2, m1
1647    pshufb              xm1, xm3, xm12
1648    pmaddwd             xm1, xm5
1649    pshufb              xm3, xm13
1650    pmaddwd             xm3, xm6
1651    paddd                m0, m4
1652    paddd                m2, m10
1653    paddd               xm1, xm10
1654    paddd                m0, m10
1655    paddd               xm3, xm1
1656    REPX      {psrad x, 10}, m2, m0, xm3
1657    packssdw             m2, m0     ; 0 2   1 3
1658    packssdw            xm0, xm3    ; 2 4
1659    vperm2i128           m0, m2, 0x03
1660    punpcklwd            m1, m2, m0 ; 01 12
1661    punpckhwd            m2, m0     ; 23 34
1662.hv_w4_loop:
1663    movu                xm3, [srcq+ssq*1]
1664    lea                srcq, [srcq+ssq*2]
1665    vinserti128          m3, [srcq+ssq*0], 1
1666    pmaddwd              m4, m7, m1 ; a0 b0
1667    mova                 m1, m2
1668    pmaddwd              m2, m8     ; a1 b1
1669    paddd                m4, m2
1670    pshufb               m2, m3, m12
1671    pmaddwd              m2, m5
1672    pshufb               m3, m13
1673    pmaddwd              m3, m6
1674    paddd                m2, m10
1675    paddd                m3, m2
1676    psrad                m3, 10
1677    packssdw             m3, m3     ; 5 5   6 6
1678    vperm2i128           m2, m0, m3, 0x21
1679    mova                 m0, m3
1680    punpckhwd            m2, m3     ; 45 56
1681    pmaddwd              m3, m9, m2 ; a2 b2
1682    paddd                m4, m10
1683    paddd                m4, m3
1684    psrad                m4, 10
1685    vextracti128        xm3, m4, 1
1686    packusdw            xm4, xm3
1687    pminsw              xm4, xm11
1688    movq       [dstq+dsq*0], xm4
1689    movhps     [dstq+dsq*1], xm4
1690    lea                dstq, [dstq+dsq*2]
1691    sub                  hd, 2
1692    jg .hv_w4_loop
1693    RET
1694.hv_w8:
1695    WIN64_PUSH_XMM       16, 12
1696    shr                 mxd, 16
1697    vbroadcasti128      m12, [subpel_h_shufA]
1698    vpbroadcastq         m2, [base+subpel_filters+1+mxq*8]
1699    movzx               mxd, myb
1700    shr                 myd, 16
1701    cmp                  hd, 6
1702    cmovs               myd, mxd
1703    pmovsxbw            xm1, [base+subpel_filters+1+myq*8]
1704    shl                  wd, 5
1705    mov                  r6, ssq
1706    sub                srcq, 4
1707    pxor                 m0, m0
1708    neg                  r6
1709    punpcklbw            m0, m2
1710    lea                  wd, [hq+wq-256]
1711    test          dword r8m, 0x800
1712    jz .hv_w8_10bit
1713    psraw                m0, 2
1714    psllw               xm1, 2
1715.hv_w8_10bit:
1716    pshufd               m7, m0, q0000
1717    pshufd               m8, m0, q1111
1718%if WIN64
1719    %define v_mul (rsp+stack_offset+40) ; r4m
1720%else
1721    %define v_mul (rsp+stack_offset+ 8) ; r6m
1722%endif
1723    mova            [v_mul], xm1
1724    pshufd               m9, m0, q2222
1725.hv_w8_loop0:
1726    vbroadcasti128       m0, [srcq+ssq*0+ 0]
1727    vinserti128          m3, m0, [srcq+r6*2+ 0], 0
1728    lea                  r7, [srcq+ssq*2]
1729    vbroadcasti128       m2, [srcq+ssq*0+16]
1730    vinserti128          m1, m2, [srcq+r6*2+16], 0
1731    mov                  r8, dstq
1732    vinserti128          m0, [r7  +ssq*0+ 0], 1
1733    vinserti128          m2, [r7  +ssq*0+16], 1
1734    shufpd               m4, m3, m1, 0x05
1735%macro PUT_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
1736    pshufb              m%1, m12       ; 01 12 23 34
1737    pshufb              m%2, m12       ; 45 56 67 78
1738    pmaddwd             m%4, m7, m%1   ; a0
1739    pshufb              m%3, m12       ; 89 9a ab bc
1740    pmaddwd             m%5, m9, m%2   ; a2
1741    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
1742    paddd               m%4, m%5       ; a0+a2
1743    pmaddwd             m%5, m7, m%2   ; b0
1744    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
1745    pmaddwd             m%3, m9        ; b2
1746    pmaddwd             m%1, m8        ; a1
1747    pmaddwd             m%2, m8        ; b1
1748    paddd               m%3, m%5       ; b0+b2
1749    paddd               m%4, m10
1750    paddd               m%3, m10
1751    paddd               m%1, m%4
1752    paddd               m%2, m%3
1753    psrad               m%1, 10
1754    psrad               m%2, 10
1755    packssdw            m%1, m%2
1756%endmacro
1757    PUT_6TAP_HV_H         3, 4, 1, 5, 6  ; 0 2
1758    movu                xm4, [srcq+r6 *1+ 0]
1759    vinserti128          m4, [srcq+ssq*1+ 0], 1
1760    shufpd               m1, m0, m2, 0x05
1761    PUT_6TAP_HV_H         0, 1, 2, 5, 6  ; 2 4
1762    movu                xm2, [srcq+r6 *1+16]
1763    vinserti128          m2, [srcq+ssq*1+16], 1
1764    shufpd               m1, m4, m2, 0x05
1765    PUT_6TAP_HV_H         4, 1, 2, 5, 6  ; 1 3
1766    vpermq               m3, m3, q3120
1767    vpermq               m4, m4, q3120
1768    vpermq               m0, m0, q3120
1769    punpcklwd            m1, m3, m4      ; 01
1770    punpckhwd            m3, m4          ; 23
1771    punpcklwd            m2, m4, m0      ; 12
1772    punpckhwd            m4, m0          ; 34
1773.hv_w8_loop:
1774    vpbroadcastd        m15, [v_mul+4*0]
1775    vpbroadcastd        m13, [v_mul+4*1]
1776    movu                xm5, [r7+ssq*1+ 0]
1777    movu                xm6, [r7+ssq*1+16]
1778    lea                  r7, [r7+ssq*2]
1779    pmaddwd             m14, m15, m1     ; a0
1780    pmaddwd             m15, m2          ; b0
1781    vinserti128          m5, [r7+ssq*0+ 0], 1
1782    vinserti128          m6, [r7+ssq*0+16], 1
1783    mova                 m1, m3
1784    pmaddwd              m3, m13         ; a1
1785    mova                 m2, m4
1786    pmaddwd              m4, m13         ; b1
1787    paddd               m14, m3
1788    shufpd               m3, m5, m6, 0x05
1789    paddd               m15, m4
1790    PUT_6TAP_HV_H         5, 3, 6, 4, 13 ; 5 6
1791    vpbroadcastd         m6, [v_mul+4*2]
1792    vpermq               m5, m5, q3120
1793    shufpd               m4, m0, m5, 0x05
1794    mova                 m0, m5
1795    punpcklwd            m3, m4, m5      ; 45
1796    punpckhwd            m4, m5          ; 56
1797    pmaddwd              m5, m6, m3      ; a2
1798    pmaddwd              m6, m4          ; b2
1799    paddd               m14, m10
1800    paddd               m15, m10
1801    paddd                m5, m14
1802    paddd                m6, m15
1803    psrad                m5, 10
1804    psrad                m6, 10
1805    packusdw             m5, m6
1806    pminsw               m5, m11
1807    vpermq               m5, m5, q3120
1808    mova         [r8+dsq*0], xm5
1809    vextracti128 [r8+dsq*1], m5, 1
1810    lea                  r8, [r8+dsq*2]
1811    sub                  hd, 2
1812    jg .hv_w8_loop
1813    add                srcq, 16
1814    add                dstq, 16
1815    movzx                hd, wb
1816    sub                  wd, 1<<8
1817    jg .hv_w8_loop0
1818    RET
1819
1820PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
1821PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
1822PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
1823PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
1824PUT_8TAP_FN sharp,          SHARP,   SHARP
1825
1826cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
1827%define base r8-put_avx2
1828    imul                mxd, mxm, 0x010101
1829    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
1830    imul                myd, mym, 0x010101
1831    add                 myd, t1d ; 8tap_v, my, 4tap_v
1832    lea                  r8, [put_avx2]
1833    movifnidn            wd, wm
1834    movifnidn            hd, hm
1835    test                mxd, 0xf00
1836    jnz .h
1837    test                myd, 0xf00
1838    jz mangle(private_prefix %+ _put_6tap_16bpc_avx2).put
1839.v:
1840    movzx               mxd, myb
1841    shr                 myd, 16
1842    cmp                  hd, 6
1843    cmovs               myd, mxd
1844    vpbroadcastq         m0, [base+subpel_filters+myq*8]
1845    WIN64_SPILL_XMM      12, 15
1846    vpbroadcastd         m6, [pd_32]
1847    vpbroadcastw         m7, r8m
1848    lea                  r6, [ssq*3]
1849    sub                srcq, r6
1850    punpcklbw            m0, m0
1851    psraw                m0, 8 ; sign-extend
1852    pshufd               m8, m0, q0000
1853    pshufd               m9, m0, q1111
1854    pshufd              m10, m0, q2222
1855    pshufd              m11, m0, q3333
1856    cmp                  wd, 4
1857    jg .v_w8
1858    je .v_w4
1859.v_w2:
1860    movd                xm2, [srcq+ssq*0]
1861    pinsrd              xm2, [srcq+ssq*1], 1
1862    pinsrd              xm2, [srcq+ssq*2], 2
1863    pinsrd              xm2, [srcq+r6   ], 3 ; 0 1 2 3
1864    lea                srcq, [srcq+ssq*4]
1865    movd                xm3, [srcq+ssq*0]
1866    vpbroadcastd        xm1, [srcq+ssq*1]
1867    vpbroadcastd        xm0, [srcq+ssq*2]
1868    add                srcq, r6
1869    vpblendd            xm3, xm1, 0x02       ; 4 5
1870    vpblendd            xm1, xm0, 0x02       ; 5 6
1871    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
1872    punpcklwd           xm3, xm1             ; 45 56
1873    punpcklwd           xm1, xm2, xm4        ; 01 12
1874    punpckhwd           xm2, xm4             ; 23 34
1875.v_w2_loop:
1876    vpbroadcastd        xm4, [srcq+ssq*0]
1877    pmaddwd             xm5, xm8, xm1        ; a0 b0
1878    mova                xm1, xm2
1879    pmaddwd             xm2, xm9             ; a1 b1
1880    paddd               xm5, xm6
1881    paddd               xm5, xm2
1882    mova                xm2, xm3
1883    pmaddwd             xm3, xm10            ; a2 b2
1884    paddd               xm5, xm3
1885    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
1886    vpbroadcastd        xm0, [srcq+ssq*1]
1887    lea                srcq, [srcq+ssq*2]
1888    vpblendd            xm4, xm0, 0x02       ; 7 8
1889    punpcklwd           xm3, xm4             ; 67 78
1890    pmaddwd             xm4, xm11, xm3       ; a3 b3
1891    paddd               xm5, xm4
1892    psrad               xm5, 6
1893    packusdw            xm5, xm5
1894    pminsw              xm5, xm7
1895    movd       [dstq+dsq*0], xm5
1896    pextrd     [dstq+dsq*1], xm5, 1
1897    lea                dstq, [dstq+dsq*2]
1898    sub                  hd, 2
1899    jg .v_w2_loop
1900    RET
1901.v_w4:
1902    movq                xm1, [srcq+ssq*0]
1903    vpbroadcastq         m0, [srcq+ssq*1]
1904    vpbroadcastq         m2, [srcq+ssq*2]
1905    vpbroadcastq         m4, [srcq+r6   ]
1906    lea                srcq, [srcq+ssq*4]
1907    vpbroadcastq         m3, [srcq+ssq*0]
1908    vpbroadcastq         m5, [srcq+ssq*1]
1909    vpblendd             m1, m0, 0x30
1910    vpblendd             m0, m2, 0x30
1911    punpcklwd            m1, m0      ; 01 12
1912    vpbroadcastq         m0, [srcq+ssq*2]
1913    add                srcq, r6
1914    vpblendd             m2, m4, 0x30
1915    vpblendd             m4, m3, 0x30
1916    punpcklwd            m2, m4      ; 23 34
1917    vpblendd             m3, m5, 0x30
1918    vpblendd             m5, m0, 0x30
1919    punpcklwd            m3, m5      ; 45 56
1920.v_w4_loop:
1921    vpbroadcastq         m4, [srcq+ssq*0]
1922    pmaddwd              m5, m8, m1  ; a0 b0
1923    mova                 m1, m2
1924    pmaddwd              m2, m9      ; a1 b1
1925    paddd                m5, m6
1926    paddd                m5, m2
1927    mova                 m2, m3
1928    pmaddwd              m3, m10     ; a2 b2
1929    paddd                m5, m3
1930    vpblendd             m3, m0, m4, 0x30
1931    vpbroadcastq         m0, [srcq+ssq*1]
1932    lea                srcq, [srcq+ssq*2]
1933    vpblendd             m4, m0, 0x30
1934    punpcklwd            m3, m4      ; 67 78
1935    pmaddwd              m4, m11, m3 ; a3 b3
1936    paddd                m5, m4
1937    psrad                m5, 6
1938    vextracti128        xm4, m5, 1
1939    packusdw            xm5, xm4
1940    pminsw              xm5, xm7
1941    movq       [dstq+dsq*0], xm5
1942    movhps     [dstq+dsq*1], xm5
1943    lea                dstq, [dstq+dsq*2]
1944    sub                  hd, 2
1945    jg .v_w4_loop
1946    RET
1947.v_w8:
1948    shl                  wd, 5
1949    WIN64_PUSH_XMM       15
1950    lea                  wd, [hq+wq-256]
1951.v_w8_loop0:
1952    vbroadcasti128       m4, [srcq+ssq*0]
1953    vbroadcasti128       m5, [srcq+ssq*1]
1954    lea                  r7, [srcq+ssq*4]
1955    vbroadcasti128       m0, [srcq+r6   ]
1956    vbroadcasti128       m6, [srcq+ssq*2]
1957    mov                  r8, dstq
1958    vbroadcasti128       m1, [r7+ssq*0]
1959    vbroadcasti128       m2, [r7+ssq*1]
1960    vbroadcasti128       m3, [r7+ssq*2]
1961    add                  r7, r6
1962    shufpd               m4, m0, 0x0c
1963    shufpd               m5, m1, 0x0c
1964    punpcklwd            m1, m4, m5 ; 01
1965    punpckhwd            m4, m5     ; 34
1966    shufpd               m6, m2, 0x0c
1967    punpcklwd            m2, m5, m6 ; 12
1968    punpckhwd            m5, m6     ; 45
1969    shufpd               m0, m3, 0x0c
1970    punpcklwd            m3, m6, m0 ; 23
1971    punpckhwd            m6, m0     ; 56
1972.v_w8_loop:
1973    vbroadcasti128      m14, [r7+ssq*0]
1974    pmaddwd             m12, m8, m1  ; a0
1975    pmaddwd             m13, m8, m2  ; b0
1976    mova                 m1, m3
1977    mova                 m2, m4
1978    pmaddwd              m3, m9      ; a1
1979    pmaddwd              m4, m9      ; b1
1980    paddd               m12, m3
1981    paddd               m13, m4
1982    mova                 m3, m5
1983    mova                 m4, m6
1984    pmaddwd              m5, m10     ; a2
1985    pmaddwd              m6, m10     ; b2
1986    paddd               m12, m5
1987    vbroadcasti128       m5, [r7+ssq*1]
1988    lea                  r7, [r7+ssq*2]
1989    paddd               m13, m6
1990    shufpd               m6, m0, m14, 0x0d
1991    shufpd               m0, m14, m5, 0x0c
1992    punpcklwd            m5, m6, m0  ; 67
1993    punpckhwd            m6, m0      ; 78
1994    pmaddwd             m14, m11, m5 ; a3
1995    paddd               m12, m14
1996    pmaddwd             m14, m11, m6 ; b3
1997    paddd               m13, m14
1998    psrad               m12, 5
1999    psrad               m13, 5
2000    packusdw            m12, m13
2001    pxor                m13, m13
2002    pavgw               m12, m13
2003    pminsw              m12, m7
2004    vpermq              m12, m12, q3120
2005    mova         [r8+dsq*0], xm12
2006    vextracti128 [r8+dsq*1], m12, 1
2007    lea                  r8, [r8+dsq*2]
2008    sub                  hd, 2
2009    jg .v_w8_loop
2010    add                srcq, 16
2011    add                dstq, 16
2012    movzx                hd, wb
2013    sub                  wd, 1<<8
2014    jg .v_w8_loop0
2015    RET
2016.h:
2017    RESET_STACK_STATE
2018    test                myd, 0xf00
2019    jnz .hv
2020    mov                 r7d, r8m
2021    vpbroadcastw         m5, r8m
2022    shr                 r7d, 11
2023    vpbroadcastd         m4, [base+put_8tap_h_rnd+r7*4]
2024    cmp                  wd, 4
2025    jl mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w2
2026    je mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w4
2027    WIN64_SPILL_XMM      13
2028    shr                 mxd, 16
2029    sub                srcq, 6
2030    vpbroadcastq         m0, [base+subpel_filters+mxq*8]
2031    vbroadcasti128       m6, [subpel_h_shufA]
2032    vbroadcasti128       m7, [subpel_h_shufB]
2033    punpcklbw            m0, m0
2034    psraw                m0, 8 ; sign-extend
2035    pshufd               m8, m0, q0000
2036    pshufd               m9, m0, q1111
2037    pshufd              m10, m0, q2222
2038    pshufd              m11, m0, q3333
2039    sub                  wd, 16
2040    jge .h_w16
2041.h_w8:
2042%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
2043    pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
2044    pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
2045    pmaddwd             m%5, m9, m%4   ; abcd1
2046    pmaddwd             m%1, m8        ; abcd0
2047    pshufb              m%2, m7        ; 6 7 7 8 8 9 9 a
2048    shufpd              m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
2049    paddd               m%5, m4
2050    paddd               m%1, m%5
2051    pmaddwd             m%5, m11, m%2  ; abcd3
2052    paddd               m%1, m%5
2053    pmaddwd             m%5, m10, m%4  ; abcd2
2054    pshufb              m%3, m7        ; a b b c c d d e
2055    pmaddwd             m%4, m8        ; efgh0
2056    paddd               m%1, m%5
2057    pmaddwd             m%5, m9, m%2   ; efgh1
2058    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
2059    pmaddwd             m%3, m11       ; efgh3
2060    pmaddwd             m%2, m10       ; efgh2
2061    paddd               m%4, m4
2062    paddd               m%4, m%5
2063    paddd               m%3, m%4
2064    paddd               m%2, m%3
2065    psrad               m%1, 6
2066    psrad               m%2, 6
2067    packusdw            m%1, m%2
2068    pminsw              m%1, m5
2069%endmacro
2070    movu                xm0, [srcq+ssq*0+ 0]
2071    vinserti128          m0, [srcq+ssq*1+ 0], 1
2072    movu                xm2, [srcq+ssq*0+16]
2073    vinserti128          m2, [srcq+ssq*1+16], 1
2074    lea                srcq, [srcq+ssq*2]
2075    shufpd               m1, m0, m2, 0x05
2076    PUT_8TAP_H            0, 1, 2, 3, 12
2077    mova         [dstq+dsq*0], xm0
2078    vextracti128 [dstq+dsq*1], m0, 1
2079    lea                dstq, [dstq+dsq*2]
2080    sub                  hd, 2
2081    jg .h_w8
2082    RET
2083.h_w16:
2084    mov                 r6d, wd
2085.h_w16_loop:
2086    movu                 m0, [srcq+r6*2+ 0]
2087    movu                 m1, [srcq+r6*2+ 8]
2088    movu                 m2, [srcq+r6*2+16]
2089    PUT_8TAP_H            0, 1, 2, 3, 12
2090    mova        [dstq+r6*2], m0
2091    sub                 r6d, 16
2092    jge .h_w16_loop
2093    add                srcq, ssq
2094    add                dstq, dsq
2095    dec                  hd
2096    jg .h_w16
2097    RET
2098.hv:
2099    WIN64_SPILL_XMM      16
2100    vpbroadcastw        m15, r8m
2101    cmp                  wd, 4
2102    jg .hv_w8
2103    movzx               mxd, mxb
2104    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
2105    movzx               mxd, myb
2106    shr                 myd, 16
2107    cmp                  hd, 6
2108    cmovs               myd, mxd
2109    vpbroadcastq         m1, [base+subpel_filters+myq*8]
2110    vpbroadcastd         m6, [pd_512]
2111    lea                  r6, [ssq*3]
2112    sub                srcq, 2
2113    sub                srcq, r6
2114    pxor                 m7, m7
2115    punpcklbw            m7, m0
2116    punpcklbw            m1, m1
2117    psraw                m1, 8 ; sign-extend
2118    test          dword r8m, 0x800
2119    jz .hv_10bit
2120    psraw                m7, 2
2121    psllw                m1, 2
2122.hv_10bit:
2123    pshufd              m11, m1, q0000
2124    pshufd              m12, m1, q1111
2125    pshufd              m13, m1, q2222
2126    pshufd              m14, m1, q3333
2127    cmp                  wd, 4
2128    je .hv_w4
2129    vbroadcasti128       m9, [subpel_h_shuf2]
2130    vbroadcasti128       m1, [srcq+r6   ]    ; 3 3
2131    movu                xm3, [srcq+ssq*2]
2132    movu                xm0, [srcq+ssq*0]
2133    movu                xm2, [srcq+ssq*1]
2134    lea                srcq, [srcq+ssq*4]
2135    vinserti128          m3, [srcq+ssq*0], 1 ; 2 4
2136    vinserti128          m0, [srcq+ssq*1], 1 ; 0 5
2137    vinserti128          m2, [srcq+ssq*2], 1 ; 1 6
2138    add                srcq, r6
2139    pshufb               m1, m9
2140    pshufb               m3, m9
2141    pshufb               m0, m9
2142    pshufb               m2, m9
2143    pmaddwd              m1, m7
2144    pmaddwd              m3, m7
2145    pmaddwd              m0, m7
2146    pmaddwd              m2, m7
2147    phaddd               m1, m3
2148    phaddd               m0, m2
2149    paddd                m1, m6
2150    paddd                m0, m6
2151    psrad                m1, 10
2152    psrad                m0, 10
2153    packssdw             m1, m0         ; 3 2 0 1
2154    vextracti128        xm0, m1, 1      ; 3 4 5 6
2155    pshufd              xm2, xm1, q1301 ; 2 3 1 2
2156    pshufd              xm3, xm0, q2121 ; 4 5 4 5
2157    punpckhwd           xm1, xm2        ; 01 12
2158    punpcklwd           xm2, xm0        ; 23 34
2159    punpckhwd           xm3, xm0        ; 45 56
2160.hv_w2_loop:
2161    movu                xm4, [srcq+ssq*0]
2162    movu                xm5, [srcq+ssq*1]
2163    lea                srcq, [srcq+ssq*2]
2164    pshufb              xm4, xm9
2165    pshufb              xm5, xm9
2166    pmaddwd             xm4, xm7
2167    pmaddwd             xm5, xm7
2168    phaddd              xm4, xm5
2169    pmaddwd             xm5, xm11, xm1 ; a0 b0
2170    mova                xm1, xm2
2171    pmaddwd             xm2, xm12      ; a1 b1
2172    paddd               xm5, xm2
2173    mova                xm2, xm3
2174    pmaddwd             xm3, xm13      ; a2 b2
2175    paddd               xm5, xm3
2176    paddd               xm4, xm6
2177    psrad               xm4, 10
2178    packssdw            xm4, xm4
2179    palignr             xm3, xm4, xm0, 12
2180    mova                xm0, xm4
2181    punpcklwd           xm3, xm0       ; 67 78
2182    pmaddwd             xm4, xm14, xm3 ; a3 b3
2183    paddd               xm5, xm6
2184    paddd               xm5, xm4
2185    psrad               xm5, 10
2186    packusdw            xm5, xm5
2187    pminsw              xm5, xm15
2188    movd       [dstq+dsq*0], xm5
2189    pextrd     [dstq+dsq*1], xm5, 1
2190    lea                dstq, [dstq+dsq*2]
2191    sub                  hd, 2
2192    jg .hv_w2_loop
2193    RET
2194.hv_w4:
2195    vbroadcasti128       m9, [subpel_h_shufA]
2196    vbroadcasti128      m10, [subpel_h_shufB]
2197    pshufd               m8, m7, q1111
2198    pshufd               m7, m7, q0000
2199    movu                xm1, [srcq+ssq*0]
2200    vinserti128          m1, [srcq+ssq*1], 1     ; 0 1
2201    vbroadcasti128       m0, [srcq+r6   ]
2202    vinserti128          m2, m0, [srcq+ssq*2], 0 ; 2 3
2203    lea                srcq, [srcq+ssq*4]
2204    vinserti128          m0, [srcq+ssq*0], 1     ; 3 4
2205    movu                xm3, [srcq+ssq*1]
2206    vinserti128          m3, [srcq+ssq*2], 1     ; 5 6
2207    add                srcq, r6
2208    pshufb               m4, m1, m9
2209    pshufb               m1, m10
2210    pmaddwd              m4, m7
2211    pmaddwd              m1, m8
2212    pshufb               m5, m2, m9
2213    pshufb               m2, m10
2214    pmaddwd              m5, m7
2215    pmaddwd              m2, m8
2216    paddd                m4, m6
2217    paddd                m1, m4
2218    pshufb               m4, m0, m9
2219    pshufb               m0, m10
2220    pmaddwd              m4, m7
2221    pmaddwd              m0, m8
2222    paddd                m5, m6
2223    paddd                m2, m5
2224    pshufb               m5, m3, m9
2225    pshufb               m3, m10
2226    pmaddwd              m5, m7
2227    pmaddwd              m3, m8
2228    paddd                m4, m6
2229    paddd                m4, m0
2230    paddd                m5, m6
2231    paddd                m5, m3
2232    vperm2i128           m0, m1, m2, 0x21
2233    psrld                m1, 10
2234    psrld                m2, 10
2235    vperm2i128           m3, m4, m5, 0x21
2236    pslld                m4, 6
2237    pslld                m5, 6
2238    pblendw              m2, m4, 0xaa ; 23 34
2239    pslld                m0, 6
2240    pblendw              m1, m0, 0xaa ; 01 12
2241    psrld                m3, 10
2242    pblendw              m3, m5, 0xaa ; 45 56
2243    psrad                m0, m5, 16
2244.hv_w4_loop:
2245    movu                xm4, [srcq+ssq*0]
2246    vinserti128          m4, [srcq+ssq*1], 1
2247    lea                srcq, [srcq+ssq*2]
2248    pmaddwd              m5, m11, m1   ; a0 b0
2249    mova                 m1, m2
2250    pmaddwd              m2, m12       ; a1 b1
2251    paddd                m5, m6
2252    paddd                m5, m2
2253    mova                 m2, m3
2254    pmaddwd              m3, m13       ; a2 b2
2255    paddd                m5, m3
2256    pshufb               m3, m4, m9
2257    pshufb               m4, m10
2258    pmaddwd              m3, m7
2259    pmaddwd              m4, m8
2260    paddd                m3, m6
2261    paddd                m4, m3
2262    psrad                m4, 10
2263    packssdw             m0, m4        ; _ 7 6 8
2264    vpermq               m3, m0, q1122 ; _ 6 _ 7
2265    punpckhwd            m3, m0        ; 67 78
2266    mova                 m0, m4
2267    pmaddwd              m4, m14, m3   ; a3 b3
2268    paddd                m4, m5
2269    psrad                m4, 10
2270    vextracti128        xm5, m4, 1
2271    packusdw            xm4, xm5
2272    pminsw              xm4, xm15
2273    movq       [dstq+dsq*0], xm4
2274    movhps     [dstq+dsq*1], xm4
2275    lea                dstq, [dstq+dsq*2]
2276    sub                  hd, 2
2277    jg .hv_w4_loop
2278    RET
2279.hv_w8:
2280    shr                 mxd, 16
2281    vpbroadcastq         m2, [base+subpel_filters+mxq*8]
2282    movzx               mxd, myb
2283    shr                 myd, 16
2284    cmp                  hd, 6
2285    cmovs               myd, mxd
2286    pmovsxbw            xm1, [base+subpel_filters+myq*8]
2287    shl                  wd, 5
2288    lea                  r6, [ssq*3]
2289    sub                srcq, 6
2290    pxor                 m0, m0
2291    sub                srcq, r6
2292    punpcklbw            m0, m2
2293    lea                  wd, [hq+wq-256]
2294    test          dword r8m, 0x800
2295    jz .hv_w8_10bit
2296    psraw                m0, 2
2297    psllw               xm1, 2
2298.hv_w8_10bit:
2299    pshufd              m11, m0, q0000
2300    pshufd              m12, m0, q1111
2301    mova            [v_mul], xm1
2302    pshufd              m13, m0, q2222
2303    pshufd              m14, m0, q3333
2304.hv_w8_loop0:
2305%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
2306    pshufb               m2, m%1, m9   ; 2 3 3 4 4 5 5 6
2307    pshufb              m%1, m8        ; 0 1 1 2 2 3 3 4
2308    pmaddwd              m3, m12, m2
2309    pmaddwd             m%1, m11
2310    pshufb              m%2, m9        ; 6 7 7 8 8 9 9 a
2311    shufpd               m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
2312    paddd                m3, m10
2313    paddd               m%1, m3
2314    pmaddwd              m3, m14, m%2
2315    paddd               m%1, m3
2316    pmaddwd              m3, m13, m2
2317    pshufb              m%3, m9        ; a b b c c d d e
2318    pmaddwd              m2, m11
2319    paddd               m%1, m3
2320    pmaddwd              m3, m12, m%2
2321    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
2322    pmaddwd             m%3, m14
2323    pmaddwd             m%2, m13
2324    paddd                m2, m10
2325    paddd                m2, m3
2326    paddd               m%3, m2
2327    paddd               m%2, m%3
2328    psrad               m%1, 10
2329    psrad               m%2, 10
2330    packssdw            m%1, m%2
2331%endmacro
2332    movu                xm4, [srcq+r6 *1+ 0]
2333    vbroadcasti128       m8, [subpel_h_shufA]
2334    lea                  r7, [srcq+ssq*4]
2335    movu                xm6, [srcq+r6 *1+ 8]
2336    vbroadcasti128       m9, [subpel_h_shufB]
2337    mov                  r8, dstq
2338    movu                xm0, [srcq+r6 *1+16]
2339    vpbroadcastd        m10, [pd_512]
2340    movu                xm5, [srcq+ssq*0+ 0]
2341    vinserti128          m5, [r7  +ssq*0+ 0], 1
2342    movu                xm1, [srcq+ssq*0+16]
2343    vinserti128          m1, [r7  +ssq*0+16], 1
2344    shufpd               m7, m5, m1, 0x05
2345    INIT_XMM avx2
2346    PUT_8TAP_HV_H         4, 6, 0    ; 3
2347    INIT_YMM avx2
2348    PUT_8TAP_HV_H         5, 7, 1    ; 0 4
2349    movu                xm0, [srcq+ssq*2+ 0]
2350    vinserti128          m0, [srcq+r6 *2+ 0], 1
2351    movu                xm1, [srcq+ssq*2+16]
2352    vinserti128          m1, [srcq+r6 *2+16], 1
2353    shufpd               m7, m0, m1, 0x05
2354    PUT_8TAP_HV_H         0, 7, 1    ; 2 6
2355    movu                xm6, [srcq+ssq*1+ 0]
2356    movu                xm1, [srcq+ssq*1+16]
2357    vinserti128          m6, [r7  +ssq*1+ 0], 1
2358    vinserti128          m1, [r7  +ssq*1+16], 1
2359    add                  r7, r6
2360    shufpd               m7, m6, m1, 0x05
2361    PUT_8TAP_HV_H         6, 7, 1    ; 1 5
2362    vpermq               m4, m4, q1100
2363    vpermq               m5, m5, q3120
2364    vpermq               m6, m6, q3120
2365    vpermq               m7, m0, q3120
2366    punpcklwd            m3, m7, m4  ; 23
2367    punpckhwd            m4, m5      ; 34
2368    punpcklwd            m1, m5, m6  ; 01
2369    punpckhwd            m5, m6      ; 45
2370    punpcklwd            m2, m6, m7  ; 12
2371    punpckhwd            m6, m7      ; 56
2372.hv_w8_loop:
2373    vpbroadcastd         m9, [v_mul+4*0]
2374    vpbroadcastd         m7, [v_mul+4*1]
2375    vpbroadcastd        m10, [v_mul+4*2]
2376    pmaddwd              m8, m9, m1  ; a0
2377    pmaddwd              m9, m2      ; b0
2378    mova                 m1, m3
2379    mova                 m2, m4
2380    pmaddwd              m3, m7      ; a1
2381    pmaddwd              m4, m7      ; b1
2382    paddd                m8, m3
2383    paddd                m9, m4
2384    mova                 m3, m5
2385    mova                 m4, m6
2386    pmaddwd              m5, m10     ; a2
2387    pmaddwd              m6, m10     ; b2
2388    paddd                m8, m5
2389    paddd                m9, m6
2390    movu                xm5, [r7+ssq*0]
2391    vinserti128          m5, [r7+ssq*1], 1
2392    vbroadcasti128       m7, [subpel_h_shufA]
2393    vbroadcasti128      m10, [subpel_h_shufB]
2394    movu                xm6, [r7+ssq*0+16]
2395    vinserti128          m6, [r7+ssq*1+16], 1
2396    vextracti128       [r8], m0, 1
2397    pshufb               m0, m5, m7  ; 01
2398    pshufb               m5, m10     ; 23
2399    pmaddwd              m0, m11
2400    pmaddwd              m5, m12
2401    paddd                m0, m5
2402    pshufb               m5, m6, m7  ; 89
2403    pshufb               m6, m10     ; ab
2404    pmaddwd              m5, m13
2405    pmaddwd              m6, m14
2406    paddd                m6, m5
2407    movu                xm5, [r7+ssq*0+8]
2408    vinserti128          m5, [r7+ssq*1+8], 1
2409    lea                  r7, [r7+ssq*2]
2410    pshufb               m7, m5, m7
2411    pshufb               m5, m10
2412    pmaddwd             m10, m13, m7
2413    pmaddwd              m7, m11
2414    paddd                m0, m10
2415    vpbroadcastd        m10, [pd_512]
2416    paddd                m6, m7
2417    pmaddwd              m7, m14, m5
2418    pmaddwd              m5, m12
2419    paddd                m0, m7
2420    paddd                m5, m6
2421    vbroadcasti128       m6, [r8]
2422    paddd                m8, m10
2423    paddd                m9, m10
2424    paddd                m0, m10
2425    paddd                m5, m10
2426    vpbroadcastd        m10, [v_mul+4*3]
2427    psrad                m0, 10
2428    psrad                m5, 10
2429    packssdw             m0, m5
2430    vpermq               m7, m0, q3120 ; 7 8
2431    shufpd               m6, m7, 0x04  ; 6 7
2432    punpcklwd            m5, m6, m7    ; 67
2433    punpckhwd            m6, m7        ; 78
2434    pmaddwd              m7, m10, m5   ; a3
2435    pmaddwd             m10, m6        ; b3
2436    paddd                m7, m8
2437    paddd                m9, m10
2438    psrad                m7, 10
2439    psrad                m9, 10
2440    packusdw             m7, m9
2441    pminsw               m7, m15
2442    vpermq               m7, m7, q3120
2443    mova         [r8+dsq*0], xm7
2444    vextracti128 [r8+dsq*1], m7, 1
2445    lea                  r8, [r8+dsq*2]
2446    sub                  hd, 2
2447    jg .hv_w8_loop
2448    add                srcq, 16
2449    add                dstq, 16
2450    movzx                hd, wb
2451    sub                  wd, 1<<8
2452    jg .hv_w8_loop0
2453    RET
2454
2455%if WIN64
2456DECLARE_REG_TMP 6, 4
2457%else
2458DECLARE_REG_TMP 6, 7
2459%endif
2460
2461%define PREP_8TAP_FN FN prep_8tap,
2462PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
2463PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
2464PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
2465PREP_8TAP_FN regular,        REGULAR, REGULAR
2466
2467cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
2468%define base r7-prep_avx2
2469    imul                mxd, mxm, 0x010101
2470    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
2471    imul                myd, mym, 0x010101
2472    add                 myd, t1d ; 6tap_v, my, 4tap_v
2473    lea                  r7, [prep_avx2]
2474    movifnidn            hd, hm
2475    test                mxd, 0xf00
2476    jnz .h
2477    test                myd, 0xf00
2478    jnz .v
2479.prep:
2480    tzcnt                wd, wd
2481    mov                 r6d, r7m ; bitdepth_max
2482    movzx                wd, word [r7+wq*2+table_offset(prep,)]
2483    vpbroadcastd         m5, [r7-prep_avx2+pw_8192]
2484    shr                 r6d, 11
2485    add                  wq, r7
2486    vpbroadcastd         m4, [base+prep_mul+r6*4]
2487    lea                  r6, [ssq*3]
2488%if WIN64
2489    pop                  r7
2490%endif
2491    jmp                  wq
2492.h_w4:
2493    movzx               mxd, mxb
2494    sub                srcq, 2
2495    pmovsxbw            xm0, [base+subpel_filters+mxq*8]
2496    vbroadcasti128       m3, [subpel_h_shufA]
2497    lea                  r6, [ssq*3]
2498    vbroadcasti128       m4, [subpel_h_shufB]
2499    WIN64_SPILL_XMM       8
2500    pshufd              xm0, xm0, q2211
2501    test          dword r7m, 0x800
2502    jnz .h_w4_12bpc
2503    psllw               xm0, 2
2504.h_w4_12bpc:
2505    vpbroadcastq         m6, xm0
2506    vpermq               m7, m0, q1111
2507.h_w4_loop:
2508    movu                xm1, [srcq+ssq*0]
2509    vinserti128          m1, [srcq+ssq*2], 1
2510    movu                xm2, [srcq+ssq*1]
2511    vinserti128          m2, [srcq+r6 *1], 1
2512    lea                srcq, [srcq+ssq*4]
2513    pshufb               m0, m1, m3 ; 0 1 1 2 2 3 3 4
2514    pshufb               m1, m4     ; 2 3 3 4 4 5 5 6
2515    pmaddwd              m0, m6
2516    pmaddwd              m1, m7
2517    paddd                m0, m5
2518    paddd                m0, m1
2519    pshufb               m1, m2, m3
2520    pshufb               m2, m4
2521    pmaddwd              m1, m6
2522    pmaddwd              m2, m7
2523    paddd                m1, m5
2524    paddd                m1, m2
2525    psrad                m0, 4
2526    psrad                m1, 4
2527    packssdw             m0, m1
2528    mova             [tmpq], m0
2529    add                tmpq, 32
2530    sub                  hd, 4
2531    jg .h_w4_loop
2532    RET
2533.h:
2534    test                myd, 0xf00
2535    jnz .hv
2536    vpbroadcastd         m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
2537    cmp                  wd, 4
2538    je .h_w4
2539    shr                 mxd, 16
2540    sub                srcq, 4
2541    vpbroadcastq         m0, [base+subpel_filters+1+mxq*8]
2542    WIN64_SPILL_XMM      10
2543    vbroadcasti128       m6, [subpel_h_shufA]
2544    punpcklbw            m0, m0
2545    psraw                m0, 8 ; sign-extend
2546    test          dword r7m, 0x800
2547    jnz .h_12bpc
2548    psllw                m0, 2
2549.h_12bpc:
2550    pshufd               m7, m0, q0000
2551    pshufd               m8, m0, q1111
2552    pshufd               m9, m0, q2222
2553    cmp                  wd, 8
2554    jg .h_w16
2555.h_w8:
2556    movu                xm0, [srcq+ssq*0+ 0]
2557    vinserti128          m0, [srcq+ssq*1+ 0], 1
2558    movu                xm2, [srcq+ssq*0+16]
2559    vinserti128          m2, [srcq+ssq*1+16], 1
2560    lea                srcq, [srcq+ssq*2]
2561    shufpd               m1, m0, m2, 0x05
2562%macro PREP_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
2563    pshufb              m%1, m6        ; 01 12 23 34
2564    pshufb              m%2, m6        ; 45 56 67 78
2565    pmaddwd             m%4, m7, m%1   ; a0
2566    pshufb              m%3, m6        ; 89 9a ab bc
2567    pmaddwd             m%5, m9, m%2   ; a2
2568    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
2569    paddd               m%4, m%5       ; a0+a2
2570    pmaddwd             m%5, m7, m%2   ; b0
2571    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
2572    pmaddwd             m%3, m9        ; b2
2573    pmaddwd             m%1, m8        ; a1
2574    pmaddwd             m%2, m8        ; b1
2575    paddd               m%3, m%5       ; b0+b2
2576    paddd               m%4, m5
2577    paddd               m%3, m5
2578    paddd               m%1, m%4
2579    paddd               m%2, m%3
2580    psrad               m%1, 4
2581    psrad               m%2, 4
2582    packssdw            m%1, m%2
2583%endmacro
2584    PREP_6TAP_H           0, 1, 2, 3, 4
2585    mova             [tmpq], m0
2586    add                tmpq, 32
2587    sub                  hd, 2
2588    jg .h_w8
2589    RET
2590.h_w16:
2591    add                  wd, wd
2592.h_w16_loop0:
2593    mov                 r6d, wd
2594.h_w16_loop:
2595    movu                 m0, [srcq+r6-32]
2596    movu                 m1, [srcq+r6-24]
2597    movu                 m2, [srcq+r6-16]
2598    PREP_6TAP_H           0, 1, 2, 3, 4
2599    mova       [tmpq+r6-32], m0
2600    sub                 r6d, 32
2601    jg .h_w16_loop
2602    add                srcq, ssq
2603    add                tmpq, wq
2604    dec                  hd
2605    jg .h_w16_loop0
2606    RET
2607.v:
2608    movzx               mxd, myb
2609    shr                 myd, 16
2610    cmp                  hd, 4
2611    cmove               myd, mxd
2612    vpbroadcastq         m0, [base+subpel_filters+1+myq*8]
2613    WIN64_SPILL_XMM       9, 12
2614    vpbroadcastd         m5, [prep_8tap_1d_rnd]
2615    mov                  r6, ssq
2616    punpcklbw            m0, m0
2617    neg                  r6
2618    psraw                m0, 8 ; sign-extend
2619    test          dword r7m, 0x800
2620    jnz .v_12bpc
2621    psllw                m0, 2
2622.v_12bpc:
2623    pshufd               m6, m0, q0000
2624    pshufd               m7, m0, q1111
2625    pshufd               m8, m0, q2222
2626    cmp                  wd, 4
2627    jg .v_w8
2628.v_w4:
2629    movq                xm1, [srcq+r6 *2]
2630    vpbroadcastq         m3, [srcq+r6 *1]
2631    vpbroadcastq         m2, [srcq+ssq*0]
2632    vpbroadcastq         m4, [srcq+ssq*1]
2633    lea                srcq, [srcq+ssq*2]
2634    vpbroadcastq         m0, [srcq+ssq*0]
2635    vpblendd             m1, m3, 0x30
2636    vpblendd             m3, m2, 0x30
2637    punpcklwd            m1, m3     ; 01 12
2638    vpblendd             m2, m4, 0x30
2639    vpblendd             m4, m0, 0x30
2640    punpcklwd            m2, m4     ; 23 34
2641.v_w4_loop:
2642    vpbroadcastq         m3, [srcq+ssq*1]
2643    lea                srcq, [srcq+ssq*2]
2644    pmaddwd              m4, m6, m1 ; a0 b0
2645    mova                 m1, m2
2646    pmaddwd              m2, m7     ; a1 b1
2647    paddd                m4, m2
2648    vpblendd             m2, m0, m3, 0x30
2649    vpbroadcastq         m0, [srcq+ssq*0]
2650    vpblendd             m3, m0, 0x30
2651    punpcklwd            m2, m3     ; 45 56
2652    pmaddwd              m3, m8, m2 ; a2 b2
2653    paddd                m4, m5
2654    paddd                m4, m3
2655    psrad                m4, 4
2656    vextracti128        xm3, m4, 1
2657    packssdw            xm4, xm3
2658    mova             [tmpq], xm4
2659    add                tmpq, 16
2660    sub                  hd, 2
2661    jg .v_w4_loop
2662    RET
2663.v_w8:
2664    WIN64_PUSH_XMM       12
2665%if WIN64
2666    push                 r8
2667%endif
2668    mov                 r8d, wd
2669    shl                  wd, 5
2670    lea                  wd, [hq+wq-256]
2671.v_w8_loop0:
2672    vbroadcasti128       m3, [srcq+r6 *2]
2673    vbroadcasti128       m4, [srcq+r6 *1]
2674    lea                  r5, [srcq+ssq*2]
2675    vbroadcasti128       m0, [srcq+ssq*0]
2676    vbroadcasti128       m1, [srcq+ssq*1]
2677    mov                  r7, tmpq
2678    vbroadcasti128       m2, [r5+ssq*0]
2679    shufpd               m3, m0, 0x0c
2680    shufpd               m4, m1, 0x0c
2681    punpcklwd            m1, m3, m4 ; 01
2682    punpckhwd            m3, m4     ; 23
2683    shufpd               m0, m2, 0x0c
2684    punpcklwd            m2, m4, m0 ; 12
2685    punpckhwd            m4, m0     ; 34
2686.v_w8_loop:
2687    vbroadcasti128       m9, [r5+ssq*1]
2688    pmaddwd             m10, m6, m1 ; a0
2689    lea                  r5, [r5+ssq*2]
2690    pmaddwd             m11, m6, m2 ; b0
2691    mova                 m1, m3
2692    pmaddwd              m3, m7     ; a1
2693    mova                 m2, m4
2694    pmaddwd              m4, m7     ; b1
2695    paddd               m10, m5
2696    paddd               m11, m5
2697    paddd               m10, m3
2698    vbroadcasti128       m3, [r5+ssq*0]
2699    paddd               m11, m4
2700    shufpd               m4, m0, m9, 0x0d
2701    shufpd               m0, m9, m3, 0x0c
2702    punpcklwd            m3, m4, m0 ; 45
2703    punpckhwd            m4, m0     ; 56
2704    pmaddwd              m9, m8, m3 ; a2
2705    paddd               m10, m9
2706    pmaddwd              m9, m8, m4 ; b2
2707    paddd               m11, m9
2708    psrad               m10, 4
2709    psrad               m11, 4
2710    packssdw            m10, m11
2711    vpermq              m10, m10, q3120
2712    mova          [r7+r8*0], xm10
2713    vextracti128  [r7+r8*2], m10, 1
2714    lea                  r7, [r7+r8*4]
2715    sub                  hd, 2
2716    jg .v_w8_loop
2717    add                srcq, 16
2718    add                tmpq, 16
2719    movzx                hd, wb
2720    sub                  wd, 1<<8
2721    jg .v_w8_loop0
2722%if WIN64
2723    pop                  r8
2724%endif
2725    RET
2726.hv:
2727    WIN64_SPILL_XMM      13, 15
2728    vpbroadcastd         m7, [prep_8tap_2d_rnd]
2729    vbroadcasti128       m8, [subpel_h_shufA]
2730    cmp                  wd, 4
2731    jg .hv_w8
2732    movzx               mxd, mxb
2733    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
2734    movzx               mxd, myb
2735    shr                 myd, 16
2736    cmp                  hd, 4
2737    cmove               myd, mxd
2738    vpbroadcastq         m1, [base+subpel_filters+1+myq*8]
2739    mov                  r6, ssq
2740    sub                srcq, 2
2741    pxor                 m6, m6
2742    neg                  r6
2743    punpcklbw            m6, m0
2744    punpcklbw            m1, m1
2745    psraw                m6, 4
2746    psraw                m1, 8
2747    test          dword r7m, 0x800
2748    jz .hv_w4_10bit
2749    psraw                m6, 2
2750.hv_w4_10bit:
2751    pshufd              m10, m1, q0000
2752    pshufd              m11, m1, q1111
2753    pshufd              m12, m1, q2222
2754.hv_w4:
2755    movu                xm2, [srcq+r6 *2]
2756    vinserti128          m2, [srcq+r6 *1], 1 ; 0 1
2757    pshufd               m5, m6, q0000
2758    vbroadcasti128       m9, [base+subpel_h_shufB]
2759    movu                xm0, [srcq+ssq*0]
2760    pshufd               m6, m6, q1111
2761    vinserti128          m0, [srcq+ssq*1], 1 ; 2 3
2762    lea                srcq, [srcq+ssq*2]
2763    movu                xm3, [srcq+ssq*0]    ; 4
2764    pshufb               m1, m2, m8
2765    pmaddwd              m1, m5
2766    pshufb               m2, m9
2767    pmaddwd              m2, m6
2768    pshufb               m4, m0, m8
2769    pmaddwd              m4, m5
2770    pshufb               m0, m9
2771    pmaddwd              m0, m6
2772    paddd                m2, m1
2773    pshufb              xm1, xm3, xm8
2774    pmaddwd             xm1, xm5
2775    pshufb              xm3, xm9
2776    pmaddwd             xm3, xm6
2777    paddd                m0, m4
2778    paddd                m2, m7
2779    paddd               xm1, xm7
2780    paddd                m0, m7
2781    paddd               xm3, xm1
2782    REPX       {psrad x, 6}, m2, m0, xm3
2783    packssdw             m2, m0      ; 0 2   1 3
2784    packssdw            xm0, xm3     ; 2 4
2785    vperm2i128           m0, m2, 0x03
2786    punpcklwd            m1, m2, m0  ; 01 12
2787    punpckhwd            m2, m0      ; 23 34
2788.hv_w4_loop:
2789    movu                xm3, [srcq+ssq*1]
2790    lea                srcq, [srcq+ssq*2]
2791    vinserti128          m3, [srcq+ssq*0], 1
2792    pmaddwd              m4, m10, m1 ; a0 b0
2793    mova                 m1, m2
2794    pmaddwd              m2, m11     ; a1 b1
2795    paddd                m4, m2
2796    pshufb               m2, m3, m8
2797    pmaddwd              m2, m5
2798    pshufb               m3, m9
2799    pmaddwd              m3, m6
2800    paddd                m2, m7
2801    paddd                m3, m2
2802    psrad                m3, 6
2803    packssdw             m3, m3      ; 5 5   6 6
2804    vperm2i128           m2, m0, m3, 0x21
2805    mova                 m0, m3
2806    punpckhwd            m2, m3      ; 45 56
2807    pmaddwd              m3, m12, m2 ; a2 b2
2808    paddd                m4, m7
2809    paddd                m4, m3
2810    psrad                m4, 6
2811    vextracti128        xm3, m4, 1
2812    packssdw            xm4, xm3
2813    mova             [tmpq], xm4
2814    add                tmpq, 16
2815    sub                  hd, 2
2816    jg .hv_w4_loop
2817    RET
2818.hv_w8:
2819    shr                 mxd, 16
2820    vpbroadcastq         m2, [base+subpel_filters+1+mxq*8]
2821    movzx               mxd, myb
2822    shr                 myd, 16
2823    cmp                  hd, 4
2824    cmove               myd, mxd
2825    pmovsxbw            xm1, [base+subpel_filters+1+myq*8]
2826    WIN64_PUSH_XMM       15
2827%if WIN64
2828    PUSH                 r8
2829%endif
2830    mov                 r8d, wd
2831    shl                  wd, 5
2832    mov                  r6, ssq
2833    sub                srcq, 4
2834    neg                  r6
2835    lea                  wd, [hq+wq-256]
2836    pxor                 m0, m0
2837    punpcklbw            m0, m2
2838    psraw                m0, 4
2839    test          dword r7m, 0x800
2840    jz .hv_w8_10bit
2841    psraw                m0, 2
2842.hv_w8_10bit:
2843    pshufd              m10, m0, q0000
2844    pshufd              m11, m0, q1111
2845    mova            [v_mul], xm1
2846    pshufd              m12, m0, q2222
2847.hv_w8_loop0:
2848    vbroadcasti128       m0, [srcq+ssq*0+ 0]
2849    vinserti128          m3, m0, [srcq+r6*2+ 0], 0
2850    lea                  r5, [srcq+ssq*2]
2851    vbroadcasti128       m2, [srcq+ssq*0+16]
2852    vinserti128          m1, m2, [srcq+r6*2+16], 0
2853    mov                  r7, tmpq
2854    vinserti128          m0, [r5  +ssq*0+ 0], 1
2855    vinserti128          m2, [r5  +ssq*0+16], 1
2856    shufpd               m4, m3, m1, 0x05
2857%macro PREP_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
2858    pshufb              m%1, m8        ; 01 12 23 34
2859    pshufb              m%2, m8        ; 45 56 67 78
2860    pmaddwd             m%4, m10, m%1  ; a0
2861    pshufb              m%3, m8        ; 89 9a ab bc
2862    pmaddwd             m%5, m12, m%2  ; a2
2863    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
2864    paddd               m%4, m%5       ; a0+a2
2865    pmaddwd             m%5, m10, m%2  ; b0
2866    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
2867    pmaddwd             m%3, m12       ; b2
2868    pmaddwd             m%1, m11       ; a1
2869    pmaddwd             m%2, m11       ; b1
2870    paddd               m%3, m%5       ; b0+b2
2871    paddd               m%4, m7
2872    paddd               m%3, m7
2873    paddd               m%1, m%4
2874    paddd               m%2, m%3
2875    psrad               m%1, 6
2876    psrad               m%2, 6
2877    packssdw            m%1, m%2
2878%endmacro
2879    PREP_6TAP_HV_H        3, 4, 1, 5, 6  ; 0 2
2880    movu                xm4, [srcq+r6 *1+ 0]
2881    vinserti128          m4, [srcq+ssq*1+ 0], 1
2882    shufpd               m1, m0, m2, 0x05
2883    PREP_6TAP_HV_H        0, 1, 2, 5, 6  ; 2 4
2884    movu                xm2, [srcq+r6 *1+16]
2885    vinserti128          m2, [srcq+ssq*1+16], 1
2886    shufpd               m1, m4, m2, 0x05
2887    PREP_6TAP_HV_H        4, 1, 2, 5, 6  ; 1 3
2888    vpermq               m3, m3, q3120
2889    vpermq               m4, m4, q3120
2890    vpermq               m0, m0, q3120
2891    punpcklwd            m1, m3, m4     ; 01
2892    punpckhwd            m3, m4         ; 23
2893    punpcklwd            m2, m4, m0     ; 12
2894    punpckhwd            m4, m0         ; 34
2895.hv_w8_loop:
2896    vpbroadcastd        m14, [v_mul+4*0]
2897    vpbroadcastd         m9, [v_mul+4*1]
2898    movu                xm5, [r5+ssq*1+ 0]
2899    movu                xm6, [r5+ssq*1+16]
2900    lea                  r5, [r5+ssq*2]
2901    pmaddwd             m13, m14, m1    ; a0
2902    pmaddwd             m14, m2         ; b0
2903    vinserti128          m5, [r5+ssq*0+ 0], 1
2904    vinserti128          m6, [r5+ssq*0+16], 1
2905    mova                 m1, m3
2906    pmaddwd              m3, m9         ; a1
2907    mova                 m2, m4
2908    pmaddwd              m4, m9         ; b1
2909    paddd               m13, m3
2910    shufpd               m3, m5, m6, 0x05
2911    paddd               m14, m4
2912    PREP_6TAP_HV_H        5, 3, 6, 4, 9 ; 5 6
2913    vpbroadcastd         m6, [v_mul+4*2]
2914    vpermq               m5, m5, q3120
2915    shufpd               m4, m0, m5, 0x05
2916    mova                 m0, m5
2917    punpcklwd            m3, m4, m5     ; 45
2918    punpckhwd            m4, m5         ; 56
2919    pmaddwd              m5, m6, m3     ; a2
2920    pmaddwd              m6, m4         ; b2
2921    paddd               m13, m7
2922    paddd               m14, m7
2923    paddd                m5, m13
2924    paddd                m6, m14
2925    psrad                m5, 6
2926    psrad                m6, 6
2927    packssdw             m5, m6
2928    vpermq               m5, m5, q3120
2929    mova          [r7+r8*0], xm5
2930    vextracti128  [r7+r8*2], m5, 1
2931    lea                  r7, [r7+r8*4]
2932    sub                  hd, 2
2933    jg .hv_w8_loop
2934    add                srcq, 16
2935    add                tmpq, 16
2936    movzx                hd, wb
2937    sub                  wd, 1<<8
2938    jg .hv_w8_loop0
2939%if WIN64
2940    POP                  r8
2941%endif
2942    RET
2943
2944PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
2945PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
2946PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
2947PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
2948PREP_8TAP_FN sharp,          SHARP,   SHARP
2949
2950cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
2951%define base r7-prep_avx2
2952    imul                mxd, mxm, 0x010101
2953    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2954    imul                myd, mym, 0x010101
2955    add                 myd, t1d ; 8tap_v, my, 4tap_v
2956    lea                  r7, [prep_avx2]
2957    movifnidn            hd, hm
2958    test                mxd, 0xf00
2959    jnz .h
2960    test                myd, 0xf00
2961    jz mangle(private_prefix %+ _prep_6tap_16bpc_avx2).prep
2962.v:
2963    movzx               mxd, myb
2964    shr                 myd, 16
2965    cmp                  hd, 4
2966    cmove               myd, mxd
2967    vpbroadcastq         m0, [base+subpel_filters+myq*8]
2968    WIN64_SPILL_XMM      12, 15
2969    vpbroadcastd         m7, [prep_8tap_1d_rnd]
2970    lea                  r6, [strideq*3]
2971    punpcklbw            m0, m0
2972    sub                srcq, r6
2973    psraw                m0, 8 ; sign-extend
2974    test          dword r7m, 0x800
2975    jnz .v_12bpc
2976    psllw                m0, 2
2977.v_12bpc:
2978    pshufd               m8, m0, q0000
2979    pshufd               m9, m0, q1111
2980    pshufd              m10, m0, q2222
2981    pshufd              m11, m0, q3333
2982    cmp                  wd, 4
2983    jg .v_w8
2984.v_w4:
2985    movq                xm1, [srcq+strideq*0]
2986    vpbroadcastq         m0, [srcq+strideq*1]
2987    vpbroadcastq         m2, [srcq+strideq*2]
2988    vpbroadcastq         m4, [srcq+r6       ]
2989    lea                srcq, [srcq+strideq*4]
2990    vpbroadcastq         m3, [srcq+strideq*0]
2991    vpbroadcastq         m5, [srcq+strideq*1]
2992    vpblendd             m1, m0, 0x30
2993    vpblendd             m0, m2, 0x30
2994    punpcklwd            m1, m0      ; 01 12
2995    vpbroadcastq         m0, [srcq+strideq*2]
2996    add                srcq, r6
2997    vpblendd             m2, m4, 0x30
2998    vpblendd             m4, m3, 0x30
2999    punpcklwd            m2, m4      ; 23 34
3000    vpblendd             m3, m5, 0x30
3001    vpblendd             m5, m0, 0x30
3002    punpcklwd            m3, m5      ; 45 56
3003.v_w4_loop:
3004    vpbroadcastq         m4, [srcq+strideq*0]
3005    pmaddwd              m5, m8, m1  ; a0 b0
3006    mova                 m1, m2
3007    pmaddwd              m2, m9      ; a1 b1
3008    paddd                m5, m7
3009    paddd                m5, m2
3010    mova                 m2, m3
3011    pmaddwd              m3, m10     ; a2 b2
3012    paddd                m5, m3
3013    vpblendd             m3, m0, m4, 0x30
3014    vpbroadcastq         m0, [srcq+strideq*1]
3015    lea                srcq, [srcq+strideq*2]
3016    vpblendd             m4, m0, 0x30
3017    punpcklwd            m3, m4      ; 67 78
3018    pmaddwd              m4, m11, m3 ; a3 b3
3019    paddd                m5, m4
3020    psrad                m5, 4
3021    vextracti128        xm4, m5, 1
3022    packssdw            xm5, xm4
3023    mova             [tmpq], xm5
3024    add                tmpq, 16
3025    sub                  hd, 2
3026    jg .v_w4_loop
3027    RET
3028.v_w8:
3029%if WIN64
3030    WIN64_PUSH_XMM       15
3031    push                 r8
3032%endif
3033    mov                 r8d, wd
3034    shl                  wd, 5
3035    lea                  wd, [hq+wq-256]
3036.v_w8_loop0:
3037    vbroadcasti128       m4, [srcq+strideq*0]
3038    vbroadcasti128       m5, [srcq+strideq*1]
3039    lea                  r5, [srcq+strideq*4]
3040    vbroadcasti128       m0, [srcq+r6       ]
3041    vbroadcasti128       m6, [srcq+strideq*2]
3042    mov                  r7, tmpq
3043    vbroadcasti128       m1, [r5+strideq*0]
3044    vbroadcasti128       m2, [r5+strideq*1]
3045    vbroadcasti128       m3, [r5+strideq*2]
3046    add                  r5, r6
3047    shufpd               m4, m0, 0x0c
3048    shufpd               m5, m1, 0x0c
3049    punpcklwd            m1, m4, m5 ; 01
3050    punpckhwd            m4, m5     ; 34
3051    shufpd               m6, m2, 0x0c
3052    punpcklwd            m2, m5, m6 ; 12
3053    punpckhwd            m5, m6     ; 45
3054    shufpd               m0, m3, 0x0c
3055    punpcklwd            m3, m6, m0 ; 23
3056    punpckhwd            m6, m0     ; 56
3057.v_w8_loop:
3058    vbroadcasti128      m14, [r5+strideq*0]
3059    pmaddwd             m12, m8, m1  ; a0
3060    pmaddwd             m13, m8, m2  ; b0
3061    mova                 m1, m3
3062    mova                 m2, m4
3063    pmaddwd              m3, m9      ; a1
3064    pmaddwd              m4, m9      ; b1
3065    paddd               m12, m7
3066    paddd               m13, m7
3067    paddd               m12, m3
3068    paddd               m13, m4
3069    mova                 m3, m5
3070    mova                 m4, m6
3071    pmaddwd              m5, m10     ; a2
3072    pmaddwd              m6, m10     ; b2
3073    paddd               m12, m5
3074    vbroadcasti128       m5, [r5+strideq*1]
3075    lea                  r5, [r5+strideq*2]
3076    paddd               m13, m6
3077    shufpd               m6, m0, m14, 0x0d
3078    shufpd               m0, m14, m5, 0x0c
3079    punpcklwd            m5, m6, m0  ; 67
3080    punpckhwd            m6, m0      ; 78
3081    pmaddwd             m14, m11, m5 ; a3
3082    paddd               m12, m14
3083    pmaddwd             m14, m11, m6 ; b3
3084    paddd               m13, m14
3085    psrad               m12, 4
3086    psrad               m13, 4
3087    packssdw            m12, m13
3088    vpermq              m12, m12, q3120
3089    mova          [r7+r8*0], xm12
3090    vextracti128  [r7+r8*2], m12, 1
3091    lea                  r7, [r7+r8*4]
3092    sub                  hd, 2
3093    jg .v_w8_loop
3094    add                srcq, 16
3095    add                tmpq, 16
3096    movzx                hd, wb
3097    sub                  wd, 1<<8
3098    jg .v_w8_loop0
3099%if WIN64
3100    pop                  r8
3101%endif
3102    RET
3103.h:
3104    test                myd, 0xf00
3105    jnz .hv
3106    vpbroadcastd         m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
3107    cmp                  wd, 4
3108    je mangle(private_prefix %+ _prep_6tap_16bpc_avx2).h_w4
3109    shr                 mxd, 16
3110    sub                srcq, 6
3111    vpbroadcastq         m0, [base+subpel_filters+mxq*8]
3112    WIN64_SPILL_XMM      12
3113    vbroadcasti128       m6, [subpel_h_shufA]
3114    vbroadcasti128       m7, [subpel_h_shufB]
3115    punpcklbw            m0, m0
3116    psraw                m0, 8 ; sign-extend
3117    test          dword r7m, 0x800
3118    jnz .h_12bpc
3119    psllw                m0, 2
3120.h_12bpc:
3121    pshufd               m8, m0, q0000
3122    pshufd               m9, m0, q1111
3123    pshufd              m10, m0, q2222
3124    pshufd              m11, m0, q3333
3125    cmp                  wd, 8
3126    jg .h_w16
3127.h_w8:
3128%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
3129    pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
3130    pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
3131    pmaddwd             m%5, m9, m%4   ; abcd1
3132    pmaddwd             m%1, m8        ; abcd0
3133    pshufb              m%2, m7        ; 6 7 7 8 8 9 9 a
3134    shufpd              m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
3135    paddd               m%5, m5
3136    paddd               m%1, m%5
3137    pmaddwd             m%5, m11, m%2  ; abcd3
3138    paddd               m%1, m%5
3139    pmaddwd             m%5, m10, m%4  ; abcd2
3140    pshufb              m%3, m7        ; a b b c c d d e
3141    pmaddwd             m%4, m8        ; efgh0
3142    paddd               m%1, m%5
3143    pmaddwd             m%5, m9, m%2   ; efgh1
3144    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
3145    pmaddwd             m%3, m11       ; efgh3
3146    pmaddwd             m%2, m10       ; efgh2
3147    paddd               m%4, m5
3148    paddd               m%4, m%5
3149    paddd               m%3, m%4
3150    paddd               m%2, m%3
3151    psrad               m%1, 4
3152    psrad               m%2, 4
3153    packssdw            m%1, m%2
3154%endmacro
3155    movu                xm0, [srcq+strideq*0+ 0]
3156    vinserti128          m0, [srcq+strideq*1+ 0], 1
3157    movu                xm2, [srcq+strideq*0+16]
3158    vinserti128          m2, [srcq+strideq*1+16], 1
3159    lea                srcq, [srcq+strideq*2]
3160    shufpd               m1, m0, m2, 0x05
3161    PREP_8TAP_H           0, 1, 2, 3, 4
3162    mova             [tmpq], m0
3163    add                tmpq, 32
3164    sub                  hd, 2
3165    jg .h_w8
3166    RET
3167.h_w16:
3168    add                  wd, wd
3169.h_w16_loop0:
3170    mov                 r6d, wd
3171.h_w16_loop:
3172    movu                 m0, [srcq+r6-32]
3173    movu                 m1, [srcq+r6-24]
3174    movu                 m2, [srcq+r6-16]
3175    PREP_8TAP_H           0, 1, 2, 3, 4
3176    mova       [tmpq+r6-32], m0
3177    sub                 r6d, 32
3178    jg .h_w16_loop
3179    add                srcq, strideq
3180    add                tmpq, wq
3181    dec                  hd
3182    jg .h_w16_loop0
3183    RET
3184.hv:
3185    WIN64_SPILL_XMM      16
3186    vpbroadcastd        m15, [prep_8tap_2d_rnd]
3187    cmp                  wd, 4
3188    jg .hv_w8
3189    movzx               mxd, mxb
3190    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
3191    movzx               mxd, myb
3192    shr                 myd, 16
3193    cmp                  hd, 4
3194    cmove               myd, mxd
3195    vpbroadcastq         m1, [base+subpel_filters+myq*8]
3196    lea                  r6, [strideq*3]
3197    sub                srcq, 2
3198    pxor                 m7, m7
3199    sub                srcq, r6
3200    punpcklbw            m7, m0
3201    punpcklbw            m1, m1
3202    psraw                m7, 4
3203    psraw                m1, 8
3204    test          dword r7m, 0x800
3205    jz .hv_w4_10bit
3206    psraw                m7, 2
3207.hv_w4_10bit:
3208    pshufd              m11, m1, q0000
3209    pshufd              m12, m1, q1111
3210    pshufd              m13, m1, q2222
3211    pshufd              m14, m1, q3333
3212.hv_w4:
3213    vbroadcasti128       m9, [subpel_h_shufA]
3214    vbroadcasti128      m10, [subpel_h_shufB]
3215    pshufd               m8, m7, q1111
3216    pshufd               m7, m7, q0000
3217    movu                xm1, [srcq+strideq*0]
3218    vinserti128          m1, [srcq+strideq*1], 1     ; 0 1
3219    vbroadcasti128       m0, [srcq+r6       ]
3220    vinserti128          m2, m0, [srcq+strideq*2], 0 ; 2 3
3221    lea                srcq, [srcq+strideq*4]
3222    vinserti128          m0, [srcq+strideq*0], 1     ; 3 4
3223    movu                xm3, [srcq+strideq*1]
3224    vinserti128          m3, [srcq+strideq*2], 1     ; 5 6
3225    add                srcq, r6
3226    pshufb               m4, m1, m9
3227    pshufb               m1, m10
3228    pmaddwd              m4, m7
3229    pmaddwd              m1, m8
3230    pshufb               m5, m2, m9
3231    pshufb               m2, m10
3232    pmaddwd              m5, m7
3233    pmaddwd              m2, m8
3234    paddd                m4, m15
3235    paddd                m1, m4
3236    pshufb               m4, m0, m9
3237    pshufb               m0, m10
3238    pmaddwd              m4, m7
3239    pmaddwd              m0, m8
3240    paddd                m5, m15
3241    paddd                m2, m5
3242    pshufb               m5, m3, m9
3243    pshufb               m3, m10
3244    pmaddwd              m5, m7
3245    pmaddwd              m3, m8
3246    paddd                m4, m15
3247    paddd                m4, m0
3248    paddd                m5, m15
3249    paddd                m5, m3
3250    vperm2i128           m0, m1, m2, 0x21
3251    psrld                m1, 6
3252    psrld                m2, 6
3253    vperm2i128           m3, m4, m5, 0x21
3254    pslld                m4, 10
3255    pslld                m5, 10
3256    pblendw              m2, m4, 0xaa ; 23 34
3257    pslld                m0, 10
3258    pblendw              m1, m0, 0xaa ; 01 12
3259    psrld                m3, 6
3260    pblendw              m3, m5, 0xaa ; 45 56
3261    psrad                m0, m5, 16
3262.hv_w4_loop:
3263    movu                xm4, [srcq+strideq*0]
3264    vinserti128          m4, [srcq+strideq*1], 1
3265    lea                srcq, [srcq+strideq*2]
3266    pmaddwd              m5, m11, m1   ; a0 b0
3267    mova                 m1, m2
3268    pmaddwd              m2, m12       ; a1 b1
3269    paddd                m5, m15
3270    paddd                m5, m2
3271    mova                 m2, m3
3272    pmaddwd              m3, m13       ; a2 b2
3273    paddd                m5, m3
3274    pshufb               m3, m4, m9
3275    pshufb               m4, m10
3276    pmaddwd              m3, m7
3277    pmaddwd              m4, m8
3278    paddd                m3, m15
3279    paddd                m4, m3
3280    psrad                m4, 6
3281    packssdw             m0, m4        ; _ 7 6 8
3282    vpermq               m3, m0, q1122 ; _ 6 _ 7
3283    punpckhwd            m3, m0        ; 67 78
3284    mova                 m0, m4
3285    pmaddwd              m4, m14, m3   ; a3 b3
3286    paddd                m4, m5
3287    psrad                m4, 6
3288    vextracti128        xm5, m4, 1
3289    packssdw            xm4, xm5
3290    mova             [tmpq], xm4
3291    add                tmpq, 16
3292    sub                  hd, 2
3293    jg .hv_w4_loop
3294    RET
3295.hv_w8:
3296    shr                 mxd, 16
3297    vpbroadcastq         m2, [base+subpel_filters+mxq*8]
3298    movzx               mxd, myb
3299    shr                 myd, 16
3300    cmp                  hd, 4
3301    cmove               myd, mxd
3302    pmovsxbw            xm1, [base+subpel_filters+myq*8]
3303%if WIN64
3304    PUSH                 r8
3305%endif
3306    mov                 r8d, wd
3307    shl                  wd, 5
3308    lea                  r6, [strideq*3]
3309    sub                srcq, 6
3310    sub                srcq, r6
3311    lea                  wd, [hq+wq-256]
3312    pxor                 m0, m0
3313    punpcklbw            m0, m2
3314    psraw                m0, 4
3315    test          dword r7m, 0x800
3316    jz .hv_w8_10bit
3317    psraw                m0, 2
3318.hv_w8_10bit:
3319    pshufd              m11, m0, q0000
3320    pshufd              m12, m0, q1111
3321    mova            [v_mul], xm1
3322    pshufd              m13, m0, q2222
3323    pshufd              m14, m0, q3333
3324.hv_w8_loop0:
3325%macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
3326    pshufb               m2, m%1, m9   ; 2 3 3 4 4 5 5 6
3327    pshufb              m%1, m8        ; 0 1 1 2 2 3 3 4
3328    pmaddwd              m3, m12, m2
3329    pmaddwd             m%1, m11
3330    pshufb              m%2, m9        ; 6 7 7 8 8 9 9 a
3331    shufpd               m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
3332    paddd                m3, m15
3333    paddd               m%1, m3
3334    pmaddwd              m3, m14, m%2
3335    paddd               m%1, m3
3336    pmaddwd              m3, m13, m2
3337    pshufb              m%3, m9        ; a b b c c d d e
3338    pmaddwd              m2, m11
3339    paddd               m%1, m3
3340    pmaddwd              m3, m12, m%2
3341    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
3342    pmaddwd             m%3, m14
3343    pmaddwd             m%2, m13
3344    paddd                m2, m15
3345    paddd                m2, m3
3346    paddd                m2, m%3
3347    paddd                m2, m%2
3348    psrad               m%1, 6
3349    psrad                m2, 6
3350    packssdw            m%1, m2
3351%endmacro
3352    movu                xm4, [srcq+r6       + 0]
3353    vbroadcasti128       m8, [subpel_h_shufA]
3354    lea                  r5, [srcq+strideq*4]
3355    movu                xm6, [srcq+r6       + 8]
3356    vbroadcasti128       m9, [subpel_h_shufB]
3357    mov                  r7, tmpq
3358    movu                xm0, [srcq+r6       +16]
3359    movu                xm5, [srcq+strideq*0+ 0]
3360    vinserti128          m5, [r5  +strideq*0+ 0], 1
3361    movu                xm1, [srcq+strideq*0+16]
3362    vinserti128          m1, [r5  +strideq*0+16], 1
3363    shufpd               m7, m5, m1, 0x05
3364    INIT_XMM avx2
3365    PREP_8TAP_HV_H        4, 6, 0    ; 3
3366    INIT_YMM avx2
3367    PREP_8TAP_HV_H        5, 7, 1    ; 0 4
3368    movu                xm0, [srcq+strideq*2+ 0]
3369    vinserti128          m0, [srcq+r6     *2+ 0], 1
3370    movu                xm1, [srcq+strideq*2+16]
3371    vinserti128          m1, [srcq+r6     *2+16], 1
3372    shufpd               m7, m0, m1, 0x05
3373    PREP_8TAP_HV_H        0, 7, 1    ; 2 6
3374    movu                xm6, [srcq+strideq*1+ 0]
3375    movu                xm1, [srcq+strideq*1+16]
3376    vinserti128          m6, [r5  +strideq*1+ 0], 1
3377    vinserti128          m1, [r5  +strideq*1+16], 1
3378    add                  r5, r6
3379    shufpd               m7, m6, m1, 0x05
3380    PREP_8TAP_HV_H        6, 7, 1    ; 1 5
3381    vpermq               m4, m4, q1100
3382    vpermq               m5, m5, q3120
3383    vpermq               m6, m6, q3120
3384    vpermq               m7, m0, q3120
3385    punpcklwd            m3, m7, m4  ; 23
3386    punpckhwd            m4, m5      ; 34
3387    punpcklwd            m1, m5, m6  ; 01
3388    punpckhwd            m5, m6      ; 45
3389    punpcklwd            m2, m6, m7  ; 12
3390    punpckhwd            m6, m7      ; 56
3391.hv_w8_loop:
3392    vpbroadcastd         m9, [v_mul+4*0]
3393    vpbroadcastd         m7, [v_mul+4*1]
3394    vpbroadcastd        m10, [v_mul+4*2]
3395    pmaddwd              m8, m9, m1  ; a0
3396    pmaddwd              m9, m2      ; b0
3397    mova                 m1, m3
3398    mova                 m2, m4
3399    pmaddwd              m3, m7      ; a1
3400    pmaddwd              m4, m7      ; b1
3401    paddd                m8, m15
3402    paddd                m9, m15
3403    paddd                m8, m3
3404    paddd                m9, m4
3405    mova                 m3, m5
3406    mova                 m4, m6
3407    pmaddwd              m5, m10     ; a2
3408    pmaddwd              m6, m10     ; b2
3409    paddd                m8, m5
3410    paddd                m9, m6
3411    movu                xm5, [r5+strideq*0]
3412    vinserti128          m5, [r5+strideq*1], 1
3413    vbroadcasti128       m7, [subpel_h_shufA]
3414    vbroadcasti128      m10, [subpel_h_shufB]
3415    movu                xm6, [r5+strideq*0+16]
3416    vinserti128          m6, [r5+strideq*1+16], 1
3417    vextracti128       [r7], m0, 1
3418    pshufb               m0, m5, m7  ; 01
3419    pshufb               m5, m10     ; 23
3420    pmaddwd              m0, m11
3421    pmaddwd              m5, m12
3422    paddd                m0, m15
3423    paddd                m0, m5
3424    pshufb               m5, m6, m7  ; 89
3425    pshufb               m6, m10     ; ab
3426    pmaddwd              m5, m13
3427    pmaddwd              m6, m14
3428    paddd                m5, m15
3429    paddd                m6, m5
3430    movu                xm5, [r5+strideq*0+8]
3431    vinserti128          m5, [r5+strideq*1+8], 1
3432    lea                  r5, [r5+strideq*2]
3433    pshufb               m7, m5, m7
3434    pshufb               m5, m10
3435    pmaddwd             m10, m13, m7
3436    pmaddwd              m7, m11
3437    paddd                m0, m10
3438    paddd                m6, m7
3439    pmaddwd              m7, m14, m5
3440    pmaddwd              m5, m12
3441    paddd                m0, m7
3442    paddd                m5, m6
3443    vbroadcasti128       m6, [r7]
3444    vpbroadcastd        m10, [v_mul+4*3]
3445    psrad                m0, 6
3446    psrad                m5, 6
3447    packssdw             m0, m5
3448    vpermq               m7, m0, q3120 ; 7 8
3449    shufpd               m6, m7, 0x04  ; 6 7
3450    punpcklwd            m5, m6, m7    ; 67
3451    punpckhwd            m6, m7        ; 78
3452    pmaddwd              m7, m10, m5   ; a3
3453    pmaddwd             m10, m6        ; b3
3454    paddd                m7, m8
3455    paddd                m9, m10
3456    psrad                m7, 6
3457    psrad                m9, 6
3458    packssdw             m7, m9
3459    vpermq               m7, m7, q3120
3460    mova          [r7+r8*0], xm7
3461    vextracti128  [r7+r8*2], m7, 1
3462    lea                  r7, [r7+r8*4]
3463    sub                  hd, 2
3464    jg .hv_w8_loop
3465    add                srcq, 16
3466    add                tmpq, 16
3467    movzx                hd, wb
3468    sub                  wd, 1<<8
3469    jg .hv_w8_loop0
3470%if WIN64
3471    POP                  r8
3472%endif
3473    RET
3474
3475%macro movifprep 2
3476 %if isprep
3477    mov %1, %2
3478 %endif
3479%endmacro
3480
3481%macro REMAP_REG 2
3482 %xdefine r%1  r%2
3483 %xdefine r%1q r%2q
3484 %xdefine r%1d r%2d
3485%endmacro
3486
3487%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
3488 %if isprep
3489  %xdefine r14_save r14
3490  %assign %%i 14
3491  %rep 14
3492   %assign %%j %%i-1
3493   REMAP_REG %%i, %%j
3494   %assign %%i %%i-1
3495  %endrep
3496 %endif
3497%endmacro
3498
3499%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
3500 %if isprep
3501  %assign %%i 1
3502  %rep 13
3503   %assign %%j %%i+1
3504   REMAP_REG %%i, %%j
3505   %assign %%i %%i+1
3506  %endrep
3507  %xdefine r14 r14_save
3508  %undef r14_save
3509 %endif
3510%endmacro
3511
3512%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
3513    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
3514    RET
3515 %if %1
3516    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
3517 %endif
3518%endmacro
3519
3520%macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd
3521    movu               xm%1, [srcq+ r4*2]
3522    movu               xm%2, [srcq+ r6*2]
3523    movu               xm%3, [srcq+ r7*2]
3524    movu               xm%4, [srcq+ r9*2]
3525    vinserti128         m%1, [srcq+r10*2], 1
3526    vinserti128         m%2, [srcq+r11*2], 1
3527    vinserti128         m%3, [srcq+r13*2], 1
3528    vinserti128         m%4, [srcq+ rX*2], 1
3529    add                srcq, ssq
3530    movu               xm%5, [srcq+ r4*2]
3531    movu               xm%6, [srcq+ r6*2]
3532    movu               xm%7, [srcq+ r7*2]
3533    movu               xm%8, [srcq+ r9*2]
3534    vinserti128         m%5, [srcq+r10*2], 1
3535    vinserti128         m%6, [srcq+r11*2], 1
3536    vinserti128         m%7, [srcq+r13*2], 1
3537    vinserti128         m%8, [srcq+ rX*2], 1
3538    add                srcq, ssq
3539    pmaddwd             m%1, m12
3540    pmaddwd             m%2, m13
3541    pmaddwd             m%3, m14
3542    pmaddwd             m%4, m15
3543    pmaddwd             m%5, m12
3544    pmaddwd             m%6, m13
3545    pmaddwd             m%7, m14
3546    pmaddwd             m%8, m15
3547    phaddd              m%1, m%2
3548 %if %9
3549    mova                m10, [rsp+0x00]
3550 %endif
3551    phaddd              m%3, m%4
3552    phaddd              m%5, m%6
3553    phaddd              m%7, m%8
3554    phaddd              m%1, m%3
3555    phaddd              m%5, m%7
3556    paddd               m%1, m10
3557    paddd               m%5, m10
3558    psrad               m%1, xm11
3559    psrad               m%5, xm11
3560    packssdw            m%1, m%5
3561%endmacro
3562
3563%macro MC_8TAP_SCALED 1
3564%ifidn %1, put
3565 %assign isput  1
3566 %assign isprep 0
3567cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
3568 %xdefine base_reg r12
3569    mov                 r7d, pxmaxm
3570%else
3571 %assign isput  0
3572 %assign isprep 1
3573cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
3574  %define tmp_stridem qword [rsp+0xd0]
3575 %xdefine base_reg r11
3576%endif
3577    lea            base_reg, [%1_8tap_scaled_16bpc_avx2]
3578%define base base_reg-%1_8tap_scaled_16bpc_avx2
3579    tzcnt                wd, wm
3580    vpbroadcastd         m8, dxm
3581%if isprep && UNIX64
3582    movd               xm10, mxd
3583    vpbroadcastd        m10, xm10
3584    mov                 r5d, t0d
3585 DECLARE_REG_TMP 5, 7
3586    mov                 r6d, pxmaxm
3587%else
3588    vpbroadcastd        m10, mxm
3589 %if isput
3590    vpbroadcastw        m11, pxmaxm
3591 %else
3592    mov                 r6d, pxmaxm
3593 %endif
3594%endif
3595    mov                 dyd, dym
3596%if isput
3597 %if WIN64
3598    mov                 r8d, hm
3599  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
3600  %define hm r5m
3601  %define dxm r8m
3602 %else
3603  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
3604  %define hm r6m
3605 %endif
3606 %define dsm [rsp+0x98]
3607 %define rX r1
3608 %define rXd r1d
3609%else ; prep
3610 %if WIN64
3611    mov                 r7d, hm
3612  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
3613  %define hm r4m
3614  %define dxm r7m
3615 %else
3616  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
3617  %define hm [rsp+0x98]
3618 %endif
3619 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
3620 %define rX r14
3621 %define rXd r14d
3622%endif
3623    shr                 r7d, 11
3624    vpbroadcastd         m6, [base+pd_0x3ff]
3625    vpbroadcastd        m12, [base+s_8tap_h_rnd+r7*4]
3626    movd                xm7, [base+s_8tap_h_sh+r7*4]
3627%if isput
3628    vpbroadcastd        m13, [base+put_s_8tap_v_rnd+r7*4]
3629    pinsrd              xm7, [base+put_s_8tap_v_sh+r7*4], 2
3630%else
3631    vpbroadcastd        m13, [base+pd_m524256]
3632%endif
3633    pxor                 m9, m9
3634    lea                ss3q, [ssq*3]
3635    movzx               r7d, t1b
3636    shr                 t1d, 16
3637    cmp                  hd, 6
3638    cmovs               t1d, r7d
3639    sub                srcq, ss3q
3640    cmp                 dyd, 1024
3641    je .dy1
3642    cmp                 dyd, 2048
3643    je .dy2
3644    movzx                wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
3645    add                  wq, base_reg
3646    jmp                  wq
3647%if isput
3648.w2:
3649    mov                 myd, mym
3650    movzx               t0d, t0b
3651    sub                srcq, 2
3652    movd               xm15, t0d
3653    punpckldq            m8, m9, m8
3654    paddd               m10, m8 ; mx+dx*[0,1]
3655    vpbroadcastd       xm14, [base+pq_0x40000000+2]
3656    vpbroadcastd       xm15, xm15
3657    pand                xm8, xm10, xm6
3658    psrld               xm8, 6
3659    paddd              xm15, xm8
3660    movd                r4d, xm15
3661    pextrd              r6d, xm15, 1
3662    vbroadcasti128       m5, [base+bdct_lb_q]
3663    vbroadcasti128       m6, [base+subpel_s_shuf2]
3664    vpbroadcastd       xm15, [base+subpel_filters+r4*8+2]
3665    vpbroadcastd        xm4, [base+subpel_filters+r6*8+2]
3666    pcmpeqd             xm8, xm9
3667    psrld               m10, 10
3668    paddd               m10, m10
3669    movu                xm0, [srcq+ssq*0]
3670    movu                xm1, [srcq+ssq*1]
3671    movu                xm2, [srcq+ssq*2]
3672    movu                xm3, [srcq+ss3q ]
3673    lea                srcq, [srcq+ssq*4]
3674    pshufb              m10, m5
3675    paddb               m10, m6
3676    vpblendd           xm15, xm4, 0xa
3677    pblendvb           xm15, xm14, xm8
3678    pmovsxbw            m15, xm15
3679    vinserti128          m0, [srcq+ssq*0], 1 ; 0 4
3680    vinserti128          m1, [srcq+ssq*1], 1 ; 1 5
3681    vinserti128          m2, [srcq+ssq*2], 1 ; 2 6
3682    vinserti128          m3, [srcq+ss3q ], 1 ; 3 7
3683    lea                srcq, [srcq+ssq*4]
3684    REPX    {pshufb x, m10}, m0, m1, m2, m3
3685    REPX   {pmaddwd x, m15}, m0, m1, m2, m3
3686    phaddd               m0, m1
3687    phaddd               m2, m3
3688    paddd                m0, m12
3689    paddd                m2, m12
3690    psrad                m0, xm7
3691    psrad                m2, xm7
3692    packssdw             m0, m2             ; 0 1 2 3  4 5 6 7
3693    vextracti128        xm1, m0, 1
3694    palignr             xm2, xm1, xm0, 4    ; 1 2 3 4
3695    punpcklwd           xm3, xm0, xm2       ; 01 12
3696    punpckhwd           xm0, xm2            ; 23 34
3697    pshufd              xm4, xm1, q0321     ; 5 6 7 _
3698    punpcklwd           xm2, xm1, xm4       ; 45 56
3699    punpckhwd           xm4, xm1, xm4       ; 67 __
3700.w2_loop:
3701    and                 myd, 0x3ff
3702    mov                 r6d, 64 << 24
3703    mov                 r4d, myd
3704    shr                 r4d, 6
3705    lea                 r4d, [t1+r4]
3706    cmovnz              r6q, [base+subpel_filters+r4*8]
3707    movq               xm14, r6q
3708    pmovsxbw           xm14, xm14
3709    pshufd              xm8, xm14, q0000
3710    pshufd              xm9, xm14, q1111
3711    pmaddwd             xm5, xm3, xm8
3712    pmaddwd             xm6, xm0, xm9
3713    pshufd              xm8, xm14, q2222
3714    pshufd             xm14, xm14, q3333
3715    paddd               xm5, xm6
3716    pmaddwd             xm6, xm2, xm8
3717    pmaddwd             xm8, xm4, xm14
3718    psrldq              xm9, xm7, 8
3719    paddd               xm5, xm6
3720    paddd               xm5, xm13
3721    paddd               xm5, xm8
3722    psrad               xm5, xm9
3723    packusdw            xm5, xm5
3724    pminsw              xm5, xm11
3725    movd             [dstq], xm5
3726    add                dstq, dsq
3727    dec                  hd
3728    jz .ret
3729    add                 myd, dyd
3730    test                myd, ~0x3ff
3731    jz .w2_loop
3732    movu                xm5, [srcq]
3733    test                myd, 0x400
3734    jz .w2_skip_line
3735    add                srcq, ssq
3736    shufps              xm3, xm0, q1032     ; 01 12
3737    shufps              xm0, xm2, q1032     ; 23 34
3738    shufps              xm2, xm4, q1032     ; 45 56
3739    pshufb              xm5, xm10
3740    pmaddwd             xm5, xm15
3741    phaddd              xm5, xm5
3742    paddd               xm5, xm12
3743    psrad               xm5, xm7
3744    packssdw            xm5, xm5
3745    palignr             xm1, xm5, xm1, 12
3746    punpcklqdq          xm1, xm1            ; 6 7 6 7
3747    punpcklwd           xm4, xm1, xm5       ; 67 __
3748    jmp .w2_loop
3749.w2_skip_line:
3750    movu                xm6, [srcq+ssq*1]
3751    lea                srcq, [srcq+ssq*2]
3752    mova                xm3, xm0            ; 01 12
3753    mova                xm0, xm2            ; 23 34
3754    pshufb              xm5, xm10
3755    pshufb              xm6, xm10
3756    pmaddwd             xm5, xm15
3757    pmaddwd             xm6, xm15
3758    phaddd              xm5, xm6
3759    paddd               xm5, xm12
3760    psrad               xm5, xm7
3761    packssdw            xm5, xm5            ; 6 7 6 7
3762    palignr             xm1, xm5, xm1, 8    ; 4 5 6 7
3763    pshufd              xm5, xm1, q0321     ; 5 6 7 _
3764    punpcklwd           xm2, xm1, xm5       ; 45 56
3765    punpckhwd           xm4, xm1, xm5       ; 67 __
3766    jmp .w2_loop
3767%endif
3768.w4:
3769    mov                 myd, mym
3770    mova         [rsp+0x00], m12
3771%if isput
3772    mova         [rsp+0x20], xm13
3773%else
3774    SWAP                m11, m13
3775%endif
3776    mova         [rsp+0x30], xm7
3777    vbroadcasti128       m7, [base+rescale_mul]
3778    movzx               t0d, t0b
3779    sub                srcq, 2
3780    movd               xm15, t0d
3781    pmaddwd              m8, m7
3782    vpbroadcastq         m2, [base+pq_0x40000000+1]
3783    vpbroadcastd       xm15, xm15
3784    SWAP                m13, m10
3785    paddd               m13, m8 ; mx+dx*[0-3]
3786    pand                 m6, m13
3787    psrld                m6, 6
3788    paddd              xm15, xm6
3789    movd                r4d, xm15
3790    pextrd              r6d, xm15, 1
3791    pextrd             r11d, xm15, 2
3792    pextrd             r13d, xm15, 3
3793    vbroadcasti128       m5, [base+bdct_lb_q+ 0]
3794    vbroadcasti128       m1, [base+bdct_lb_q+16]
3795    vbroadcasti128       m0, [base+subpel_s_shuf2]
3796    vpbroadcastd       xm14, [base+subpel_filters+r4*8+2]
3797    vpbroadcastd        xm7, [base+subpel_filters+r6*8+2]
3798    vpbroadcastd       xm15, [base+subpel_filters+r11*8+2]
3799    vpbroadcastd        xm8, [base+subpel_filters+r13*8+2]
3800    pcmpeqd              m6, m9
3801    punpckldq           m10, m6, m6
3802    punpckhdq            m6, m6
3803    psrld               m13, 10
3804    paddd               m13, m13
3805    vpblendd           xm14, xm7, 0xa
3806    vpblendd           xm15, xm8, 0xa
3807    pmovsxbw            m14, xm14
3808    pmovsxbw            m15, xm15
3809    pblendvb            m14, m2, m10
3810    pblendvb            m15, m2, m6
3811    pextrd               r4, xm13, 2
3812    pshufb              m12, m13, m5
3813    pshufb              m13, m1
3814    lea                  r6, [r4+ssq*1]
3815    lea                 r11, [r4+ssq*2]
3816    lea                 r13, [r4+ss3q ]
3817    movu                xm7, [srcq+ssq*0]
3818    movu                xm9, [srcq+ssq*1]
3819    movu                xm8, [srcq+ssq*2]
3820    movu               xm10, [srcq+ss3q ]
3821    movu                xm1, [srcq+r4   ]
3822    movu                xm3, [srcq+r6   ]
3823    movu                xm2, [srcq+r11  ]
3824    movu                xm4, [srcq+r13  ]
3825    lea                srcq, [srcq+ssq*4]
3826    vinserti128          m7, [srcq+ssq*0], 1
3827    vinserti128          m9, [srcq+ssq*1], 1
3828    vinserti128          m8, [srcq+ssq*2], 1
3829    vinserti128         m10, [srcq+ss3q ], 1
3830    vinserti128          m1, [srcq+r4   ], 1
3831    vinserti128          m3, [srcq+r6   ], 1
3832    vinserti128          m2, [srcq+r11  ], 1
3833    vinserti128          m4, [srcq+r13  ], 1
3834    lea                srcq, [srcq+ssq*4]
3835    vpbroadcastb         m5, xm13
3836    psubb               m13, m5
3837    paddb               m12, m0
3838    paddb               m13, m0
3839    REPX    {pshufb x, m12}, m7, m9, m8, m10
3840    REPX   {pmaddwd x, m14}, m7, m9, m8, m10
3841    REPX    {pshufb x, m13}, m1, m2, m3, m4
3842    REPX   {pmaddwd x, m15}, m1, m2, m3, m4
3843    mova                 m5, [rsp+0x00]
3844    movd                xm6, [rsp+0x30]
3845    phaddd               m7, m1
3846    phaddd               m9, m3
3847    phaddd               m8, m2
3848    phaddd              m10, m4
3849    REPX      {paddd x, m5}, m7, m9, m8, m10
3850    REPX     {psrad x, xm6}, m7, m9, m8, m10
3851    packssdw             m7, m9                 ; 0 1  4 5
3852    packssdw             m8, m10                ; 2 3  6 7
3853    vextracti128        xm9, m7, 1              ; 4 5
3854    vextracti128        xm3, m8, 1              ; 6 7
3855    shufps              xm4, xm7, xm8, q1032    ; 1 2
3856    shufps              xm5, xm8, xm9, q1032    ; 3 4
3857    shufps              xm6, xm9, xm3, q1032    ; 5 6
3858    psrldq             xm10, xm3, 8             ; 7 _
3859    punpcklwd           xm0, xm7, xm4   ; 01
3860    punpckhwd           xm7, xm4        ; 12
3861    punpcklwd           xm1, xm8, xm5   ; 23
3862    punpckhwd           xm8, xm5        ; 34
3863    punpcklwd           xm2, xm9, xm6   ; 45
3864    punpckhwd           xm9, xm6        ; 56
3865    punpcklwd           xm3, xm10       ; 67
3866    mova         [rsp+0x40], xm7
3867    mova         [rsp+0x50], xm8
3868    mova         [rsp+0x60], xm9
3869.w4_loop:
3870    and                 myd, 0x3ff
3871    mov                r11d, 64 << 24
3872    mov                r13d, myd
3873    shr                r13d, 6
3874    lea                r13d, [t1+r13]
3875    cmovnz             r11q, [base+subpel_filters+r13*8]
3876    movq                xm9, r11q
3877    pmovsxbw            xm9, xm9
3878    pshufd              xm7, xm9, q0000
3879    pshufd              xm8, xm9, q1111
3880    pmaddwd             xm4, xm0, xm7
3881    pmaddwd             xm5, xm1, xm8
3882    pshufd              xm7, xm9, q2222
3883    pshufd              xm9, xm9, q3333
3884    pmaddwd             xm6, xm2, xm7
3885    pmaddwd             xm8, xm3, xm9
3886%if isput
3887    mova                xm7, [rsp+0x20]
3888    movd                xm9, [rsp+0x38]
3889%else
3890    SWAP                 m7, m11
3891%endif
3892    paddd               xm4, xm5
3893    paddd               xm6, xm8
3894    paddd               xm4, xm6
3895    paddd               xm4, xm7
3896%if isput
3897    psrad               xm4, xm9
3898    packusdw            xm4, xm4
3899    pminuw              xm4, xm11
3900    movq             [dstq], xm4
3901    add                dstq, dsq
3902%else
3903    SWAP                m11, m7
3904    psrad               xm4, 6
3905    packssdw            xm4, xm4
3906    movq             [tmpq], xm4
3907    add                tmpq, 8
3908%endif
3909    dec                  hd
3910    jz .ret
3911    add                 myd, dyd
3912    test                myd, ~0x3ff
3913    jz .w4_loop
3914    mova                xm8, [rsp+0x00]
3915    movd                xm9, [rsp+0x30]
3916    movu                xm4, [srcq]
3917    movu                xm5, [srcq+r4]
3918    test                myd, 0x400
3919    jz .w4_skip_line
3920    mova                xm0, [rsp+0x40]
3921    mova         [rsp+0x40], xm1
3922    mova                xm1, [rsp+0x50]
3923    mova         [rsp+0x50], xm2
3924    mova                xm2, [rsp+0x60]
3925    mova         [rsp+0x60], xm3
3926    pshufb              xm4, xm12
3927    pshufb              xm5, xm13
3928    pmaddwd             xm4, xm14
3929    pmaddwd             xm5, xm15
3930    phaddd              xm4, xm5
3931    paddd               xm4, xm8
3932    psrad               xm4, xm9
3933    packssdw            xm4, xm4
3934    punpcklwd           xm3, xm10, xm4
3935    mova               xm10, xm4
3936    add                srcq, ssq
3937    jmp .w4_loop
3938.w4_skip_line:
3939    movu                xm6, [srcq+ssq*1]
3940    movu                xm7, [srcq+r6]
3941    movu                 m0, [rsp+0x50]
3942    pshufb              xm4, xm12
3943    pshufb              xm6, xm12
3944    pshufb              xm5, xm13
3945    pshufb              xm7, xm13
3946    pmaddwd             xm4, xm14
3947    pmaddwd             xm6, xm14
3948    pmaddwd             xm5, xm15
3949    pmaddwd             xm7, xm15
3950    mova         [rsp+0x40], m0
3951    phaddd              xm4, xm5
3952    phaddd              xm6, xm7
3953    paddd               xm4, xm8
3954    paddd               xm6, xm8
3955    psrad               xm4, xm9
3956    psrad               xm6, xm9
3957    packssdw            xm4, xm6
3958    punpcklwd           xm9, xm10, xm4
3959    mova         [rsp+0x60], xm9
3960    psrldq             xm10, xm4, 8
3961    mova                xm0, xm1
3962    mova                xm1, xm2
3963    mova                xm2, xm3
3964    punpcklwd           xm3, xm4, xm10
3965    lea                srcq, [srcq+ssq*2]
3966    jmp .w4_loop
3967    SWAP                m10, m13
3968%if isprep
3969    SWAP                m13, m11
3970%endif
3971.w8:
3972    mov    dword [rsp+0x80], 1
3973    movifprep   tmp_stridem, 16
3974    jmp .w_start
3975.w16:
3976    mov    dword [rsp+0x80], 2
3977    movifprep   tmp_stridem, 32
3978    jmp .w_start
3979.w32:
3980    mov    dword [rsp+0x80], 4
3981    movifprep   tmp_stridem, 64
3982    jmp .w_start
3983.w64:
3984    mov    dword [rsp+0x80], 8
3985    movifprep   tmp_stridem, 128
3986    jmp .w_start
3987.w128:
3988    mov    dword [rsp+0x80], 16
3989    movifprep   tmp_stridem, 256
3990.w_start:
3991    SWAP                m10, m12, m1
3992    SWAP                m11, m7
3993    ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
3994%if isput
3995    movifnidn           dsm, dsq
3996    mova         [rsp+0xb0], xm7
3997%endif
3998    mova         [rsp+0x00], m10
3999    mova         [rsp+0x20], m13
4000    shr                 t0d, 16
4001    sub                srcq, 6
4002    pmaddwd              m8, [base+rescale_mul2]
4003    movd               xm15, t0d
4004    mov          [rsp+0x84], t0d
4005    mov          [rsp+0x88], srcq
4006    mov          [rsp+0x90], r0q ; dstq / tmpq
4007%if UNIX64
4008    mov                  hm, hd
4009%endif
4010    shl           dword dxm, 3 ; dx*8
4011    vpbroadcastd        m15, xm15
4012    paddd                m1, m8 ; mx+dx*[0-7]
4013    jmp .hloop
4014.hloop_prep:
4015    dec    dword [rsp+0x80]
4016    jz .ret
4017    add    qword [rsp+0x90], 16
4018    mov                  hd, hm
4019    vpbroadcastd         m8, dxm
4020    vpbroadcastd         m6, [base+pd_0x3ff]
4021    paddd                m1, m8, [rsp+0x40]
4022    vpbroadcastd        m15, [rsp+0x84]
4023    pxor                 m9, m9
4024    mov                srcq, [rsp+0x88]
4025    mov                 r0q, [rsp+0x90] ; dstq / tmpq
4026.hloop:
4027    vpbroadcastq        xm2, [base+pq_0x40000000]
4028    pand                 m5, m1, m6
4029    psrld                m5, 6
4030    paddd               m15, m5
4031    pcmpeqd              m5, m9
4032    vextracti128        xm7, m15, 1
4033    movq                 r6, xm15
4034    pextrq               r9, xm15, 1
4035    movq                r11, xm7
4036    pextrq               rX, xm7, 1
4037    mov                 r4d, r6d
4038    shr                  r6, 32
4039    mov                 r7d, r9d
4040    shr                  r9, 32
4041    mov                r10d, r11d
4042    shr                 r11, 32
4043    mov                r13d, rXd
4044    shr                  rX, 32
4045    mova         [rsp+0x40], m1
4046    movq               xm12, [base+subpel_filters+ r4*8]
4047    movq               xm13, [base+subpel_filters+ r6*8]
4048    movhps             xm12, [base+subpel_filters+ r7*8]
4049    movhps             xm13, [base+subpel_filters+ r9*8]
4050    movq               xm14, [base+subpel_filters+r10*8]
4051    movq               xm15, [base+subpel_filters+r11*8]
4052    movhps             xm14, [base+subpel_filters+r13*8]
4053    movhps             xm15, [base+subpel_filters+ rX*8]
4054    psrld                m1, 10
4055    vextracti128        xm7, m1, 1
4056    vextracti128        xm6, m5, 1
4057    movq         [rsp+0xa0], xm1
4058    movq         [rsp+0xa8], xm7
4059    movq                 r6, xm1
4060    pextrq              r11, xm1, 1
4061    movq                 r9, xm7
4062    pextrq               rX, xm7, 1
4063    mov                 r4d, r6d
4064    shr                  r6, 32
4065    mov                r10d, r11d
4066    shr                 r11, 32
4067    mov                 r7d, r9d
4068    shr                  r9, 32
4069    mov                r13d, rXd
4070    shr                  rX, 32
4071    pshufd              xm4, xm5, q2200
4072    pshufd              xm5, xm5, q3311
4073    pshufd              xm7, xm6, q2200
4074    pshufd              xm6, xm6, q3311
4075    pblendvb           xm12, xm2, xm4
4076    pblendvb           xm13, xm2, xm5
4077    pblendvb           xm14, xm2, xm7
4078    pblendvb           xm15, xm2, xm6
4079    pmovsxbw            m12, xm12
4080    pmovsxbw            m13, xm13
4081    pmovsxbw            m14, xm14
4082    pmovsxbw            m15, xm15
4083    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
4084    mova        [rsp+0x60], m0
4085    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
4086    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
4087    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
4088    mova                 m0, [rsp+0x60]
4089    vbroadcasti128       m9, [base+subpel_s_shuf8]
4090    mov                 myd, mym
4091    mov                 dyd, dym
4092    pshufb               m0, m9     ; 01a 01b
4093    pshufb               m1, m9     ; 23a 23b
4094    pshufb               m2, m9     ; 45a 45b
4095    pshufb               m3, m9     ; 67a 67b
4096.vloop:
4097    and                 myd, 0x3ff
4098    mov                 r6d, 64 << 24
4099    mov                 r4d, myd
4100    shr                 r4d, 6
4101    lea                 r4d, [t1+r4]
4102    cmovnz              r6q, [base+subpel_filters+r4*8]
4103    movq                xm9, r6q
4104    punpcklqdq          xm9, xm9
4105    pmovsxbw             m9, xm9
4106    pshufd               m8, m9, q0000
4107    pshufd               m7, m9, q1111
4108    pmaddwd              m4, m0, m8
4109    pmaddwd              m5, m1, m7
4110    pshufd               m8, m9, q2222
4111    pshufd               m9, m9, q3333
4112    pmaddwd              m6, m2, m8
4113    pmaddwd              m7, m3, m9
4114%if isput
4115    psrldq              xm8, xm11, 8
4116%endif
4117    paddd                m4, [rsp+0x20]
4118    paddd                m6, m7
4119    paddd                m4, m5
4120    paddd                m4, m6
4121%if isput
4122    psrad                m4, xm8
4123    vextracti128        xm5, m4, 1
4124    packusdw            xm4, xm5
4125    pminsw              xm4, [rsp+0xb0]
4126    mova             [dstq], xm4
4127    add                dstq, dsm
4128%else
4129    psrad                m4, 6
4130    vextracti128        xm5, m4, 1
4131    packssdw            xm4, xm5
4132    mova             [tmpq], xm4
4133    add                tmpq, tmp_stridem
4134%endif
4135    dec                  hd
4136    jz .hloop_prep
4137    add                 myd, dyd
4138    test                myd, ~0x3ff
4139    jz .vloop
4140    test                myd, 0x400
4141    mov          [rsp+0x60], myd
4142    mov                 r4d, [rsp+0xa0]
4143    mov                 r6d, [rsp+0xa4]
4144    mov                 r7d, [rsp+0xa8]
4145    mov                 r9d, [rsp+0xac]
4146    jz .skip_line
4147    vbroadcasti128       m9, [base+wswap]
4148    movu                xm4, [srcq+ r4*2]
4149    movu                xm5, [srcq+ r6*2]
4150    movu                xm6, [srcq+ r7*2]
4151    movu                xm7, [srcq+ r9*2]
4152    vinserti128          m4, [srcq+r10*2], 1
4153    vinserti128          m5, [srcq+r11*2], 1
4154    vinserti128          m6, [srcq+r13*2], 1
4155    vinserti128          m7, [srcq+ rX*2], 1
4156    add                srcq, ssq
4157    mov                 myd, [rsp+0x60]
4158    mov                 dyd, dym
4159    pshufb               m0, m9
4160    pshufb               m1, m9
4161    pshufb               m2, m9
4162    pshufb               m3, m9
4163    pmaddwd              m4, m12
4164    pmaddwd              m5, m13
4165    pmaddwd              m6, m14
4166    pmaddwd              m7, m15
4167    phaddd               m4, m5
4168    phaddd               m6, m7
4169    phaddd               m4, m6
4170    paddd                m4, m10
4171    psrad                m4, xm11
4172    pslld                m4, 16
4173    pblendw              m0, m1, 0xaa
4174    pblendw              m1, m2, 0xaa
4175    pblendw              m2, m3, 0xaa
4176    pblendw              m3, m4, 0xaa
4177    jmp .vloop
4178.skip_line:
4179    mova                 m0, m1
4180    mova                 m1, m2
4181    mova                 m2, m3
4182    MC_8TAP_SCALED_H      3, 10, 4, 5, 6, 7, 8, 9, 1
4183    vbroadcasti128       m9, [base+subpel_s_shuf8]
4184    mov                 myd, [rsp+0x60]
4185    mov                 dyd, dym
4186    pshufb               m3, m9
4187    jmp .vloop
4188    SWAP                 m1, m12, m10
4189    SWAP                 m7, m11
4190.dy1:
4191    movzx                wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
4192    add                  wq, base_reg
4193    jmp                  wq
4194%if isput
4195.dy1_w2:
4196    mov                 myd, mym
4197    movzx               t0d, t0b
4198    sub                srcq, 2
4199    movd               xm15, t0d
4200    punpckldq            m8, m9, m8
4201    paddd               m10, m8 ; mx+dx*[0-1]
4202    vpbroadcastd       xm14, [base+pq_0x40000000+2]
4203    vpbroadcastd       xm15, xm15
4204    pand                xm8, xm10, xm6
4205    psrld               xm8, 6
4206    paddd              xm15, xm8
4207    movd                r4d, xm15
4208    pextrd              r6d, xm15, 1
4209    vbroadcasti128       m5, [base+bdct_lb_q]
4210    vbroadcasti128       m6, [base+subpel_s_shuf2]
4211    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
4212    vpbroadcastd         m4, [base+subpel_filters+r6*8+2]
4213    pcmpeqd             xm8, xm9
4214    psrld               m10, 10
4215    paddd               m10, m10
4216    movu                xm0, [srcq+ssq*0]
4217    movu                xm1, [srcq+ssq*1]
4218    movu                xm2, [srcq+ssq*2]
4219    movu                xm3, [srcq+ss3q ]
4220    lea                srcq, [srcq+ssq*4]
4221    shr                 myd, 6
4222    mov                 r4d, 64 << 24
4223    lea                 myd, [t1+myq]
4224    cmovnz              r4q, [base+subpel_filters+myq*8]
4225    pshufb              m10, m5
4226    paddb               m10, m6
4227    vpblendd           xm15, xm4, 0xa
4228    pblendvb           xm15, xm14, xm8
4229    pmovsxbw            m15, xm15
4230    vinserti128          m0, [srcq+ssq*0], 1
4231    vinserti128          m1, [srcq+ssq*1], 1
4232    vinserti128          m2, [srcq+ssq*2], 1
4233    add                srcq, ss3q
4234    movq                xm6, r4q
4235    pmovsxbw            xm6, xm6
4236    pshufd              xm8, xm6, q0000
4237    pshufd              xm9, xm6, q1111
4238    pshufd             xm14, xm6, q2222
4239    pshufd              xm6, xm6, q3333
4240    REPX    {pshufb x, m10}, m0, m1, m2
4241    pshufb              xm3, xm10
4242    REPX   {pmaddwd x, m15}, m0, m1, m2
4243    pmaddwd             xm3, xm15
4244    phaddd               m0, m1
4245    phaddd               m2, m3
4246    paddd                m0, m12
4247    paddd                m2, m12
4248    psrad                m0, xm7
4249    psrad                m2, xm7
4250    packssdw             m0, m2
4251    vextracti128        xm1, m0, 1
4252    palignr             xm2, xm1, xm0, 4
4253    pshufd              xm4, xm1, q2121
4254    punpcklwd           xm3, xm0, xm2       ; 01 12
4255    punpckhwd           xm0, xm2            ; 23 34
4256    punpcklwd           xm2, xm1, xm4       ; 45 56
4257.dy1_w2_loop:
4258    movu                xm1, [srcq+ssq*0]
4259    movu                xm5, [srcq+ssq*1]
4260    lea                srcq, [srcq+ssq*2]
4261    pshufb              xm1, xm10
4262    pshufb              xm5, xm10
4263    pmaddwd             xm1, xm15
4264    pmaddwd             xm5, xm15
4265    phaddd              xm1, xm5
4266    pmaddwd             xm5, xm3, xm8
4267    mova                xm3, xm0
4268    pmaddwd             xm0, xm9
4269    paddd               xm1, xm12
4270    psrad               xm1, xm7
4271    packssdw            xm1, xm1
4272    paddd               xm5, xm0
4273    mova                xm0, xm2
4274    pmaddwd             xm2, xm14
4275    paddd               xm5, xm2
4276    palignr             xm2, xm1, xm4, 12
4277    punpcklwd           xm2, xm1            ; 67 78
4278    pmaddwd             xm4, xm2, xm6
4279    paddd               xm5, xm13
4280    paddd               xm5, xm4
4281    mova                xm4, xm1
4282    psrldq              xm1, xm7, 8
4283    psrad               xm5, xm1
4284    packusdw            xm5, xm5
4285    pminsw              xm5, xm11
4286    movd       [dstq+dsq*0], xm5
4287    pextrd     [dstq+dsq*1], xm5, 1
4288    lea                dstq, [dstq+dsq*2]
4289    sub                  hd, 2
4290    jg .dy1_w2_loop
4291    RET
4292%endif
4293.dy1_w4:
4294    mov                 myd, mym
4295%if isput
4296    mova         [rsp+0x50], xm11
4297%endif
4298    mova         [rsp+0x00], m12
4299    mova         [rsp+0x20], m13
4300    mova         [rsp+0x40], xm7
4301    vbroadcasti128       m7, [base+rescale_mul]
4302    movzx               t0d, t0b
4303    sub                srcq, 2
4304    movd               xm15, t0d
4305    pmaddwd              m8, m7
4306    vpbroadcastq         m2, [base+pq_0x40000000+1]
4307    vpbroadcastd       xm15, xm15
4308    SWAP                m13, m10
4309    paddd               m13, m8 ; mx+dx*[0-3]
4310    pand                 m6, m13
4311    psrld                m6, 6
4312    paddd              xm15, xm6
4313    movd                r4d, xm15
4314    pextrd              r6d, xm15, 1
4315    pextrd             r11d, xm15, 2
4316    pextrd             r13d, xm15, 3
4317    vbroadcasti128       m5, [base+bdct_lb_q+ 0]
4318    vbroadcasti128       m1, [base+bdct_lb_q+16]
4319    vbroadcasti128       m4, [base+subpel_s_shuf2]
4320    vpbroadcastd       xm14, [base+subpel_filters+r4*8+2]
4321    vpbroadcastd        xm7, [base+subpel_filters+r6*8+2]
4322    vpbroadcastd       xm15, [base+subpel_filters+r11*8+2]
4323    vpbroadcastd        xm8, [base+subpel_filters+r13*8+2]
4324    pcmpeqd              m6, m9
4325    punpckldq           m10, m6, m6
4326    punpckhdq            m6, m6
4327    psrld               m13, 10
4328    paddd               m13, m13
4329    vpblendd           xm14, xm7, 0xa
4330    vpblendd           xm15, xm8, 0xa
4331    pmovsxbw            m14, xm14
4332    pmovsxbw            m15, xm15
4333    pblendvb            m14, m2, m10
4334    pblendvb            m15, m2, m6
4335    pextrd               r4, xm13, 2
4336    pshufb              m12, m13, m5
4337    pshufb              m13, m1
4338    lea                  r6, [r4+ssq*2]
4339    lea                 r11, [r4+ssq*1]
4340    lea                 r13, [r4+ss3q ]
4341    movu                xm0, [srcq+ssq*0]
4342    movu                xm7, [srcq+r4   ]
4343    movu                xm1, [srcq+ssq*2]
4344    movu                xm8, [srcq+r6   ]
4345    vinserti128          m0, [srcq+ssq*1], 1 ; 0 1
4346    vinserti128          m7, [srcq+r11  ], 1
4347    vinserti128          m1, [srcq+ss3q ], 1 ; 2 3
4348    vinserti128          m8, [srcq+r13  ], 1
4349    lea                srcq, [srcq+ssq*4]
4350    movu                xm2, [srcq+ssq*0]
4351    movu                xm9, [srcq+r4   ]
4352    movu                xm3, [srcq+ssq*2]    ; 6 _
4353    movu               xm10, [srcq+r6   ]
4354    vinserti128          m2, [srcq+ssq*1], 1 ; 4 5
4355    vinserti128          m9, [srcq+r11  ], 1
4356    lea                srcq, [srcq+ss3q ]
4357    vpbroadcastb         m5, xm13
4358    psubb               m13, m5
4359    paddb               m12, m4
4360    paddb               m13, m4
4361    mova                 m5, [rsp+0x00]
4362    movd                xm6, [rsp+0x40]
4363    pshufb               m0, m12
4364    pshufb               m1, m12
4365    pmaddwd              m0, m14
4366    pmaddwd              m1, m14
4367    pshufb               m7, m13
4368    pshufb               m8, m13
4369    pmaddwd              m7, m15
4370    pmaddwd              m8, m15
4371    pshufb               m2, m12
4372    pshufb              xm3, xm12
4373    pmaddwd              m2, m14
4374    pmaddwd             xm3, xm14
4375    pshufb               m9, m13
4376    pshufb             xm10, xm13
4377    pmaddwd              m9, m15
4378    pmaddwd            xm10, xm15
4379    phaddd               m0, m7
4380    phaddd               m1, m8
4381    phaddd               m2, m9
4382    phaddd              xm3, xm10
4383    paddd                m0, m5
4384    paddd                m1, m5
4385    paddd                m2, m5
4386    paddd               xm3, xm5
4387    psrad                m0, xm6
4388    psrad                m1, xm6
4389    psrad                m2, xm6
4390    psrad               xm3, xm6
4391    vperm2i128           m4, m0, m1, 0x21 ; 1 2
4392    vperm2i128           m5, m1, m2, 0x21 ; 3 4
4393    vperm2i128           m6, m2, m3, 0x21 ; 5 6
4394    shr                 myd, 6
4395    mov                r13d, 64 << 24
4396    lea                 myd, [t1+myq]
4397    cmovnz             r13q, [base+subpel_filters+myq*8]
4398    pslld                m4, 16
4399    pslld                m5, 16
4400    pslld                m6, 16
4401    pblendw              m0, m4, 0xaa ; 01 12
4402    pblendw              m1, m5, 0xaa ; 23 34
4403    pblendw              m2, m6, 0xaa ; 45 56
4404    movq               xm10, r13q
4405    punpcklqdq         xm10, xm10
4406    pmovsxbw            m10, xm10
4407    pshufd               m7, m10, q0000
4408    pshufd               m8, m10, q1111
4409    pshufd               m9, m10, q2222
4410    pshufd              m10, m10, q3333
4411.dy1_w4_loop:
4412    movu               xm11, [srcq+ssq*0]
4413    movu                xm6, [srcq+r4   ]
4414    vinserti128         m11, [srcq+ssq*1], 1
4415    vinserti128          m6, [srcq+r11  ], 1
4416    lea                srcq, [srcq+ssq*2]
4417    pmaddwd              m4, m0, m7
4418    pmaddwd              m5, m1, m8
4419    pshufb              m11, m12
4420    pshufb               m6, m13
4421    pmaddwd             m11, m14
4422    pmaddwd              m6, m15
4423    paddd                m4, [rsp+0x20]
4424    phaddd              m11, m6
4425    pmaddwd              m6, m2, m9
4426    paddd               m11, [rsp+0x00]
4427    psrad               m11, [rsp+0x40]
4428    mova                 m0, m1
4429    mova                 m1, m2
4430    paddd                m5, m6
4431    paddd                m4, m5
4432    vinserti128          m2, m3, xm11, 1
4433    pslld                m3, m11, 16
4434    pblendw              m2, m3, 0xaa   ; 67 78
4435    pmaddwd              m5, m2, m10
4436    vextracti128        xm3, m11, 1
4437    paddd                m4, m5
4438%if isput
4439    psrad                m4, [rsp+0x48]
4440    vextracti128        xm5, m4, 1
4441    packusdw            xm4, xm5
4442    pminsw              xm4, [rsp+0x50]
4443    movq       [dstq+dsq*0], xm4
4444    movhps     [dstq+dsq*1], xm4
4445    lea                dstq, [dstq+dsq*2]
4446%else
4447    psrad                m4, 6
4448    vextracti128        xm5, m4, 1
4449    packssdw            xm4, xm5
4450    mova             [tmpq], xm4
4451    add                tmpq, 16
4452%endif
4453    sub                  hd, 2
4454    jg .dy1_w4_loop
4455    MC_8TAP_SCALED_RET
4456    SWAP                 m10, m13
4457.dy1_w8:
4458    mov    dword [rsp+0xa0], 1
4459    movifprep   tmp_stridem, 16
4460    jmp .dy1_w_start
4461.dy1_w16:
4462    mov    dword [rsp+0xa0], 2
4463    movifprep   tmp_stridem, 32
4464    jmp .dy1_w_start
4465.dy1_w32:
4466    mov    dword [rsp+0xa0], 4
4467    movifprep   tmp_stridem, 64
4468    jmp .dy1_w_start
4469.dy1_w64:
4470    mov    dword [rsp+0xa0], 8
4471    movifprep   tmp_stridem, 128
4472    jmp .dy1_w_start
4473.dy1_w128:
4474    mov    dword [rsp+0xa0], 16
4475    movifprep   tmp_stridem, 256
4476.dy1_w_start:
4477    SWAP                m10, m12, m1
4478    SWAP                m11, m7
4479    ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
4480    mov                 myd, mym
4481%if isput
4482 %define dsm [rsp+0xb8]
4483    movifnidn           dsm, dsq
4484    mova         [rsp+0xc0], xm7
4485%else
4486 %if UNIX64
4487  %define hm [rsp+0xb8]
4488 %endif
4489%endif
4490    mova         [rsp+0x00], m10
4491    mova         [rsp+0x20], m13
4492    mova         [rsp+0x40], xm11
4493    shr                 t0d, 16
4494    sub                srcq, 6
4495    shr                 myd, 6
4496    mov                 r4d, 64 << 24
4497    lea                 myd, [t1+myq]
4498    cmovnz              r4q, [base+subpel_filters+myq*8]
4499    pmaddwd              m8, [base+rescale_mul2]
4500    movd               xm15, t0d
4501    mov          [rsp+0xa4], t0d
4502    mov          [rsp+0xa8], srcq
4503    mov          [rsp+0xb0], r0q ; dstq / tmpq
4504%if UNIX64
4505    mov                  hm, hd
4506%endif
4507    shl           dword dxm, 3 ; dx*8
4508    vpbroadcastd        m15, xm15
4509    paddd                m1, m8 ; mx+dx*[0-7]
4510    movq                xm0, r4q
4511    pmovsxbw            xm0, xm0
4512    mova         [rsp+0x50], xm0
4513    jmp .dy1_hloop
4514.dy1_hloop_prep:
4515    dec    dword [rsp+0xa0]
4516    jz .ret
4517    add    qword [rsp+0xb0], 16
4518    mov                  hd, hm
4519    vpbroadcastd         m8, dxm
4520    vpbroadcastd         m6, [base+pd_0x3ff]
4521    paddd                m1, m8, [rsp+0x60]
4522    vpbroadcastd        m15, [rsp+0xa4]
4523    pxor                 m9, m9
4524    mov                srcq, [rsp+0xa8]
4525    mov                 r0q, [rsp+0xb0] ; dstq / tmpq
4526    mova                m10, [rsp+0x00]
4527    mova               xm11, [rsp+0x40]
4528.dy1_hloop:
4529    vpbroadcastq        xm2, [base+pq_0x40000000]
4530    pand                 m5, m1, m6
4531    psrld                m5, 6
4532    paddd               m15, m5
4533    pcmpeqd              m5, m9
4534    vextracti128        xm7, m15, 1
4535    movq                 r6, xm15
4536    pextrq               r9, xm15, 1
4537    movq                r11, xm7
4538    pextrq               rX, xm7, 1
4539    mov                 r4d, r6d
4540    shr                  r6, 32
4541    mov                 r7d, r9d
4542    shr                  r9, 32
4543    mov                r10d, r11d
4544    shr                 r11, 32
4545    mov                r13d, rXd
4546    shr                  rX, 32
4547    mova         [rsp+0x60], m1
4548    movq               xm12, [base+subpel_filters+ r4*8]
4549    movq               xm13, [base+subpel_filters+ r6*8]
4550    movhps             xm12, [base+subpel_filters+ r7*8]
4551    movhps             xm13, [base+subpel_filters+ r9*8]
4552    movq               xm14, [base+subpel_filters+r10*8]
4553    movq               xm15, [base+subpel_filters+r11*8]
4554    movhps             xm14, [base+subpel_filters+r13*8]
4555    movhps             xm15, [base+subpel_filters+ rX*8]
4556    psrld                m1, 10
4557    vextracti128        xm7, m1, 1
4558    vextracti128        xm6, m5, 1
4559    movq                 r6, xm1
4560    pextrq              r11, xm1, 1
4561    movq                 r9, xm7
4562    pextrq               rX, xm7, 1
4563    mov                 r4d, r6d
4564    shr                  r6, 32
4565    mov                r10d, r11d
4566    shr                 r11, 32
4567    mov                 r7d, r9d
4568    shr                  r9, 32
4569    mov                r13d, rXd
4570    shr                  rX, 32
4571    pshufd              xm4, xm5, q2200
4572    pshufd              xm5, xm5, q3311
4573    pshufd              xm7, xm6, q2200
4574    pshufd              xm6, xm6, q3311
4575    pblendvb           xm12, xm2, xm4
4576    pblendvb           xm13, xm2, xm5
4577    pblendvb           xm14, xm2, xm7
4578    pblendvb           xm15, xm2, xm6
4579    pmovsxbw            m12, xm12
4580    pmovsxbw            m13, xm13
4581    pmovsxbw            m14, xm14
4582    pmovsxbw            m15, xm15
4583    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
4584    mova         [rsp+0x80], m0
4585    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
4586    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
4587    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
4588    mova                 m0, [rsp+0x80]
4589    vbroadcasti128       m7, [base+subpel_s_shuf8]
4590    vpbroadcastd         m8, [rsp+0x50]
4591    vpbroadcastd         m9, [rsp+0x54]
4592    vpbroadcastd        m10, [rsp+0x58]
4593    vpbroadcastd        m11, [rsp+0x5c]
4594    pshufb               m0, m7     ; 01a 01b
4595    pshufb               m1, m7     ; 23a 23b
4596    pshufb               m2, m7     ; 45a 45b
4597    pshufb               m3, m7     ; 67a 67b
4598.dy1_vloop:
4599    pmaddwd              m4, m0, m8
4600    pmaddwd              m5, m1, m9
4601    pmaddwd              m6, m2, m10
4602    pmaddwd              m7, m3, m11
4603    paddd                m4, [rsp+0x20]
4604    paddd                m6, m7
4605    paddd                m4, m5
4606    paddd                m4, m6
4607%if isput
4608    psrad                m4, [rsp+0x48]
4609    vextracti128        xm5, m4, 1
4610    packusdw            xm4, xm5
4611    pminsw              xm4, [rsp+0xc0]
4612    mova             [dstq], xm4
4613    add                dstq, dsm
4614%else
4615    psrad                m4, 6
4616    vextracti128        xm5, m4, 1
4617    packssdw            xm4, xm5
4618    mova             [tmpq], xm4
4619    add                tmpq, tmp_stridem
4620%endif
4621    dec                  hd
4622    jz .dy1_hloop_prep
4623    vbroadcasti128       m7, [base+wswap]
4624    pshufb               m0, m7
4625    pshufb               m1, m7
4626    pshufb               m2, m7
4627    pshufb               m3, m7
4628    movu                xm4, [srcq+ r4*2]
4629    movu                xm5, [srcq+ r6*2]
4630    movu                xm6, [srcq+ r7*2]
4631    movu                xm7, [srcq+ r9*2]
4632    vinserti128          m4, [srcq+r10*2], 1
4633    vinserti128          m5, [srcq+r11*2], 1
4634    vinserti128          m6, [srcq+r13*2], 1
4635    vinserti128          m7, [srcq+ rX*2], 1
4636    add                srcq, ssq
4637    pmaddwd              m4, m12
4638    pmaddwd              m5, m13
4639    pmaddwd              m6, m14
4640    pmaddwd              m7, m15
4641    phaddd               m4, m5
4642    phaddd               m6, m7
4643    phaddd               m4, m6
4644    paddd                m4, [rsp+0x00]
4645    psrad                m4, [rsp+0x40]
4646    pslld                m4, 16
4647    pblendw              m0, m1, 0xaa
4648    pblendw              m1, m2, 0xaa
4649    pblendw              m2, m3, 0xaa
4650    pblendw              m3, m4, 0xaa
4651    jmp .dy1_vloop
4652    SWAP                 m1, m12, m10
4653    SWAP                 m7, m11
4654.dy2:
4655    movzx                wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
4656    add                  wq, base_reg
4657    jmp                  wq
4658%if isput
4659.dy2_w2:
4660    mov                 myd, mym
4661    movzx               t0d, t0b
4662    sub                srcq, 2
4663    movd               xm15, t0d
4664    punpckldq            m8, m9, m8
4665    paddd               m10, m8 ; mx+dx*[0-1]
4666    vpbroadcastd       xm14, [base+pq_0x40000000+2]
4667    vpbroadcastd       xm15, xm15
4668    pand                xm8, xm10, xm6
4669    psrld               xm8, 6
4670    paddd              xm15, xm8
4671    movd                r4d, xm15
4672    pextrd              r6d, xm15, 1
4673    vbroadcasti128       m5, [base+bdct_lb_q]
4674    vbroadcasti128       m6, [base+subpel_s_shuf2]
4675    vpbroadcastd       xm15, [base+subpel_filters+r4*8+2]
4676    vpbroadcastd        xm4, [base+subpel_filters+r6*8+2]
4677    pcmpeqd             xm8, xm9
4678    psrld               m10, 10
4679    paddd               m10, m10
4680    movu                xm0, [srcq+ssq*0]
4681    movu                xm1, [srcq+ssq*2]
4682    movu                xm2, [srcq+ssq*4]
4683    pshufb              m10, m5
4684    paddb               m10, m6
4685    vpblendd           xm15, xm4, 0xa
4686    pblendvb           xm15, xm14, xm8
4687    pmovsxbw            m15, xm15
4688    vinserti128          m0, [srcq+ssq*1], 1 ; 0 1
4689    vinserti128          m1, [srcq+ss3q ], 1 ; 2 3
4690    lea                srcq, [srcq+ssq*4]
4691    vinserti128          m2, [srcq+ssq*1], 1 ; 4 5
4692    lea                srcq, [srcq+ssq*2]
4693    shr                 myd, 6
4694    mov                 r4d, 64 << 24
4695    lea                 myd, [t1+myq]
4696    cmovnz              r4q, [base+subpel_filters+myq*8]
4697    pshufb               m0, m10
4698    pshufb               m1, m10
4699    pshufb               m2, m10
4700    pmaddwd              m0, m15
4701    pmaddwd              m1, m15
4702    pmaddwd              m2, m15
4703    movq                xm6, r4q
4704    pmovsxbw            xm6, xm6
4705    phaddd               m0, m1
4706    phaddd               m1, m2
4707    paddd                m0, m12
4708    paddd                m1, m12
4709    psrad                m0, xm7
4710    psrad                m1, xm7
4711    packssdw             m0, m1             ; 0 2 2 4  1 3 3 5
4712    vextracti128        xm1, m0, 1
4713    pshufd              xm8, xm6, q0000
4714    pshufd              xm9, xm6, q1111
4715    pshufd             xm14, xm6, q2222
4716    pshufd              xm6, xm6, q3333
4717    punpcklwd           xm2, xm0, xm1       ; 01 23
4718    punpckhwd           xm1, xm0, xm1       ; 23 45
4719.dy2_w2_loop:
4720    movu                xm3, [srcq+ssq*0]
4721    movu                xm5, [srcq+ssq*2]
4722    vinserti128          m3, [srcq+ssq*1], 1 ; 6 7
4723    vinserti128          m5, [srcq+ss3q ], 1 ; 8 9
4724    lea                srcq, [srcq+ssq*4]
4725    pmaddwd             xm4, xm2, xm8
4726    pmaddwd             xm1, xm9
4727    pshufb               m3, m10
4728    pshufb               m5, m10
4729    pmaddwd              m3, m15
4730    pmaddwd              m5, m15
4731    phaddd               m3, m5
4732    paddd               xm4, xm1
4733    paddd                m3, m12
4734    psrad                m3, xm7
4735    packssdw             m3, m3
4736    pshufd               m3, m3, q2100
4737    palignr              m0, m3, m0, 12     ; 4 6 6 8  5 7 7 9
4738    vextracti128        xm1, m0, 1
4739    punpcklwd           xm2, xm0, xm1       ; 45 67
4740    punpckhwd           xm1, xm0, xm1       ; 67 89
4741    pmaddwd             xm3, xm2, xm14
4742    pmaddwd             xm5, xm1, xm6
4743    paddd               xm4, xm13
4744    paddd               xm4, xm3
4745    psrldq              xm3, xm7, 8
4746    paddd               xm4, xm5
4747    psrad               xm4, xm3
4748    packusdw            xm4, xm4
4749    pminsw              xm4, xm11
4750    movd       [dstq+dsq*0], xm4
4751    pextrd     [dstq+dsq*1], xm4, 1
4752    lea                dstq, [dstq+dsq*2]
4753    sub                  hd, 2
4754    jg .dy2_w2_loop
4755    RET
4756%endif
4757.dy2_w4:
4758    mov                 myd, mym
4759%if isput
4760    mova         [rsp+0x50], xm11
4761%endif
4762    mova         [rsp+0x00], m12
4763    mova         [rsp+0x20], m13
4764    mova         [rsp+0x40], xm7
4765    vbroadcasti128       m7, [base+rescale_mul]
4766    movzx               t0d, t0b
4767    sub                srcq, 2
4768    movd               xm15, t0d
4769    pmaddwd              m8, m7
4770    vpbroadcastq         m2, [base+pq_0x40000000+1]
4771    vpbroadcastd       xm15, xm15
4772    SWAP                m13, m10
4773    paddd               m13, m8 ; mx+dx*[0-3]
4774    pand                 m6, m13
4775    psrld                m6, 6
4776    paddd              xm15, xm6
4777    movd                r4d, xm15
4778    pextrd              r6d, xm15, 1
4779    pextrd             r11d, xm15, 2
4780    pextrd             r13d, xm15, 3
4781    vbroadcasti128       m5, [base+bdct_lb_q+ 0]
4782    vbroadcasti128       m1, [base+bdct_lb_q+16]
4783    vbroadcasti128       m4, [base+subpel_s_shuf2]
4784    vpbroadcastd       xm14, [base+subpel_filters+r4*8+2]
4785    vpbroadcastd        xm7, [base+subpel_filters+r6*8+2]
4786    vpbroadcastd       xm15, [base+subpel_filters+r11*8+2]
4787    vpbroadcastd        xm8, [base+subpel_filters+r13*8+2]
4788    shr                 myd, 6
4789    mov                r13d, 64 << 24
4790    lea                 myd, [t1+myq]
4791    cmovnz             r13q, [base+subpel_filters+myq*8]
4792    pcmpeqd              m6, m9
4793    punpckldq           m11, m6, m6
4794    punpckhdq            m6, m6
4795    psrld               m13, 10
4796    paddd               m13, m13
4797    vpblendd           xm14, xm7, 0xa
4798    vpblendd           xm15, xm8, 0xa
4799    pmovsxbw            m14, xm14
4800    pmovsxbw            m15, xm15
4801    movq               xm10, r13q
4802    pblendvb            m14, m2, m11
4803    pblendvb            m15, m2, m6
4804    pextrd               r4, xm13, 2
4805    pshufb              m12, m13, m5
4806    pshufb              m13, m1
4807    lea                  r6, [r4+ssq*1]
4808    lea                 r11, [r4+ssq*2]
4809    lea                 r13, [r4+ss3q ]
4810    movu                xm0, [srcq+ssq*0]
4811    movu                xm7, [srcq+r4   ]
4812    movu                xm1, [srcq+ssq*1]
4813    movu                xm8, [srcq+r6   ]
4814    vinserti128          m0, [srcq+ssq*2], 1 ; 0 2
4815    vinserti128          m7, [srcq+r11  ], 1
4816    vinserti128          m1, [srcq+ss3q ], 1 ; 1 3
4817    vinserti128          m8, [srcq+r13  ], 1
4818    lea                srcq, [srcq+ssq*4]
4819    movu                xm2, [srcq+ssq*0]
4820    movu                xm9, [srcq+r4   ]
4821    vinserti128          m2, [srcq+ssq*1], 1 ; 4 5
4822    vinserti128          m9, [srcq+r6   ], 1
4823    lea                srcq, [srcq+ssq*2]
4824    vpbroadcastb         m5, xm13
4825    psubb               m13, m5
4826    paddb               m12, m4
4827    paddb               m13, m4
4828    mova                 m5, [rsp+0x00]
4829    movd                xm6, [rsp+0x40]
4830    pshufb               m0, m12
4831    pshufb               m1, m12
4832    pshufb               m2, m12
4833    pmaddwd              m0, m14
4834    pmaddwd              m1, m14
4835    pmaddwd              m2, m14
4836    pshufb               m7, m13
4837    pshufb               m8, m13
4838    pshufb               m9, m13
4839    pmaddwd              m7, m15
4840    pmaddwd              m8, m15
4841    pmaddwd              m9, m15
4842    punpcklqdq         xm10, xm10
4843    pmovsxbw            m10, xm10
4844    phaddd               m0, m7
4845    phaddd               m1, m8
4846    phaddd               m2, m9
4847    paddd                m0, m5
4848    paddd                m1, m5
4849    paddd                m2, m5
4850    psrad                m0, xm6
4851    psrad                m1, xm6
4852    psrad                m2, xm6
4853    vperm2i128           m3, m0, m2, 0x21 ; 2 4
4854    vperm2i128           m2, m1, 0x13     ; 3 5
4855    pshufd               m7, m10, q0000
4856    pshufd               m8, m10, q1111
4857    pshufd               m9, m10, q2222
4858    pshufd              m10, m10, q3333
4859    packssdw             m0, m3 ; 0 2  2 4
4860    packssdw             m1, m2 ; 1 3  3 5
4861    punpckhwd            m2, m0, m1 ; 23 45
4862    punpcklwd            m0, m1     ; 01 23
4863.dy2_w4_loop:
4864    movu                xm1, [srcq+ssq*0]
4865    movu                xm6, [srcq+r4   ]
4866    movu                xm3, [srcq+ssq*1]
4867    movu               xm11, [srcq+r6   ]
4868    vinserti128          m1, [srcq+ssq*2], 1 ; 6 8
4869    vinserti128          m6, [srcq+r11  ], 1
4870    vinserti128          m3, [srcq+ss3q ], 1 ; 7 9
4871    vinserti128         m11, [srcq+r13  ], 1
4872    lea                srcq, [srcq+ssq*4]
4873    pmaddwd              m4, m0, m7
4874    pmaddwd              m5, m2, m8
4875    pshufb               m1, m12
4876    pshufb               m3, m12
4877    pmaddwd              m1, m14
4878    pmaddwd              m3, m14
4879    mova                 m0, [rsp+0x00]
4880    pshufb               m6, m13
4881    pshufb              m11, m13
4882    pmaddwd              m6, m15
4883    pmaddwd             m11, m15
4884    paddd                m4, m5
4885    movd                xm5, [rsp+0x40]
4886    phaddd               m1, m6
4887    phaddd               m3, m11
4888    paddd                m1, m0
4889    paddd                m3, m0
4890    psrad                m1, xm5
4891    psrad                m3, xm5
4892    pslld                m3, 16
4893    pblendw              m1, m3, 0xaa     ; 67 89
4894    vperm2i128           m0, m2, m1, 0x21 ; 45 67
4895    paddd                m4, [rsp+0x20]
4896    mova                 m2, m1
4897    pmaddwd              m5, m0, m9
4898    pmaddwd              m6, m2, m10
4899    paddd                m4, m5
4900    paddd                m4, m6
4901%if isput
4902    psrad                m4, [rsp+0x48]
4903    vextracti128        xm5, m4, 1
4904    packusdw            xm4, xm5
4905    pminsw              xm4, [rsp+0x50]
4906    movq       [dstq+dsq*0], xm4
4907    movhps     [dstq+dsq*1], xm4
4908    lea                dstq, [dstq+dsq*2]
4909%else
4910    psrad                m4, 6
4911    vextracti128        xm5, m4, 1
4912    packssdw            xm4, xm5
4913    mova             [tmpq], xm4
4914    add                tmpq, 16
4915%endif
4916    sub                  hd, 2
4917    jg .dy2_w4_loop
4918    MC_8TAP_SCALED_RET
4919    SWAP                m10, m13
4920.dy2_w8:
4921    mov    dword [rsp+0xa0], 1
4922    movifprep   tmp_stridem, 16
4923    jmp .dy2_w_start
4924.dy2_w16:
4925    mov    dword [rsp+0xa0], 2
4926    movifprep   tmp_stridem, 32
4927    jmp .dy2_w_start
4928.dy2_w32:
4929    mov    dword [rsp+0xa0], 4
4930    movifprep   tmp_stridem, 64
4931    jmp .dy2_w_start
4932.dy2_w64:
4933    mov    dword [rsp+0xa0], 8
4934    movifprep   tmp_stridem, 128
4935    jmp .dy2_w_start
4936.dy2_w128:
4937    mov    dword [rsp+0xa0], 16
4938    movifprep   tmp_stridem, 256
4939.dy2_w_start:
4940    SWAP                m10, m12, m1
4941    SWAP                m11, m7
4942    ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
4943    mov                 myd, mym
4944%if isput
4945    movifnidn           dsm, dsq
4946    mova         [rsp+0xc0], xm7
4947%endif
4948    mova         [rsp+0x00], m10
4949    mova         [rsp+0x20], m13
4950    mova         [rsp+0x40], xm11
4951    shr                 t0d, 16
4952    sub                srcq, 6
4953    shr                 myd, 6
4954    mov                 r4d, 64 << 24
4955    lea                 myd, [t1+myq]
4956    cmovnz              r4q, [base+subpel_filters+myq*8]
4957    pmaddwd              m8, [base+rescale_mul2]
4958    movd               xm15, t0d
4959    mov          [rsp+0xa4], t0d
4960    mov          [rsp+0xa8], srcq
4961    mov          [rsp+0xb0], r0q ; dstq / tmpq
4962%if UNIX64
4963    mov                  hm, hd
4964%endif
4965    shl           dword dxm, 3 ; dx*8
4966    vpbroadcastd        m15, xm15
4967    paddd                m1, m8 ; mx+dx*[0-7]
4968    movq                xm0, r4q
4969    pmovsxbw            xm0, xm0
4970    mova         [rsp+0x50], xm0
4971    jmp .dy2_hloop
4972.dy2_hloop_prep:
4973    dec    dword [rsp+0xa0]
4974    jz .ret
4975    add    qword [rsp+0xb0], 16
4976    mov                  hd, hm
4977    vpbroadcastd         m8, dxm
4978    vpbroadcastd         m6, [base+pd_0x3ff]
4979    paddd                m1, m8, [rsp+0x60]
4980    vpbroadcastd        m15, [rsp+0xa4]
4981    pxor                 m9, m9
4982    mov                srcq, [rsp+0xa8]
4983    mov                 r0q, [rsp+0xb0] ; dstq / tmpq
4984    mova                m10, [rsp+0x00]
4985    mova               xm11, [rsp+0x40]
4986.dy2_hloop:
4987    vpbroadcastq        xm2, [base+pq_0x40000000]
4988    pand                 m5, m1, m6
4989    psrld                m5, 6
4990    paddd               m15, m5
4991    pcmpeqd              m5, m9
4992    vextracti128        xm7, m15, 1
4993    movq                 r6, xm15
4994    pextrq               r9, xm15, 1
4995    movq                r11, xm7
4996    pextrq               rX, xm7, 1
4997    mov                 r4d, r6d
4998    shr                  r6, 32
4999    mov                 r7d, r9d
5000    shr                  r9, 32
5001    mov                r10d, r11d
5002    shr                 r11, 32
5003    mov                r13d, rXd
5004    shr                  rX, 32
5005    mova         [rsp+0x60], m1
5006    movq               xm12, [base+subpel_filters+ r4*8]
5007    movq               xm13, [base+subpel_filters+ r6*8]
5008    movhps             xm12, [base+subpel_filters+ r7*8]
5009    movhps             xm13, [base+subpel_filters+ r9*8]
5010    movq               xm14, [base+subpel_filters+r10*8]
5011    movq               xm15, [base+subpel_filters+r11*8]
5012    movhps             xm14, [base+subpel_filters+r13*8]
5013    movhps             xm15, [base+subpel_filters+ rX*8]
5014    psrld                m1, 10
5015    vextracti128        xm7, m1, 1
5016    vextracti128        xm6, m5, 1
5017    movq                 r6, xm1
5018    pextrq              r11, xm1, 1
5019    movq                 r9, xm7
5020    pextrq               rX, xm7, 1
5021    mov                 r4d, r6d
5022    shr                  r6, 32
5023    mov                r10d, r11d
5024    shr                 r11, 32
5025    mov                 r7d, r9d
5026    shr                  r9, 32
5027    mov                r13d, rXd
5028    shr                  rX, 32
5029    pshufd              xm4, xm5, q2200
5030    pshufd              xm5, xm5, q3311
5031    pshufd              xm7, xm6, q2200
5032    pshufd              xm6, xm6, q3311
5033    pblendvb           xm12, xm2, xm4
5034    pblendvb           xm13, xm2, xm5
5035    pblendvb           xm14, xm2, xm7
5036    pblendvb           xm15, xm2, xm6
5037    pmovsxbw            m12, xm12
5038    pmovsxbw            m13, xm13
5039    pmovsxbw            m14, xm14
5040    pmovsxbw            m15, xm15
5041    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
5042    mova         [rsp+0x80], m0
5043    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
5044    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
5045    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
5046    mova                 m0, [rsp+0x80]
5047    vbroadcasti128       m7, [base+subpel_s_shuf8]
5048    vpbroadcastd         m8, [rsp+0x50]
5049    vpbroadcastd         m9, [rsp+0x54]
5050    vpbroadcastd        m10, [rsp+0x58]
5051    vpbroadcastd        m11, [rsp+0x5c]
5052    pshufb               m0, m7     ; 01a 01b
5053    pshufb               m1, m7     ; 23a 23b
5054    pshufb               m2, m7     ; 45a 45b
5055    pshufb               m3, m7     ; 67a 67b
5056.dy2_vloop:
5057    pmaddwd              m4, m0, m8
5058    pmaddwd              m5, m1, m9
5059    pmaddwd              m6, m2, m10
5060    pmaddwd              m7, m3, m11
5061    paddd                m4, [rsp+0x20]
5062    paddd                m6, m7
5063    paddd                m4, m5
5064    paddd                m4, m6
5065%if isput
5066    psrad                m4, [rsp+0x48]
5067    vextracti128        xm5, m4, 1
5068    packusdw            xm4, xm5
5069    pminsw              xm4, [rsp+0xc0]
5070    mova             [dstq], xm4
5071    add                dstq, dsm
5072%else
5073    psrad                m4, 6
5074    vextracti128        xm5, m4, 1
5075    packssdw            xm4, xm5
5076    mova             [tmpq], xm4
5077    add                tmpq, tmp_stridem
5078%endif
5079    dec                  hd
5080    jz .dy2_hloop_prep
5081    mova                 m0, m1
5082    mova                 m1, m2
5083    mova                 m2, m3
5084    movu                xm3, [srcq+ r4*2]
5085    movu                xm4, [srcq+ r6*2]
5086    movu                xm5, [srcq+ r7*2]
5087    movu                xm6, [srcq+ r9*2]
5088    vinserti128          m3, [srcq+r10*2], 1
5089    vinserti128          m4, [srcq+r11*2], 1
5090    vinserti128          m5, [srcq+r13*2], 1
5091    vinserti128          m6, [srcq+ rX*2], 1
5092    add                srcq, ssq
5093    pmaddwd              m3, m12
5094    pmaddwd              m4, m13
5095    pmaddwd              m5, m14
5096    pmaddwd              m6, m15
5097    phaddd               m3, m4
5098    phaddd               m5, m6
5099    phaddd               m3, m5
5100    movu                xm4, [srcq+ r4*2]
5101    movu                xm5, [srcq+ r6*2]
5102    movu                xm6, [srcq+ r7*2]
5103    movu                xm7, [srcq+ r9*2]
5104    vinserti128          m4, [srcq+r10*2], 1
5105    vinserti128          m5, [srcq+r11*2], 1
5106    vinserti128          m6, [srcq+r13*2], 1
5107    vinserti128          m7, [srcq+ rX*2], 1
5108    add                srcq, ssq
5109    pmaddwd              m4, m12
5110    pmaddwd              m5, m13
5111    pmaddwd              m6, m14
5112    pmaddwd              m7, m15
5113    phaddd               m4, m5
5114    phaddd               m6, m7
5115    mova                 m5, [rsp+0x00]
5116    movd                xm7, [rsp+0x40]
5117    phaddd               m4, m6
5118    paddd                m3, m5
5119    paddd                m4, m5
5120    psrad                m3, xm7
5121    psrad                m4, xm7
5122    pslld                m4, 16
5123    pblendw              m3, m4, 0xaa
5124    jmp .dy2_vloop
5125.ret:
5126    MC_8TAP_SCALED_RET 0
5127%undef isput
5128%undef isprep
5129%endmacro
5130
5131%macro BILIN_SCALED_FN 1
5132cglobal %1_bilin_scaled_16bpc
5133    mov                 t0d, (5*15 << 16) | 5*15
5134    mov                 t1d, t0d
5135    jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
5136%endmacro
5137
5138%if WIN64
5139DECLARE_REG_TMP 6, 5
5140%else
5141DECLARE_REG_TMP 6, 8
5142%endif
5143
5144%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
5145BILIN_SCALED_FN put
5146PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_16bpc
5147PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_16bpc
5148PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_16bpc
5149PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_16bpc
5150PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_16bpc
5151PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_16bpc
5152PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_16bpc
5153PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_16bpc
5154PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
5155MC_8TAP_SCALED put
5156
5157%if WIN64
5158DECLARE_REG_TMP 5, 4
5159%else
5160DECLARE_REG_TMP 6, 7
5161%endif
5162
5163%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
5164BILIN_SCALED_FN prep
5165PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_16bpc
5166PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_16bpc
5167PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_16bpc
5168PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_16bpc
5169PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_16bpc
5170PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_16bpc
5171PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_16bpc
5172PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_16bpc
5173PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
5174MC_8TAP_SCALED prep
5175
5176%macro WARP_V 5 ; dst, 01, 23, 45, 67
5177    lea               tmp1d, [myq+deltaq*4]
5178    lea               tmp2d, [myq+deltaq*1]
5179    shr                 myd, 10
5180    shr               tmp1d, 10
5181    movq                xm8, [filterq+myq  *8]
5182    vinserti128          m8, [filterq+tmp1q*8], 1 ; a e
5183    lea               tmp1d, [tmp2q+deltaq*4]
5184    lea                 myd, [tmp2q+deltaq*1]
5185    shr               tmp2d, 10
5186    shr               tmp1d, 10
5187    movq                xm0, [filterq+tmp2q*8]
5188    vinserti128          m0, [filterq+tmp1q*8], 1 ; b f
5189    lea               tmp1d, [myq+deltaq*4]
5190    lea               tmp2d, [myq+deltaq*1]
5191    shr                 myd, 10
5192    shr               tmp1d, 10
5193    movq                xm9, [filterq+myq  *8]
5194    vinserti128          m9, [filterq+tmp1q*8], 1 ; c g
5195    lea               tmp1d, [tmp2q+deltaq*4]
5196    lea                 myd, [tmp2q+gammaq]       ; my += gamma
5197    punpcklwd            m8, m0
5198    shr               tmp2d, 10
5199    shr               tmp1d, 10
5200    movq                xm0, [filterq+tmp2q*8]
5201    vinserti128          m0, [filterq+tmp1q*8], 1 ; d h
5202    punpcklwd            m0, m9, m0
5203    punpckldq            m9, m8, m0
5204    punpckhdq            m0, m8, m0
5205    punpcklbw            m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
5206    punpckhbw            m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
5207    pmaddwd             m%2, m8
5208    pmaddwd              m9, m%3
5209    punpcklbw            m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
5210    punpckhbw            m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
5211    pmaddwd              m8, m%4
5212    pmaddwd              m0, m%5
5213    paddd                m9, m%2
5214    mova                m%2, m%3
5215    paddd                m0, m8
5216    mova                m%3, m%4
5217    mova                m%4, m%5
5218    paddd               m%1, m0, m9
5219%endmacro
5220
5221cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts
5222    mov                 r6d, r7m
5223    lea                  r9, [$$]
5224    shr                 r6d, 11
5225    vpbroadcastd        m13, [r9-$$+warp8x8_shift+r6*4]
5226    vpbroadcastd        m14, [warp8x8t_rnd]
5227    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main
5228    jmp .start
5229.loop:
5230    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2
5231    lea                tmpq, [tmpq+tsq*4]
5232.start:
5233    paddd                m7, m14
5234    paddd                m0, m14
5235    psrad                m7, 15
5236    psrad                m0, 15
5237    packssdw             m7, m0
5238    vpermq               m7, m7, q3120
5239    mova         [tmpq+tsq*0], xm7
5240    vextracti128 [tmpq+tsq*2], m7, 1
5241    dec                 r4d
5242    jg .loop
5243.end:
5244    RET
5245
5246cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \
5247                                          alpha, beta, filter, tmp1, delta, \
5248                                          my, gamma
5249    mov                 r6d, r7m
5250    lea             filterq, [$$]
5251    shr                 r6d, 11
5252    vpbroadcastd        m13, [filterq-$$+warp8x8_shift+r6*4]
5253    vpbroadcastd        m14, [filterq-$$+warp8x8_rnd  +r6*4]
5254    vpbroadcastw        m15, r7m ; pixel_max
5255    call .main
5256    jmp .start
5257.loop:
5258    call .main2
5259    lea                dstq, [dstq+dsq*2]
5260.start:
5261    psrad                m7, 16
5262    psrad                m0, 16
5263    packusdw             m7, m0
5264    pmulhrsw             m7, m14
5265    pminsw               m7, m15
5266    vpermq               m7, m7, q3120
5267    mova         [dstq+dsq*0], xm7
5268    vextracti128 [dstq+dsq*1], m7, 1
5269    dec                 r4d
5270    jg .loop
5271.end:
5272    RET
5273ALIGN function_align
5274.main:
5275    ; Stack args offset by one (r4m -> r5m etc.) due to call
5276%if WIN64
5277    mov               abcdq, r5m
5278    mov                 mxd, r6m
5279%endif
5280    movsx            alphad, word [abcdq+2*0]
5281    movsx             betad, word [abcdq+2*1]
5282    vpbroadcastd        m12, [pd_32768]
5283    pxor                m11, m11
5284    add             filterq, mc_warp_filter-$$
5285    lea               tmp1q, [ssq*3]
5286    add                 mxd, 512+(64<<10)
5287    lea               tmp2d, [alphaq*3]
5288    sub                srcq, tmp1q    ; src -= src_stride*3
5289    sub               betad, tmp2d    ; beta -= alpha*3
5290    mov                 myd, r7m
5291    call .h
5292    psrld                m1, m0, 16
5293    call .h
5294    pblendw              m1, m0, 0xaa ; 01
5295    psrld                m2, m0, 16
5296    call .h
5297    pblendw              m2, m0, 0xaa ; 12
5298    psrld                m3, m0, 16
5299    call .h
5300    pblendw              m3, m0, 0xaa ; 23
5301    psrld                m4, m0, 16
5302    call .h
5303    pblendw              m4, m0, 0xaa ; 34
5304    psrld                m5, m0, 16
5305    call .h
5306    pblendw              m5, m0, 0xaa ; 45
5307    psrld                m6, m0, 16
5308    call .h
5309    pblendw              m6, m0, 0xaa ; 56
5310    movsx            deltad, word [abcdq+2*2]
5311    movsx            gammad, word [abcdq+2*3]
5312    add                 myd, 512+(64<<10)
5313    mov                 r4d, 4
5314    lea               tmp1d, [deltaq*3]
5315    sub              gammad, tmp1d    ; gamma -= delta*3
5316.main2:
5317    call .h
5318    psrld                m7, m6, 16
5319    pblendw              m7, m0, 0xaa ; 67
5320    WARP_V                7, 1, 3, 5, 7
5321    call .h
5322    psrld               m10, m5, 16
5323    pblendw             m10, m0, 0xaa ; 78
5324    WARP_V                0, 2, 4, 6, 10
5325    ret
5326ALIGN function_align
5327.h:
5328    lea               tmp1d, [mxq+alphaq*4]
5329    lea               tmp2d, [mxq+alphaq*1]
5330    movu               xm10, [srcq-6]
5331    vinserti128         m10, [srcq+2], 1
5332    shr                 mxd, 10 ; 0
5333    shr               tmp1d, 10 ; 4
5334    movq                xm0, [filterq+mxq  *8]
5335    vinserti128          m0, [filterq+tmp1q*8], 1
5336    lea               tmp1d, [tmp2q+alphaq*4]
5337    lea                 mxd, [tmp2q+alphaq*1]
5338    movu                xm8, [srcq-4]
5339    vinserti128          m8, [srcq+4], 1
5340    shr               tmp2d, 10 ; 1
5341    shr               tmp1d, 10 ; 5
5342    movq                xm9, [filterq+tmp2q*8]
5343    vinserti128          m9, [filterq+tmp1q*8], 1
5344    lea               tmp1d, [mxq+alphaq*4]
5345    lea               tmp2d, [mxq+alphaq*1]
5346    shr                 mxd, 10 ; 2
5347    shr               tmp1d, 10 ; 6
5348    punpcklbw            m0, m11, m0
5349    pmaddwd              m0, m10
5350    movu               xm10, [srcq-2]
5351    vinserti128         m10, [srcq+6], 1
5352    punpcklbw            m9, m11, m9
5353    pmaddwd              m9, m8
5354    movq                xm8, [filterq+mxq  *8]
5355    vinserti128          m8, [filterq+tmp1q*8], 1
5356    lea               tmp1d, [tmp2q+alphaq*4]
5357    lea                 mxd, [tmp2q+betaq] ; mx += beta
5358    phaddd               m0, m9 ; 0 1   4 5
5359    movu                xm9, [srcq+0]
5360    vinserti128          m9, [srcq+8], 1
5361    shr               tmp2d, 10 ; 3
5362    shr               tmp1d, 10 ; 7
5363    punpcklbw            m8, m11, m8
5364    pmaddwd              m8, m10
5365    movq               xm10, [filterq+tmp2q*8]
5366    vinserti128         m10, [filterq+tmp1q*8], 1
5367    punpcklbw           m10, m11, m10
5368    pmaddwd              m9, m10
5369    add                srcq, ssq
5370    phaddd               m8, m9 ; 2 3   6 7
5371    phaddd               m0, m8 ; 0 1 2 3   4 5 6 7
5372    vpsllvd              m0, m13
5373    paddd                m0, m12 ; rounded 14-bit result in upper 16 bits of dword
5374    ret
5375
5376%macro BIDIR_FN 0
5377    call .main
5378    lea            stride3q, [strideq*3]
5379    jmp                  wq
5380.w4:
5381    movq   [dstq          ], xm0
5382    movhps [dstq+strideq*1], xm0
5383    vextracti128        xm0, m0, 1
5384    movq   [dstq+strideq*2], xm0
5385    movhps [dstq+stride3q ], xm0
5386    cmp                  hd, 4
5387    je .ret
5388    lea                dstq, [dstq+strideq*4]
5389    movq   [dstq          ], xm1
5390    movhps [dstq+strideq*1], xm1
5391    vextracti128        xm1, m1, 1
5392    movq   [dstq+strideq*2], xm1
5393    movhps [dstq+stride3q ], xm1
5394    cmp                  hd, 8
5395    je .ret
5396    lea                dstq, [dstq+strideq*4]
5397    movq   [dstq          ], xm2
5398    movhps [dstq+strideq*1], xm2
5399    vextracti128        xm2, m2, 1
5400    movq   [dstq+strideq*2], xm2
5401    movhps [dstq+stride3q ], xm2
5402    lea                dstq, [dstq+strideq*4]
5403    movq   [dstq          ], xm3
5404    movhps [dstq+strideq*1], xm3
5405    vextracti128        xm3, m3, 1
5406    movq   [dstq+strideq*2], xm3
5407    movhps [dstq+stride3q ], xm3
5408.ret:
5409    RET
5410.w8:
5411    mova         [dstq+strideq*0], xm0
5412    vextracti128 [dstq+strideq*1], m0, 1
5413    mova         [dstq+strideq*2], xm1
5414    vextracti128 [dstq+stride3q ], m1, 1
5415    cmp                  hd, 4
5416    jne .w8_loop_start
5417    RET
5418.w8_loop:
5419    call .main
5420    lea                dstq, [dstq+strideq*4]
5421    mova         [dstq+strideq*0], xm0
5422    vextracti128 [dstq+strideq*1], m0, 1
5423    mova         [dstq+strideq*2], xm1
5424    vextracti128 [dstq+stride3q ], m1, 1
5425.w8_loop_start:
5426    lea                dstq, [dstq+strideq*4]
5427    mova         [dstq+strideq*0], xm2
5428    vextracti128 [dstq+strideq*1], m2, 1
5429    mova         [dstq+strideq*2], xm3
5430    vextracti128 [dstq+stride3q ], m3, 1
5431    sub                  hd, 8
5432    jg .w8_loop
5433    RET
5434.w16_loop:
5435    call .main
5436    lea                dstq, [dstq+strideq*4]
5437.w16:
5438    mova   [dstq+strideq*0], m0
5439    mova   [dstq+strideq*1], m1
5440    mova   [dstq+strideq*2], m2
5441    mova   [dstq+stride3q ], m3
5442    sub                  hd, 4
5443    jg .w16_loop
5444    RET
5445.w32_loop:
5446    call .main
5447    lea                dstq, [dstq+strideq*2]
5448.w32:
5449    mova [dstq+strideq*0+32*0], m0
5450    mova [dstq+strideq*0+32*1], m1
5451    mova [dstq+strideq*1+32*0], m2
5452    mova [dstq+strideq*1+32*1], m3
5453    sub                  hd, 2
5454    jg .w32_loop
5455    RET
5456.w64_loop:
5457    call .main
5458    add                dstq, strideq
5459.w64:
5460    mova        [dstq+32*0], m0
5461    mova        [dstq+32*1], m1
5462    mova        [dstq+32*2], m2
5463    mova        [dstq+32*3], m3
5464    dec                  hd
5465    jg .w64_loop
5466    RET
5467.w128_loop:
5468    call .main
5469    add                dstq, strideq
5470.w128:
5471    mova        [dstq+32*0], m0
5472    mova        [dstq+32*1], m1
5473    mova        [dstq+32*2], m2
5474    mova        [dstq+32*3], m3
5475    call .main
5476    mova        [dstq+32*4], m0
5477    mova        [dstq+32*5], m1
5478    mova        [dstq+32*6], m2
5479    mova        [dstq+32*7], m3
5480    dec                  hd
5481    jg .w128_loop
5482    RET
5483%endmacro
5484
5485%if WIN64
5486DECLARE_REG_TMP 5
5487%else
5488DECLARE_REG_TMP 7
5489%endif
5490
5491cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
5492%define base r6-avg_avx2_table
5493    lea                  r6, [avg_avx2_table]
5494    tzcnt                wd, wm
5495    mov                 t0d, r6m ; pixel_max
5496    movsxd               wq, [r6+wq*4]
5497    shr                 t0d, 11
5498    vpbroadcastd         m4, [base+bidir_rnd+t0*4]
5499    vpbroadcastd         m5, [base+bidir_mul+t0*4]
5500    movifnidn            hd, hm
5501    add                  wq, r6
5502    BIDIR_FN
5503ALIGN function_align
5504.main:
5505    mova                 m0, [tmp1q+32*0]
5506    paddsw               m0, [tmp2q+32*0]
5507    mova                 m1, [tmp1q+32*1]
5508    paddsw               m1, [tmp2q+32*1]
5509    mova                 m2, [tmp1q+32*2]
5510    paddsw               m2, [tmp2q+32*2]
5511    mova                 m3, [tmp1q+32*3]
5512    paddsw               m3, [tmp2q+32*3]
5513    add               tmp1q, 32*4
5514    add               tmp2q, 32*4
5515    pmaxsw               m0, m4
5516    pmaxsw               m1, m4
5517    pmaxsw               m2, m4
5518    pmaxsw               m3, m4
5519    psubsw               m0, m4
5520    psubsw               m1, m4
5521    psubsw               m2, m4
5522    psubsw               m3, m4
5523    pmulhw               m0, m5
5524    pmulhw               m1, m5
5525    pmulhw               m2, m5
5526    pmulhw               m3, m5
5527    ret
5528
5529cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3
5530    lea                  r6, [w_avg_avx2_table]
5531    tzcnt                wd, wm
5532    mov                 t0d, r6m ; weight
5533    vpbroadcastw         m8, r7m ; pixel_max
5534    vpbroadcastd         m7, [r6-w_avg_avx2_table+pd_65538]
5535    movsxd               wq, [r6+wq*4]
5536    paddw                m7, m8
5537    add                  wq, r6
5538    lea                 r6d, [t0-16]
5539    shl                 t0d, 16
5540    sub                 t0d, r6d ; 16-weight, weight
5541    pslld                m7, 7
5542    rorx                r6d, t0d, 30 ; << 2
5543    test          dword r7m, 0x800
5544    cmovz               r6d, t0d
5545    movifnidn            hd, hm
5546    movd                xm6, r6d
5547    vpbroadcastd         m6, xm6
5548    BIDIR_FN
5549ALIGN function_align
5550.main:
5551    mova                 m4, [tmp1q+32*0]
5552    mova                 m0, [tmp2q+32*0]
5553    punpckhwd            m5, m0, m4
5554    punpcklwd            m0, m4
5555    mova                 m4, [tmp1q+32*1]
5556    mova                 m1, [tmp2q+32*1]
5557    pmaddwd              m5, m6
5558    pmaddwd              m0, m6
5559    paddd                m5, m7
5560    paddd                m0, m7
5561    psrad                m5, 8
5562    psrad                m0, 8
5563    packusdw             m0, m5
5564    punpckhwd            m5, m1, m4
5565    punpcklwd            m1, m4
5566    mova                 m4, [tmp1q+32*2]
5567    mova                 m2, [tmp2q+32*2]
5568    pmaddwd              m5, m6
5569    pmaddwd              m1, m6
5570    paddd                m5, m7
5571    paddd                m1, m7
5572    psrad                m5, 8
5573    psrad                m1, 8
5574    packusdw             m1, m5
5575    punpckhwd            m5, m2, m4
5576    punpcklwd            m2, m4
5577    mova                 m4, [tmp1q+32*3]
5578    mova                 m3, [tmp2q+32*3]
5579    add               tmp1q, 32*4
5580    add               tmp2q, 32*4
5581    pmaddwd              m5, m6
5582    pmaddwd              m2, m6
5583    paddd                m5, m7
5584    paddd                m2, m7
5585    psrad                m5, 8
5586    psrad                m2, 8
5587    packusdw             m2, m5
5588    punpckhwd            m5, m3, m4
5589    punpcklwd            m3, m4
5590    pmaddwd              m5, m6
5591    pmaddwd              m3, m6
5592    paddd                m5, m7
5593    paddd                m3, m7
5594    psrad                m5, 8
5595    psrad                m3, 8
5596    packusdw             m3, m5
5597    pminsw               m0, m8
5598    pminsw               m1, m8
5599    pminsw               m2, m8
5600    pminsw               m3, m8
5601    ret
5602
5603cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
5604%define base r7-mask_avx2_table
5605    lea                  r7, [mask_avx2_table]
5606    tzcnt                wd, wm
5607    mov                 r6d, r7m ; pixel_max
5608    movifnidn            hd, hm
5609    shr                 r6d, 11
5610    movsxd               wq, [r7+wq*4]
5611    vpbroadcastd         m8, [base+pw_64]
5612    vpbroadcastd         m9, [base+bidir_rnd+r6*4]
5613    vpbroadcastd        m10, [base+bidir_mul+r6*4]
5614    mov               maskq, maskmp
5615    add                  wq, r7
5616    BIDIR_FN
5617ALIGN function_align
5618.main:
5619%macro MASK 1
5620    pmovzxbw             m5, [maskq+16*%1]
5621    mova                m%1, [tmp1q+32*%1]
5622    mova                 m6, [tmp2q+32*%1]
5623    punpckhwd            m4, m%1, m6
5624    punpcklwd           m%1, m6
5625    psubw                m7, m8, m5
5626    punpckhwd            m6, m5, m7 ; m, 64-m
5627    punpcklwd            m5, m7
5628    pmaddwd              m4, m6     ; tmp1 * m + tmp2 * (64-m)
5629    pmaddwd             m%1, m5
5630    psrad                m4, 5
5631    psrad               m%1, 5
5632    packssdw            m%1, m4
5633    pmaxsw              m%1, m9
5634    psubsw              m%1, m9
5635    pmulhw              m%1, m10
5636%endmacro
5637    MASK                  0
5638    MASK                  1
5639    MASK                  2
5640    MASK                  3
5641    add               maskq, 16*4
5642    add               tmp1q, 32*4
5643    add               tmp2q, 32*4
5644    ret
5645
5646cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
5647%define base r7-w_mask_420_avx2_table
5648    lea                  r7, [w_mask_420_avx2_table]
5649    tzcnt                wd, wm
5650    mov                 r6d, r8m ; pixel_max
5651    movd                xm0, r7m ; sign
5652    movifnidn            hd, hm
5653    shr                 r6d, 11
5654    movsxd               wq, [r7+wq*4]
5655    vpbroadcastd        m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
5656    vpbroadcastd        m11, [base+pw_64]
5657    vpbroadcastd        m12, [base+bidir_rnd+r6*4]
5658    vpbroadcastd        m13, [base+bidir_mul+r6*4]
5659    movd               xm14, [base+pw_2]
5660    mov               maskq, maskmp
5661    psubw              xm14, xm0
5662    vpbroadcastw        m14, xm14
5663    add                  wq, r7
5664    call .main
5665    lea            stride3q, [strideq*3]
5666    jmp                  wq
5667.w4:
5668    phaddd               m4, m5
5669    paddw                m4, m14
5670    psrlw                m4, 2
5671    packuswb             m4, m4
5672    vextracti128        xm5, m4, 1
5673    punpcklwd           xm4, xm5
5674    movq   [dstq+strideq*0], xm0
5675    movhps [dstq+strideq*1], xm0
5676    vextracti128        xm0, m0, 1
5677    movq   [dstq+strideq*2], xm0
5678    movhps [dstq+stride3q ], xm0
5679    mova            [maskq], xm4
5680    cmp                  hd, 8
5681    jl .w4_end
5682    lea                dstq, [dstq+strideq*4]
5683    movq   [dstq+strideq*0], xm1
5684    movhps [dstq+strideq*1], xm1
5685    vextracti128        xm1, m1, 1
5686    movq   [dstq+strideq*2], xm1
5687    movhps [dstq+stride3q ], xm1
5688    je .w4_end
5689    lea                dstq, [dstq+strideq*4]
5690    movq   [dstq+strideq*0], xm2
5691    movhps [dstq+strideq*1], xm2
5692    vextracti128        xm2, m2, 1
5693    movq   [dstq+strideq*2], xm2
5694    movhps [dstq+stride3q ], xm2
5695    lea                dstq, [dstq+strideq*4]
5696    movq   [dstq+strideq*0], xm3
5697    movhps [dstq+strideq*1], xm3
5698    vextracti128        xm3, m3, 1
5699    movq   [dstq+strideq*2], xm3
5700    movhps [dstq+stride3q ], xm3
5701.w4_end:
5702    RET
5703.w8_loop:
5704    call .main
5705    lea                dstq, [dstq+strideq*4]
5706    add               maskq, 16
5707.w8:
5708    vperm2i128           m6, m4, m5, 0x21
5709    vpblendd             m4, m5, 0xf0
5710    paddw                m4, m14
5711    paddw                m4, m6
5712    psrlw                m4, 2
5713    vextracti128        xm5, m4, 1
5714    packuswb            xm4, xm5
5715    mova         [dstq+strideq*0], xm0
5716    vextracti128 [dstq+strideq*1], m0, 1
5717    mova         [dstq+strideq*2], xm1
5718    vextracti128 [dstq+stride3q ], m1, 1
5719    mova            [maskq], xm4
5720    sub                  hd, 8
5721    jl .w8_end
5722    lea                dstq, [dstq+strideq*4]
5723    mova         [dstq+strideq*0], xm2
5724    vextracti128 [dstq+strideq*1], m2, 1
5725    mova         [dstq+strideq*2], xm3
5726    vextracti128 [dstq+stride3q ], m3, 1
5727    jg .w8_loop
5728.w8_end:
5729    RET
5730.w16_loop:
5731    call .main
5732    lea                dstq, [dstq+strideq*4]
5733    add               maskq, 16
5734.w16:
5735    punpcklqdq           m6, m4, m5
5736    punpckhqdq           m4, m5
5737    paddw                m6, m14
5738    paddw                m4, m6
5739    psrlw                m4, 2
5740    vextracti128        xm5, m4, 1
5741    packuswb            xm4, xm5
5742    pshufd              xm4, xm4, q3120
5743    mova   [dstq+strideq*0], m0
5744    mova   [dstq+strideq*1], m1
5745    mova   [dstq+strideq*2], m2
5746    mova   [dstq+stride3q ], m3
5747    mova            [maskq], xm4
5748    sub                  hd, 4
5749    jg .w16_loop
5750    RET
5751.w32_loop:
5752    call .main
5753    lea                dstq, [dstq+strideq*4]
5754    add               maskq, 32
5755.w32:
5756    paddw                m4, m14
5757    paddw                m4, m5
5758    psrlw               m15, m4, 2
5759    mova [dstq+strideq*0+32*0], m0
5760    mova [dstq+strideq*0+32*1], m1
5761    mova [dstq+strideq*1+32*0], m2
5762    mova [dstq+strideq*1+32*1], m3
5763    call .main
5764    mova                 m6, [deint_shuf]
5765    paddw                m4, m14
5766    paddw                m4, m5
5767    psrlw                m4, 2
5768    packuswb            m15, m4
5769    vpermd               m4, m6, m15
5770    mova [dstq+strideq*2+32*0], m0
5771    mova [dstq+strideq*2+32*1], m1
5772    mova [dstq+stride3q +32*0], m2
5773    mova [dstq+stride3q +32*1], m3
5774    mova            [maskq], m4
5775    sub                  hd, 4
5776    jg .w32_loop
5777    RET
5778.w64_loop:
5779    call .main
5780    lea                dstq, [dstq+strideq*2]
5781    add               maskq, 32
5782.w64:
5783    paddw                m4, m14
5784    paddw               m15, m14, m5
5785    mova [dstq+strideq*0+32*0], m0
5786    mova [dstq+strideq*0+32*1], m1
5787    mova [dstq+strideq*0+32*2], m2
5788    mova [dstq+strideq*0+32*3], m3
5789    mova            [maskq], m4 ; no available registers
5790    call .main
5791    paddw                m4, [maskq]
5792    mova                 m6, [deint_shuf]
5793    paddw                m5, m15
5794    psrlw                m4, 2
5795    psrlw                m5, 2
5796    packuswb             m4, m5 ; 0 2 4 6   1 3 5 7
5797    vpermd               m4, m6, m4
5798    mova [dstq+strideq*1+32*0], m0
5799    mova [dstq+strideq*1+32*1], m1
5800    mova [dstq+strideq*1+32*2], m2
5801    mova [dstq+strideq*1+32*3], m3
5802    mova            [maskq], m4
5803    sub                  hd, 2
5804    jg .w64_loop
5805    RET
5806.w128_loop:
5807    call .main
5808    lea                dstq, [dstq+strideq*2]
5809    add               maskq, 64
5810.w128:
5811    paddw                m4, m14
5812    paddw                m5, m14
5813    mova [dstq+strideq*0+32*0], m0
5814    mova [dstq+strideq*0+32*1], m1
5815    mova [dstq+strideq*0+32*2], m2
5816    mova [dstq+strideq*0+32*3], m3
5817    mova       [maskq+32*0], m4
5818    mova     [dstq+strideq], m5
5819    call .main
5820    paddw                m4, m14
5821    paddw               m15, m14, m5
5822    mova [dstq+strideq*0+32*4], m0
5823    mova [dstq+strideq*0+32*5], m1
5824    mova [dstq+strideq*0+32*6], m2
5825    mova [dstq+strideq*0+32*7], m3
5826    mova       [maskq+32*1], m4
5827    call .main
5828    paddw                m4, [maskq+32*0]
5829    paddw                m5, [dstq+strideq]
5830    mova                 m6, [deint_shuf]
5831    psrlw                m4, 2
5832    psrlw                m5, 2
5833    packuswb             m4, m5
5834    vpermd               m4, m6, m4
5835    mova [dstq+strideq*1+32*0], m0
5836    mova [dstq+strideq*1+32*1], m1
5837    mova [dstq+strideq*1+32*2], m2
5838    mova [dstq+strideq*1+32*3], m3
5839    mova       [maskq+32*0], m4
5840    call .main
5841    paddw                m4, [maskq+32*1]
5842    mova                 m6, [deint_shuf]
5843    paddw                m5, m15
5844    psrlw                m4, 2
5845    psrlw                m5, 2
5846    packuswb             m4, m5
5847    vpermd               m4, m6, m4
5848    mova [dstq+strideq*1+32*4], m0
5849    mova [dstq+strideq*1+32*5], m1
5850    mova [dstq+strideq*1+32*6], m2
5851    mova [dstq+strideq*1+32*7], m3
5852    mova       [maskq+32*1], m4
5853    sub                  hd, 2
5854    jg .w128_loop
5855    RET
5856ALIGN function_align
5857.main:
5858%macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul
5859    mova                m%1, [tmp1q+32*%1]
5860    mova                m%2, [tmp2q+32*%1]
5861    punpcklwd            m8, m%2, m%1
5862    punpckhwd            m9, m%2, m%1
5863    psubsw              m%1, m%2
5864    pabsw               m%1, m%1
5865    psubusw              m7, m10, m%1
5866    psrlw                m7, 10       ; 64-m
5867    psubw               m%2, m%3, m7  ; m
5868    punpcklwd           m%1, m7, m%2
5869    punpckhwd            m7, m%2
5870    pmaddwd             m%1, m8
5871    pmaddwd              m7, m9
5872    psrad               m%1, 5
5873    psrad                m7, 5
5874    packssdw            m%1, m7
5875    pmaxsw              m%1, m%4
5876    psubsw              m%1, m%4
5877    pmulhw              m%1, m%5
5878%endmacro
5879    W_MASK                0, 4
5880    W_MASK                1, 5
5881    phaddw               m4, m5
5882    W_MASK                2, 5
5883    W_MASK                3, 6
5884    phaddw               m5, m6
5885    add               tmp1q, 32*4
5886    add               tmp2q, 32*4
5887    ret
5888
5889cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
5890%define base r7-w_mask_422_avx2_table
5891    lea                  r7, [w_mask_422_avx2_table]
5892    tzcnt                wd, wm
5893    mov                 r6d, r8m ; pixel_max
5894    vpbroadcastb        m14, r7m ; sign
5895    movifnidn            hd, hm
5896    shr                 r6d, 11
5897    movsxd               wq, [r7+wq*4]
5898    vpbroadcastd        m10, [base+pw_27615]
5899    vpbroadcastd        m11, [base+pw_64]
5900    vpbroadcastd        m12, [base+bidir_rnd+r6*4]
5901    vpbroadcastd        m13, [base+bidir_mul+r6*4]
5902    mova                m15, [base+deint_shuf]
5903    mov               maskq, maskmp
5904    add                  wq, r7
5905    call .main
5906    lea            stride3q, [strideq*3]
5907    jmp                  wq
5908.w4:
5909    movq   [dstq+strideq*0], xm0
5910    movhps [dstq+strideq*1], xm0
5911    vextracti128        xm0, m0, 1
5912    movq   [dstq+strideq*2], xm0
5913    movhps [dstq+stride3q ], xm0
5914    cmp                  hd, 8
5915    jl .w4_end
5916    lea                dstq, [dstq+strideq*4]
5917    movq   [dstq+strideq*0], xm1
5918    movhps [dstq+strideq*1], xm1
5919    vextracti128        xm1, m1, 1
5920    movq   [dstq+strideq*2], xm1
5921    movhps [dstq+stride3q ], xm1
5922    je .w4_end
5923    lea                dstq, [dstq+strideq*4]
5924    movq   [dstq+strideq*0], xm2
5925    movhps [dstq+strideq*1], xm2
5926    vextracti128        xm2, m2, 1
5927    movq   [dstq+strideq*2], xm2
5928    movhps [dstq+stride3q ], xm2
5929    lea                dstq, [dstq+strideq*4]
5930    movq   [dstq+strideq*0], xm3
5931    movhps [dstq+strideq*1], xm3
5932    vextracti128        xm3, m3, 1
5933    movq   [dstq+strideq*2], xm3
5934    movhps [dstq+stride3q ], xm3
5935.w4_end:
5936    RET
5937.w8_loop:
5938    call .main
5939    lea                dstq, [dstq+strideq*4]
5940.w8:
5941    mova         [dstq+strideq*0], xm0
5942    vextracti128 [dstq+strideq*1], m0, 1
5943    mova         [dstq+strideq*2], xm1
5944    vextracti128 [dstq+stride3q ], m1, 1
5945    sub                  hd, 8
5946    jl .w8_end
5947    lea                dstq, [dstq+strideq*4]
5948    mova         [dstq+strideq*0], xm2
5949    vextracti128 [dstq+strideq*1], m2, 1
5950    mova         [dstq+strideq*2], xm3
5951    vextracti128 [dstq+stride3q ], m3, 1
5952    jg .w8_loop
5953.w8_end:
5954    RET
5955.w16_loop:
5956    call .main
5957    lea                dstq, [dstq+strideq*4]
5958.w16:
5959    mova   [dstq+strideq*0], m0
5960    mova   [dstq+strideq*1], m1
5961    mova   [dstq+strideq*2], m2
5962    mova   [dstq+stride3q ], m3
5963    sub                  hd, 4
5964    jg .w16_loop
5965    RET
5966.w32_loop:
5967    call .main
5968    lea                dstq, [dstq+strideq*2]
5969.w32:
5970    mova [dstq+strideq*0+32*0], m0
5971    mova [dstq+strideq*0+32*1], m1
5972    mova [dstq+strideq*1+32*0], m2
5973    mova [dstq+strideq*1+32*1], m3
5974    sub                  hd, 2
5975    jg .w32_loop
5976    RET
5977.w64_loop:
5978    call .main
5979    add                dstq, strideq
5980.w64:
5981    mova        [dstq+32*0], m0
5982    mova        [dstq+32*1], m1
5983    mova        [dstq+32*2], m2
5984    mova        [dstq+32*3], m3
5985    dec                  hd
5986    jg .w64_loop
5987    RET
5988.w128_loop:
5989    call .main
5990    add                dstq, strideq
5991.w128:
5992    mova        [dstq+32*0], m0
5993    mova        [dstq+32*1], m1
5994    mova        [dstq+32*2], m2
5995    mova        [dstq+32*3], m3
5996    call .main
5997    mova        [dstq+32*4], m0
5998    mova        [dstq+32*5], m1
5999    mova        [dstq+32*6], m2
6000    mova        [dstq+32*7], m3
6001    dec                  hd
6002    jg .w128_loop
6003    RET
6004ALIGN function_align
6005.main:
6006    W_MASK                0, 4
6007    W_MASK                1, 5
6008    phaddw               m4, m5
6009    W_MASK                2, 5
6010    W_MASK                3, 6
6011    phaddw               m5, m6
6012    add               tmp1q, 32*4
6013    add               tmp2q, 32*4
6014    packuswb             m4, m5
6015    pxor                 m5, m5
6016    psubb                m4, m14
6017    pavgb                m4, m5
6018    vpermd               m4, m15, m4
6019    mova            [maskq], m4
6020    add               maskq, 32
6021    ret
6022
6023cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
6024%define base r7-w_mask_444_avx2_table
6025    lea                  r7, [w_mask_444_avx2_table]
6026    tzcnt                wd, wm
6027    mov                 r6d, r8m ; pixel_max
6028    movifnidn            hd, hm
6029    shr                 r6d, 11
6030    movsxd               wq, [r7+wq*4]
6031    vpbroadcastd        m10, [base+pw_27615]
6032    vpbroadcastd         m4, [base+pw_64]
6033    vpbroadcastd         m5, [base+bidir_rnd+r6*4]
6034    vpbroadcastd         m6, [base+bidir_mul+r6*4]
6035    mov               maskq, maskmp
6036    add                  wq, r7
6037    call .main
6038    lea            stride3q, [strideq*3]
6039    jmp                  wq
6040.w4:
6041    movq   [dstq+strideq*0], xm0
6042    movhps [dstq+strideq*1], xm0
6043    vextracti128        xm0, m0, 1
6044    movq   [dstq+strideq*2], xm0
6045    movhps [dstq+stride3q ], xm0
6046    cmp                  hd, 8
6047    jl .w4_end
6048    lea                dstq, [dstq+strideq*4]
6049    movq   [dstq+strideq*0], xm1
6050    movhps [dstq+strideq*1], xm1
6051    vextracti128        xm1, m1, 1
6052    movq   [dstq+strideq*2], xm1
6053    movhps [dstq+stride3q ], xm1
6054    je .w4_end
6055    call .main
6056    lea                dstq, [dstq+strideq*4]
6057    movq   [dstq+strideq*0], xm0
6058    movhps [dstq+strideq*1], xm0
6059    vextracti128        xm0, m0, 1
6060    movq   [dstq+strideq*2], xm0
6061    movhps [dstq+stride3q ], xm0
6062    lea                dstq, [dstq+strideq*4]
6063    movq   [dstq+strideq*0], xm1
6064    movhps [dstq+strideq*1], xm1
6065    vextracti128        xm1, m1, 1
6066    movq   [dstq+strideq*2], xm1
6067    movhps [dstq+stride3q ], xm1
6068.w4_end:
6069    RET
6070.w8_loop:
6071    call .main
6072    lea                dstq, [dstq+strideq*4]
6073.w8:
6074    mova         [dstq+strideq*0], xm0
6075    vextracti128 [dstq+strideq*1], m0, 1
6076    mova         [dstq+strideq*2], xm1
6077    vextracti128 [dstq+stride3q ], m1, 1
6078    sub                  hd, 4
6079    jg .w8_loop
6080.w8_end:
6081    RET
6082.w16_loop:
6083    call .main
6084    lea                dstq, [dstq+strideq*2]
6085.w16:
6086    mova   [dstq+strideq*0], m0
6087    mova   [dstq+strideq*1], m1
6088    sub                  hd, 2
6089    jg .w16_loop
6090    RET
6091.w32_loop:
6092    call .main
6093    add                dstq, strideq
6094.w32:
6095    mova        [dstq+32*0], m0
6096    mova        [dstq+32*1], m1
6097    dec                  hd
6098    jg .w32_loop
6099    RET
6100.w64_loop:
6101    call .main
6102    add                dstq, strideq
6103.w64:
6104    mova        [dstq+32*0], m0
6105    mova        [dstq+32*1], m1
6106    call .main
6107    mova        [dstq+32*2], m0
6108    mova        [dstq+32*3], m1
6109    dec                  hd
6110    jg .w64_loop
6111    RET
6112.w128_loop:
6113    call .main
6114    add                dstq, strideq
6115.w128:
6116    mova        [dstq+32*0], m0
6117    mova        [dstq+32*1], m1
6118    call .main
6119    mova        [dstq+32*2], m0
6120    mova        [dstq+32*3], m1
6121    call .main
6122    mova        [dstq+32*4], m0
6123    mova        [dstq+32*5], m1
6124    call .main
6125    mova        [dstq+32*6], m0
6126    mova        [dstq+32*7], m1
6127    dec                  hd
6128    jg .w128_loop
6129    RET
6130ALIGN function_align
6131.main:
6132    W_MASK                0, 2, 4, 5, 6
6133    W_MASK                1, 3, 4, 5, 6
6134    packuswb             m2, m3
6135    vpermq               m2, m2, q3120
6136    add               tmp1q, 32*2
6137    add               tmp2q, 32*2
6138    mova            [maskq], m2
6139    add               maskq, 32
6140    ret
6141
6142; (a * (64 - m) + b * m + 32) >> 6
6143; = (((b - a) * m + 32) >> 6) + a
6144; = (((b - a) * (m << 9) + 16384) >> 15) + a
6145;   except m << 9 overflows int16_t when m == 64 (which is possible),
6146;   but if we negate m it works out (-64 << 9 == -32768).
6147; = (((a - b) * (m * -512) + 16384) >> 15) + a
6148cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
6149%define base r6-blend_avx2_table
6150    lea                  r6, [blend_avx2_table]
6151    tzcnt                wd, wm
6152    movifnidn            hd, hm
6153    movsxd               wq, [r6+wq*4]
6154    movifnidn         maskq, maskmp
6155    vpbroadcastd         m6, [base+pw_m512]
6156    add                  wq, r6
6157    lea                  r6, [dsq*3]
6158    jmp                  wq
6159.w4:
6160    pmovzxbw             m3, [maskq]
6161    movq                xm0, [dstq+dsq*0]
6162    movhps              xm0, [dstq+dsq*1]
6163    vpbroadcastq         m1, [dstq+dsq*2]
6164    vpbroadcastq         m2, [dstq+r6   ]
6165    vpblendd             m0, m1, 0x30
6166    vpblendd             m0, m2, 0xc0
6167    psubw                m1, m0, [tmpq]
6168    add               maskq, 16
6169    add                tmpq, 32
6170    pmullw               m3, m6
6171    pmulhrsw             m1, m3
6172    paddw                m0, m1
6173    vextracti128        xm1, m0, 1
6174    movq       [dstq+dsq*0], xm0
6175    movhps     [dstq+dsq*1], xm0
6176    movq       [dstq+dsq*2], xm1
6177    movhps     [dstq+r6   ], xm1
6178    lea                dstq, [dstq+dsq*4]
6179    sub                  hd, 4
6180    jg .w4
6181    RET
6182.w8:
6183    pmovzxbw             m4, [maskq+16*0]
6184    pmovzxbw             m5, [maskq+16*1]
6185    mova                xm0, [dstq+dsq*0]
6186    vinserti128          m0, [dstq+dsq*1], 1
6187    mova                xm1, [dstq+dsq*2]
6188    vinserti128          m1, [dstq+r6   ], 1
6189    psubw                m2, m0, [tmpq+32*0]
6190    psubw                m3, m1, [tmpq+32*1]
6191    add               maskq, 16*2
6192    add                tmpq, 32*2
6193    pmullw               m4, m6
6194    pmullw               m5, m6
6195    pmulhrsw             m2, m4
6196    pmulhrsw             m3, m5
6197    paddw                m0, m2
6198    paddw                m1, m3
6199    mova         [dstq+dsq*0], xm0
6200    vextracti128 [dstq+dsq*1], m0, 1
6201    mova         [dstq+dsq*2], xm1
6202    vextracti128 [dstq+r6   ], m1, 1
6203    lea                dstq, [dstq+dsq*4]
6204    sub                  hd, 4
6205    jg .w8
6206    RET
6207.w16:
6208    pmovzxbw             m4, [maskq+16*0]
6209    pmovzxbw             m5, [maskq+16*1]
6210    mova                 m0,     [dstq+dsq*0]
6211    psubw                m2, m0, [tmpq+ 32*0]
6212    mova                 m1,     [dstq+dsq*1]
6213    psubw                m3, m1, [tmpq+ 32*1]
6214    add               maskq, 16*2
6215    add                tmpq, 32*2
6216    pmullw               m4, m6
6217    pmullw               m5, m6
6218    pmulhrsw             m2, m4
6219    pmulhrsw             m3, m5
6220    paddw                m0, m2
6221    paddw                m1, m3
6222    mova       [dstq+dsq*0], m0
6223    mova       [dstq+dsq*1], m1
6224    lea                dstq, [dstq+dsq*2]
6225    sub                  hd, 2
6226    jg .w16
6227    RET
6228.w32:
6229    pmovzxbw             m4, [maskq+16*0]
6230    pmovzxbw             m5, [maskq+16*1]
6231    mova                 m0,     [dstq+32*0]
6232    psubw                m2, m0, [tmpq+32*0]
6233    mova                 m1,     [dstq+32*1]
6234    psubw                m3, m1, [tmpq+32*1]
6235    add               maskq, 16*2
6236    add                tmpq, 32*2
6237    pmullw               m4, m6
6238    pmullw               m5, m6
6239    pmulhrsw             m2, m4
6240    pmulhrsw             m3, m5
6241    paddw                m0, m2
6242    paddw                m1, m3
6243    mova        [dstq+32*0], m0
6244    mova        [dstq+32*1], m1
6245    add                dstq, dsq
6246    dec                  hd
6247    jg .w32
6248    RET
6249
6250INIT_XMM avx2
6251cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h
6252%define base r5-blend_v_avx2_table
6253    lea                  r5, [blend_v_avx2_table]
6254    tzcnt                wd, wm
6255    movifnidn            hd, hm
6256    movsxd               wq, [r5+wq*4]
6257    add                  wq, r5
6258    jmp                  wq
6259.w2:
6260    vpbroadcastd         m2, [base+obmc_masks_avx2+2*2]
6261.w2_loop:
6262    movd                 m0, [dstq+dsq*0]
6263    pinsrd               m0, [dstq+dsq*1], 1
6264    movq                 m1, [tmpq]
6265    add                tmpq, 4*2
6266    psubw                m1, m0, m1
6267    pmulhrsw             m1, m2
6268    paddw                m0, m1
6269    movd       [dstq+dsq*0], m0
6270    pextrd     [dstq+dsq*1], m0, 1
6271    lea                dstq, [dstq+dsq*2]
6272    sub                  hd, 2
6273    jg .w2_loop
6274    RET
6275.w4:
6276    vpbroadcastq         m2, [base+obmc_masks_avx2+4*2]
6277.w4_loop:
6278    movq                 m0, [dstq+dsq*0]
6279    movhps               m0, [dstq+dsq*1]
6280    psubw                m1, m0, [tmpq]
6281    add                tmpq, 8*2
6282    pmulhrsw             m1, m2
6283    paddw                m0, m1
6284    movq       [dstq+dsq*0], m0
6285    movhps     [dstq+dsq*1], m0
6286    lea                dstq, [dstq+dsq*2]
6287    sub                  hd, 2
6288    jg .w4_loop
6289    RET
6290INIT_YMM avx2
6291.w8:
6292    vbroadcasti128       m2, [base+obmc_masks_avx2+8*2]
6293.w8_loop:
6294    mova                xm0, [dstq+dsq*0]
6295    vinserti128          m0, [dstq+dsq*1], 1
6296    psubw                m1, m0, [tmpq]
6297    add                tmpq, 16*2
6298    pmulhrsw             m1, m2
6299    paddw                m0, m1
6300    mova         [dstq+dsq*0], xm0
6301    vextracti128 [dstq+dsq*1], m0, 1
6302    lea                dstq, [dstq+dsq*2]
6303    sub                  hd, 2
6304    jg .w8_loop
6305    RET
6306.w16:
6307    mova                 m4, [base+obmc_masks_avx2+16*2]
6308.w16_loop:
6309    mova                 m0,     [dstq+dsq*0]
6310    psubw                m2, m0, [tmpq+ 32*0]
6311    mova                 m1,     [dstq+dsq*1]
6312    psubw                m3, m1, [tmpq+ 32*1]
6313    add                tmpq, 32*2
6314    pmulhrsw             m2, m4
6315    pmulhrsw             m3, m4
6316    paddw                m0, m2
6317    paddw                m1, m3
6318    mova       [dstq+dsq*0], m0
6319    mova       [dstq+dsq*1], m1
6320    lea                dstq, [dstq+dsq*2]
6321    sub                  hd, 2
6322    jg .w16_loop
6323    RET
6324.w32:
6325%if WIN64
6326    movaps         [rsp+ 8], xmm6
6327    movaps         [rsp+24], xmm7
6328%endif
6329    mova                 m6, [base+obmc_masks_avx2+32*2]
6330    vbroadcasti128       m7, [base+obmc_masks_avx2+32*3]
6331.w32_loop:
6332    mova                 m0,     [dstq+dsq*0+32*0]
6333    psubw                m3, m0, [tmpq      +32*0]
6334    mova                xm2,     [dstq+dsq*0+32*1]
6335    mova                xm5,     [tmpq      +32*1]
6336    mova                 m1,     [dstq+dsq*1+32*0]
6337    psubw                m4, m1, [tmpq      +32*2]
6338    vinserti128          m2,     [dstq+dsq*1+32*1], 1
6339    vinserti128          m5,     [tmpq      +32*3], 1
6340    add                tmpq, 32*4
6341    psubw                m5, m2, m5
6342    pmulhrsw             m3, m6
6343    pmulhrsw             m4, m6
6344    pmulhrsw             m5, m7
6345    paddw                m0, m3
6346    paddw                m1, m4
6347    paddw                m2, m5
6348    mova         [dstq+dsq*0+32*0], m0
6349    mova         [dstq+dsq*1+32*0], m1
6350    mova         [dstq+dsq*0+32*1], xm2
6351    vextracti128 [dstq+dsq*1+32*1], m2, 1
6352    lea                dstq, [dstq+dsq*2]
6353    sub                  hd, 2
6354    jg .w32_loop
6355%if WIN64
6356    movaps             xmm6, [rsp+ 8]
6357    movaps             xmm7, [rsp+24]
6358%endif
6359    RET
6360
6361%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
6362    mova                 m0,     [dstq+32*(%1+0)]
6363    psubw                m2, m0, [tmpq+32*(%2+0)]
6364    mova                 m1,     [dstq+32*(%1+1)]
6365    psubw                m3, m1, [tmpq+32*(%2+1)]
6366%if %3
6367    add                tmpq, 32*%3
6368%endif
6369    pmulhrsw             m2, m4
6370    pmulhrsw             m3, m4
6371    paddw                m0, m2
6372    paddw                m1, m3
6373    mova   [dstq+32*(%1+0)], m0
6374    mova   [dstq+32*(%1+1)], m1
6375%endmacro
6376
6377INIT_XMM avx2
6378cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
6379%define base r5-blend_h_avx2_table
6380    lea                  r5, [blend_h_avx2_table]
6381    tzcnt                wd, wm
6382    mov                  hd, hm
6383    movsxd               wq, [r5+wq*4]
6384    add                  wq, r5
6385    lea               maskq, [base+obmc_masks_avx2+hq*2]
6386    lea                  hd, [hq*3]
6387    shr                  hd, 2 ; h * 3/4
6388    lea               maskq, [maskq+hq*2]
6389    neg                  hq
6390    jmp                  wq
6391.w2:
6392    movd                 m0, [dstq+dsq*0]
6393    pinsrd               m0, [dstq+dsq*1], 1
6394    movd                 m2, [maskq+hq*2]
6395    movq                 m1, [tmpq]
6396    add                tmpq, 4*2
6397    punpcklwd            m2, m2
6398    psubw                m1, m0, m1
6399    pmulhrsw             m1, m2
6400    paddw                m0, m1
6401    movd       [dstq+dsq*0], m0
6402    pextrd     [dstq+dsq*1], m0, 1
6403    lea                dstq, [dstq+dsq*2]
6404    add                  hq, 2
6405    jl .w2
6406    RET
6407.w4:
6408    mova                 m3, [blend_shuf]
6409.w4_loop:
6410    movq                 m0, [dstq+dsq*0]
6411    movhps               m0, [dstq+dsq*1]
6412    movd                 m2, [maskq+hq*2]
6413    psubw                m1, m0, [tmpq]
6414    add                tmpq, 8*2
6415    pshufb               m2, m3
6416    pmulhrsw             m1, m2
6417    paddw                m0, m1
6418    movq       [dstq+dsq*0], m0
6419    movhps     [dstq+dsq*1], m0
6420    lea                dstq, [dstq+dsq*2]
6421    add                  hq, 2
6422    jl .w4_loop
6423    RET
6424INIT_YMM avx2
6425.w8:
6426    vbroadcasti128       m3, [blend_shuf]
6427    shufpd               m3, m3, 0x0c
6428.w8_loop:
6429    mova                xm0, [dstq+dsq*0]
6430    vinserti128          m0, [dstq+dsq*1], 1
6431    vpbroadcastd         m2, [maskq+hq*2]
6432    psubw                m1, m0, [tmpq]
6433    add                tmpq, 16*2
6434    pshufb               m2, m3
6435    pmulhrsw             m1, m2
6436    paddw                m0, m1
6437    mova         [dstq+dsq*0], xm0
6438    vextracti128 [dstq+dsq*1], m0, 1
6439    lea                dstq, [dstq+dsq*2]
6440    add                  hq, 2
6441    jl .w8_loop
6442    RET
6443.w16:
6444    vpbroadcastw         m4, [maskq+hq*2]
6445    vpbroadcastw         m5, [maskq+hq*2+2]
6446    mova                 m0,     [dstq+dsq*0]
6447    psubw                m2, m0, [tmpq+ 32*0]
6448    mova                 m1,     [dstq+dsq*1]
6449    psubw                m3, m1, [tmpq+ 32*1]
6450    add                tmpq, 32*2
6451    pmulhrsw             m2, m4
6452    pmulhrsw             m3, m5
6453    paddw                m0, m2
6454    paddw                m1, m3
6455    mova       [dstq+dsq*0], m0
6456    mova       [dstq+dsq*1], m1
6457    lea                dstq, [dstq+dsq*2]
6458    add                  hq, 2
6459    jl .w16
6460    RET
6461.w32:
6462    vpbroadcastw         m4, [maskq+hq*2]
6463    BLEND_H_ROW           0, 0, 2
6464    add                dstq, dsq
6465    inc                  hq
6466    jl .w32
6467    RET
6468.w64:
6469    vpbroadcastw         m4, [maskq+hq*2]
6470    BLEND_H_ROW           0, 0
6471    BLEND_H_ROW           2, 2, 4
6472    add                dstq, dsq
6473    inc                  hq
6474    jl .w64
6475    RET
6476.w128:
6477    vpbroadcastw         m4, [maskq+hq*2]
6478    BLEND_H_ROW           0,  0
6479    BLEND_H_ROW           2,  2, 8
6480    BLEND_H_ROW           4, -4
6481    BLEND_H_ROW           6, -2
6482    add                dstq, dsq
6483    inc                  hq
6484    jl .w128
6485    RET
6486
6487cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
6488                                   bottomext, rightext
6489    ; we assume that the buffer (stride) is larger than width, so we can
6490    ; safely overwrite by a few bytes
6491
6492    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
6493    xor                r12d, r12d
6494    lea                 r10, [ihq-1]
6495    cmp                  yq, ihq
6496    cmovs               r10, yq
6497    test                 yq, yq
6498    cmovs               r10, r12
6499    imul                r10, sstrideq
6500    add                srcq, r10
6501
6502    ; ref += iclip(x, 0, iw - 1)
6503    lea                 r10, [iwq-1]
6504    cmp                  xq, iwq
6505    cmovs               r10, xq
6506    test                 xq, xq
6507    cmovs               r10, r12
6508    lea                srcq, [srcq+r10*2]
6509
6510    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
6511    lea          bottomextq, [yq+bhq]
6512    sub          bottomextq, ihq
6513    lea                  r3, [bhq-1]
6514    cmovs        bottomextq, r12
6515
6516    DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
6517                bottomext, rightext
6518
6519    ; top_ext = iclip(-y, 0, bh - 1)
6520    neg             topextq
6521    cmovs           topextq, r12
6522    cmp          bottomextq, bhq
6523    cmovns       bottomextq, r3
6524    cmp             topextq, bhq
6525    cmovg           topextq, r3
6526
6527    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
6528    lea           rightextq, [xq+bwq]
6529    sub           rightextq, iwq
6530    lea                  r2, [bwq-1]
6531    cmovs         rightextq, r12
6532
6533    DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
6534                bottomext, rightext
6535
6536    ; left_ext = iclip(-x, 0, bw - 1)
6537    neg            leftextq
6538    cmovs          leftextq, r12
6539    cmp           rightextq, bwq
6540    cmovns        rightextq, r2
6541    cmp            leftextq, bwq
6542    cmovns         leftextq, r2
6543
6544    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
6545                dst, dstride, src, sstride, bottomext, rightext
6546
6547    ; center_h = bh - top_ext - bottom_ext
6548    lea                  r3, [bottomextq+topextq]
6549    sub            centerhq, r3
6550
6551    ; blk += top_ext * PXSTRIDE(dst_stride)
6552    mov                  r2, topextq
6553    imul                 r2, dstrideq
6554    add                dstq, r2
6555    mov                 r9m, dstq
6556
6557    ; center_w = bw - left_ext - right_ext
6558    mov            centerwq, bwq
6559    lea                  r3, [rightextq+leftextq]
6560    sub            centerwq, r3
6561
6562%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
6563.v_loop_%3:
6564%if %1
6565    ; left extension
6566    xor                  r3, r3
6567    vpbroadcastw         m0, [srcq]
6568.left_loop_%3:
6569    mova        [dstq+r3*2], m0
6570    add                  r3, 16
6571    cmp                  r3, leftextq
6572    jl .left_loop_%3
6573
6574    ; body
6575    lea                 r12, [dstq+leftextq*2]
6576%endif
6577    xor                  r3, r3
6578.body_loop_%3:
6579    movu                 m0, [srcq+r3*2]
6580%if %1
6581    movu         [r12+r3*2], m0
6582%else
6583    movu        [dstq+r3*2], m0
6584%endif
6585    add                  r3, 16
6586    cmp                  r3, centerwq
6587    jl .body_loop_%3
6588
6589%if %2
6590    ; right extension
6591%if %1
6592    lea                 r12, [r12+centerwq*2]
6593%else
6594    lea                 r12, [dstq+centerwq*2]
6595%endif
6596    xor                  r3, r3
6597    vpbroadcastw         m0, [srcq+centerwq*2-2]
6598.right_loop_%3:
6599    movu         [r12+r3*2], m0
6600    add                  r3, 16
6601    cmp                  r3, rightextq
6602    jl .right_loop_%3
6603
6604%endif
6605    add                dstq, dstrideq
6606    add                srcq, sstrideq
6607    dec            centerhq
6608    jg .v_loop_%3
6609%endmacro
6610
6611    test           leftextq, leftextq
6612    jnz .need_left_ext
6613    test          rightextq, rightextq
6614    jnz .need_right_ext
6615    v_loop                0, 0, 0
6616    jmp .body_done
6617
6618.need_left_ext:
6619    test          rightextq, rightextq
6620    jnz .need_left_right_ext
6621    v_loop                1, 0, 1
6622    jmp .body_done
6623
6624.need_left_right_ext:
6625    v_loop                1, 1, 2
6626    jmp .body_done
6627
6628.need_right_ext:
6629    v_loop                0, 1, 3
6630
6631.body_done:
6632    ; bottom edge extension
6633    test         bottomextq, bottomextq
6634    jz .top
6635    mov                srcq, dstq
6636    sub                srcq, dstrideq
6637    xor                  r1, r1
6638.bottom_x_loop:
6639    mova                 m0, [srcq+r1*2]
6640    lea                  r3, [dstq+r1*2]
6641    mov                  r4, bottomextq
6642.bottom_y_loop:
6643    mova               [r3], m0
6644    add                  r3, dstrideq
6645    dec                  r4
6646    jg .bottom_y_loop
6647    add                  r1, 16
6648    cmp                  r1, bwq
6649    jl .bottom_x_loop
6650
6651.top:
6652    ; top edge extension
6653    test            topextq, topextq
6654    jz .end
6655    mov                srcq, r9m
6656    mov                dstq, dstm
6657    xor                  r1, r1
6658.top_x_loop:
6659    mova                 m0, [srcq+r1*2]
6660    lea                  r3, [dstq+r1*2]
6661    mov                  r4, topextq
6662.top_y_loop:
6663    mova               [r3], m0
6664    add                  r3, dstrideq
6665    dec                  r4
6666    jg .top_y_loop
6667    add                  r1, 16
6668    cmp                  r1, bwq
6669    jl .top_x_loop
6670
6671.end:
6672    RET
6673
6674cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
6675                                 dst_w, h, src_w, dx, mx0, pxmax
6676    sub          dword mx0m, 4<<14
6677    sub        dword src_wm, 8
6678    vpbroadcastd         m5, dxm
6679    vpbroadcastd         m8, mx0m
6680    vpbroadcastd         m6, src_wm
6681 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
6682    LEA                  r7, $$
6683%define base r7-$$
6684    vpbroadcastd         m3, [base+pd_64]
6685    vpbroadcastw        xm7, pxmaxm
6686    pmaddwd              m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
6687    pslld                m5, 3                      ; dx*8
6688    pslld                m6, 14
6689    paddd                m8, m2                     ; mx+[0..7]*dx
6690.loop_y:
6691    xor                  xd, xd
6692    mova                 m4, m8             ; per-line working version of mx
6693.loop_x:
6694    vpbroadcastd        m10, [base+pd_63]
6695    pxor                 m2, m2
6696    pmaxsd               m0, m4, m2
6697    psrad                m9, m4, 8          ; filter offset (unmasked)
6698    pminsd               m0, m6             ; iclip(mx, 0, src_w-8)
6699    psubd                m1, m4, m0         ; pshufb offset
6700    psrad                m0, 14             ; clipped src_x offset
6701    psrad                m1, 14             ; pshufb edge_emu offset
6702    pand                 m9, m10            ; filter offset (masked)
6703    ; load source pixels
6704    movd                r8d, xm0
6705    pextrd              r9d, xm0, 1
6706    pextrd             r10d, xm0, 2
6707    pextrd             r11d, xm0, 3
6708    vextracti128        xm0, m0, 1
6709    movu               xm10, [srcq+r8*2]
6710    movu               xm11, [srcq+r9*2]
6711    movu               xm12, [srcq+r10*2]
6712    movu               xm13, [srcq+r11*2]
6713    movd                r8d, xm0
6714    pextrd              r9d, xm0, 1
6715    pextrd             r10d, xm0, 2
6716    pextrd             r11d, xm0, 3
6717    vinserti128         m10, [srcq+r8*2], 1
6718    vinserti128         m11, [srcq+r9*2], 1
6719    vinserti128         m12, [srcq+r10*2], 1
6720    vinserti128         m13, [srcq+r11*2], 1
6721    ptest                m1, m1
6722    jz .filter
6723    movq                 r9, xm1
6724    pextrq              r11, xm1, 1
6725    movsxd               r8, r9d
6726    sar                  r9, 32
6727    movsxd              r10, r11d
6728    sar                 r11, 32
6729    vextracti128        xm1, m1, 1
6730    movu               xm14, [base+resize_shuf+8+r8*2]
6731    movu               xm15, [base+resize_shuf+8+r9*2]
6732    movu                xm0, [base+resize_shuf+8+r10*2]
6733    movu                xm2, [base+resize_shuf+8+r11*2]
6734    movq                 r9, xm1
6735    pextrq              r11, xm1, 1
6736    movsxd               r8, r9d
6737    sar                  r9, 32
6738    movsxd              r10, r11d
6739    sar                 r11, 32
6740    vinserti128         m14, [base+resize_shuf+8+r8*2], 1
6741    vinserti128         m15, [base+resize_shuf+8+r9*2], 1
6742    vinserti128          m0, [base+resize_shuf+8+r10*2], 1
6743    vinserti128          m2, [base+resize_shuf+8+r11*2], 1
6744    pshufb              m10, m14
6745    pshufb              m11, m15
6746    pshufb              m12, m0
6747    pshufb              m13, m2
6748.filter:
6749    movd                r8d, xm9
6750    pextrd              r9d, xm9, 1
6751    pextrd             r10d, xm9, 2
6752    pextrd             r11d, xm9, 3
6753    vextracti128        xm9, m9, 1
6754    movq               xm14, [base+resize_filter+r8*8]
6755    movq               xm15, [base+resize_filter+r9*8]
6756    movq                xm0, [base+resize_filter+r10*8]
6757    movq                xm2, [base+resize_filter+r11*8]
6758    movd                r8d, xm9
6759    pextrd              r9d, xm9, 1
6760    pextrd             r10d, xm9, 2
6761    pextrd             r11d, xm9, 3
6762    movhps             xm14, [base+resize_filter+r8*8]
6763    movhps             xm15, [base+resize_filter+r9*8]
6764    movhps              xm0, [base+resize_filter+r10*8]
6765    movhps              xm2, [base+resize_filter+r11*8]
6766    pmovsxbw            m14, xm14
6767    pmovsxbw            m15, xm15
6768    pmovsxbw             m0, xm0
6769    pmovsxbw             m2, xm2
6770    pmaddwd             m10, m14
6771    pmaddwd             m11, m15
6772    pmaddwd             m12, m0
6773    pmaddwd             m13, m2
6774    phaddd              m10, m11
6775    phaddd              m12, m13
6776    phaddd              m10, m12
6777    psubd               m10, m3, m10
6778    psrad               m10, 7
6779    vextracti128        xm0, m10, 1
6780    packusdw           xm10, xm0
6781    pminsw             xm10, xm7
6782    mova        [dstq+xq*2], xm10
6783    paddd                m4, m5
6784    add                  xd, 8
6785    cmp                  xd, dst_wd
6786    jl .loop_x
6787    add                dstq, dst_strideq
6788    add                srcq, src_strideq
6789    dec                  hd
6790    jg .loop_y
6791    RET
6792
6793%endif ; ARCH_X86_64
6794