xref: /aosp_15_r20/external/libdav1d/src/x86/mc16_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA
30
31; dav1d_obmc_masks[] << 9
32obmc_masks:     dw     0,     0,  9728,     0, 12800,  7168,  2560,     0
33                dw 14336, 11264,  8192,  5632,  3584,  1536,     0,     0
34                dw 15360, 13824, 12288, 10752,  9216,  7680,  6144,  5120
35                dw  4096,  3072,  2048,  1536,     0,     0,     0,     0
36                dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240
37                dw  9728,  8704,  8192,  7168,  6656,  6144,  5632,  4608
38                dw  4096,  3584,  3072,  2560,  2048,  2048,  1536,  1024
39
40blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
41spel_h_shufA:   db 0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
42spel_h_shufB:   db 4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
43spel_h_shuf2:   db 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
44spel_s_shuf2:   db 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7
45spel_s_shuf8:   db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
46unpckw:         db 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
47rescale_mul:    dd 0,  1,  2,  3
48resize_shuf:    db 0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
49                db 8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
50bdct_lb_q: times 8 db 0
51           times 8 db 4
52           times 8 db 8
53           times 8 db 12
54
55pw_2:             times 8 dw 2
56pw_16:            times 4 dw 16
57prep_mul:         times 4 dw 16
58                  times 8 dw 4
59pw_64:            times 8 dw 64
60pw_256:           times 8 dw 256
61pw_2048:          times 4 dw 2048
62bidir_mul:        times 4 dw 2048
63pw_8192:          times 8 dw 8192
64pw_27615:         times 8 dw 27615
65pw_32766:         times 8 dw 32766
66pw_m512:          times 8 dw -512
67pd_63:            times 4 dd 63
68pd_64:            times 4 dd 64
69pd_512:           times 4 dd 512
70pd_2560:          times 2 dd 2560
71pd_8704:          times 2 dd 8704
72pd_m524256:       times 4 dd -524256 ; -8192 << 6 + 32
73pd_0x3ff:         times 4 dd 0x3ff
74pd_0x4000:        times 4 dd 0x4000
75pq_0x400000:      times 2 dq 0x400000
76pq_0x40000000:    times 2 dq 0x40000000
77pd_65538:         times 2 dd 65538
78
79put_bilin_h_rnd:  times 4 dw 8
80                  times 4 dw 10
81s_8tap_h_rnd:     times 2 dd 2
82                  times 2 dd 8
83put_s_8tap_v_rnd: times 2 dd 512
84                  times 2 dd 128
85s_8tap_h_sh:      dd 2, 4
86put_s_8tap_v_sh:  dd 10, 8
87bidir_rnd:        times 4 dw -16400
88                  times 4 dw -16388
89put_8tap_h_rnd:   dd 34, 34, 40, 40
90prep_8tap_1d_rnd: times 2 dd     8 - (8192 <<  4)
91prep_8tap_2d_rnd: times 4 dd    32 - (8192 <<  5)
92
93warp8x8_shift:    dd 11, 13
94warp8x8_rnd1:     dd 1024, 1024, 4096, 4096
95warp8x8_rnd2:     times 4 dw 4096
96                  times 4 dw 16384
97warp8x8t_rnd:     times 2 dd 16384 - (8192 << 15)
98
99%macro BIDIR_JMP_TABLE 2-*
100    %xdefine %1_%2_table (%%table - 2*%3)
101    %xdefine %%base %1_%2_table
102    %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
103    %%table:
104    %rep %0 - 2
105        dd %%prefix %+ .w%3 - %%base
106        %rotate 1
107    %endrep
108%endmacro
109
110BIDIR_JMP_TABLE avg,        ssse3,    4, 8, 16, 32, 64, 128
111BIDIR_JMP_TABLE w_avg,      ssse3,    4, 8, 16, 32, 64, 128
112BIDIR_JMP_TABLE mask,       ssse3,    4, 8, 16, 32, 64, 128
113BIDIR_JMP_TABLE w_mask_420, ssse3,    4, 8, 16, 32, 64, 128
114BIDIR_JMP_TABLE w_mask_422, ssse3,    4, 8, 16, 32, 64, 128
115BIDIR_JMP_TABLE w_mask_444, ssse3,    4, 8, 16, 32, 64, 128
116BIDIR_JMP_TABLE blend,      ssse3,    4, 8, 16, 32
117BIDIR_JMP_TABLE blend_v,    ssse3, 2, 4, 8, 16, 32
118BIDIR_JMP_TABLE blend_h,    ssse3, 2, 4, 8, 16, 32, 64, 128
119
120%macro BASE_JMP_TABLE 3-*
121    %xdefine %1_%2_table (%%table - %3)
122    %xdefine %%base %1_%2
123    %%table:
124    %rep %0 - 2
125        dw %%base %+ _w%3 - %%base
126        %rotate 1
127    %endrep
128%endmacro
129
130%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put)
131%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep)
132
133BASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
134BASE_JMP_TABLE prep, ssse3,    4, 8, 16, 32, 64, 128
135
136%macro SCALED_JMP_TABLE 2-*
137    %xdefine %1_%2_table (%%table - %3)
138    %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
139%%table:
140    %rep %0 - 2
141        dw %%base %+ .w%3 - %%base
142        %rotate 1
143    %endrep
144    %rotate 2
145%%dy_1024:
146    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
147    %rep %0 - 2
148        dw %%base %+ .dy1_w%3 - %%base
149        %rotate 1
150    %endrep
151    %rotate 2
152%%dy_2048:
153    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
154    %rep %0 - 2
155        dw %%base %+ .dy2_w%3 - %%base
156        %rotate 1
157    %endrep
158%endmacro
159
160SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
161SCALED_JMP_TABLE prep_8tap_scaled, ssse3,   4, 8, 16, 32, 64, 128
162
163cextern mc_subpel_filters
164%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
165
166cextern mc_warp_filter
167cextern resize_filter
168
169SECTION .text
170
171%if UNIX64
172DECLARE_REG_TMP 7
173%else
174DECLARE_REG_TMP 5
175%endif
176
177INIT_XMM ssse3
178cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy
179%define base t0-put_ssse3
180    mov                mxyd, r6m ; mx
181    LEA                  t0, put_ssse3
182    movifnidn            wd, wm
183    test               mxyd, mxyd
184    jnz .h
185    mov                mxyd, r7m ; my
186    test               mxyd, mxyd
187    jnz .v
188.put:
189    tzcnt                wd, wd
190    movzx                wd, word [base+put_ssse3_table+wq*2]
191    add                  wq, t0
192    movifnidn            hd, hm
193    jmp                  wq
194.put_w2:
195    mov                 r4d, [srcq+ssq*0]
196    mov                 r6d, [srcq+ssq*1]
197    lea                srcq, [srcq+ssq*2]
198    mov        [dstq+dsq*0], r4d
199    mov        [dstq+dsq*1], r6d
200    lea                dstq, [dstq+dsq*2]
201    sub                  hd, 2
202    jg .put_w2
203    RET
204.put_w4:
205    movq                 m0, [srcq+ssq*0]
206    movq                 m1, [srcq+ssq*1]
207    lea                srcq, [srcq+ssq*2]
208    movq       [dstq+dsq*0], m0
209    movq       [dstq+dsq*1], m1
210    lea                dstq, [dstq+dsq*2]
211    sub                  hd, 2
212    jg .put_w4
213    RET
214.put_w8:
215    movu                 m0, [srcq+ssq*0]
216    movu                 m1, [srcq+ssq*1]
217    lea                srcq, [srcq+ssq*2]
218    mova       [dstq+dsq*0], m0
219    mova       [dstq+dsq*1], m1
220    lea                dstq, [dstq+dsq*2]
221    sub                  hd, 2
222    jg .put_w8
223    RET
224.put_w16:
225    movu                 m0, [srcq+ssq*0+16*0]
226    movu                 m1, [srcq+ssq*0+16*1]
227    movu                 m2, [srcq+ssq*1+16*0]
228    movu                 m3, [srcq+ssq*1+16*1]
229    lea                srcq, [srcq+ssq*2]
230    mova  [dstq+dsq*0+16*0], m0
231    mova  [dstq+dsq*0+16*1], m1
232    mova  [dstq+dsq*1+16*0], m2
233    mova  [dstq+dsq*1+16*1], m3
234    lea                dstq, [dstq+dsq*2]
235    sub                  hd, 2
236    jg .put_w16
237    RET
238.put_w32:
239    movu                 m0, [srcq+16*0]
240    movu                 m1, [srcq+16*1]
241    movu                 m2, [srcq+16*2]
242    movu                 m3, [srcq+16*3]
243    add                srcq, ssq
244    mova        [dstq+16*0], m0
245    mova        [dstq+16*1], m1
246    mova        [dstq+16*2], m2
247    mova        [dstq+16*3], m3
248    add                dstq, dsq
249    dec                  hd
250    jg .put_w32
251    RET
252.put_w64:
253    movu                 m0, [srcq+16*0]
254    movu                 m1, [srcq+16*1]
255    movu                 m2, [srcq+16*2]
256    movu                 m3, [srcq+16*3]
257    mova        [dstq+16*0], m0
258    mova        [dstq+16*1], m1
259    mova        [dstq+16*2], m2
260    mova        [dstq+16*3], m3
261    movu                 m0, [srcq+16*4]
262    movu                 m1, [srcq+16*5]
263    movu                 m2, [srcq+16*6]
264    movu                 m3, [srcq+16*7]
265    add                srcq, ssq
266    mova        [dstq+16*4], m0
267    mova        [dstq+16*5], m1
268    mova        [dstq+16*6], m2
269    mova        [dstq+16*7], m3
270    add                dstq, dsq
271    dec                  hd
272    jg .put_w64
273    RET
274.put_w128:
275    add                srcq, 16*8
276    add                dstq, 16*8
277.put_w128_loop:
278    movu                 m0, [srcq-16*8]
279    movu                 m1, [srcq-16*7]
280    movu                 m2, [srcq-16*6]
281    movu                 m3, [srcq-16*5]
282    mova        [dstq-16*8], m0
283    mova        [dstq-16*7], m1
284    mova        [dstq-16*6], m2
285    mova        [dstq-16*5], m3
286    movu                 m0, [srcq-16*4]
287    movu                 m1, [srcq-16*3]
288    movu                 m2, [srcq-16*2]
289    movu                 m3, [srcq-16*1]
290    mova        [dstq-16*4], m0
291    mova        [dstq-16*3], m1
292    mova        [dstq-16*2], m2
293    mova        [dstq-16*1], m3
294    movu                 m0, [srcq+16*0]
295    movu                 m1, [srcq+16*1]
296    movu                 m2, [srcq+16*2]
297    movu                 m3, [srcq+16*3]
298    mova        [dstq+16*0], m0
299    mova        [dstq+16*1], m1
300    mova        [dstq+16*2], m2
301    mova        [dstq+16*3], m3
302    movu                 m0, [srcq+16*4]
303    movu                 m1, [srcq+16*5]
304    movu                 m2, [srcq+16*6]
305    movu                 m3, [srcq+16*7]
306    add                srcq, ssq
307    mova        [dstq+16*4], m0
308    mova        [dstq+16*5], m1
309    mova        [dstq+16*6], m2
310    mova        [dstq+16*7], m3
311    add                dstq, dsq
312    dec                  hd
313    jg .put_w128_loop
314    RET
315.h:
316    movd                 m5, mxyd
317    mov                mxyd, r7m ; my
318    mova                 m4, [base+pw_16]
319    pshufb               m5, [base+pw_256]
320    psubw                m4, m5
321    test               mxyd, mxyd
322    jnz .hv
323    ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
324    mov                 r6d, r8m ; bitdepth_max
325    shr                 r6d, 11
326    movddup              m3, [base+put_bilin_h_rnd+r6*8]
327    movifnidn            hd, hm
328    sub                  wd, 8
329    jg .h_w16
330    je .h_w8
331    cmp                  wd, -4
332    je .h_w4
333.h_w2:
334    movq                 m1, [srcq+ssq*0]
335    movhps               m1, [srcq+ssq*1]
336    lea                srcq, [srcq+ssq*2]
337    pmullw               m0, m4, m1
338    psrlq                m1, 16
339    pmullw               m1, m5
340    paddw                m0, m3
341    paddw                m0, m1
342    psrlw                m0, 4
343    movd       [dstq+dsq*0], m0
344    punpckhqdq           m0, m0
345    movd       [dstq+dsq*1], m0
346    lea                dstq, [dstq+dsq*2]
347    sub                  hd, 2
348    jg .h_w2
349    RET
350.h_w4:
351    movq                 m0, [srcq+ssq*0]
352    movhps               m0, [srcq+ssq*1]
353    movq                 m1, [srcq+ssq*0+2]
354    movhps               m1, [srcq+ssq*1+2]
355    lea                srcq, [srcq+ssq*2]
356    pmullw               m0, m4
357    pmullw               m1, m5
358    paddw                m0, m3
359    paddw                m0, m1
360    psrlw                m0, 4
361    movq       [dstq+dsq*0], m0
362    movhps     [dstq+dsq*1], m0
363    lea                dstq, [dstq+dsq*2]
364    sub                  hd, 2
365    jg .h_w4
366    RET
367.h_w8:
368    movu                 m0, [srcq+ssq*0]
369    movu                 m1, [srcq+ssq*0+2]
370    pmullw               m0, m4
371    pmullw               m1, m5
372    paddw                m0, m3
373    paddw                m0, m1
374    movu                 m1, [srcq+ssq*1]
375    movu                 m2, [srcq+ssq*1+2]
376    lea                srcq, [srcq+ssq*2]
377    pmullw               m1, m4
378    pmullw               m2, m5
379    paddw                m1, m3
380    paddw                m1, m2
381    psrlw                m0, 4
382    psrlw                m1, 4
383    mova       [dstq+dsq*0], m0
384    mova       [dstq+dsq*1], m1
385    lea                dstq, [dstq+dsq*2]
386    sub                  hd, 2
387    jg .h_w8
388    RET
389.h_w16:
390    lea                srcq, [srcq+wq*2]
391    lea                dstq, [dstq+wq*2]
392    neg                  wq
393.h_w16_loop0:
394    mov                  r6, wq
395.h_w16_loop:
396    movu                 m0, [srcq+r6*2+ 0]
397    movu                 m1, [srcq+r6*2+ 2]
398    pmullw               m0, m4
399    pmullw               m1, m5
400    paddw                m0, m3
401    paddw                m0, m1
402    movu                 m1, [srcq+r6*2+16]
403    movu                 m2, [srcq+r6*2+18]
404    pmullw               m1, m4
405    pmullw               m2, m5
406    paddw                m1, m3
407    paddw                m1, m2
408    psrlw                m0, 4
409    psrlw                m1, 4
410    mova   [dstq+r6*2+16*0], m0
411    mova   [dstq+r6*2+16*1], m1
412    add                  r6, 16
413    jl .h_w16_loop
414    add                srcq, ssq
415    add                dstq, dsq
416    dec                  hd
417    jg .h_w16_loop0
418    RET
419.v:
420    shl                mxyd, 11
421    movd                 m5, mxyd
422    pshufb               m5, [base+pw_256]
423    movifnidn            hd, hm
424    cmp                  wd, 4
425    jg .v_w8
426    je .v_w4
427.v_w2:
428    movd                 m0, [srcq+ssq*0]
429.v_w2_loop:
430    movd                 m1, [srcq+ssq*1]
431    lea                srcq, [srcq+ssq*2]
432    punpcklqdq           m2, m0, m1
433    movd                 m0, [srcq+ssq*0]
434    punpcklqdq           m1, m0
435    psubw                m1, m2
436    pmulhrsw             m1, m5
437    paddw                m1, m2
438    movd       [dstq+dsq*0], m1
439    punpckhqdq           m1, m1
440    movd       [dstq+dsq*1], m1
441    lea                dstq, [dstq+dsq*2]
442    sub                  hd, 2
443    jg .v_w2_loop
444    RET
445.v_w4:
446    movq                 m0, [srcq+ssq*0]
447.v_w4_loop:
448    movq                 m1, [srcq+ssq*1]
449    lea                srcq, [srcq+ssq*2]
450    punpcklqdq           m2, m0, m1
451    movq                 m0, [srcq+ssq*0]
452    punpcklqdq           m1, m0
453    psubw                m1, m2
454    pmulhrsw             m1, m5
455    paddw                m1, m2
456    movq       [dstq+dsq*0], m1
457    movhps     [dstq+dsq*1], m1
458    lea                dstq, [dstq+dsq*2]
459    sub                  hd, 2
460    jg .v_w4_loop
461    RET
462.v_w8:
463%if ARCH_X86_64
464%if WIN64
465    push                 r7
466%endif
467    shl                  wd, 5
468    mov                  r7, srcq
469    lea                 r6d, [wq+hq-256]
470    mov                  r4, dstq
471%else
472    mov                  r6, srcq
473%endif
474.v_w8_loop0:
475    movu                 m0, [srcq+ssq*0]
476.v_w8_loop:
477    movu                 m3, [srcq+ssq*1]
478    lea                srcq, [srcq+ssq*2]
479    psubw                m1, m3, m0
480    pmulhrsw             m1, m5
481    paddw                m1, m0
482    movu                 m0, [srcq+ssq*0]
483    psubw                m2, m0, m3
484    pmulhrsw             m2, m5
485    paddw                m2, m3
486    mova       [dstq+dsq*0], m1
487    mova       [dstq+dsq*1], m2
488    lea                dstq, [dstq+dsq*2]
489    sub                  hd, 2
490    jg .v_w8_loop
491%if ARCH_X86_64
492    add                  r7, 16
493    add                  r4, 16
494    movzx                hd, r6b
495    mov                srcq, r7
496    mov                dstq, r4
497    sub                 r6d, 1<<8
498%else
499    mov                dstq, dstmp
500    add                  r6, 16
501    mov                  hd, hm
502    add                dstq, 16
503    mov                srcq, r6
504    mov               dstmp, dstq
505    sub                  wd, 8
506%endif
507    jg .v_w8_loop0
508%if WIN64
509    pop                 r7
510%endif
511    RET
512.hv:
513    WIN64_SPILL_XMM       8
514    shl                mxyd, 11
515    mova                 m3, [base+pw_2]
516    movd                 m6, mxyd
517    mova                 m7, [base+pw_8192]
518    pshufb               m6, [base+pw_256]
519    test          dword r8m, 0x800
520    jnz .hv_12bpc
521    psllw                m4, 2
522    psllw                m5, 2
523    mova                 m7, [base+pw_2048]
524.hv_12bpc:
525    movifnidn            hd, hm
526    cmp                  wd, 4
527    jg .hv_w8
528    je .hv_w4
529.hv_w2:
530    movddup              m0, [srcq+ssq*0]
531    pshufhw              m1, m0, q0321
532    pmullw               m0, m4
533    pmullw               m1, m5
534    paddw                m0, m3
535    paddw                m0, m1
536    psrlw                m0, 2
537.hv_w2_loop:
538    movq                 m2, [srcq+ssq*1]
539    lea                srcq, [srcq+ssq*2]
540    movhps               m2, [srcq+ssq*0]
541    pmullw               m1, m4, m2
542    psrlq                m2, 16
543    pmullw               m2, m5
544    paddw                m1, m3
545    paddw                m1, m2
546    psrlw                m1, 2            ; 1 _ 2 _
547    shufpd               m2, m0, m1, 0x01 ; 0 _ 1 _
548    mova                 m0, m1
549    psubw                m1, m2
550    paddw                m1, m1
551    pmulhw               m1, m6
552    paddw                m1, m2
553    pmulhrsw             m1, m7
554    movd       [dstq+dsq*0], m1
555    punpckhqdq           m1, m1
556    movd       [dstq+dsq*1], m1
557    lea                dstq, [dstq+dsq*2]
558    sub                  hd, 2
559    jg .hv_w2_loop
560    RET
561.hv_w4:
562    movddup              m0, [srcq+ssq*0]
563    movddup              m1, [srcq+ssq*0+2]
564    pmullw               m0, m4
565    pmullw               m1, m5
566    paddw                m0, m3
567    paddw                m0, m1
568    psrlw                m0, 2
569.hv_w4_loop:
570    movq                 m1, [srcq+ssq*1]
571    movq                 m2, [srcq+ssq*1+2]
572    lea                srcq, [srcq+ssq*2]
573    movhps               m1, [srcq+ssq*0]
574    movhps               m2, [srcq+ssq*0+2]
575    pmullw               m1, m4
576    pmullw               m2, m5
577    paddw                m1, m3
578    paddw                m1, m2
579    psrlw                m1, 2            ; 1 2
580    shufpd               m2, m0, m1, 0x01 ; 0 1
581    mova                 m0, m1
582    psubw                m1, m2
583    paddw                m1, m1
584    pmulhw               m1, m6
585    paddw                m1, m2
586    pmulhrsw             m1, m7
587    movq       [dstq+dsq*0], m1
588    movhps     [dstq+dsq*1], m1
589    lea                dstq, [dstq+dsq*2]
590    sub                  hd, 2
591    jg .hv_w4_loop
592    RET
593.hv_w8:
594%if ARCH_X86_64
595%if WIN64
596    push                 r7
597%endif
598    shl                  wd, 5
599    lea                 r6d, [wq+hq-256]
600    mov                  r4, srcq
601    mov                  r7, dstq
602%else
603    mov                  r6, srcq
604%endif
605.hv_w8_loop0:
606    movu                 m0, [srcq+ssq*0]
607    movu                 m1, [srcq+ssq*0+2]
608    pmullw               m0, m4
609    pmullw               m1, m5
610    paddw                m0, m3
611    paddw                m0, m1
612    psrlw                m0, 2
613.hv_w8_loop:
614    movu                 m1, [srcq+ssq*1]
615    movu                 m2, [srcq+ssq*1+2]
616    lea                srcq, [srcq+ssq*2]
617    pmullw               m1, m4
618    pmullw               m2, m5
619    paddw                m1, m3
620    paddw                m1, m2
621    psrlw                m1, 2
622    psubw                m2, m1, m0
623    paddw                m2, m2
624    pmulhw               m2, m6
625    paddw                m2, m0
626    pmulhrsw             m2, m7
627    mova       [dstq+dsq*0], m2
628    movu                 m0, [srcq+ssq*0]
629    movu                 m2, [srcq+ssq*0+2]
630    pmullw               m0, m4
631    pmullw               m2, m5
632    paddw                m0, m3
633    paddw                m0, m2
634    psrlw                m0, 2
635    psubw                m2, m0, m1
636    paddw                m2, m2
637    pmulhw               m2, m6
638    paddw                m2, m1
639    pmulhrsw             m2, m7
640    mova       [dstq+dsq*1], m2
641    lea                dstq, [dstq+dsq*2]
642    sub                  hd, 2
643    jg .hv_w8_loop
644%if ARCH_X86_64
645    add                  r4, 16
646    add                  r7, 16
647    movzx                hd, r6b
648    mov                srcq, r4
649    mov                dstq, r7
650    sub                 r6d, 1<<8
651%else
652    mov                dstq, dstmp
653    add                  r6, 16
654    mov                  hd, hm
655    add                dstq, 16
656    mov                srcq, r6
657    mov               dstmp, dstq
658    sub                  wd, 8
659%endif
660    jg .hv_w8_loop0
661%if WIN64
662    pop                  r7
663%endif
664    RET
665
666cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3
667%define base r6-prep_ssse3
668    movifnidn          mxyd, r5m ; mx
669    LEA                  r6, prep_ssse3
670    movifnidn            hd, hm
671    test               mxyd, mxyd
672    jnz .h
673    mov                mxyd, r6m ; my
674    test               mxyd, mxyd
675    jnz .v
676.prep:
677    tzcnt                wd, wd
678    movzx                wd, word [base+prep_ssse3_table+wq*2]
679    mov                 r5d, r7m ; bitdepth_max
680    mova                 m5, [base+pw_8192]
681    add                  wq, r6
682    shr                 r5d, 11
683    movddup              m4, [base+prep_mul+r5*8]
684    lea            stride3q, [strideq*3]
685    jmp                  wq
686.prep_w4:
687    movq                 m0, [srcq+strideq*0]
688    movhps               m0, [srcq+strideq*1]
689    movq                 m1, [srcq+strideq*2]
690    movhps               m1, [srcq+stride3q ]
691    lea                srcq, [srcq+strideq*4]
692    pmullw               m0, m4
693    pmullw               m1, m4
694    psubw                m0, m5
695    psubw                m1, m5
696    mova        [tmpq+16*0], m0
697    mova        [tmpq+16*1], m1
698    add                tmpq, 16*2
699    sub                  hd, 4
700    jg .prep_w4
701    RET
702.prep_w8:
703    movu                 m0, [srcq+strideq*0]
704    movu                 m1, [srcq+strideq*1]
705    movu                 m2, [srcq+strideq*2]
706    movu                 m3, [srcq+stride3q ]
707    lea                srcq, [srcq+strideq*4]
708    REPX     {pmullw x, m4}, m0, m1, m2, m3
709    REPX     {psubw  x, m5}, m0, m1, m2, m3
710    mova        [tmpq+16*0], m0
711    mova        [tmpq+16*1], m1
712    mova        [tmpq+16*2], m2
713    mova        [tmpq+16*3], m3
714    add                tmpq, 16*4
715    sub                  hd, 4
716    jg .prep_w8
717    RET
718.prep_w16:
719    movu                 m0, [srcq+strideq*0+16*0]
720    movu                 m1, [srcq+strideq*0+16*1]
721    movu                 m2, [srcq+strideq*1+16*0]
722    movu                 m3, [srcq+strideq*1+16*1]
723    lea                srcq, [srcq+strideq*2]
724    REPX     {pmullw x, m4}, m0, m1, m2, m3
725    REPX     {psubw  x, m5}, m0, m1, m2, m3
726    mova        [tmpq+16*0], m0
727    mova        [tmpq+16*1], m1
728    mova        [tmpq+16*2], m2
729    mova        [tmpq+16*3], m3
730    add                tmpq, 16*4
731    sub                  hd, 2
732    jg .prep_w16
733    RET
734.prep_w32:
735    movu                 m0, [srcq+16*0]
736    movu                 m1, [srcq+16*1]
737    movu                 m2, [srcq+16*2]
738    movu                 m3, [srcq+16*3]
739    add                srcq, strideq
740    REPX     {pmullw x, m4}, m0, m1, m2, m3
741    REPX     {psubw  x, m5}, m0, m1, m2, m3
742    mova        [tmpq+16*0], m0
743    mova        [tmpq+16*1], m1
744    mova        [tmpq+16*2], m2
745    mova        [tmpq+16*3], m3
746    add                tmpq, 16*4
747    dec                  hd
748    jg .prep_w32
749    RET
750.prep_w64:
751    movu                 m0, [srcq+16*0]
752    movu                 m1, [srcq+16*1]
753    movu                 m2, [srcq+16*2]
754    movu                 m3, [srcq+16*3]
755    REPX     {pmullw x, m4}, m0, m1, m2, m3
756    REPX     {psubw  x, m5}, m0, m1, m2, m3
757    mova        [tmpq+16*0], m0
758    mova        [tmpq+16*1], m1
759    mova        [tmpq+16*2], m2
760    mova        [tmpq+16*3], m3
761    movu                 m0, [srcq+16*4]
762    movu                 m1, [srcq+16*5]
763    movu                 m2, [srcq+16*6]
764    movu                 m3, [srcq+16*7]
765    add                srcq, strideq
766    REPX     {pmullw x, m4}, m0, m1, m2, m3
767    REPX     {psubw  x, m5}, m0, m1, m2, m3
768    mova        [tmpq+16*4], m0
769    mova        [tmpq+16*5], m1
770    mova        [tmpq+16*6], m2
771    mova        [tmpq+16*7], m3
772    add                tmpq, 16*8
773    dec                  hd
774    jg .prep_w64
775    RET
776.prep_w128:
777    movu                 m0, [srcq+16* 0]
778    movu                 m1, [srcq+16* 1]
779    movu                 m2, [srcq+16* 2]
780    movu                 m3, [srcq+16* 3]
781    REPX     {pmullw x, m4}, m0, m1, m2, m3
782    REPX     {psubw  x, m5}, m0, m1, m2, m3
783    mova        [tmpq+16*0], m0
784    mova        [tmpq+16*1], m1
785    mova        [tmpq+16*2], m2
786    mova        [tmpq+16*3], m3
787    movu                 m0, [srcq+16* 4]
788    movu                 m1, [srcq+16* 5]
789    movu                 m2, [srcq+16* 6]
790    movu                 m3, [srcq+16* 7]
791    REPX     {pmullw x, m4}, m0, m1, m2, m3
792    REPX     {psubw  x, m5}, m0, m1, m2, m3
793    mova        [tmpq+16*4], m0
794    mova        [tmpq+16*5], m1
795    mova        [tmpq+16*6], m2
796    mova        [tmpq+16*7], m3
797    movu                 m0, [srcq+16* 8]
798    movu                 m1, [srcq+16* 9]
799    movu                 m2, [srcq+16*10]
800    movu                 m3, [srcq+16*11]
801    add                tmpq, 16*16
802    REPX     {pmullw x, m4}, m0, m1, m2, m3
803    REPX     {psubw  x, m5}, m0, m1, m2, m3
804    mova        [tmpq-16*8], m0
805    mova        [tmpq-16*7], m1
806    mova        [tmpq-16*6], m2
807    mova        [tmpq-16*5], m3
808    movu                 m0, [srcq+16*12]
809    movu                 m1, [srcq+16*13]
810    movu                 m2, [srcq+16*14]
811    movu                 m3, [srcq+16*15]
812    add                srcq, strideq
813    REPX     {pmullw x, m4}, m0, m1, m2, m3
814    REPX     {psubw  x, m5}, m0, m1, m2, m3
815    mova        [tmpq-16*4], m0
816    mova        [tmpq-16*3], m1
817    mova        [tmpq-16*2], m2
818    mova        [tmpq-16*1], m3
819    dec                  hd
820    jg .prep_w128
821    RET
822.h:
823    movd                 m4, mxyd
824    mov                mxyd, r6m ; my
825    mova                 m3, [base+pw_16]
826    pshufb               m4, [base+pw_256]
827    mova                 m5, [base+pw_32766]
828    psubw                m3, m4
829    test          dword r7m, 0x800
830    jnz .h_12bpc
831    psllw                m3, 2
832    psllw                m4, 2
833.h_12bpc:
834    test               mxyd, mxyd
835    jnz .hv
836    sub                  wd, 8
837    je .h_w8
838    jg .h_w16
839.h_w4:
840    movq                 m0, [srcq+strideq*0]
841    movhps               m0, [srcq+strideq*1]
842    movq                 m1, [srcq+strideq*0+2]
843    movhps               m1, [srcq+strideq*1+2]
844    lea                srcq, [srcq+strideq*2]
845    pmullw               m0, m3
846    pmullw               m1, m4
847    psubw                m0, m5
848    paddw                m0, m1
849    psraw                m0, 2
850    mova             [tmpq], m0
851    add                tmpq, 16
852    sub                  hd, 2
853    jg .h_w4
854    RET
855.h_w8:
856    movu                 m0, [srcq+strideq*0]
857    movu                 m1, [srcq+strideq*0+2]
858    pmullw               m0, m3
859    pmullw               m1, m4
860    psubw                m0, m5
861    paddw                m0, m1
862    movu                 m1, [srcq+strideq*1]
863    movu                 m2, [srcq+strideq*1+2]
864    lea                srcq, [srcq+strideq*2]
865    pmullw               m1, m3
866    pmullw               m2, m4
867    psubw                m1, m5
868    paddw                m1, m2
869    psraw                m0, 2
870    psraw                m1, 2
871    mova        [tmpq+16*0], m0
872    mova        [tmpq+16*1], m1
873    add                tmpq, 16*2
874    sub                  hd, 2
875    jg .h_w8
876    RET
877.h_w16:
878    lea                srcq, [srcq+wq*2]
879    neg                  wq
880.h_w16_loop0:
881    mov                  r6, wq
882.h_w16_loop:
883    movu                 m0, [srcq+r6*2+ 0]
884    movu                 m1, [srcq+r6*2+ 2]
885    pmullw               m0, m3
886    pmullw               m1, m4
887    psubw                m0, m5
888    paddw                m0, m1
889    movu                 m1, [srcq+r6*2+16]
890    movu                 m2, [srcq+r6*2+18]
891    pmullw               m1, m3
892    pmullw               m2, m4
893    psubw                m1, m5
894    paddw                m1, m2
895    psraw                m0, 2
896    psraw                m1, 2
897    mova        [tmpq+16*0], m0
898    mova        [tmpq+16*1], m1
899    add                tmpq, 16*2
900    add                  r6, 16
901    jl .h_w16_loop
902    add                srcq, strideq
903    dec                  hd
904    jg .h_w16_loop0
905    RET
906.v:
907    movd                 m4, mxyd
908    mova                 m3, [base+pw_16]
909    pshufb               m4, [base+pw_256]
910    mova                 m5, [base+pw_32766]
911    psubw                m3, m4
912    test          dword r7m, 0x800
913    jnz .v_12bpc
914    psllw                m3, 2
915    psllw                m4, 2
916.v_12bpc:
917    cmp                  wd, 8
918    je .v_w8
919    jg .v_w16
920.v_w4:
921    movq                 m0, [srcq+strideq*0]
922.v_w4_loop:
923    movq                 m2, [srcq+strideq*1]
924    lea                srcq, [srcq+strideq*2]
925    punpcklqdq           m1, m0, m2 ; 0 1
926    movq                 m0, [srcq+strideq*0]
927    punpcklqdq           m2, m0     ; 1 2
928    pmullw               m1, m3
929    pmullw               m2, m4
930    psubw                m1, m5
931    paddw                m1, m2
932    psraw                m1, 2
933    mova             [tmpq], m1
934    add                tmpq, 16
935    sub                  hd, 2
936    jg .v_w4_loop
937    RET
938.v_w8:
939    movu                 m0, [srcq+strideq*0]
940.v_w8_loop:
941    movu                 m2, [srcq+strideq*1]
942    lea                srcq, [srcq+strideq*2]
943    pmullw               m0, m3
944    pmullw               m1, m4, m2
945    psubw                m0, m5
946    paddw                m1, m0
947    movu                 m0, [srcq+strideq*0]
948    psraw                m1, 2
949    pmullw               m2, m3
950    mova        [tmpq+16*0], m1
951    pmullw               m1, m4, m0
952    psubw                m2, m5
953    paddw                m1, m2
954    psraw                m1, 2
955    mova        [tmpq+16*1], m1
956    add                tmpq, 16*2
957    sub                  hd, 2
958    jg .v_w8_loop
959    RET
960.v_w16:
961%if WIN64
962    push                 r7
963%endif
964    mov                  r5, srcq
965%if ARCH_X86_64
966    lea                 r6d, [wq*4-32]
967    mov                  wd, wd
968    lea                 r6d, [hq+r6*8]
969    mov                  r7, tmpq
970%else
971    mov                 r6d, wd
972%endif
973.v_w16_loop0:
974    movu                 m0, [srcq+strideq*0]
975.v_w16_loop:
976    movu                 m2, [srcq+strideq*1]
977    lea                srcq, [srcq+strideq*2]
978    pmullw               m0, m3
979    pmullw               m1, m4, m2
980    psubw                m0, m5
981    paddw                m1, m0
982    movu                 m0, [srcq+strideq*0]
983    psraw                m1, 2
984    pmullw               m2, m3
985    mova        [tmpq+wq*0], m1
986    pmullw               m1, m4, m0
987    psubw                m2, m5
988    paddw                m1, m2
989    psraw                m1, 2
990    mova        [tmpq+wq*2], m1
991    lea                tmpq, [tmpq+wq*4]
992    sub                  hd, 2
993    jg .v_w16_loop
994%if ARCH_X86_64
995    add                  r5, 16
996    add                  r7, 16
997    movzx                hd, r6b
998    mov                srcq, r5
999    mov                tmpq, r7
1000    sub                 r6d, 1<<8
1001%else
1002    mov                tmpq, tmpmp
1003    add                  r5, 16
1004    mov                  hd, hm
1005    add                tmpq, 16
1006    mov                srcq, r5
1007    mov               tmpmp, tmpq
1008    sub                 r6d, 8
1009%endif
1010    jg .v_w16_loop0
1011%if WIN64
1012    pop                  r7
1013%endif
1014    RET
1015.hv:
1016    WIN64_SPILL_XMM       7
1017    shl                mxyd, 11
1018    movd                 m6, mxyd
1019    pshufb               m6, [base+pw_256]
1020    cmp                  wd, 8
1021    je .hv_w8
1022    jg .hv_w16
1023.hv_w4:
1024    movddup              m0, [srcq+strideq*0]
1025    movddup              m1, [srcq+strideq*0+2]
1026    pmullw               m0, m3
1027    pmullw               m1, m4
1028    psubw                m0, m5
1029    paddw                m0, m1
1030    psraw                m0, 2
1031.hv_w4_loop:
1032    movq                 m1, [srcq+strideq*1]
1033    movq                 m2, [srcq+strideq*1+2]
1034    lea                srcq, [srcq+strideq*2]
1035    movhps               m1, [srcq+strideq*0]
1036    movhps               m2, [srcq+strideq*0+2]
1037    pmullw               m1, m3
1038    pmullw               m2, m4
1039    psubw                m1, m5
1040    paddw                m1, m2
1041    psraw                m1, 2            ; 1 2
1042    shufpd               m2, m0, m1, 0x01 ; 0 1
1043    mova                 m0, m1
1044    psubw                m1, m2
1045    pmulhrsw             m1, m6
1046    paddw                m1, m2
1047    mova             [tmpq], m1
1048    add                tmpq, 16
1049    sub                  hd, 2
1050    jg .hv_w4_loop
1051    RET
1052.hv_w8:
1053    movu                 m0, [srcq+strideq*0]
1054    movu                 m1, [srcq+strideq*0+2]
1055    pmullw               m0, m3
1056    pmullw               m1, m4
1057    psubw                m0, m5
1058    paddw                m0, m1
1059    psraw                m0, 2
1060.hv_w8_loop:
1061    movu                 m1, [srcq+strideq*1]
1062    movu                 m2, [srcq+strideq*1+2]
1063    lea                srcq, [srcq+strideq*2]
1064    pmullw               m1, m3
1065    pmullw               m2, m4
1066    psubw                m1, m5
1067    paddw                m1, m2
1068    psraw                m1, 2
1069    psubw                m2, m1, m0
1070    pmulhrsw             m2, m6
1071    paddw                m2, m0
1072    mova        [tmpq+16*0], m2
1073    movu                 m0, [srcq+strideq*0]
1074    movu                 m2, [srcq+strideq*0+2]
1075    pmullw               m0, m3
1076    pmullw               m2, m4
1077    psubw                m0, m5
1078    paddw                m0, m2
1079    psraw                m0, 2
1080    psubw                m2, m0, m1
1081    pmulhrsw             m2, m6
1082    paddw                m2, m1
1083    mova        [tmpq+16*1], m2
1084    add                tmpq, 16*2
1085    sub                  hd, 2
1086    jg .hv_w8_loop
1087    RET
1088.hv_w16:
1089%if WIN64
1090    push                 r7
1091%endif
1092    mov                  r5, srcq
1093%if ARCH_X86_64
1094    lea                 r6d, [wq*4-32]
1095    mov                  wd, wd
1096    lea                 r6d, [hq+r6*8]
1097    mov                  r7, tmpq
1098%else
1099    mov                 r6d, wd
1100%endif
1101.hv_w16_loop0:
1102    movu                 m0, [srcq+strideq*0]
1103    movu                 m1, [srcq+strideq*0+2]
1104    pmullw               m0, m3
1105    pmullw               m1, m4
1106    psubw                m0, m5
1107    paddw                m0, m1
1108    psraw                m0, 2
1109.hv_w16_loop:
1110    movu                 m1, [srcq+strideq*1]
1111    movu                 m2, [srcq+strideq*1+2]
1112    lea                srcq, [srcq+strideq*2]
1113    pmullw               m1, m3
1114    pmullw               m2, m4
1115    psubw                m1, m5
1116    paddw                m1, m2
1117    psraw                m1, 2
1118    psubw                m2, m1, m0
1119    pmulhrsw             m2, m6
1120    paddw                m2, m0
1121    mova        [tmpq+wq*0], m2
1122    movu                 m0, [srcq+strideq*0]
1123    movu                 m2, [srcq+strideq*0+2]
1124    pmullw               m0, m3
1125    pmullw               m2, m4
1126    psubw                m0, m5
1127    paddw                m0, m2
1128    psraw                m0, 2
1129    psubw                m2, m0, m1
1130    pmulhrsw             m2, m6
1131    paddw                m2, m1
1132    mova        [tmpq+wq*2], m2
1133    lea                tmpq, [tmpq+wq*4]
1134    sub                  hd, 2
1135    jg .hv_w16_loop
1136%if ARCH_X86_64
1137    add                  r5, 16
1138    add                  r7, 16
1139    movzx                hd, r6b
1140    mov                srcq, r5
1141    mov                tmpq, r7
1142    sub                 r6d, 1<<8
1143%else
1144    mov                tmpq, tmpmp
1145    add                  r5, 16
1146    mov                  hd, hm
1147    add                tmpq, 16
1148    mov                srcq, r5
1149    mov               tmpmp, tmpq
1150    sub                 r6d, 8
1151%endif
1152    jg .hv_w16_loop0
1153%if WIN64
1154    pop                  r7
1155%endif
1156    RET
1157
1158; int8_t subpel_filters[5][15][8]
1159%assign FILTER_REGULAR (0*15 << 16) | 3*15
1160%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1161%assign FILTER_SHARP   (2*15 << 16) | 3*15
1162
1163%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
1164cglobal %1_%2_16bpc
1165    mov                 t0d, FILTER_%3
1166%ifidn %3, %4
1167    mov                 t1d, t0d
1168%else
1169    mov                 t1d, FILTER_%4
1170%endif
1171%if %0 == 5 ; skip the jump in the last filter
1172    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1173%endif
1174%endmacro
1175
1176%if ARCH_X86_32
1177DECLARE_REG_TMP 1, 2, 6
1178%elif WIN64
1179DECLARE_REG_TMP 4, 5, 8
1180%else
1181DECLARE_REG_TMP 7, 8, 8
1182%endif
1183
1184%define PUT_8TAP_FN FN put_8tap,
1185PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
1186PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
1187PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
1188PUT_8TAP_FN regular,        REGULAR, REGULAR
1189
1190cglobal put_6tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my
1191    %define            base  t2-put_ssse3
1192%if ARCH_X86_32
1193    %define             mxb  r0b
1194    %define             mxd  r0
1195    %define             mxq  r0
1196    %define             myb  r1b
1197    %define             myd  r1
1198    %define             myq  r1
1199%endif
1200    imul                mxd, mxm, 0x010101
1201    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
1202    imul                myd, mym, 0x010101
1203    add                 myd, t1d ; 6tap_v, my, 4tap_v
1204    LEA                  t2, put_ssse3
1205    movifnidn            wd, wm
1206    movifnidn          srcq, srcmp
1207    movifnidn           ssq, ssmp
1208    movifnidn            hd, hm
1209    test                mxd, 0xf00
1210    jnz .h
1211    test                myd, 0xf00
1212    jnz .v
1213.put:
1214    tzcnt                wd, wd
1215    movzx                wd, word [base+put_ssse3_table+wq*2]
1216    movifnidn          dstq, dstmp
1217    movifnidn           dsq, dsmp
1218    add                  wq, t2
1219%if WIN64
1220    pop                  r8
1221    pop                  r7
1222%endif
1223    jmp                  wq
1224.h_w2:
1225    mova                 m2, [base+spel_h_shuf2]
1226    pshufd               m3, m3, q2121
1227.h_w2_loop:
1228    movu                 m0, [srcq+ssq*0]
1229    movu                 m1, [srcq+ssq*1]
1230    lea                srcq, [srcq+ssq*2]
1231    pshufb               m0, m2
1232    pshufb               m1, m2
1233    pmaddwd              m0, m3
1234    pmaddwd              m1, m3
1235    phaddd               m0, m1
1236    paddd                m0, m4
1237    psrad                m0, 6
1238    packssdw             m0, m0
1239    pxor                 m1, m1
1240    pminsw               m0, m5
1241    pmaxsw               m0, m1
1242    movd       [dstq+dsq*0], m0
1243    pshuflw              m0, m0, q3232
1244    movd       [dstq+dsq*1], m0
1245    lea                dstq, [dstq+dsq*2]
1246    sub                  hd, 2
1247    jg .h_w2_loop
1248    RET
1249.h_w4:
1250    movzx               mxd, mxb
1251    lea                srcq, [srcq-2]
1252    movq                 m3, [base+subpel_filters+mxq*8]
1253    movifnidn          dstq, dstmp
1254    punpcklbw            m3, m3
1255    psraw                m3, 8 ; sign-extend
1256    jl .h_w2
1257    WIN64_SPILL_XMM       9
1258    mova                 m7, [base+spel_h_shufA]
1259%if ARCH_X86_32
1260    %define              m8  [base+spel_h_shufB]
1261%else
1262    mova                 m8, [base+spel_h_shufB]
1263%endif
1264    pshufd               m2, m3, q1111
1265    pshufd               m3, m3, q2222
1266.h_w4_loop:
1267    movu                 m0, [srcq+ssq*0]
1268    movu                 m1, [srcq+ssq*1]
1269    lea                srcq, [srcq+ssq*2]
1270    pshufb               m6, m0, m7 ; 0 1 1 2 2 3 3 4
1271    pmaddwd              m6, m2
1272    pshufb               m0, m8     ; 2 3 3 4 4 5 5 6
1273    pmaddwd              m0, m3
1274    paddd                m0, m6
1275    pshufb               m6, m1, m7
1276    pmaddwd              m6, m2
1277    pshufb               m1, m8
1278    pmaddwd              m1, m3
1279    paddd                m0, m4
1280    paddd                m6, m4
1281    paddd                m1, m6
1282    psrad                m0, 6
1283    psrad                m1, 6
1284    packssdw             m0, m1
1285    pxor                 m1, m1
1286    pminsw               m0, m5
1287    pmaxsw               m0, m1
1288    movq       [dstq+dsq*0], m0
1289    movhps     [dstq+dsq*1], m0
1290    lea                dstq, [dstq+dsq*2]
1291    sub                  hd, 2
1292    jg .h_w4_loop
1293    RET
1294.h:
1295    RESET_STACK_STATE
1296    test                myd, 0xf00
1297    jnz .hv
1298    mov                 myd, r8m
1299    movd                 m5, r8m
1300    shr                 myd, 11
1301    movddup              m4, [base+put_8tap_h_rnd+myq*8]
1302    movifnidn           dsq, dsmp
1303    pshufb               m5, [base+pw_256]
1304    sub                  wd, 4
1305    jle .h_w4
1306    WIN64_SPILL_XMM      11
1307    shr                 mxd, 16
1308    movq                 m2, [base+subpel_filters+1+mxq*8]
1309    movifnidn          dstq, dstmp
1310    mova                 m6, [base+spel_h_shufA]
1311    mova                 m7, [base+spel_h_shufB]
1312    lea                srcq, [srcq+wq*2]
1313    punpcklbw            m2, m2
1314    lea                dstq, [dstq+wq*2]
1315    psraw                m2, 8
1316    neg                  wq
1317%if ARCH_X86_32
1318    ALLOC_STACK       -16*3
1319    %define              m8  [rsp+16*0]
1320    %define              m9  [rsp+16*1]
1321    %define             m10  [rsp+16*2]
1322    pshufd               m0, m2, q0000
1323    pshufd               m1, m2, q1111
1324    pshufd               m2, m2, q2222
1325    mova                 m8, m0
1326    mova                 m9, m1
1327    mova                m10, m2
1328%else
1329    pshufd               m8, m2, q0000
1330    pshufd               m9, m2, q1111
1331    pshufd              m10, m2, q2222
1332%endif
1333.h_w8_loop0:
1334    mov                  r6, wq
1335.h_w8_loop:
1336    movu                 m3, [srcq+r6*2-4]
1337    movu                 m2, [srcq+r6*2+8]
1338    pshufb               m0, m3, m6   ; 01 12 23 34
1339    pmaddwd              m0, m8       ; abcd0
1340    pshufb               m3, m7       ; 23 34 45 56
1341    pmaddwd              m1, m9, m3   ; abcd1
1342    paddd                m0, m1
1343    pshufb               m1, m2, m6   ; 67 78 89 9a
1344    shufpd               m3, m1, 0x01 ; 45 56 67 78
1345    pmaddwd              m1, m9       ; efgh1
1346    pshufb               m2, m7       ; 89 9a ab bc
1347    pmaddwd              m2, m10      ; efgh2
1348    paddd                m1, m2
1349    pmaddwd              m2, m10, m3  ; abcd2
1350    pmaddwd              m3, m8       ; efgh0
1351    paddd                m0, m4
1352    paddd                m1, m4
1353    paddd                m0, m2
1354    paddd                m1, m3
1355    psrad                m0, 6
1356    psrad                m1, 6
1357    packssdw             m0, m1
1358    pxor                 m1, m1
1359    pminsw               m0, m5
1360    pmaxsw               m0, m1
1361    mova        [dstq+r6*2], m0
1362    add                  r6, 8
1363    jl .h_w8_loop
1364    add                srcq, ssq
1365    add                dstq, dsq
1366    dec                  hd
1367    jg .h_w8_loop0
1368    RET
1369.v:
1370    movzx               mxd, myb
1371    shr                 myd, 16
1372    cmp                  hd, 6
1373    cmovb               myd, mxd
1374    movq                 m2, [base+subpel_filters+1+myq*8]
1375    WIN64_SPILL_XMM      11, 16
1376    movd                 m5, r8m
1377    movifnidn          dstq, dstmp
1378    movifnidn           dsq, dsmp
1379    punpcklbw            m2, m2
1380    pshufb               m5, [base+pw_256]
1381    psraw                m2, 8 ; sign-extend
1382%if ARCH_X86_32
1383    ALLOC_STACK       -16*4
1384    pshufd               m0, m2, q0000
1385    mov                  r6, ssq
1386    pshufd               m1, m2, q1111
1387    neg                  r6
1388    pshufd               m2, m2, q2222
1389    mova                 m8, m0
1390    mova                 m9, m1
1391    mova                m10, m2
1392    cmp                  wd, 2
1393    jne .v_w4
1394%else
1395    mov                  r6, ssq
1396    pshufd               m8, m2, q0000
1397    neg                  r6
1398    cmp                  wd, 4
1399    jg .v_w8
1400    pshufd               m9, m2, q1111
1401    pshufd              m10, m2, q2222
1402    je .v_w4
1403%endif
1404.v_w2:
1405    movd                 m1, [srcq+r6 *2]
1406    movd                 m3, [srcq+r6 *1]
1407    movd                 m2, [srcq+ssq*0]
1408    movd                 m4, [srcq+ssq*1]
1409    lea                srcq, [srcq+ssq*2]
1410    movd                 m0, [srcq+ssq*0]
1411    punpckldq            m1, m3      ; 0 1
1412    punpckldq            m3, m2      ; 1 2
1413    punpckldq            m2, m4      ; 2 3
1414    punpckldq            m4, m0      ; 3 4
1415    punpcklwd            m1, m3      ; 01 12
1416    punpcklwd            m2, m4      ; 23 34
1417    pxor                 m6, m6
1418.v_w2_loop:
1419    movd                 m3, [srcq+ssq*1]
1420    lea                srcq, [srcq+ssq*2]
1421    pmaddwd              m4, m8, m1  ; a0 b0
1422    mova                 m1, m2
1423    pmaddwd              m2, m9      ; a1 b1
1424    paddd                m4, m2
1425    punpckldq            m2, m0, m3  ; 4 5
1426    movd                 m0, [srcq+ssq*0]
1427    punpckldq            m3, m0      ; 5 6
1428    punpcklwd            m2, m3      ; 67 78
1429    pmaddwd              m3, m10, m2 ; a2 b2
1430    paddd                m4, m3
1431    psrad                m4, 5
1432    packssdw             m4, m4
1433    pmaxsw               m4, m6
1434    pavgw                m4, m6
1435    pminsw               m4, m5
1436    movd       [dstq+dsq*0], m4
1437    pshuflw              m4, m4, q3232
1438    movd       [dstq+dsq*1], m4
1439    lea                dstq, [dstq+dsq*2]
1440    sub                  hd, 2
1441    jg .v_w2_loop
1442    RET
1443.v_w4:
1444%if ARCH_X86_32
1445    shl                  wd, 14
1446    lea                srcq, [srcq+r6*2]
1447    lea                  wd, [wq+hq-(1<<16)]
1448%if STACK_ALIGNMENT < 16
1449    %define           dstmp  [esp+16*3]
1450%endif
1451.v_w4_loop0:
1452    mov               dstmp, dstq
1453    movq                 m1, [srcq+ssq*0]
1454    movq                 m2, [srcq+ssq*1]
1455    lea                  r6, [srcq+ssq*2]
1456    movq                 m3, [r6  +ssq*0]
1457    movq                 m4, [r6  +ssq*1]
1458    lea                  r6, [r6  +ssq*2]
1459%else
1460    movq                 m1, [srcq+r6 *2]
1461    movq                 m2, [srcq+r6 *1]
1462    lea                  r6, [srcq+ssq*2]
1463    movq                 m3, [srcq+ssq*0]
1464    movq                 m4, [srcq+ssq*1]
1465%endif
1466    movq                 m0, [r6  +ssq*0]
1467    punpcklwd            m1, m2      ; 01
1468    punpcklwd            m2, m3      ; 12
1469    punpcklwd            m3, m4      ; 23
1470    punpcklwd            m4, m0      ; 34
1471.v_w4_loop:
1472    pmaddwd              m6, m8, m1  ; a0
1473    pmaddwd              m7, m8, m2  ; b0
1474    mova                 m1, m3
1475    pmaddwd              m3, m9      ; a1
1476    mova                 m2, m4
1477    pmaddwd              m4, m9      ; b1
1478    paddd                m6, m3
1479    movq                 m3, [r6+ssq*0]
1480    paddd                m7, m4
1481    movq                 m4, [r6+ssq*1]
1482    lea                  r6, [r6+ssq*2]
1483    movq                 m0, [r6+ssq*0]
1484    punpcklwd            m3, m4      ; 45
1485    punpcklwd            m4, m0      ; 56
1486    pmaddwd              m0, m10, m3 ; a2
1487    paddd                m6, m0
1488    pmaddwd              m0, m10, m4 ; b2
1489    paddd                m7, m0
1490    psrad                m6, 5
1491    psrad                m7, 5
1492    packssdw             m6, m7
1493    pxor                 m7, m7
1494    pmaxsw               m6, m7
1495    pavgw                m6, m7
1496    pminsw               m6, m5
1497    movq       [dstq+dsq*0], m6
1498    movhps     [dstq+dsq*1], m6
1499    lea                dstq, [dstq+dsq*2]
1500    sub                  hd, 2
1501    jg .v_w4_loop
1502%if ARCH_X86_32
1503    mov                dstq, dstmp
1504    add                srcq, 8
1505    movzx                hd, ww
1506    add                dstq, 8
1507    sub                  wd, 1<<16
1508    jg .v_w4_loop0
1509    RET
1510%else
1511    RET
1512.v_w8:
1513    mova                r6m, m8
1514    shl                  wd, 5
1515    pshufd               m6, m2, q1111
1516    lea                  wd, [wq+hq-(1<<8)]
1517    pshufd               m7, m2, q2222
1518    WIN64_PUSH_XMM       16
1519.v_w8_loop0:
1520    movu                 m9, [srcq+ r6*2]
1521    movu                m11, [srcq+ r6*1]
1522    lea                  r7, [srcq+ssq*2]
1523    movu                m13, [srcq+ssq*0]
1524    movu                m15, [srcq+ssq*1]
1525    mov                  r8, dstq
1526    movu                 m4, [r7  +ssq*0]
1527    punpcklwd            m8, m9, m11  ; 01
1528    punpckhwd            m9, m11
1529    punpcklwd           m10, m11, m13 ; 12
1530    punpckhwd           m11, m13
1531    punpcklwd           m12, m13, m15 ; 23
1532    punpckhwd           m13, m15
1533    punpcklwd           m14, m15, m4  ; 34
1534    punpckhwd           m15, m4
1535.v_w8_loop:
1536    mova                 m3, r6m
1537    pmaddwd              m0, m8, m3   ; a0
1538    pmaddwd              m2, m9, m3   ; a0'
1539    pmaddwd              m1, m10, m3  ; b0
1540    pmaddwd              m3, m11      ; b0'
1541    mova                 m8, m12
1542    pmaddwd             m12, m6       ; a1
1543    mova                 m9, m13
1544    pmaddwd             m13, m6       ; a1'
1545    mova                m10, m14
1546    pmaddwd             m14, m6       ; b1
1547    mova                m11, m15
1548    pmaddwd             m15, m6       ; b1'
1549    paddd                m0, m12
1550    paddd                m2, m13
1551    movu                m13, [r7+ssq*0]
1552    paddd                m1, m14
1553    paddd                m3, m15
1554    movu                m15, [r7+ssq*1]
1555    lea                  r7, [r7+ssq*2]
1556    movu                 m4, [r7+ssq*0]
1557    punpcklwd           m12, m13, m15 ; 45
1558    punpckhwd           m13, m15
1559    punpcklwd           m14, m15, m4  ; 56
1560    punpckhwd           m15, m4
1561    pmaddwd              m4, m7, m12  ; a2
1562    paddd                m0, m4
1563    pmaddwd              m4, m7, m13  ; a2'
1564    paddd                m2, m4
1565    pmaddwd              m4, m7, m14  ; b2
1566    paddd                m1, m4
1567    pmaddwd              m4, m7, m15  ; b2'
1568    paddd                m3, m4
1569    REPX       {psrad x, 5}, m0, m2, m1, m3
1570    packssdw             m0, m2
1571    packssdw             m1, m3
1572    pxor                 m2, m2
1573    pmaxsw               m0, m2
1574    pmaxsw               m1, m2
1575    pavgw                m0, m2
1576    pavgw                m1, m2
1577    pminsw               m0, m5
1578    pminsw               m1, m5
1579    mova         [r8+dsq*0], m0
1580    mova         [r8+dsq*1], m1
1581    lea                  r8, [r8+dsq*2]
1582    sub                  hd, 2
1583    jg .v_w8_loop
1584    add                srcq, 16
1585    add                dstq, 16
1586    movzx                hd, wb
1587    sub                  wd, 1<<8
1588    jg .v_w8_loop0
1589    RET
1590%endif
1591.hv:
1592    cmp                  wd, 4
1593    jg .hv_w8
1594    WIN64_SPILL_XMM      12, 16
1595%if ARCH_X86_32
1596    movd                 m3, r8m
1597    pshufb               m3, [base+pw_256]
1598%else
1599    movd                m11, r8m
1600    pshufb              m11, [base+pw_256]
1601%endif
1602    movzx               mxd, mxb
1603    movq                 m0, [base+subpel_filters+mxq*8]
1604    movzx               mxd, myb
1605    shr                 myd, 16
1606    cmp                  hd, 6
1607    cmovb               myd, mxd
1608    movq                 m2, [base+subpel_filters+1+myq*8]
1609    movddup              m7, [base+pd_8704]
1610    sub                srcq, 2
1611    pshuflw              m0, m0, q2121
1612    pxor                 m6, m6
1613    punpcklbw            m6, m0
1614    punpcklbw            m2, m2
1615    psraw                m2, 8 ; sign-extend
1616    test          dword r8m, 0x800
1617    jz .hv_w2_10bpc
1618    movddup              m7, [base+pd_2560]
1619    psraw                m6, 2
1620    psllw                m2, 2
1621.hv_w2_10bpc:
1622%if ARCH_X86_32
1623%assign regs_used 2
1624    ALLOC_STACK       -16*7
1625%assign regs_used 7
1626    mov                dstq, r0mp
1627    mov                 dsq, r1mp
1628    %define             m11  [esp+16*4]
1629    pshufd               m0, m2, q0000
1630    pshufd               m1, m2, q1111
1631    pshufd               m2, m2, q2222
1632    mova                 m8, m0
1633    mova                 m9, m1
1634    mova                m10, m2
1635    mova                m11, m3
1636    neg                 ssq
1637    movu                 m3, [srcq+ssq*2]
1638    movu                 m4, [srcq+ssq*1]
1639    neg                 ssq
1640%else
1641    pshufd               m8, m2, q0000
1642    mov                  r6, ssq
1643    pshufd               m9, m2, q1111
1644    neg                  r6
1645    pshufd              m10, m2, q2222
1646    movu                 m3, [srcq+r6 *2]
1647    movu                 m4, [srcq+r6 *1]
1648%endif
1649    movu                 m1, [srcq+ssq*0]
1650    movu                 m0, [srcq+ssq*1]
1651    lea                srcq, [srcq+ssq*2]
1652    movu                 m2, [srcq+ssq*0]
1653    cmp                  wd, 4
1654    je .hv_w4
1655    mova                 m5, [base+spel_h_shuf2]
1656    REPX    {pshufb  x, m5}, m3, m4, m0, m1, m2
1657    REPX    {pmaddwd x, m6}, m3, m0, m4, m1, m2
1658    phaddd               m3, m0        ; 0 3
1659    phaddd               m4, m1        ; 1 2
1660    phaddd               m0, m2        ; 3 4
1661    REPX    {paddd   x, m7}, m3, m4, m0
1662    REPX    {psrad   x, 10}, m3, m4, m0
1663    packssdw             m3, m4        ; 0 3 1 2
1664    packssdw             m4, m0        ; 1 2 3 4
1665    pshufd               m2, m3, q1320 ; 0 1 2 3
1666    punpcklwd            m1, m2, m4    ; 01 12
1667    punpckhwd            m2, m4        ; 23 34
1668.hv_w2_loop:
1669    movu                 m3, [srcq+ssq*1]
1670    lea                srcq, [srcq+ssq*2]
1671    movu                 m4, [srcq+ssq*0]
1672    pshufb               m3, m5
1673    pshufb               m4, m5
1674    pmaddwd              m3, m6
1675    pmaddwd              m4, m6
1676    phaddd               m3, m4
1677    pmaddwd              m4, m8, m1    ; a0 b0
1678    mova                 m1, m2
1679    pmaddwd              m2, m9        ; a1 b1
1680    paddd                m4, m2
1681    paddd                m3, m7
1682    psrad                m3, 10        ; 5 6
1683    packssdw             m0, m3
1684    pshufd               m2, m0, q2103
1685    punpckhwd            m2, m0        ; 45 56
1686    mova                 m0, m3
1687    pmaddwd              m3, m10, m2   ; a2 b2
1688    paddd                m4, m3
1689    psrad                m4, 10
1690    packssdw             m4, m4
1691    pxor                 m3, m3
1692    pminsw               m4, m11
1693    pmaxsw               m4, m3
1694    movd       [dstq+dsq*0], m4
1695    pshuflw              m4, m4, q1032
1696    movd       [dstq+dsq*1], m4
1697    lea                dstq, [dstq+dsq*2]
1698    sub                  hd, 2
1699    jg .hv_w2_loop
1700    RET
1701.hv_w4:
1702%if ARCH_X86_32
1703    %define             m12  [esp+16*5]
1704    %define             m13  [esp+16*6]
1705    %define             m14  [base+spel_h_shufA]
1706    %define             m15  [base+spel_h_shufB]
1707    pshufd               m5, m6, q0000
1708    pshufd               m6, m6, q1111
1709    mova                m12, m5
1710    mova                m13, m6
1711%else
1712    WIN64_PUSH_XMM       16
1713    mova                m14, [base+spel_h_shufA]
1714    mova                m15, [base+spel_h_shufB]
1715    pshufd              m12, m6, q0000
1716    pshufd              m13, m6, q1111
1717%endif
1718%macro HV_H_W4_6TAP 3-4 m15 ; dst, src, tmp, shufB
1719    pshufb               %3, %2, m14
1720    pmaddwd              %3, m12
1721    pshufb               %2, %4
1722    pmaddwd              %2, m13
1723    paddd                %3, m7
1724    paddd                %1, %2, %3
1725%endmacro
1726    HV_H_W4_6TAP         m3, m3, m5
1727    HV_H_W4_6TAP         m4, m4, m5
1728    HV_H_W4_6TAP         m5, m1, m5
1729    HV_H_W4_6TAP         m0, m0, m1
1730    HV_H_W4_6TAP         m2, m2, m1
1731    REPX      {psrad x, 10}, m3, m5, m4, m0, m2
1732    packssdw             m3, m5      ; 0 2
1733    packssdw             m4, m0      ; 1 3
1734    packssdw             m5, m2      ; 2 4
1735    punpcklwd            m1, m3, m4  ; 01
1736    punpckhwd            m3, m4      ; 23
1737    punpcklwd            m2, m4, m5  ; 12
1738    punpckhwd            m4, m5      ; 34
1739.hv_w4_loop:
1740    movu                 m0, [srcq+ssq*1]
1741    pmaddwd              m5, m8, m1  ; a0
1742    lea                srcq, [srcq+ssq*2]
1743    pmaddwd              m6, m8, m2  ; b0
1744    mova                 m1, m3
1745    pmaddwd              m3, m9      ; a1
1746    mova                 m2, m4
1747    pmaddwd              m4, m9      ; b1
1748    paddd                m5, m3
1749    movu                 m3, [srcq+ssq*0]
1750    paddd                m6, m4
1751    HV_H_W4_6TAP         m0, m0, m4
1752    HV_H_W4_6TAP         m3, m3, m4
1753    psrad                m4, m2, 16
1754    psrad                m0, 10
1755    psrad                m3, 10
1756    packssdw             m4, m0      ; 4 5
1757    packssdw             m0, m3      ; 5 6
1758    punpcklwd            m3, m4, m0  ; 45
1759    punpckhwd            m4, m0      ; 56
1760    pmaddwd              m0, m10, m3 ; a2
1761    paddd                m5, m0
1762    pmaddwd              m0, m10, m4 ; b2
1763    paddd                m6, m0
1764    psrad                m5, 10
1765    psrad                m6, 10
1766    packssdw             m5, m6
1767    pxor                 m6, m6
1768    pminsw               m5, m11
1769    pmaxsw               m5, m6
1770    movq       [dstq+dsq*0], m5
1771    movhps     [dstq+dsq*1], m5
1772    lea                dstq, [dstq+dsq*2]
1773    sub                  hd, 2
1774    jg .hv_w4_loop
1775    RET
1776.hv_w8:
1777    RESET_STACK_STATE
1778    shr                 mxd, 16
1779    movq                 m2, [base+subpel_filters+1+mxq*8]
1780    movzx               mxd, myb
1781    shr                 myd, 16
1782    cmp                  hd, 6
1783    cmovb               myd, mxd
1784    movq                 m1, [base+subpel_filters+1+myq*8]
1785    movd                 m3, r8m
1786    movddup              m4, [base+pd_8704]
1787    pshufb               m3, [base+pw_256]
1788    pxor                 m0, m0
1789    punpcklbw            m0, m2
1790    punpcklbw            m1, m1
1791    sub                srcq, 4
1792    psraw                m1, 8 ; sign-extend
1793    test          dword r8m, 0x800
1794    jz .hv_w8_10bpc
1795    movddup              m4, [base+pd_2560]
1796    psraw                m0, 2
1797    psllw                m1, 2
1798.hv_w8_10bpc:
1799%if ARCH_X86_32
1800%assign regs_used 2
1801    ALLOC_STACK       -16*9
1802%assign regs_used 7
1803    mov                dstq, r0mp
1804    mov                 dsq, r1mp
1805    mova         [rsp+16*7], m4
1806%else
1807    ALLOC_STACK        16*7, 16
1808%endif
1809    mova         [rsp+16*6], m3
1810    pshufd               m2, m0, q0000
1811    mova         [rsp+16*0], m2
1812    pshufd               m2, m0, q1111
1813    mova         [rsp+16*1], m2
1814    pshufd               m0, m0, q2222
1815    mova         [rsp+16*2], m0
1816    pshufd               m2, m1, q0000
1817    mova         [rsp+16*3], m2
1818    pshufd               m2, m1, q1111
1819    mova         [rsp+16*4], m2
1820    pshufd               m1, m1, q2222
1821    mova         [rsp+16*5], m1
1822    mov                  r6, ssq
1823    neg                  r6
1824%if ARCH_X86_32
1825    shl                  wd, 14
1826    lea                 r4d, [wq+hq-(1<<16)]
1827%if STACK_ALIGNMENT < 16
1828    %define           srcmp  [esp+16*8+4*0]
1829    %define           dstmp  [esp+16*8+4*1]
1830%endif
1831%macro HV_H_6TAP 3-6 [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-2], mul[1-3]
1832    punpcklwd            %1, %2, %3   ; 01 12 23 34
1833    punpckhwd            %2, %3       ; 45 56 67 78
1834    pmaddwd              %3, %4, %1   ; a0
1835    shufpd               %1, %2, 0x01 ; 23 34 45 56
1836    pmaddwd              %2, %6       ; a2
1837    pmaddwd              %1, %5       ; a1
1838    paddd                %2, %3
1839    paddd                %1, %2
1840%endmacro
1841.hv_w8_loop0:
1842    mov               srcmp, srcq
1843    mov               dstmp, dstq
1844    movu                 m5, [srcq+r6*2+0]
1845    movu                 m6, [srcq+r6*2+2]
1846    mova                 m7, [rsp+16*0]
1847    mova                 m1, [rsp+16*1]
1848    mova                 m0, [rsp+16*2]
1849    HV_H_6TAP            m2, m5, m6, m7, m1, m0
1850    movu                 m5, [srcq+r6*1+0]
1851    movu                 m6, [srcq+r6*1+2]
1852    HV_H_6TAP            m3, m5, m6, m7, m1, m0
1853    movu                 m5, [srcq+ssq*0+0]
1854    movu                 m6, [srcq+ssq*0+2]
1855    HV_H_6TAP            m4, m5, m6, m7, m1, m0
1856    movu                 m5, [srcq+ssq*1+0]
1857    movu                 m6, [srcq+ssq*1+2]
1858    lea                srcq, [srcq+ssq*2]
1859    HV_H_6TAP            m0, m5, m6, m7, m1
1860    movu                 m5, [srcq+ssq*0+0]
1861    movu                 m6, [srcq+ssq*0+2]
1862    HV_H_6TAP            m1, m5, m6, m7
1863    mova                 m5, [rsp+16*7]
1864    REPX      {paddd x, m5}, m2, m3, m4, m0, m1
1865    REPX      {psrad x, 10}, m2, m4, m3, m0, m1
1866    packssdw             m2, m4     ; 0 2
1867    packssdw             m3, m0     ; 1 3
1868    packssdw             m4, m1     ; 2 4
1869    punpcklwd            m0, m2, m3 ; 01
1870    punpckhwd            m2, m3     ; 23
1871    punpcklwd            m1, m3, m4 ; 12
1872    punpckhwd            m3, m4     ; 34
1873.hv_w8_loop:
1874    mova                 m5, [rsp+16*3]
1875    mova                 m6, [rsp+16*4]
1876    pmaddwd              m4, m0, m5 ; a0
1877    pmaddwd              m5, m1     ; b0
1878    mova                 m0, m2
1879    pmaddwd              m2, m6     ; a1
1880    mova                 m1, m3
1881    pmaddwd              m3, m6     ; b1
1882    paddd                m4, m2
1883    movu                 m2, [srcq+ssq*1+0]
1884    paddd                m5, m3
1885    movu                 m3, [srcq+ssq*1+2]
1886    lea                srcq, [srcq+ssq*2]
1887    HV_H_6TAP            m6, m2, m3
1888    movu                 m2, [srcq+ssq*0+0]
1889    movu                 m3, [srcq+ssq*0+2]
1890    HV_H_6TAP            m7, m2, m3
1891    mova                 m2, [rsp+16*7]
1892    psrad                m3, m1, 16
1893    paddd                m6, m2
1894    paddd                m7, m2
1895    psrad                m6, 10
1896    psrad                m7, 10
1897    packssdw             m3, m6     ; 4 5
1898    packssdw             m6, m7     ; 5 6
1899    mova                 m7, [rsp+16*5]
1900    punpcklwd            m2, m3, m6 ; 45
1901    punpckhwd            m3, m6     ; 56
1902    pmaddwd              m6, m2, m7 ; a2
1903    pmaddwd              m7, m3     ; b2
1904    paddd                m4, m6
1905    paddd                m5, m7
1906    psrad                m4, 10
1907    psrad                m5, 10
1908    packssdw             m4, m5
1909    pxor                 m5, m5
1910    pminsw               m4, [rsp+16*6]
1911    pmaxsw               m4, m5
1912    movq       [dstq+dsq*0], m4
1913    movhps     [dstq+dsq*1], m4
1914    lea                dstq, [dstq+dsq*2]
1915    sub                  hd, 2
1916    jg .hv_w8_loop
1917    mov                srcq, srcmp
1918    mov                dstq, dstmp
1919    movzx                hd, r4w
1920    add                srcq, 8
1921    add                dstq, 8
1922    sub                 r4d, 1<<16
1923%else
1924    shl                  wd, 5
1925    lea                 r8d, [wq+hq-256]
1926%macro HV_H_6TAP 5-9 [spel_h_shufA], [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-3], shift, shuf, mul[1-3]
1927%ifid %6
1928    REPX     {pshufb x, %6}, %2, %3, %4
1929%else
1930    mova                 %1, %6
1931    pshufb               %2, %1       ; 01 12 23 34
1932    pshufb               %3, %1       ; 45 56 67 78
1933    pshufb               %4, %1       ; 89 9a ab bc
1934%endif
1935    pmaddwd              %1, %7, %2
1936    shufpd               %2, %3, 0x01 ; 23 34 45 56
1937    pmaddwd              %2, %8
1938    paddd                %1, %2
1939    pmaddwd              %2, %9, %3
1940    paddd                %1, %2
1941    pmaddwd              %2, %7, %3
1942    shufpd               %3, %4, 0x01 ; 67 78 89 9a
1943    pmaddwd              %4, %9
1944    pmaddwd              %3, %8
1945    paddd                %1, m4
1946    paddd                %2, m4
1947    paddd                %3, %4
1948    paddd                %2, %3
1949    psrad                %1, %5
1950    psrad                %2, %5
1951    packssdw             %1, %2
1952%endmacro
1953.hv_w8_loop0:
1954    mova                 m5, [spel_h_shufA]
1955    movu                 m0, [srcq+r6*2+ 0]
1956    mova                 m6, [rsp+16*0]
1957    movu                 m1, [srcq+r6*2+ 8]
1958    mova                 m7, [rsp+16*1]
1959    movu                 m2, [srcq+r6*2+16]
1960    mova                 m8, [rsp+16*2]
1961    HV_H_6TAP            m9, m0, m1, m2, 10, m5, m6, m7, m8
1962    movu                 m0, [srcq+r6*1+ 0]
1963    movu                 m1, [srcq+r6*1+ 8]
1964    movu                 m2, [srcq+r6*1+16]
1965    lea                  r4, [srcq+ssq*2]
1966    HV_H_6TAP           m11, m0, m1, m2, 10, m5, m6, m7, m8
1967    movu                 m0, [srcq+ssq*0+ 0]
1968    movu                 m1, [srcq+ssq*0+ 8]
1969    movu                 m2, [srcq+ssq*0+16]
1970    mov                  r7, dstq
1971    HV_H_6TAP           m13, m0, m1, m2, 10, m5, m6, m7, m8
1972    movu                 m0, [srcq+ssq*1+ 0]
1973    movu                 m1, [srcq+ssq*1+ 8]
1974    movu                 m2, [srcq+ssq*1+16]
1975    HV_H_6TAP           m15, m0, m1, m2, 10, m5, m6, m7, m8
1976    movu                 m0, [r4+ssq*0+ 0]
1977    movu                 m1, [r4+ssq*0+ 8]
1978    movu                 m2, [r4+ssq*0+16]
1979    HV_H_6TAP            m5, m0, m1, m2, 10, m5, m6, m7, m8
1980    punpcklwd            m8, m9, m11  ; 01
1981    punpckhwd            m9, m11
1982    punpcklwd           m10, m11, m13 ; 12
1983    punpckhwd           m11, m13
1984    punpcklwd           m12, m13, m15 ; 23
1985    punpckhwd           m13, m15
1986    punpcklwd           m14, m15, m5  ; 34
1987    punpckhwd           m15, m5
1988.hv_w8_loop:
1989    mova                 m3, [rsp+16*3]
1990    mova                 m7, [rsp+16*4]
1991    pmaddwd              m0, m8, m3   ; a0
1992    mova                 m8, m12
1993    pmaddwd              m2, m9, m3   ; a0'
1994    mova                 m9, m13
1995    pmaddwd              m1, m10, m3  ; b0
1996    mova                m10, m14
1997    pmaddwd              m3, m11      ; b0'
1998    mova                m11, m15
1999    REPX    {pmaddwd x, m7}, m12, m13, m14, m15
2000    movu                 m6, [r4+ssq*1+ 0]
2001    paddd                m0, m12
2002    movu                 m7, [r4+ssq*1+ 8]
2003    paddd                m2, m13
2004    movu                m12, [r4+ssq*1+16]
2005    paddd                m1, m14
2006    lea                  r4, [r4+ssq*2]
2007    paddd                m3, m15
2008    HV_H_6TAP           m15, m6, m7, m12, 10
2009    movu                 m6, [r4+ssq*0+ 0]
2010    movu                 m7, [r4+ssq*0+ 8]
2011    movu                m14, [r4+ssq*0+16]
2012    punpcklwd           m12, m5, m15 ; 45
2013    punpckhwd           m13, m5, m15
2014    HV_H_6TAP            m5, m6, m7, m14, 10
2015    mova                 m7, [rsp+16*5]
2016    punpcklwd           m14, m15, m5  ; 56
2017    punpckhwd           m15, m5
2018    pmaddwd              m6, m12, m7  ; a2
2019    paddd                m0, m6
2020    pmaddwd              m6, m13, m7  ; a2'
2021    paddd                m2, m6
2022    pmaddwd              m6, m14, m7  ; b2
2023    pmaddwd              m7, m15      ; b2'
2024    paddd                m1, m6
2025    mova                 m6, [rsp+16*6]
2026    paddd                m3, m7
2027    REPX      {psrad x, 10}, m0, m2, m1, m3
2028    packssdw             m0, m2
2029    packssdw             m1, m3
2030    pxor                 m2, m2
2031    pminsw               m0, m6
2032    pminsw               m1, m6
2033    pmaxsw               m0, m2
2034    pmaxsw               m1, m2
2035    mova         [r7+dsq*0], m0
2036    mova         [r7+dsq*1], m1
2037    lea                  r7, [r7+dsq*2]
2038    sub                  hd, 2
2039    jg .hv_w8_loop
2040    add                srcq, 16
2041    add                dstq, 16
2042    movzx                hd, r8b
2043    sub                 r8d, 1<<8
2044%endif
2045    jg .hv_w8_loop0
2046    RET
2047
2048PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
2049PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
2050PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
2051PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
2052PUT_8TAP_FN sharp,          SHARP,   SHARP
2053
2054cglobal put_8tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my
2055%if ARCH_X86_32
2056    %define             mxb  r0b
2057    %define             mxd  r0
2058    %define             mxq  r0
2059    %define             myb  r1b
2060    %define             myd  r1
2061    %define             myq  r1
2062    %define              m8  [esp+16*0]
2063    %define              m9  [esp+16*1]
2064    %define             m10  [esp+16*2]
2065    %define             m11  [esp+16*3]
2066    %define             m12  [esp+16*4]
2067    %define             m13  [esp+16*5]
2068    %define             m14  [esp+16*6]
2069    %define             m15  [esp+16*7]
2070%endif
2071    imul                mxd, mxm, 0x010101
2072    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2073    imul                myd, mym, 0x010101
2074    add                 myd, t1d ; 8tap_v, my, 4tap_v
2075    LEA                  t2, put_ssse3
2076    movifnidn            wd, wm
2077    movifnidn          srcq, srcmp
2078    movifnidn           ssq, ssmp
2079    movifnidn            hd, hm
2080    test                mxd, 0xf00
2081    jnz .h
2082    test                myd, 0xf00
2083    jz mangle(private_prefix %+ _put_6tap_16bpc_ssse3).put
2084.v:
2085    movzx               mxd, myb
2086    shr                 myd, 16
2087    cmp                  hd, 6
2088    cmovb               myd, mxd
2089    movq                 m3, [base+subpel_filters+myq*8]
2090    WIN64_SPILL_XMM      15
2091    movd                 m7, r8m
2092    movifnidn          dstq, dstmp
2093    movifnidn           dsq, dsmp
2094    punpcklbw            m3, m3
2095    pshufb               m7, [base+pw_256]
2096    psraw                m3, 8 ; sign-extend
2097%if ARCH_X86_32
2098    ALLOC_STACK       -16*7
2099    pshufd               m0, m3, q0000
2100    pshufd               m1, m3, q1111
2101    pshufd               m2, m3, q2222
2102    pshufd               m3, m3, q3333
2103    mova                 m8, m0
2104    mova                 m9, m1
2105    mova                m10, m2
2106    mova                m11, m3
2107%else
2108    pshufd               m8, m3, q0000
2109    pshufd               m9, m3, q1111
2110    pshufd              m10, m3, q2222
2111    pshufd              m11, m3, q3333
2112%endif
2113    lea                  r6, [ssq*3]
2114    sub                srcq, r6
2115    cmp                  wd, 2
2116    jne .v_w4
2117.v_w2:
2118    movd                 m1, [srcq+ssq*0]
2119    movd                 m4, [srcq+ssq*1]
2120    movd                 m2, [srcq+ssq*2]
2121    add                srcq, r6
2122    movd                 m5, [srcq+ssq*0]
2123    movd                 m3, [srcq+ssq*1]
2124    movd                 m6, [srcq+ssq*2]
2125    add                srcq, r6
2126    movd                 m0, [srcq+ssq*0]
2127    punpckldq            m1, m4      ; 0 1
2128    punpckldq            m4, m2      ; 1 2
2129    punpckldq            m2, m5      ; 2 3
2130    punpckldq            m5, m3      ; 3 4
2131    punpckldq            m3, m6      ; 4 5
2132    punpckldq            m6, m0      ; 5 6
2133    punpcklwd            m1, m4      ; 01 12
2134    punpcklwd            m2, m5      ; 23 34
2135    punpcklwd            m3, m6      ; 45 56
2136    pxor                 m6, m6
2137.v_w2_loop:
2138    movd                 m4, [srcq+ssq*1]
2139    lea                srcq, [srcq+ssq*2]
2140    pmaddwd              m5, m8, m1  ; a0 b0
2141    mova                 m1, m2
2142    pmaddwd              m2, m9      ; a1 b1
2143    paddd                m5, m2
2144    mova                 m2, m3
2145    pmaddwd              m3, m10     ; a2 b2
2146    paddd                m5, m3
2147    punpckldq            m3, m0, m4  ; 6 7
2148    movd                 m0, [srcq+ssq*0]
2149    punpckldq            m4, m0      ; 7 8
2150    punpcklwd            m3, m4      ; 67 78
2151    pmaddwd              m4, m11, m3 ; a3 b3
2152    paddd                m5, m4
2153    psrad                m5, 5
2154    packssdw             m5, m5
2155    pmaxsw               m5, m6
2156    pavgw                m5, m6
2157    pminsw               m5, m7
2158    movd       [dstq+dsq*0], m5
2159    pshuflw              m5, m5, q3232
2160    movd       [dstq+dsq*1], m5
2161    lea                dstq, [dstq+dsq*2]
2162    sub                  hd, 2
2163    jg .v_w2_loop
2164    RET
2165.v_w4:
2166%if ARCH_X86_32
2167    shl                  wd, 14
2168%if STACK_ALIGNMENT < 16
2169    mov          [esp+4*29], srcq
2170    mov          [esp+4*30], dstq
2171%else
2172    mov               srcmp, srcq
2173%endif
2174    lea                  wd, [wq+hq-(1<<16)]
2175%else
2176    shl                  wd, 6
2177    mov                  r7, srcq
2178    mov                  r8, dstq
2179    lea                  wd, [wq+hq-(1<<8)]
2180%endif
2181.v_w4_loop0:
2182    movq                 m1, [srcq+ssq*0]
2183    movq                 m2, [srcq+ssq*1]
2184    movq                 m3, [srcq+ssq*2]
2185    add                srcq, r6
2186    movq                 m4, [srcq+ssq*0]
2187    movq                 m5, [srcq+ssq*1]
2188    movq                 m6, [srcq+ssq*2]
2189    add                srcq, r6
2190    movq                 m0, [srcq+ssq*0]
2191    punpcklwd            m1, m2      ; 01
2192    punpcklwd            m2, m3      ; 12
2193    punpcklwd            m3, m4      ; 23
2194    punpcklwd            m4, m5      ; 34
2195    punpcklwd            m5, m6      ; 45
2196    punpcklwd            m6, m0      ; 56
2197%if ARCH_X86_32
2198    jmp .v_w4_loop_start
2199.v_w4_loop:
2200    mova                 m1, m12
2201    mova                 m2, m13
2202    mova                 m3, m14
2203.v_w4_loop_start:
2204    pmaddwd              m1, m8      ; a0
2205    pmaddwd              m2, m8      ; b0
2206    mova                m12, m3
2207    mova                m13, m4
2208    pmaddwd              m3, m9      ; a1
2209    pmaddwd              m4, m9      ; b1
2210    paddd                m1, m3
2211    paddd                m2, m4
2212    mova                m14, m5
2213    mova                 m4, m6
2214    pmaddwd              m5, m10     ; a2
2215    pmaddwd              m6, m10     ; b2
2216    paddd                m1, m5
2217    paddd                m2, m6
2218    movq                 m6, [srcq+ssq*1]
2219    lea                srcq, [srcq+ssq*2]
2220    punpcklwd            m5, m0, m6  ; 67
2221    movq                 m0, [srcq+ssq*0]
2222    pmaddwd              m3, m11, m5 ; a3
2223    punpcklwd            m6, m0      ; 78
2224    paddd                m1, m3
2225    pmaddwd              m3, m11, m6 ; b3
2226    paddd                m2, m3
2227    psrad                m1, 5
2228    psrad                m2, 5
2229    packssdw             m1, m2
2230    pxor                 m2, m2
2231    pmaxsw               m1, m2
2232    pavgw                m1, m2
2233    pminsw               m1, m7
2234    movq       [dstq+dsq*0], m1
2235    movhps     [dstq+dsq*1], m1
2236    lea                dstq, [dstq+dsq*2]
2237    sub                  hd, 2
2238    jg .v_w4_loop
2239%if STACK_ALIGNMENT < 16
2240    mov                srcq, [esp+4*29]
2241    mov                dstq, [esp+4*30]
2242    movzx                hd, ww
2243    add                srcq, 8
2244    add                dstq, 8
2245    mov          [esp+4*29], srcq
2246    mov          [esp+4*30], dstq
2247%else
2248    mov                srcq, srcmp
2249    mov                dstq, dstmp
2250    movzx                hd, ww
2251    add                srcq, 8
2252    add                dstq, 8
2253    mov               srcmp, srcq
2254    mov               dstmp, dstq
2255%endif
2256    sub                  wd, 1<<16
2257%else
2258.v_w4_loop:
2259    pmaddwd             m12, m8, m1  ; a0
2260    pmaddwd             m13, m8, m2  ; b0
2261    mova                 m1, m3
2262    mova                 m2, m4
2263    pmaddwd              m3, m9      ; a1
2264    pmaddwd              m4, m9      ; b1
2265    paddd               m12, m3
2266    paddd               m13, m4
2267    mova                 m3, m5
2268    mova                 m4, m6
2269    pmaddwd              m5, m10     ; a2
2270    pmaddwd              m6, m10     ; b2
2271    paddd               m12, m5
2272    paddd               m13, m6
2273    movq                 m6, [srcq+ssq*1]
2274    lea                srcq, [srcq+ssq*2]
2275    punpcklwd            m5, m0, m6  ; 67
2276    movq                 m0, [srcq+ssq*0]
2277    pmaddwd             m14, m11, m5 ; a3
2278    punpcklwd            m6, m0      ; 78
2279    paddd               m12, m14
2280    pmaddwd             m14, m11, m6 ; b3
2281    paddd               m13, m14
2282    psrad               m12, 5
2283    psrad               m13, 5
2284    packssdw            m12, m13
2285    pxor                m13, m13
2286    pmaxsw              m12, m13
2287    pavgw               m12, m13
2288    pminsw              m12, m7
2289    movq       [dstq+dsq*0], m12
2290    movhps     [dstq+dsq*1], m12
2291    lea                dstq, [dstq+dsq*2]
2292    sub                  hd, 2
2293    jg .v_w4_loop
2294    add                  r7, 8
2295    add                  r8, 8
2296    movzx                hd, wb
2297    mov                srcq, r7
2298    mov                dstq, r8
2299    sub                  wd, 1<<8
2300%endif
2301    jg .v_w4_loop0
2302    RET
2303.h:
2304    RESET_STACK_STATE
2305    test                myd, 0xf00
2306    jnz .hv
2307    mov                 myd, r8m
2308    movd                 m5, r8m
2309    shr                 myd, 11
2310    movddup              m4, [base+put_8tap_h_rnd+myq*8]
2311    movifnidn           dsq, dsmp
2312    pshufb               m5, [base+pw_256]
2313    cmp                  wd, 4
2314    jle mangle(private_prefix %+ _put_6tap_16bpc_ssse3).h_w4
2315    WIN64_SPILL_XMM      12
2316    shr                 mxd, 16
2317    movq                 m3, [base+subpel_filters+mxq*8]
2318    movifnidn          dstq, dstmp
2319    mova                 m6, [base+spel_h_shufA]
2320    mova                 m7, [base+spel_h_shufB]
2321%if UNIX64
2322    mov                  wd, wd
2323%endif
2324    lea                srcq, [srcq+wq*2]
2325    punpcklbw            m3, m3
2326    lea                dstq, [dstq+wq*2]
2327    psraw                m3, 8
2328    neg                  wq
2329%if ARCH_X86_32
2330    ALLOC_STACK       -16*4
2331    pshufd               m0, m3, q0000
2332    pshufd               m1, m3, q1111
2333    pshufd               m2, m3, q2222
2334    pshufd               m3, m3, q3333
2335    mova                 m8, m0
2336    mova                 m9, m1
2337    mova                m10, m2
2338    mova                m11, m3
2339%else
2340    pshufd               m8, m3, q0000
2341    pshufd               m9, m3, q1111
2342    pshufd              m10, m3, q2222
2343    pshufd              m11, m3, q3333
2344%endif
2345.h_w8_loop0:
2346    mov                  r6, wq
2347.h_w8_loop:
2348    movu                 m0, [srcq+r6*2- 6]
2349    movu                 m1, [srcq+r6*2+ 2]
2350    pshufb               m2, m0, m6   ; 0 1 1 2 2 3 3 4
2351    pshufb               m0, m7       ; 2 3 3 4 4 5 5 6
2352    pmaddwd              m2, m8       ; abcd0
2353    pmaddwd              m0, m9       ; abcd1
2354    pshufb               m3, m1, m6   ; 4 5 5 6 6 7 7 8
2355    pshufb               m1, m7       ; 6 7 7 8 8 9 9 a
2356    paddd                m2, m4
2357    paddd                m0, m2
2358    pmaddwd              m2, m10, m3  ; abcd2
2359    pmaddwd              m3, m8       ; efgh0
2360    paddd                m0, m2
2361    pmaddwd              m2, m11, m1  ; abcd3
2362    pmaddwd              m1, m9       ; efgh1
2363    paddd                m0, m2
2364    movu                 m2, [srcq+r6*2+10]
2365    paddd                m3, m4
2366    paddd                m1, m3
2367    pshufb               m3, m2, m6   ; 8 9 9 a a b b c
2368    pshufb               m2, m7       ; a b b c c d d e
2369    pmaddwd              m3, m10      ; efgh2
2370    pmaddwd              m2, m11      ; efgh3
2371    paddd                m1, m3
2372    paddd                m1, m2
2373    psrad                m0, 6
2374    psrad                m1, 6
2375    packssdw             m0, m1
2376    pxor                 m1, m1
2377    pminsw               m0, m5
2378    pmaxsw               m0, m1
2379    mova        [dstq+r6*2], m0
2380    add                  r6, 8
2381    jl .h_w8_loop
2382    add                srcq, ssq
2383    add                dstq, dsq
2384    dec                  hd
2385    jg .h_w8_loop0
2386    RET
2387.hv:
2388    RESET_STACK_STATE
2389%if ARCH_X86_32
2390    movd                 m4, r8m
2391    pshufb               m4, [base+pw_256]
2392%else
2393%if WIN64
2394    ALLOC_STACK        16*6, 16
2395%endif
2396    movd                m15, r8m
2397    pshufb              m15, [base+pw_256]
2398%endif
2399    cmp                  wd, 4
2400    jg .hv_w8
2401    movzx               mxd, mxb
2402    je .hv_w4
2403    movq                 m0, [base+subpel_filters+mxq*8]
2404    movzx               mxd, myb
2405    shr                 myd, 16
2406    cmp                  hd, 6
2407    cmovb               myd, mxd
2408    movq                 m3, [base+subpel_filters+myq*8]
2409    movddup              m6, [base+pd_8704]
2410    pshuflw              m0, m0, q2121
2411    pxor                 m7, m7
2412    punpcklbw            m7, m0
2413    punpcklbw            m3, m3
2414    psraw                m3, 8 ; sign-extend
2415    test          dword r8m, 0x800
2416    jz .hv_w2_10bpc
2417    movddup              m6, [base+pd_2560]
2418    psraw                m7, 2
2419    psllw                m3, 2
2420.hv_w2_10bpc:
2421%if ARCH_X86_32
2422    mov                dstq, dstmp
2423    mov                 dsq, dsmp
2424    mova                 m5, [base+spel_h_shuf2]
2425    ALLOC_STACK       -16*8
2426    pshufd               m0, m3, q0000
2427    pshufd               m1, m3, q1111
2428    pshufd               m2, m3, q2222
2429    pshufd               m3, m3, q3333
2430    mova                 m9, m5
2431    mova                m11, m0
2432    mova                m12, m1
2433    mova                m13, m2
2434    mova                m14, m3
2435    mova                m15, m4
2436%else
2437    mova                 m9, [base+spel_h_shuf2]
2438    pshufd              m11, m3, q0000
2439    pshufd              m12, m3, q1111
2440    pshufd              m13, m3, q2222
2441    pshufd              m14, m3, q3333
2442%endif
2443    lea                  r6, [ssq*3]
2444    sub                srcq, 2
2445    sub                srcq, r6
2446    movu                 m2, [srcq+ssq*0]
2447    movu                 m3, [srcq+ssq*1]
2448    movu                 m1, [srcq+ssq*2]
2449    add                srcq, r6
2450    movu                 m4, [srcq+ssq*0]
2451%if ARCH_X86_32
2452    REPX    {pshufb  x, m5}, m2, m3, m1, m4
2453%else
2454    REPX    {pshufb  x, m9}, m2, m3, m1, m4
2455%endif
2456    REPX    {pmaddwd x, m7}, m2, m3, m1, m4
2457    phaddd               m2, m3        ; 0 1
2458    phaddd               m1, m4        ; 2 3
2459    movu                 m3, [srcq+ssq*1]
2460    movu                 m4, [srcq+ssq*2]
2461    add                srcq, r6
2462    movu                 m0, [srcq+ssq*0]
2463%if ARCH_X86_32
2464    REPX    {pshufb  x, m5}, m3, m4, m0
2465%else
2466    REPX    {pshufb  x, m9}, m3, m4, m0
2467%endif
2468    REPX    {pmaddwd x, m7}, m3, m4, m0
2469    phaddd               m3, m4        ; 4 5
2470    phaddd               m0, m0        ; 6 6
2471    REPX    {paddd   x, m6}, m2, m1, m3, m0
2472    REPX    {psrad   x, 10}, m2, m1, m3, m0
2473    packssdw             m2, m1        ; 0 1 2 3
2474    packssdw             m3, m0        ; 4 5 6 _
2475    palignr              m4, m3, m2, 4 ; 1 2 3 4
2476    pshufd               m5, m3, q0321 ; 5 6 _ _
2477    punpcklwd            m1, m2, m4    ; 01 12
2478    punpckhwd            m2, m4        ; 23 34
2479    punpcklwd            m3, m5        ; 45 56
2480.hv_w2_loop:
2481    movu                 m4, [srcq+ssq*1]
2482    lea                srcq, [srcq+ssq*2]
2483    movu                 m5, [srcq+ssq*0]
2484    pshufb               m4, m9
2485    pshufb               m5, m9
2486    pmaddwd              m4, m7
2487    pmaddwd              m5, m7
2488    phaddd               m4, m5
2489    pmaddwd              m5, m11, m1   ; a0 b0
2490    mova                 m1, m2
2491    pmaddwd              m2, m12       ; a1 b1
2492    paddd                m5, m2
2493    mova                 m2, m3
2494    pmaddwd              m3, m13       ; a2 b2
2495    paddd                m5, m3
2496    paddd                m4, m6
2497    psrad                m4, 10        ; 7 8
2498    packssdw             m0, m4
2499    pshufd               m3, m0, q2103
2500    punpckhwd            m3, m0        ; 67 78
2501    mova                 m0, m4
2502    pmaddwd              m4, m14, m3   ; a3 b3
2503    paddd                m5, m4
2504    psrad                m5, 10
2505    packssdw             m5, m5
2506    pxor                 m4, m4
2507    pminsw               m5, m15
2508    pmaxsw               m5, m4
2509    movd       [dstq+dsq*0], m5
2510    pshuflw              m5, m5, q3232
2511    movd       [dstq+dsq*1], m5
2512    lea                dstq, [dstq+dsq*2]
2513    sub                  hd, 2
2514    jg .hv_w2_loop
2515    RET
2516.hv_w8:
2517    shr                 mxd, 16
2518.hv_w4:
2519    movq                 m2, [base+subpel_filters+mxq*8]
2520    movzx               mxd, myb
2521    shr                 myd, 16
2522    cmp                  hd, 6
2523    cmovb               myd, mxd
2524    movq                 m3, [base+subpel_filters+myq*8]
2525%if ARCH_X86_32
2526    RESET_STACK_STATE
2527    mov                dstq, dstmp
2528    mov                 dsq, dsmp
2529    mova                 m0, [base+spel_h_shufA]
2530    mova                 m1, [base+spel_h_shufB]
2531    mova                 m6, [base+pd_512]
2532    ALLOC_STACK      -16*15
2533    mova                 m8, m0
2534    mova                 m9, m1
2535    mova                m14, m6
2536%else
2537    mova                 m8, [base+spel_h_shufA]
2538    mova                 m9, [base+spel_h_shufB]
2539%endif
2540    pxor                 m0, m0
2541    punpcklbw            m0, m2
2542    punpcklbw            m3, m3
2543    psraw                m3, 8
2544    test          dword r8m, 0x800
2545    jz .hv_w4_10bpc
2546    psraw                m0, 2
2547    psllw                m3, 2
2548.hv_w4_10bpc:
2549    lea                  r6, [ssq*3]
2550    sub                srcq, 6
2551    sub                srcq, r6
2552%if ARCH_X86_32
2553    %define tmp esp+16*8
2554    shl                  wd, 14
2555%if STACK_ALIGNMENT < 16
2556    mov          [esp+4*61], srcq
2557    mov          [esp+4*62], dstq
2558%else
2559    mov               srcmp, srcq
2560%endif
2561    mova         [tmp+16*5], m4
2562    lea                  wd, [wq+hq-(1<<16)]
2563    pshufd               m1, m0, q0000
2564    pshufd               m2, m0, q1111
2565    pshufd               m5, m0, q2222
2566    pshufd               m0, m0, q3333
2567    mova                m10, m1
2568    mova                m11, m2
2569    mova                m12, m5
2570    mova                m13, m0
2571%else
2572%if WIN64
2573    %define tmp rsp
2574%else
2575    %define tmp rsp-104 ; red zone
2576%endif
2577    shl                  wd, 6
2578    mov                  r7, srcq
2579    mov                  r8, dstq
2580    lea                  wd, [wq+hq-(1<<8)]
2581    pshufd              m10, m0, q0000
2582    pshufd              m11, m0, q1111
2583    pshufd              m12, m0, q2222
2584    pshufd              m13, m0, q3333
2585    mova         [tmp+16*5], m15
2586%endif
2587    pshufd               m0, m3, q0000
2588    pshufd               m1, m3, q1111
2589    pshufd               m2, m3, q2222
2590    pshufd               m3, m3, q3333
2591    mova         [tmp+16*1], m0
2592    mova         [tmp+16*2], m1
2593    mova         [tmp+16*3], m2
2594    mova         [tmp+16*4], m3
2595%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512]
2596    pshufb              m%3, m%1, m8 ; 0 1 1 2 2 3 3 4
2597    pshufb              m%1, m9      ; 2 3 3 4 4 5 5 6
2598    pmaddwd             m%3, m10
2599    pmaddwd             m%1, m11
2600    paddd               m%3, %5
2601    paddd               m%1, m%3
2602    pshufb              m%3, m%2, m8 ; 4 5 5 6 6 7 7 8
2603    pshufb              m%2, m9      ; 6 7 7 8 8 9 9 a
2604    pmaddwd             m%3, m12
2605    pmaddwd             m%2, m13
2606    paddd               m%1, m%3
2607    paddd               m%1, m%2
2608    psrad               m%1, %4
2609%endmacro
2610.hv_w4_loop0:
2611%if ARCH_X86_64
2612    mova                m14, [pd_512]
2613%endif
2614    movu                 m4, [srcq+ssq*0+0]
2615    movu                 m1, [srcq+ssq*0+8]
2616    movu                 m5, [srcq+ssq*1+0]
2617    movu                 m2, [srcq+ssq*1+8]
2618    movu                 m6, [srcq+ssq*2+0]
2619    movu                 m3, [srcq+ssq*2+8]
2620    add                srcq, r6
2621    PUT_8TAP_HV_H         4, 1, 0, 10
2622    PUT_8TAP_HV_H         5, 2, 0, 10
2623    PUT_8TAP_HV_H         6, 3, 0, 10
2624    movu                 m7, [srcq+ssq*0+0]
2625    movu                 m2, [srcq+ssq*0+8]
2626    movu                 m1, [srcq+ssq*1+0]
2627    movu                 m3, [srcq+ssq*1+8]
2628    PUT_8TAP_HV_H         7, 2, 0, 10
2629    PUT_8TAP_HV_H         1, 3, 0, 10
2630    movu                 m2, [srcq+ssq*2+0]
2631    movu                 m3, [srcq+ssq*2+8]
2632    add                srcq, r6
2633    PUT_8TAP_HV_H         2, 3, 0, 10
2634    packssdw             m4, m7      ; 0 3
2635    packssdw             m5, m1      ; 1 4
2636    movu                 m0, [srcq+ssq*0+0]
2637    movu                 m1, [srcq+ssq*0+8]
2638    PUT_8TAP_HV_H         0, 1, 3, 10
2639    packssdw             m6, m2      ; 2 5
2640    packssdw             m7, m0      ; 3 6
2641    punpcklwd            m1, m4, m5  ; 01
2642    punpckhwd            m4, m5      ; 34
2643    punpcklwd            m2, m5, m6  ; 12
2644    punpckhwd            m5, m6      ; 45
2645    punpcklwd            m3, m6, m7  ; 23
2646    punpckhwd            m6, m7      ; 56
2647%if ARCH_X86_32
2648    jmp .hv_w4_loop_start
2649.hv_w4_loop:
2650    mova                 m1, [tmp+16*6]
2651    mova                 m2, m15
2652.hv_w4_loop_start:
2653    mova                 m7, [tmp+16*1]
2654    pmaddwd              m1, m7      ; a0
2655    pmaddwd              m2, m7      ; b0
2656    mova                 m7, [tmp+16*2]
2657    mova         [tmp+16*6], m3
2658    pmaddwd              m3, m7      ; a1
2659    mova                m15, m4
2660    pmaddwd              m4, m7      ; b1
2661    mova                 m7, [tmp+16*3]
2662    paddd                m1, m3
2663    paddd                m2, m4
2664    mova                 m3, m5
2665    pmaddwd              m5, m7      ; a2
2666    mova                 m4, m6
2667    pmaddwd              m6, m7      ; b2
2668    paddd                m1, m5
2669    paddd                m2, m6
2670    movu                 m7, [srcq+ssq*1+0]
2671    movu                 m5, [srcq+ssq*1+8]
2672    lea                srcq, [srcq+ssq*2]
2673    PUT_8TAP_HV_H         7, 5, 6, 10
2674    packssdw             m0, m7      ; 6 7
2675    mova         [tmp+16*0], m0
2676    movu                 m0, [srcq+ssq*0+0]
2677    movu                 m5, [srcq+ssq*0+8]
2678    PUT_8TAP_HV_H         0, 5, 6, 10
2679    mova                 m6, [tmp+16*0]
2680    packssdw             m7, m0      ; 7 8
2681    punpcklwd            m5, m6, m7  ; 67
2682    punpckhwd            m6, m7      ; 78
2683    pmaddwd              m7, m5, [tmp+16*4]
2684    paddd                m1, m7      ; a3
2685    pmaddwd              m7, m6, [tmp+16*4]
2686    paddd                m2, m7      ; b3
2687    psrad                m1, 9
2688    psrad                m2, 9
2689    packssdw             m1, m2
2690    pxor                 m7, m7
2691    pmaxsw               m1, m7
2692    pavgw                m7, m1
2693    pminsw               m7, [tmp+16*5]
2694    movq       [dstq+dsq*0], m7
2695    movhps     [dstq+dsq*1], m7
2696    lea                dstq, [dstq+dsq*2]
2697    sub                  hd, 2
2698    jg .hv_w4_loop
2699%if STACK_ALIGNMENT < 16
2700    mov                srcq, [esp+4*61]
2701    mov                dstq, [esp+4*62]
2702    add                srcq, 8
2703    add                dstq, 8
2704    mov          [esp+4*61], srcq
2705    mov          [esp+4*62], dstq
2706%else
2707    mov                srcq, srcmp
2708    mov                dstq, dstmp
2709    add                srcq, 8
2710    add                dstq, 8
2711    mov               srcmp, srcq
2712    mov               dstmp, dstq
2713%endif
2714    movzx                hd, ww
2715    sub                  wd, 1<<16
2716%else
2717.hv_w4_loop:
2718    mova                m15, [tmp+16*1]
2719    pmaddwd             m14, m15, m1 ; a0
2720    pmaddwd             m15, m2      ; b0
2721    mova                 m7, [tmp+16*2]
2722    mova                 m1, m3
2723    pmaddwd              m3, m7      ; a1
2724    mova                 m2, m4
2725    pmaddwd              m4, m7      ; b1
2726    mova                 m7, [tmp+16*3]
2727    paddd               m14, m3
2728    paddd               m15, m4
2729    mova                 m3, m5
2730    pmaddwd              m5, m7      ; a2
2731    mova                 m4, m6
2732    pmaddwd              m6, m7      ; b2
2733    paddd               m14, m5
2734    paddd               m15, m6
2735    movu                 m7, [srcq+ssq*1+0]
2736    movu                 m5, [srcq+ssq*1+8]
2737    lea                srcq, [srcq+ssq*2]
2738    PUT_8TAP_HV_H         7, 5, 6, 10, [pd_512]
2739    packssdw             m0, m7      ; 6 7
2740    mova         [tmp+16*0], m0
2741    movu                 m0, [srcq+ssq*0+0]
2742    movu                 m5, [srcq+ssq*0+8]
2743    PUT_8TAP_HV_H         0, 5, 6, 10, [pd_512]
2744    mova                 m6, [tmp+16*0]
2745    packssdw             m7, m0      ; 7 8
2746    punpcklwd            m5, m6, m7  ; 67
2747    punpckhwd            m6, m7      ; 78
2748    pmaddwd              m7, m5, [tmp+16*4]
2749    paddd               m14, m7      ; a3
2750    pmaddwd              m7, m6, [tmp+16*4]
2751    paddd               m15, m7      ; b3
2752    psrad               m14, 9
2753    psrad               m15, 9
2754    packssdw            m14, m15
2755    pxor                 m7, m7
2756    pmaxsw              m14, m7
2757    pavgw                m7, m14
2758    pminsw               m7, [tmp+16*5]
2759    movq       [dstq+dsq*0], m7
2760    movhps     [dstq+dsq*1], m7
2761    lea                dstq, [dstq+dsq*2]
2762    sub                  hd, 2
2763    jg .hv_w4_loop
2764    add                  r7, 8
2765    add                  r8, 8
2766    movzx                hd, wb
2767    mov                srcq, r7
2768    mov                dstq, r8
2769    sub                  wd, 1<<8
2770%endif
2771    jg .hv_w4_loop0
2772    RET
2773%undef tmp
2774
2775%if ARCH_X86_32
2776DECLARE_REG_TMP 2, 1, 6, 4
2777%elif WIN64
2778DECLARE_REG_TMP 6, 4, 7, 4
2779%else
2780DECLARE_REG_TMP 6, 7, 7, 8
2781%endif
2782
2783%define PREP_8TAP_FN FN prep_8tap,
2784PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
2785PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
2786PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
2787PREP_8TAP_FN regular,        REGULAR, REGULAR
2788
2789cglobal prep_6tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my
2790    %define            base  t2-prep_ssse3
2791%if ARCH_X86_32
2792    %define             mxb  r0b
2793    %define             mxd  r0
2794    %define             mxq  r0
2795    %define             myb  r2b
2796    %define             myd  r2
2797    %define             myq  r2
2798%endif
2799    imul                mxd, mxm, 0x010101
2800    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
2801    imul                myd, mym, 0x010101
2802    add                 myd, t1d ; 6tap_v, my, 4tap_v
2803    LEA                  t2, prep_ssse3
2804    movifnidn            wd, wm
2805    movifnidn            hd, hm
2806    movifnidn          srcq, srcmp
2807    test                mxd, 0xf00
2808    jnz .h
2809    test                myd, 0xf00
2810    jnz .v
2811.prep:
2812    tzcnt                wd, wd
2813    mov                 myd, r7m ; bitdepth_max
2814    movzx                wd, word [base+prep_ssse3_table+wq*2]
2815    mova                 m5, [base+pw_8192]
2816    shr                 myd, 11
2817    add                  wq, t2
2818    movddup              m4, [base+prep_mul+myq*8]
2819    movifnidn           ssq, ssmp
2820    movifnidn          tmpq, tmpmp
2821    lea                  r6, [ssq*3]
2822%if WIN64
2823    pop                  r7
2824%endif
2825    jmp                  wq
2826.h:
2827    RESET_STACK_STATE
2828    test                myd, 0xf00
2829    jnz .hv
2830    movifnidn           ssq, r2mp
2831    movddup              m5, [base+prep_8tap_1d_rnd]
2832    cmp                  wd, 4
2833    je mangle(private_prefix %+ _prep_8tap_16bpc_ssse3).h_w4
2834    WIN64_SPILL_XMM      10
2835    shr                 mxd, 16
2836    movq                 m2, [base+subpel_filters+1+mxq*8]
2837    movifnidn          tmpq, r0mp
2838    mova                 m4, [base+spel_h_shufA]
2839    add                  wd, wd
2840    mova                 m6, [base+spel_h_shufB]
2841    add                srcq, wq
2842    punpcklbw            m2, m2
2843    add                tmpq, wq
2844    psraw                m2, 8
2845    neg                  wq
2846    test          dword r7m, 0x800
2847    jnz .h_w8_12bpc
2848    psllw                m2, 2
2849.h_w8_12bpc:
2850    pshufd               m7, m2, q0000
2851%if ARCH_X86_32
2852    ALLOC_STACK       -16*2
2853    %define              m8  [rsp+16*0]
2854    %define              m9  [rsp+16*1]
2855    pshufd               m0, m2, q1111
2856    pshufd               m1, m2, q2222
2857    mova                 m8, m0
2858    mova                 m9, m1
2859%else
2860    pshufd               m8, m2, q1111
2861    pshufd               m9, m2, q2222
2862%endif
2863.h_w8_loop0:
2864    mov                  r6, wq
2865.h_w8_loop:
2866    movu                 m3, [srcq+r6-4]
2867    movu                 m2, [srcq+r6+8]
2868    pshufb               m0, m3, m4  ; 01 12 23 34
2869    pmaddwd              m0, m7      ; abcd0
2870    pshufb               m3, m6      ; 23 34 45 56
2871    pmaddwd              m1, m8, m3  ; abcd1
2872    paddd                m0, m1
2873    pshufb               m1, m2, m4  ; 67 78 89 9a
2874    shufpd               m3, m1, 0x01; 45 56 67 78
2875    pmaddwd              m1, m8      ; efgh1
2876    pshufb               m2, m6      ; 89 9a ab bc
2877    pmaddwd              m2, m9      ; efgh2
2878    paddd                m1, m2
2879    pmaddwd              m2, m9 , m3 ; abcd2
2880    pmaddwd              m3, m7      ; efgh0
2881    paddd                m0, m5
2882    paddd                m1, m5
2883    paddd                m0, m2
2884    paddd                m1, m3
2885    psrad                m0, 4
2886    psrad                m1, 4
2887    packssdw             m0, m1
2888    mova          [tmpq+r6], m0
2889    add                  r6, 16
2890    jl .h_w8_loop
2891    add                srcq, ssq
2892    sub                tmpq, wq
2893    dec                  hd
2894    jg .h_w8_loop0
2895    RET
2896.v:
2897    movzx               mxd, myb
2898    shr                 myd, 16
2899    cmp                  hd, 6
2900    cmovb               myd, mxd
2901    movddup              m5, [base+prep_8tap_1d_rnd]
2902    movq                 m2, [base+subpel_filters+1+myq*8]
2903    WIN64_SPILL_XMM      11, 16
2904    movifnidn           ssq, r2mp
2905    movifnidn          tmpq, r0mp
2906    punpcklbw            m2, m2
2907    sub                srcq, ssq
2908    psraw                m2, 8 ; sign-extend
2909    test          dword r7m, 0x800
2910    jnz .v_12bpc
2911    psllw                m2, 2
2912.v_12bpc:
2913    sub                srcq, ssq
2914%if ARCH_X86_32
2915    ALLOC_STACK       -16*4
2916    pshufd               m0, m2, q0000
2917    mov                 r6d, wd
2918    pshufd               m1, m2, q1111
2919    shl                 r6d, 14
2920    pshufd               m2, m2, q2222
2921    lea                 r6d, [r6+hq-(1<<16)]
2922    mova                 m8, m0
2923    mova                 m9, m1
2924    mova                m10, m2
2925%if STACK_ALIGNMENT < 16
2926    %define           srcmp  [esp+16*3+4*0]
2927    %define           tmpmp  [esp+16*3+4*1]
2928%endif
2929.v_w4_loop0:
2930    mov               srcmp, srcq
2931    mov               tmpmp, tmpq
2932%else
2933    pshufd               m8, m2, q0000
2934    and                  wd, -8
2935    jnz .v_w8
2936    pshufd               m9, m2, q1111
2937    pshufd              m10, m2, q2222
2938%endif
2939    movq                 m1, [srcq+ssq*0]
2940    movq                 m2, [srcq+ssq*1]
2941    lea                srcq, [srcq+ssq*2]
2942    movq                 m3, [srcq+ssq*0]
2943    movq                 m4, [srcq+ssq*1]
2944    lea                srcq, [srcq+ssq*2]
2945    movq                 m0, [srcq+ssq*0]
2946    punpcklwd            m1, m2      ; 01
2947    punpcklwd            m2, m3      ; 12
2948    punpcklwd            m3, m4      ; 23
2949    punpcklwd            m4, m0      ; 34
2950.v_w4_loop:
2951    pmaddwd              m6, m8, m1  ; a0
2952    pmaddwd              m7, m8, m2  ; b0
2953    mova                 m1, m3
2954    pmaddwd              m3, m9      ; a1
2955    mova                 m2, m4
2956    pmaddwd              m4, m9      ; b1
2957    paddd                m6, m3
2958    movq                 m3, [srcq+ssq*0]
2959    paddd                m7, m4
2960    movq                 m4, [srcq+ssq*1]
2961    lea                srcq, [srcq+ssq*2]
2962    movq                 m0, [srcq+ssq*0]
2963    punpcklwd            m3, m4      ; 45
2964    punpcklwd            m4, m0      ; 56
2965    pmaddwd              m0, m10, m3 ; a2
2966    paddd                m6, m5
2967    paddd                m6, m0
2968    pmaddwd              m0, m10, m4 ; b2
2969    paddd                m7, m5
2970    paddd                m7, m0
2971    psrad                m6, 4
2972    psrad                m7, 4
2973    packssdw             m6, m7
2974%if ARCH_X86_32
2975    movq        [tmpq+wq*0], m6
2976    movhps      [tmpq+wq*2], m6
2977    lea                tmpq, [tmpq+wq*4]
2978    sub                  hd, 2
2979    jg .v_w4_loop
2980    mov                srcq, srcmp
2981    mov                tmpq, tmpmp
2982    movzx                hd, r6w
2983    add                srcq, 8
2984    add                tmpq, 8
2985    sub                 r6d, 1<<16
2986    jg .v_w4_loop0
2987    RET
2988%else
2989    mova             [tmpq], m6
2990    add                tmpq, 16
2991    sub                  hd, 2
2992    jg .v_w4_loop
2993    RET
2994.v_w8:
2995    mova                r6m, m8
2996    lea                 r6d, [wq*4-(1<<5)]
2997    pshufd               m6, m2, q1111
2998    lea                 r6d, [hq+r6*8]
2999    pshufd               m7, m2, q2222
3000    WIN64_PUSH_XMM       16
3001.v_w8_loop0:
3002    movu                 m9, [srcq+ssq*0]
3003    lea                  r5, [srcq+ssq*2]
3004    movu                m11, [srcq+ssq*1]
3005    mov                  r7, tmpq
3006    movu                m13, [r5+ssq*0]
3007    movu                m15, [r5+ssq*1]
3008    lea                  r5, [r5+ssq*2]
3009    movu                 m4, [r5+ssq*0]
3010    punpcklwd            m8, m9, m11  ; 01
3011    punpckhwd            m9, m11
3012    punpcklwd           m10, m11, m13 ; 12
3013    punpckhwd           m11, m13
3014    punpcklwd           m12, m13, m15 ; 23
3015    punpckhwd           m13, m15
3016    punpcklwd           m14, m15, m4  ; 34
3017    punpckhwd           m15, m4
3018.v_w8_loop:
3019    mova                 m3, r6m
3020    pmaddwd              m0, m8, m3   ; a0
3021    pmaddwd              m2, m9, m3   ; a0'
3022    pmaddwd              m1, m10, m3  ; b0
3023    pmaddwd              m3, m11      ; b0'
3024    mova                 m8, m12
3025    pmaddwd             m12, m6       ; a1
3026    mova                 m9, m13
3027    pmaddwd             m13, m6       ; a1'
3028    mova                m10, m14
3029    pmaddwd             m14, m6       ; b1
3030    mova                m11, m15
3031    pmaddwd             m15, m6       ; b1'
3032    paddd                m0, m12
3033    paddd                m2, m13
3034    movu                m13, [r5+ssq*0]
3035    paddd                m1, m14
3036    paddd                m3, m15
3037    movu                m15, [r5+ssq*1]
3038    lea                  r5, [r5+ssq*2]
3039    movu                 m4, [r5+ssq*0]
3040    REPX      {paddd x, m5}, m0, m2, m1, m3
3041    punpcklwd           m12, m13, m15 ; 45
3042    punpckhwd           m13, m15
3043    punpcklwd           m14, m15, m4  ; 56
3044    punpckhwd           m15, m4
3045    pmaddwd              m4, m7, m12  ; a2
3046    paddd                m0, m4
3047    pmaddwd              m4, m7, m13  ; a2'
3048    paddd                m2, m4
3049    pmaddwd              m4, m7, m14  ; b2
3050    paddd                m1, m4
3051    pmaddwd              m4, m7, m15  ; b2'
3052    paddd                m3, m4
3053    REPX       {psrad x, 4}, m0, m2, m1, m3
3054    packssdw             m0, m2
3055    packssdw             m1, m3
3056    mova          [r7+wq*0], m0
3057    mova          [r7+wq*2], m1
3058    lea                  r7, [r7+wq*4]
3059    sub                  hd, 2
3060    jg .v_w8_loop
3061    add                srcq, 16
3062    add                tmpq, 16
3063    movzx                hd, r6b
3064    sub                 r6d, 1<<8
3065    jg .v_w8_loop0
3066    RET
3067%endif
3068.hv:
3069    and                  wd, -8
3070    jnz .hv_w8
3071    movzx               mxd, mxb
3072    movq                 m0, [base+subpel_filters+mxq*8]
3073    movzx               mxd, myb
3074    shr                 myd, 16
3075    cmp                  hd, 6
3076    cmovb               myd, mxd
3077    movq                 m2, [base+subpel_filters+1+myq*8]
3078    WIN64_SPILL_XMM      15
3079    movifnidn           ssq, r2mp
3080    movifnidn          tmpq, r0mp
3081    mova                 m7, [base+prep_8tap_2d_rnd]
3082    sub                srcq, 2
3083    pshuflw              m0, m0, q2121
3084    pxor                 m6, m6
3085    punpcklbw            m6, m0
3086    punpcklbw            m2, m2
3087    psraw                m6, 4
3088    psraw                m2, 8
3089    test          dword r7m, 0x800
3090    jz .hv_w4_10bpc
3091    psraw                m6, 2
3092.hv_w4_10bpc:
3093%if ARCH_X86_32
3094%assign regs_used 4
3095    ALLOC_STACK       -16*7
3096%assign regs_used 7
3097    %define             m10  [esp+16*3]
3098    %define             m12  [esp+16*5]
3099    %define             m13  [esp+16*6]
3100    %define             m14  [base+spel_h_shufA]
3101    %define             m11  [base+spel_h_shufB]
3102    pshufd               m0, m2, q0000
3103    pshufd               m1, m2, q1111
3104    pshufd               m2, m2, q2222
3105    pshufd               m5, m6, q0000
3106    pshufd               m6, m6, q1111
3107    mova                 m8, m0
3108    mova                 m9, m1
3109    mova                m10, m2
3110    mova                m12, m5
3111    mova                m13, m6
3112    neg                 ssq
3113    movu                 m3, [srcq+ssq*2]
3114    movu                 m4, [srcq+ssq*1]
3115    neg                 ssq
3116%else
3117    mov                  r6, ssq
3118    pshufd               m8, m2, q0000
3119    neg                  r6
3120    pshufd               m9, m2, q1111
3121    movu                 m3, [srcq+r6 *2]
3122    pshufd              m10, m2, q2222
3123    movu                 m4, [srcq+r6 *1]
3124    pshufd              m12, m6, q0000
3125    mova                m14, [base+spel_h_shufA]
3126    pshufd              m13, m6, q1111
3127    mova                m11, [base+spel_h_shufB]
3128%endif
3129    movu                 m1, [srcq+ssq*0]
3130    movu                 m0, [srcq+ssq*1]
3131    lea                srcq, [srcq+ssq*2]
3132    movu                 m2, [srcq+ssq*0]
3133    HV_H_W4_6TAP         m3, m3, m5, m11
3134    HV_H_W4_6TAP         m4, m4, m5, m11
3135    HV_H_W4_6TAP         m5, m1, m5, m11
3136    HV_H_W4_6TAP         m0, m0, m1, m11
3137    HV_H_W4_6TAP         m2, m2, m1, m11
3138    REPX       {psrad x, 6}, m3, m5, m4, m0, m2
3139    packssdw             m3, m5      ; 0 2
3140    packssdw             m4, m0      ; 1 3
3141    packssdw             m5, m2      ; 2 4
3142    punpcklwd            m1, m3, m4  ; 01
3143    punpckhwd            m3, m4      ; 23
3144    punpcklwd            m2, m4, m5  ; 12
3145    punpckhwd            m4, m5      ; 34
3146.hv_w4_loop:
3147    movu                 m0, [srcq+ssq*1]
3148    pmaddwd              m5, m8, m1  ; a0
3149    lea                srcq, [srcq+ssq*2]
3150    pmaddwd              m6, m8, m2  ; b0
3151    mova                 m1, m3
3152    pmaddwd              m3, m9      ; a1
3153    mova                 m2, m4
3154    pmaddwd              m4, m9      ; b1
3155    paddd                m5, m3
3156    movu                 m3, [srcq+ssq*0]
3157    paddd                m6, m4
3158    HV_H_W4_6TAP         m0, m0, m4, m11
3159    HV_H_W4_6TAP         m3, m3, m4, m11
3160    psrad                m4, m2, 16
3161    psrad                m0, 6
3162    psrad                m3, 6
3163    packssdw             m4, m0      ; 4 5
3164    packssdw             m0, m3      ; 5 6
3165    punpcklwd            m3, m4, m0  ; 45
3166    punpckhwd            m4, m0      ; 56
3167    pmaddwd              m0, m10, m3 ; a2
3168    paddd                m5, m7
3169    paddd                m5, m0
3170    pmaddwd              m0, m10, m4 ; b2
3171    paddd                m6, m7
3172    paddd                m6, m0
3173    psrad                m5, 6
3174    psrad                m6, 6
3175    packssdw             m5, m6
3176    mova             [tmpq], m5
3177    add                tmpq, 16
3178    sub                  hd, 2
3179    jg .hv_w4_loop
3180    RET
3181.hv_w8:
3182    RESET_STACK_STATE
3183    shr                 mxd, 16
3184    movq                 m2, [base+subpel_filters+1+mxq*8]
3185    movzx               mxd, myb
3186    shr                 myd, 16
3187    cmp                  hd, 6
3188    cmovb               myd, mxd
3189    movq                 m1, [base+subpel_filters+1+myq*8]
3190    movifnidn           ssq, r2mp
3191    mova                 m4, [base+prep_8tap_2d_rnd]
3192    pxor                 m0, m0
3193    punpcklbw            m0, m2
3194    punpcklbw            m1, m1
3195    sub                srcq, 4
3196    psraw                m0, 4
3197    psraw                m1, 8
3198    test          dword r7m, 0x800
3199    jz .hv_w8_10bpc
3200    psraw                m0, 2
3201.hv_w8_10bpc:
3202%if ARCH_X86_32
3203%assign regs_used 1
3204    ALLOC_STACK       -16*9
3205%assign regs_used 7
3206    mov                tmpq, r0mp
3207    mova         [rsp+16*7], m4
3208%else
3209%if WIN64
3210    PUSH                 r8
3211%assign regs_used 9
3212%endif
3213    ALLOC_STACK        16*6, 16
3214%endif
3215    pshufd               m2, m0, q0000
3216    mova         [rsp+16*0], m2
3217    pshufd               m2, m0, q1111
3218    mova         [rsp+16*1], m2
3219    pshufd               m0, m0, q2222
3220    mova         [rsp+16*2], m0
3221    pshufd               m2, m1, q0000
3222    mova         [rsp+16*3], m2
3223    pshufd               m2, m1, q1111
3224    mova         [rsp+16*4], m2
3225    pshufd               m1, m1, q2222
3226    mova         [rsp+16*5], m1
3227    mov                  r6, ssq
3228    neg                  r6
3229%if ARCH_X86_32
3230    mov                 r5d, wd
3231    shl                 r5d, 14
3232    lea                 r5d, [r5+hq-(1<<16)]
3233%if STACK_ALIGNMENT < 16
3234    %define           srcmp  [esp+16*8+4*0]
3235    %define           tmpmp  [esp+16*8+4*1]
3236%endif
3237.hv_w8_loop0:
3238    mov               srcmp, srcq
3239    mov               tmpmp, tmpq
3240    movu                 m5, [srcq+r6*2+0]
3241    movu                 m6, [srcq+r6*2+2]
3242    mova                 m7, [rsp+16*0]
3243    mova                 m1, [rsp+16*1]
3244    mova                 m0, [rsp+16*2]
3245    HV_H_6TAP            m2, m5, m6, m7, m1, m0
3246    movu                 m5, [srcq+r6*1+0]
3247    movu                 m6, [srcq+r6*1+2]
3248    HV_H_6TAP            m3, m5, m6, m7, m1, m0
3249    movu                 m5, [srcq+ssq*0+0]
3250    movu                 m6, [srcq+ssq*0+2]
3251    HV_H_6TAP            m4, m5, m6, m7, m1, m0
3252    movu                 m5, [srcq+ssq*1+0]
3253    movu                 m6, [srcq+ssq*1+2]
3254    lea                srcq, [srcq+ssq*2]
3255    HV_H_6TAP            m0, m5, m6, m7, m1
3256    movu                 m5, [srcq+ssq*0+0]
3257    movu                 m6, [srcq+ssq*0+2]
3258    HV_H_6TAP            m1, m5, m6, m7
3259    mova                 m5, [rsp+16*7]
3260    REPX      {paddd x, m5}, m2, m3, m4, m0, m1
3261    REPX      {psrad x, 6 }, m2, m4, m3, m0, m1
3262    packssdw             m2, m4     ; 0 2
3263    packssdw             m3, m0     ; 1 3
3264    packssdw             m4, m1     ; 2 4
3265    punpcklwd            m0, m2, m3 ; 01
3266    punpckhwd            m2, m3     ; 23
3267    punpcklwd            m1, m3, m4 ; 12
3268    punpckhwd            m3, m4     ; 34
3269.hv_w8_loop:
3270    mova                 m5, [rsp+16*3]
3271    mova                 m6, [rsp+16*4]
3272    pmaddwd              m4, m0, m5 ; a0
3273    pmaddwd              m5, m1     ; b0
3274    mova                 m0, m2
3275    pmaddwd              m2, m6     ; a1
3276    mova                 m1, m3
3277    pmaddwd              m3, m6     ; b1
3278    paddd                m4, m2
3279    movu                 m2, [srcq+ssq*1+0]
3280    paddd                m5, m3
3281    movu                 m3, [srcq+ssq*1+2]
3282    lea                srcq, [srcq+ssq*2]
3283    HV_H_6TAP            m6, m2, m3
3284    movu                 m2, [srcq+ssq*0+0]
3285    movu                 m3, [srcq+ssq*0+2]
3286    HV_H_6TAP            m7, m2, m3
3287    mova                 m2, [rsp+16*7]
3288    psrad                m3, m1, 16
3289    REPX      {paddd x, m2}, m6, m7, m4, m5
3290    psrad                m6, 6
3291    psrad                m7, 6
3292    packssdw             m3, m6     ; 4 5
3293    packssdw             m6, m7     ; 5 6
3294    mova                 m7, [rsp+16*5]
3295    punpcklwd            m2, m3, m6 ; 45
3296    punpckhwd            m3, m6     ; 56
3297    pmaddwd              m6, m2, m7 ; a2
3298    pmaddwd              m7, m3     ; b2
3299    paddd                m4, m6
3300    paddd                m5, m7
3301    psrad                m4, 6
3302    psrad                m5, 6
3303    packssdw             m4, m5
3304    movq        [tmpq+wq*0], m4
3305    movhps      [tmpq+wq*2], m4
3306    lea                tmpq, [tmpq+wq*4]
3307    sub                  hd, 2
3308    jg .hv_w8_loop
3309    mov                srcq, srcmp
3310    mov                tmpq, tmpmp
3311    movzx                hd, r5w
3312    add                srcq, 8
3313    add                tmpq, 8
3314    sub                 r5d, 1<<16
3315%else
3316    lea                 r8d, [wq*4-(1<<5)]
3317    lea                 r8d, [hq+r8*8]
3318.hv_w8_loop0:
3319    mova                 m5, [spel_h_shufA]
3320    movu                 m0, [srcq+r6*2+ 0]
3321    mova                 m6, [rsp+16*0]
3322    movu                 m1, [srcq+r6*2+ 8]
3323    mova                 m7, [rsp+16*1]
3324    movu                 m2, [srcq+r6*2+16]
3325    mova                 m8, [rsp+16*2]
3326    HV_H_6TAP            m9, m0, m1, m2, 6, m5, m6, m7, m8
3327    movu                 m0, [srcq+r6*1+ 0]
3328    movu                 m1, [srcq+r6*1+ 8]
3329    movu                 m2, [srcq+r6*1+16]
3330    lea                  r5, [srcq+ssq*2]
3331    HV_H_6TAP           m11, m0, m1, m2, 6, m5, m6, m7, m8
3332    movu                 m0, [srcq+ssq*0+ 0]
3333    movu                 m1, [srcq+ssq*0+ 8]
3334    movu                 m2, [srcq+ssq*0+16]
3335    mov                  r7, tmpq
3336    HV_H_6TAP           m13, m0, m1, m2, 6, m5, m6, m7, m8
3337    movu                 m0, [srcq+ssq*1+ 0]
3338    movu                 m1, [srcq+ssq*1+ 8]
3339    movu                 m2, [srcq+ssq*1+16]
3340    HV_H_6TAP           m15, m0, m1, m2, 6, m5, m6, m7, m8
3341    movu                 m0, [r5+ssq*0+ 0]
3342    movu                 m1, [r5+ssq*0+ 8]
3343    movu                 m2, [r5+ssq*0+16]
3344    HV_H_6TAP            m5, m0, m1, m2, 6, m5, m6, m7, m8
3345    punpcklwd            m8, m9, m11  ; 01
3346    punpckhwd            m9, m11
3347    punpcklwd           m10, m11, m13 ; 12
3348    punpckhwd           m11, m13
3349    punpcklwd           m12, m13, m15 ; 23
3350    punpckhwd           m13, m15
3351    punpcklwd           m14, m15, m5  ; 34
3352    punpckhwd           m15, m5
3353.hv_w8_loop:
3354    mova                 m3, [rsp+16*3]
3355    mova                 m7, [rsp+16*4]
3356    pmaddwd              m0, m8, m3   ; a0
3357    mova                 m8, m12
3358    pmaddwd              m2, m9, m3   ; a0'
3359    mova                 m9, m13
3360    pmaddwd              m1, m10, m3  ; b0
3361    mova                m10, m14
3362    pmaddwd              m3, m11      ; b0'
3363    mova                m11, m15
3364    REPX    {pmaddwd x, m7}, m12, m13, m14, m15
3365    movu                 m6, [r5+ssq*1+ 0]
3366    paddd                m0, m12
3367    movu                 m7, [r5+ssq*1+ 8]
3368    paddd                m2, m13
3369    movu                m12, [r5+ssq*1+16]
3370    paddd                m1, m14
3371    lea                  r5, [r5+ssq*2]
3372    paddd                m3, m15
3373    HV_H_6TAP           m15, m6, m7, m12, 6
3374    movu                 m6, [r5+ssq*0+ 0]
3375    movu                 m7, [r5+ssq*0+ 8]
3376    movu                m14, [r5+ssq*0+16]
3377    punpcklwd           m12, m5, m15 ; 45
3378    punpckhwd           m13, m5, m15
3379    HV_H_6TAP            m5, m6, m7, m14, 6
3380    mova                 m7, [rsp+16*5]
3381    REPX      {paddd x, m4}, m0, m2, m1, m3
3382    punpcklwd           m14, m15, m5  ; 56
3383    punpckhwd           m15, m5
3384    pmaddwd              m6, m12, m7  ; a2
3385    paddd                m0, m6
3386    pmaddwd              m6, m13, m7  ; a2'
3387    paddd                m2, m6
3388    pmaddwd              m6, m14, m7  ; b2
3389    pmaddwd              m7, m15      ; b2'
3390    paddd                m1, m6
3391    paddd                m3, m7
3392    REPX       {psrad x, 6}, m0, m2, m1, m3
3393    packssdw             m0, m2
3394    packssdw             m1, m3
3395    mova          [r7+wq*0], m0
3396    mova          [r7+wq*2], m1
3397    lea                  r7, [r7+wq*4]
3398    sub                  hd, 2
3399    jg .hv_w8_loop
3400    add                srcq, 16
3401    add                tmpq, 16
3402    movzx                hd, r8b
3403    sub                 r8d, 1<<8
3404%endif
3405    jg .hv_w8_loop0
3406    RET
3407
3408PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
3409PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
3410PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
3411PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
3412PREP_8TAP_FN sharp,          SHARP,   SHARP
3413
3414cglobal prep_8tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my
3415%if ARCH_X86_32
3416    %define             mxb  r0b
3417    %define             mxd  r0
3418    %define             mxq  r0
3419    %define             myb  r2b
3420    %define             myd  r2
3421    %define             myq  r2
3422    %define              m8  [esp+16*0]
3423    %define              m9  [esp+16*1]
3424    %define             m10  [esp+16*2]
3425    %define             m11  [esp+16*3]
3426    %define             m12  [esp+16*4]
3427    %define             m13  [esp+16*5]
3428    %define             m14  [esp+16*6]
3429    %define             m15  [esp+16*7]
3430%endif
3431    imul                mxd, mxm, 0x010101
3432    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3433    imul                myd, mym, 0x010101
3434    add                 myd, t1d ; 8tap_v, my, 4tap_v
3435    LEA                  t2, prep_ssse3
3436    movifnidn            wd, wm
3437    movifnidn          srcq, srcmp
3438    test                mxd, 0xf00
3439    jnz .h
3440    movifnidn            hd, hm
3441    test                myd, 0xf00
3442    jz mangle(private_prefix %+ _prep_6tap_16bpc_ssse3).prep
3443.v:
3444    movzx               mxd, myb
3445    shr                 myd, 16
3446    cmp                  hd, 4
3447    cmove               myd, mxd
3448    movq                 m3, [base+subpel_filters+myq*8]
3449    WIN64_SPILL_XMM      15
3450    movddup              m7, [base+prep_8tap_1d_rnd]
3451    movifnidn           ssq, r2mp
3452    movifnidn          tmpq, r0mp
3453    punpcklbw            m3, m3
3454    psraw                m3, 8 ; sign-extend
3455    test          dword r7m, 0x800
3456    jnz .v_12bpc
3457    psllw                m3, 2
3458.v_12bpc:
3459%if ARCH_X86_32
3460    ALLOC_STACK       -16*7
3461    pshufd               m0, m3, q0000
3462    pshufd               m1, m3, q1111
3463    pshufd               m2, m3, q2222
3464    pshufd               m3, m3, q3333
3465    mova                 m8, m0
3466    mova                 m9, m1
3467    mova                m10, m2
3468    mova                m11, m3
3469%else
3470    pshufd               m8, m3, q0000
3471    pshufd               m9, m3, q1111
3472    pshufd              m10, m3, q2222
3473    pshufd              m11, m3, q3333
3474%endif
3475    lea                  r6, [ssq*3]
3476    sub                srcq, r6
3477    mov                 r6d, wd
3478    shl                  wd, 6
3479    mov                  r5, srcq
3480%if ARCH_X86_64
3481    mov                  r7, tmpq
3482%elif STACK_ALIGNMENT < 16
3483    mov          [esp+4*29], tmpq
3484%endif
3485    lea                  wd, [wq+hq-(1<<8)]
3486.v_loop0:
3487    movq                 m1, [srcq+ssq*0]
3488    movq                 m2, [srcq+ssq*1]
3489    lea                srcq, [srcq+ssq*2]
3490    movq                 m3, [srcq+ssq*0]
3491    movq                 m4, [srcq+ssq*1]
3492    lea                srcq, [srcq+ssq*2]
3493    movq                 m5, [srcq+ssq*0]
3494    movq                 m6, [srcq+ssq*1]
3495    lea                srcq, [srcq+ssq*2]
3496    movq                 m0, [srcq+ssq*0]
3497    punpcklwd            m1, m2      ; 01
3498    punpcklwd            m2, m3      ; 12
3499    punpcklwd            m3, m4      ; 23
3500    punpcklwd            m4, m5      ; 34
3501    punpcklwd            m5, m6      ; 45
3502    punpcklwd            m6, m0      ; 56
3503%if ARCH_X86_32
3504    jmp .v_loop_start
3505.v_loop:
3506    mova                 m1, m12
3507    mova                 m2, m13
3508    mova                 m3, m14
3509.v_loop_start:
3510    pmaddwd              m1, m8      ; a0
3511    pmaddwd              m2, m8      ; b0
3512    mova                m12, m3
3513    mova                m13, m4
3514    pmaddwd              m3, m9      ; a1
3515    pmaddwd              m4, m9      ; b1
3516    paddd                m1, m3
3517    paddd                m2, m4
3518    mova                m14, m5
3519    mova                 m4, m6
3520    pmaddwd              m5, m10     ; a2
3521    pmaddwd              m6, m10     ; b2
3522    paddd                m1, m5
3523    paddd                m2, m6
3524    movq                 m6, [srcq+ssq*1]
3525    lea                srcq, [srcq+ssq*2]
3526    punpcklwd            m5, m0, m6  ; 67
3527    movq                 m0, [srcq+ssq*0]
3528    pmaddwd              m3, m11, m5 ; a3
3529    punpcklwd            m6, m0      ; 78
3530    paddd                m1, m7
3531    paddd                m1, m3
3532    pmaddwd              m3, m11, m6 ; b3
3533    paddd                m2, m7
3534    paddd                m2, m3
3535    psrad                m1, 4
3536    psrad                m2, 4
3537    packssdw             m1, m2
3538    movq        [tmpq+r6*0], m1
3539    movhps      [tmpq+r6*2], m1
3540    lea                tmpq, [tmpq+r6*4]
3541    sub                  hd, 2
3542    jg .v_loop
3543%if STACK_ALIGNMENT < 16
3544    mov                tmpq, [esp+4*29]
3545    add                  r5, 8
3546    add                tmpq, 8
3547    mov                srcq, r5
3548    mov          [esp+4*29], tmpq
3549%else
3550    mov                tmpq, tmpmp
3551    add                  r5, 8
3552    add                tmpq, 8
3553    mov                srcq, r5
3554    mov               tmpmp, tmpq
3555%endif
3556%else
3557.v_loop:
3558    pmaddwd             m12, m8, m1  ; a0
3559    pmaddwd             m13, m8, m2  ; b0
3560    mova                 m1, m3
3561    mova                 m2, m4
3562    pmaddwd              m3, m9      ; a1
3563    pmaddwd              m4, m9      ; b1
3564    paddd               m12, m3
3565    paddd               m13, m4
3566    mova                 m3, m5
3567    mova                 m4, m6
3568    pmaddwd              m5, m10     ; a2
3569    pmaddwd              m6, m10     ; b2
3570    paddd               m12, m5
3571    paddd               m13, m6
3572    movq                 m6, [srcq+ssq*1]
3573    lea                srcq, [srcq+ssq*2]
3574    punpcklwd            m5, m0, m6  ; 67
3575    movq                 m0, [srcq+ssq*0]
3576    pmaddwd             m14, m11, m5 ; a3
3577    punpcklwd            m6, m0      ; 78
3578    paddd               m12, m7
3579    paddd               m12, m14
3580    pmaddwd             m14, m11, m6 ; b3
3581    paddd               m13, m7
3582    paddd               m13, m14
3583    psrad               m12, 4
3584    psrad               m13, 4
3585    packssdw            m12, m13
3586    movq        [tmpq+r6*0], m12
3587    movhps      [tmpq+r6*2], m12
3588    lea                tmpq, [tmpq+r6*4]
3589    sub                  hd, 2
3590    jg .v_loop
3591    add                  r5, 8
3592    add                  r7, 8
3593    mov                srcq, r5
3594    mov                tmpq, r7
3595%endif
3596    movzx                hd, wb
3597    sub                  wd, 1<<8
3598    jg .v_loop0
3599    RET
3600.h:
3601    RESET_STACK_STATE
3602    test                myd, 0xf00
3603    jnz .hv
3604    movifnidn           ssq, r2mp
3605    movifnidn            hd, r4m
3606    movddup              m5, [base+prep_8tap_1d_rnd]
3607    cmp                  wd, 4
3608    jne .h_w8
3609.h_w4:
3610    movzx               mxd, mxb
3611    movq                 m0, [base+subpel_filters+mxq*8]
3612    mova                 m3, [base+spel_h_shufA]
3613    mova                 m4, [base+spel_h_shufB]
3614    movifnidn          tmpq, tmpmp
3615    sub                srcq, 2
3616    WIN64_SPILL_XMM       8
3617    punpcklbw            m0, m0
3618    psraw                m0, 8
3619    test          dword r7m, 0x800
3620    jnz .h_w4_12bpc
3621    psllw                m0, 2
3622.h_w4_12bpc:
3623    pshufd               m6, m0, q1111
3624    pshufd               m7, m0, q2222
3625.h_w4_loop:
3626    movu                 m1, [srcq+ssq*0]
3627    movu                 m2, [srcq+ssq*1]
3628    lea                srcq, [srcq+ssq*2]
3629    pshufb               m0, m1, m3 ; 0 1 1 2 2 3 3 4
3630    pshufb               m1, m4     ; 2 3 3 4 4 5 5 6
3631    pmaddwd              m0, m6
3632    pmaddwd              m1, m7
3633    paddd                m0, m5
3634    paddd                m0, m1
3635    pshufb               m1, m2, m3
3636    pshufb               m2, m4
3637    pmaddwd              m1, m6
3638    pmaddwd              m2, m7
3639    paddd                m1, m5
3640    paddd                m1, m2
3641    psrad                m0, 4
3642    psrad                m1, 4
3643    packssdw             m0, m1
3644    mova             [tmpq], m0
3645    add                tmpq, 16
3646    sub                  hd, 2
3647    jg .h_w4_loop
3648    RET
3649.h_w8:
3650    WIN64_SPILL_XMM      11
3651    shr                 mxd, 16
3652    movq                 m2, [base+subpel_filters+mxq*8]
3653    mova                 m4, [base+spel_h_shufA]
3654    mova                 m6, [base+spel_h_shufB]
3655    movifnidn          tmpq, r0mp
3656    add                  wd, wd
3657    punpcklbw            m2, m2
3658    add                srcq, wq
3659    psraw                m2, 8
3660    add                tmpq, wq
3661    neg                  wq
3662    test          dword r7m, 0x800
3663    jnz .h_w8_12bpc
3664    psllw                m2, 2
3665.h_w8_12bpc:
3666    pshufd               m7, m2, q0000
3667%if ARCH_X86_32
3668    ALLOC_STACK       -16*3
3669    pshufd               m0, m2, q1111
3670    pshufd               m1, m2, q2222
3671    pshufd               m2, m2, q3333
3672    mova                 m8, m0
3673    mova                 m9, m1
3674    mova                m10, m2
3675%else
3676    pshufd               m8, m2, q1111
3677    pshufd               m9, m2, q2222
3678    pshufd              m10, m2, q3333
3679%endif
3680.h_w8_loop0:
3681    mov                  r6, wq
3682.h_w8_loop:
3683    movu                 m0, [srcq+r6- 6]
3684    movu                 m1, [srcq+r6+ 2]
3685    pshufb               m2, m0, m4  ; 0 1 1 2 2 3 3 4
3686    pshufb               m0, m6      ; 2 3 3 4 4 5 5 6
3687    pmaddwd              m2, m7      ; abcd0
3688    pmaddwd              m0, m8      ; abcd1
3689    pshufb               m3, m1, m4  ; 4 5 5 6 6 7 7 8
3690    pshufb               m1, m6      ; 6 7 7 8 8 9 9 a
3691    paddd                m2, m5
3692    paddd                m0, m2
3693    pmaddwd              m2, m9, m3  ; abcd2
3694    pmaddwd              m3, m7      ; efgh0
3695    paddd                m0, m2
3696    pmaddwd              m2, m10, m1 ; abcd3
3697    pmaddwd              m1, m8      ; efgh1
3698    paddd                m0, m2
3699    movu                 m2, [srcq+r6+10]
3700    paddd                m3, m5
3701    paddd                m1, m3
3702    pshufb               m3, m2, m4  ; a b b c c d d e
3703    pshufb               m2, m6      ; 8 9 9 a a b b c
3704    pmaddwd              m3, m9      ; efgh2
3705    pmaddwd              m2, m10     ; efgh3
3706    paddd                m1, m3
3707    paddd                m1, m2
3708    psrad                m0, 4
3709    psrad                m1, 4
3710    packssdw             m0, m1
3711    mova          [tmpq+r6], m0
3712    add                  r6, 16
3713    jl .h_w8_loop
3714    add                srcq, ssq
3715    sub                tmpq, wq
3716    dec                  hd
3717    jg .h_w8_loop0
3718    RET
3719.hv:
3720    RESET_STACK_STATE
3721    movzx               t3d, mxb
3722    shr                 mxd, 16
3723    cmp                  wd, 4
3724    cmove               mxd, t3d
3725    movifnidn            hd, r4m
3726    movq                 m2, [base+subpel_filters+mxq*8]
3727    movzx               mxd, myb
3728    shr                 myd, 16
3729    cmp                  hd, 4
3730    cmove               myd, mxd
3731    movq                 m3, [base+subpel_filters+myq*8]
3732%if ARCH_X86_32
3733    mov                 ssq, r2mp
3734    mov                tmpq, r0mp
3735    mova                 m0, [base+spel_h_shufA]
3736    mova                 m1, [base+spel_h_shufB]
3737    mova                 m4, [base+prep_8tap_2d_rnd]
3738    ALLOC_STACK      -16*14
3739    mova                 m8, m0
3740    mova                 m9, m1
3741    mova                m14, m4
3742%else
3743%if WIN64
3744    ALLOC_STACK        16*6, 16
3745%endif
3746    mova                 m8, [base+spel_h_shufA]
3747    mova                 m9, [base+spel_h_shufB]
3748%endif
3749    pxor                 m0, m0
3750    punpcklbw            m0, m2
3751    punpcklbw            m3, m3
3752    psraw                m0, 4
3753    psraw                m3, 8
3754    test          dword r7m, 0x800
3755    jz .hv_10bpc
3756    psraw                m0, 2
3757.hv_10bpc:
3758    lea                  r6, [ssq*3]
3759    sub                srcq, 6
3760    sub                srcq, r6
3761    mov                 r6d, wd
3762    shl                  wd, 6
3763    mov                  r5, srcq
3764%if ARCH_X86_32
3765    %define             tmp  esp+16*8
3766%if STACK_ALIGNMENT < 16
3767    mov          [esp+4*61], tmpq
3768%endif
3769    pshufd               m1, m0, q0000
3770    pshufd               m2, m0, q1111
3771    pshufd               m5, m0, q2222
3772    pshufd               m0, m0, q3333
3773    mova                m10, m1
3774    mova                m11, m2
3775    mova                m12, m5
3776    mova                m13, m0
3777%else
3778%if WIN64
3779    %define             tmp  rsp
3780%else
3781    %define             tmp  rsp-88 ; red zone
3782%endif
3783    mov                  r7, tmpq
3784    pshufd              m10, m0, q0000
3785    pshufd              m11, m0, q1111
3786    pshufd              m12, m0, q2222
3787    pshufd              m13, m0, q3333
3788%endif
3789    lea                  wd, [wq+hq-(1<<8)]
3790    pshufd               m0, m3, q0000
3791    pshufd               m1, m3, q1111
3792    pshufd               m2, m3, q2222
3793    pshufd               m3, m3, q3333
3794    mova         [tmp+16*1], m0
3795    mova         [tmp+16*2], m1
3796    mova         [tmp+16*3], m2
3797    mova         [tmp+16*4], m3
3798.hv_loop0:
3799%if ARCH_X86_64
3800    mova                m14, [prep_8tap_2d_rnd]
3801%endif
3802    movu                 m4, [srcq+ssq*0+0]
3803    movu                 m1, [srcq+ssq*0+8]
3804    movu                 m5, [srcq+ssq*1+0]
3805    movu                 m2, [srcq+ssq*1+8]
3806    lea                srcq, [srcq+ssq*2]
3807    movu                 m6, [srcq+ssq*0+0]
3808    movu                 m3, [srcq+ssq*0+8]
3809    PUT_8TAP_HV_H         4, 1, 0, 6
3810    PUT_8TAP_HV_H         5, 2, 0, 6
3811    PUT_8TAP_HV_H         6, 3, 0, 6
3812    movu                 m7, [srcq+ssq*1+0]
3813    movu                 m2, [srcq+ssq*1+8]
3814    lea                srcq, [srcq+ssq*2]
3815    movu                 m1, [srcq+ssq*0+0]
3816    movu                 m3, [srcq+ssq*0+8]
3817    PUT_8TAP_HV_H         7, 2, 0, 6
3818    PUT_8TAP_HV_H         1, 3, 0, 6
3819    movu                 m2, [srcq+ssq*1+0]
3820    movu                 m3, [srcq+ssq*1+8]
3821    lea                srcq, [srcq+ssq*2]
3822    PUT_8TAP_HV_H         2, 3, 0, 6
3823    packssdw             m4, m7      ; 0 3
3824    packssdw             m5, m1      ; 1 4
3825    movu                 m0, [srcq+ssq*0+0]
3826    movu                 m1, [srcq+ssq*0+8]
3827    PUT_8TAP_HV_H         0, 1, 3, 6
3828    packssdw             m6, m2      ; 2 5
3829    packssdw             m7, m0      ; 3 6
3830    punpcklwd            m1, m4, m5  ; 01
3831    punpckhwd            m4, m5      ; 34
3832    punpcklwd            m2, m5, m6  ; 12
3833    punpckhwd            m5, m6      ; 45
3834    punpcklwd            m3, m6, m7  ; 23
3835    punpckhwd            m6, m7      ; 56
3836%if ARCH_X86_32
3837    jmp .hv_loop_start
3838.hv_loop:
3839    mova                 m1, [tmp+16*5]
3840    mova                 m2, m15
3841.hv_loop_start:
3842    mova                 m7, [tmp+16*1]
3843    pmaddwd              m1, m7      ; a0
3844    pmaddwd              m2, m7      ; b0
3845    mova                 m7, [tmp+16*2]
3846    mova         [tmp+16*5], m3
3847    pmaddwd              m3, m7      ; a1
3848    mova                m15, m4
3849    pmaddwd              m4, m7      ; b1
3850    mova                 m7, [tmp+16*3]
3851    paddd                m1, m14
3852    paddd                m2, m14
3853    paddd                m1, m3
3854    paddd                m2, m4
3855    mova                 m3, m5
3856    pmaddwd              m5, m7      ; a2
3857    mova                 m4, m6
3858    pmaddwd              m6, m7      ; b2
3859    paddd                m1, m5
3860    paddd                m2, m6
3861    movu                 m7, [srcq+ssq*1+0]
3862    movu                 m5, [srcq+ssq*1+8]
3863    lea                srcq, [srcq+ssq*2]
3864    PUT_8TAP_HV_H         7, 5, 6, 6
3865    packssdw             m0, m7      ; 6 7
3866    mova         [tmp+16*0], m0
3867    movu                 m0, [srcq+ssq*0+0]
3868    movu                 m5, [srcq+ssq*0+8]
3869    PUT_8TAP_HV_H         0, 5, 6, 6
3870    mova                 m6, [tmp+16*0]
3871    packssdw             m7, m0      ; 7 8
3872    punpcklwd            m5, m6, m7  ; 67
3873    punpckhwd            m6, m7      ; 78
3874    pmaddwd              m7, m5, [tmp+16*4]
3875    paddd                m1, m7      ; a3
3876    pmaddwd              m7, m6, [tmp+16*4]
3877    paddd                m2, m7      ; b3
3878    psrad                m1, 6
3879    psrad                m2, 6
3880    packssdw             m1, m2
3881    movq        [tmpq+r6*0], m1
3882    movhps      [tmpq+r6*2], m1
3883    lea                tmpq, [tmpq+r6*4]
3884    sub                  hd, 2
3885    jg .hv_loop
3886%if STACK_ALIGNMENT < 16
3887    mov                tmpq, [esp+4*61]
3888    add                  r5, 8
3889    add                tmpq, 8
3890    mov                srcq, r5
3891    mov          [esp+4*61], tmpq
3892%else
3893    mov                tmpq, tmpmp
3894    add                  r5, 8
3895    add                tmpq, 8
3896    mov                srcq, r5
3897    mov               tmpmp, tmpq
3898%endif
3899%else
3900.hv_loop:
3901    mova                m15, [tmp+16*1]
3902    mova                 m7, [prep_8tap_2d_rnd]
3903    pmaddwd             m14, m15, m1 ; a0
3904    pmaddwd             m15, m2      ; b0
3905    paddd               m14, m7
3906    paddd               m15, m7
3907    mova                 m7, [tmp+16*2]
3908    mova                 m1, m3
3909    pmaddwd              m3, m7      ; a1
3910    mova                 m2, m4
3911    pmaddwd              m4, m7      ; b1
3912    mova                 m7, [tmp+16*3]
3913    paddd               m14, m3
3914    paddd               m15, m4
3915    mova                 m3, m5
3916    pmaddwd              m5, m7      ; a2
3917    mova                 m4, m6
3918    pmaddwd              m6, m7      ; b2
3919    paddd               m14, m5
3920    paddd               m15, m6
3921    movu                 m7, [srcq+ssq*1+0]
3922    movu                 m5, [srcq+ssq*1+8]
3923    lea                srcq, [srcq+ssq*2]
3924    PUT_8TAP_HV_H         7, 5, 6, 6, [prep_8tap_2d_rnd]
3925    packssdw             m0, m7      ; 6 7
3926    mova         [tmp+16*0], m0
3927    movu                 m0, [srcq+ssq*0+0]
3928    movu                 m5, [srcq+ssq*0+8]
3929    PUT_8TAP_HV_H         0, 5, 6, 6, [prep_8tap_2d_rnd]
3930    mova                 m6, [tmp+16*0]
3931    packssdw             m7, m0      ; 7 8
3932    punpcklwd            m5, m6, m7  ; 67
3933    punpckhwd            m6, m7      ; 78
3934    pmaddwd              m7, m5, [tmp+16*4]
3935    paddd               m14, m7      ; a3
3936    pmaddwd              m7, m6, [tmp+16*4]
3937    paddd               m15, m7      ; b3
3938    psrad               m14, 6
3939    psrad               m15, 6
3940    packssdw            m14, m15
3941    movq        [tmpq+r6*0], m14
3942    movhps      [tmpq+r6*2], m14
3943    lea                tmpq, [tmpq+r6*4]
3944    sub                  hd, 2
3945    jg .hv_loop
3946    add                  r5, 8
3947    add                  r7, 8
3948    mov                srcq, r5
3949    mov                tmpq, r7
3950%endif
3951    movzx                hd, wb
3952    sub                  wd, 1<<8
3953    jg .hv_loop0
3954    RET
3955%undef tmp
3956
3957%macro movifprep 2
3958 %if isprep
3959    mov %1, %2
3960 %endif
3961%endmacro
3962
3963%macro SAVE_REG 1
3964 %xdefine r%1_save  r%1
3965 %xdefine r%1q_save r%1q
3966 %xdefine r%1d_save r%1d
3967 %if ARCH_X86_32
3968  %define r%1m_save [rstk+stack_offset+(%1+1)*4]
3969 %endif
3970%endmacro
3971
3972%macro LOAD_REG 1
3973 %xdefine r%1  r%1_save
3974 %xdefine r%1q r%1q_save
3975 %xdefine r%1d r%1d_save
3976 %if ARCH_X86_32
3977  %define r%1m r%1m_save
3978 %endif
3979 %undef r%1d_save
3980 %undef r%1q_save
3981 %undef r%1_save
3982%endmacro
3983
3984%macro REMAP_REG 2-3
3985 %xdefine r%1  r%2
3986 %xdefine r%1q r%2q
3987 %xdefine r%1d r%2d
3988 %if ARCH_X86_32
3989  %if %3 == 0
3990   %xdefine r%1m r%2m
3991  %else
3992   %define r%1m [rstk+stack_offset+(%1+1)*4]
3993  %endif
3994 %endif
3995%endmacro
3996
3997%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
3998 %if isprep
3999  %if ARCH_X86_64
4000   SAVE_REG 14
4001   %assign %%i 14
4002   %rep 14
4003    %assign %%j %%i-1
4004    REMAP_REG %%i, %%j
4005    %assign %%i %%i-1
4006   %endrep
4007  %else
4008   SAVE_REG 5
4009   %assign %%i 5
4010   %rep 5
4011    %assign %%j %%i-1
4012    REMAP_REG %%i, %%j, 0
4013    %assign %%i %%i-1
4014   %endrep
4015  %endif
4016 %endif
4017%endmacro
4018
4019%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
4020 %if isprep
4021  %assign %%i 1
4022  %if ARCH_X86_64
4023   %rep 13
4024    %assign %%j %%i+1
4025    REMAP_REG %%i, %%j
4026    %assign %%i %%i+1
4027   %endrep
4028   LOAD_REG 14
4029  %else
4030   %rep 4
4031    %assign %%j %%i+1
4032    REMAP_REG %%i, %%j, 1
4033    %assign %%i %%i+1
4034   %endrep
4035   LOAD_REG 5
4036  %endif
4037 %endif
4038%endmacro
4039
4040%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
4041    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
4042    RET
4043 %if %1
4044    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4045 %endif
4046%endmacro
4047
4048%if ARCH_X86_32
4049 %macro MC_4TAP_SCALED_H 1 ; dst_mem
4050    movu                 m7, [srcq+ssq*0]
4051    movu                 m2, [srcq+ssq*1]
4052    movu                 m5, [r4  +ssq*0]
4053    movu                 m6, [r4  +ssq*1]
4054    lea                srcq, [srcq+ssq*2]
4055    lea                  r4, [r4  +ssq*2]
4056    REPX    {pshufb x, m12}, m7, m2
4057    REPX   {pmaddwd x, m13}, m7, m2
4058    REPX    {pshufb x, m14}, m5, m6
4059    REPX   {pmaddwd x, m15}, m5, m6
4060    phaddd               m7, m5
4061    phaddd               m2, m6
4062    mova                 m5, [esp+0x00]
4063    movd                 m6, [esp+0x10]
4064    paddd                m7, m5
4065    paddd                m2, m5
4066    psrad                m7, m6
4067    psrad                m2, m6
4068    packssdw             m7, m2
4069    mova           [stk+%1], m7
4070 %endmacro
4071%endif
4072
4073%if ARCH_X86_64
4074 %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
4075    movu                m%1, [srcq+ r4*2]
4076    movu                m%2, [srcq+ r6*2]
4077    movu                m%3, [srcq+ r7*2]
4078    movu                m%4, [srcq+ r9*2]
4079    movu                m%5, [srcq+r10*2]
4080    movu                m%6, [srcq+r11*2]
4081    movu                m%7, [srcq+r13*2]
4082    movu                m%8, [srcq+ rX*2]
4083    add                srcq, ssq
4084    pmaddwd             m%1, [stk+0x10]
4085    pmaddwd             m%2, [stk+0x20]
4086    pmaddwd             m%3, [stk+0x30]
4087    pmaddwd             m%4, [stk+0x40]
4088    pmaddwd             m%5, [stk+0x50]
4089    pmaddwd             m%6, [stk+0x60]
4090    pmaddwd             m%7, [stk+0x70]
4091    pmaddwd             m%8, [stk+0x80]
4092    phaddd              m%1, m%2
4093    phaddd              m%3, m%4
4094    phaddd              m%5, m%6
4095    phaddd              m%7, m%8
4096    phaddd              m%1, m%3
4097    phaddd              m%5, m%7
4098    paddd               m%1, hround
4099    paddd               m%5, hround
4100    psrad               m%1, m12
4101    psrad               m%5, m12
4102    packssdw            m%1, m%5
4103 %endmacro
4104%else
4105 %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets
4106  %if %3 == 1
4107    mov                  r0, [stk+ 0]
4108    mov                  rX, [stk+ 4]
4109    mov                  r4, [stk+ 8]
4110    mov                  r5, [stk+12]
4111  %endif
4112    movu                 m0, [srcq+r0*2]
4113    movu                 m1, [srcq+rX*2]
4114    movu                 m2, [srcq+r4*2]
4115    movu                 m3, [srcq+r5*2]
4116    mov                  r0, [stk+16]
4117    mov                  rX, [stk+20]
4118    mov                  r4, [stk+24]
4119    mov                  r5, [stk+28]
4120    pmaddwd              m0, [stk+%1+0x00]
4121    pmaddwd              m1, [stk+%1+0x10]
4122    pmaddwd              m2, [stk+%1+0x20]
4123    pmaddwd              m3, [stk+%1+0x30]
4124    phaddd               m0, m1
4125    phaddd               m2, m3
4126    movu                 m4, [srcq+r0*2]
4127    movu                 m5, [srcq+rX*2]
4128    movu                 m6, [srcq+r4*2]
4129    movu                 m7, [srcq+r5*2]
4130    add                srcq, ssq
4131    pmaddwd              m4, [stk+%1+0xa0]
4132    pmaddwd              m5, [stk+%1+0xb0]
4133    pmaddwd              m6, [stk+%1+0xc0]
4134    pmaddwd              m7, [stk+%1+0xd0]
4135    phaddd               m4, m5
4136    phaddd               m6, m7
4137    phaddd               m0, m2
4138    phaddd               m4, m6
4139    paddd                m0, hround
4140    paddd                m4, hround
4141    psrad                m0, m12
4142    psrad                m4, m12
4143    packssdw             m0, m4
4144  %if %2 != 0
4145    mova           [stk+%2], m0
4146  %endif
4147 %endmacro
4148%endif
4149
4150%macro MC_8TAP_SCALED 1
4151%ifidn %1, put
4152 %assign isput  1
4153 %assign isprep 0
4154 %if ARCH_X86_64
4155  %if required_stack_alignment <= STACK_ALIGNMENT
4156cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
4157  %else
4158cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
4159  %endif
4160 %else ; ARCH_X86_32
4161  %if required_stack_alignment <= STACK_ALIGNMENT
4162cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
4163  %else
4164cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
4165  %endif
4166 %endif
4167 %xdefine base_reg r12
4168%else ; prep
4169 %assign isput  0
4170 %assign isprep 1
4171 %if ARCH_X86_64
4172  %if required_stack_alignment <= STACK_ALIGNMENT
4173cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
4174   %xdefine tmp_stridem r14q
4175  %else
4176cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
4177   %define tmp_stridem qword [stk+0x138]
4178  %endif
4179  %xdefine base_reg r11
4180 %else ; ARCH_X86_32
4181  %if required_stack_alignment <= STACK_ALIGNMENT
4182cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
4183  %else
4184cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
4185  %endif
4186  %define tmp_stridem dword [stk+0x138]
4187 %endif
4188%endif
4189%if ARCH_X86_32
4190    mov         [esp+0x1f0], t0d
4191    mov         [esp+0x1f4], t1d
4192 %if isput && required_stack_alignment > STACK_ALIGNMENT
4193    mov                dstd, dstm
4194    mov                 dsd, dsm
4195    mov                srcd, srcm
4196    mov                 ssd, ssm
4197    mov                  hd, hm
4198    mov                  r4, mxm
4199  %define r0m  [esp+0x200]
4200  %define dsm  [esp+0x204]
4201  %define dsmp dsm
4202  %define r1m  dsm
4203  %define r2m  [esp+0x208]
4204  %define ssm  [esp+0x20c]
4205  %define r3m  ssm
4206  %define hm   [esp+0x210]
4207  %define mxm  [esp+0x214]
4208    mov                 r0m, dstd
4209    mov                 dsm, dsd
4210    mov                 r2m, srcd
4211    mov                 ssm, ssd
4212    mov                  hm, hd
4213    mov                  r0, mym
4214    mov                  r1, dxm
4215    mov                  r2, dym
4216  %define mym    [esp+0x218]
4217  %define dxm    [esp+0x21c]
4218  %define dym    [esp+0x220]
4219    mov                 mxm, r4
4220    mov                 mym, r0
4221    mov                 dxm, r1
4222    mov                 dym, r2
4223    tzcnt                wd, wm
4224 %endif
4225 %if isput
4226    mov                  r3, pxmaxm
4227  %define pxmaxm r3
4228 %else
4229    mov                  r2, pxmaxm
4230 %endif
4231 %if isprep && required_stack_alignment > STACK_ALIGNMENT
4232  %xdefine base_reg r5
4233 %else
4234  %xdefine base_reg r6
4235 %endif
4236%endif
4237    LEA            base_reg, %1_8tap_scaled_16bpc_ssse3
4238%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3
4239%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
4240    tzcnt                wd, wm
4241%endif
4242%if ARCH_X86_64
4243 %if isput
4244    mov                 r7d, pxmaxm
4245 %endif
4246%else
4247 %define m8  m0
4248 %define m9  m1
4249 %define m14 m4
4250 %define m15 m3
4251%endif
4252    movd                 m8, dxm
4253    movd                m14, mxm
4254%if isput
4255    movd                m15, pxmaxm
4256%endif
4257    pshufd               m8, m8, q0000
4258    pshufd              m14, m14, q0000
4259%if isput
4260    pshuflw             m15, m15, q0000
4261    punpcklqdq          m15, m15
4262%endif
4263%if isprep
4264 %if UNIX64
4265    mov                 r5d, t0d
4266  DECLARE_REG_TMP 5, 7
4267 %endif
4268 %if ARCH_X86_64
4269    mov                 r6d, pxmaxm
4270 %endif
4271%endif
4272%if ARCH_X86_64
4273    mov                 dyd, dym
4274%endif
4275%if isput
4276 %if WIN64
4277    mov                 r8d, hm
4278  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
4279  %define hm r5m
4280  %define dxm r8m
4281 %elif ARCH_X86_64
4282  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
4283  %define hm r6m
4284 %else
4285 %endif
4286 %if ARCH_X86_64
4287  %if required_stack_alignment > STACK_ALIGNMENT
4288   %define dsm [rsp+0x138]
4289   %define rX r1
4290   %define rXd r1d
4291  %else
4292   %define dsm dsq
4293   %define rX r14
4294   %define rXd r14d
4295  %endif
4296 %else
4297  %define rX r1
4298 %endif
4299%else ; prep
4300 %if WIN64
4301    mov                 r7d, hm
4302  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
4303  %define hm r4m
4304  %define dxm r7m
4305 %elif ARCH_X86_64
4306  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
4307  %xdefine hm r7m
4308 %endif
4309 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4310 %if ARCH_X86_64
4311  %define rX r14
4312  %define rXd r14d
4313 %else
4314  %define rX r3
4315 %endif
4316%endif
4317%if ARCH_X86_64
4318    shr                 r7d, 11
4319    mova                m10, [base+pd_0x3ff]
4320    movddup             m11, [base+s_8tap_h_rnd+r7*8]
4321    movd                m12, [base+s_8tap_h_sh+r7*4]
4322 %if isput
4323    movddup             m13, [base+put_s_8tap_v_rnd+r7*8]
4324    movd                 m7, [base+put_s_8tap_v_sh+r7*4]
4325  %define pxmaxm [rsp]
4326    mova             pxmaxm, m15
4327    punpcklqdq          m12, m7
4328 %endif
4329    lea                ss3q, [ssq*3]
4330    movzx               r7d, t1b
4331    shr                 t1d, 16
4332    cmp                  hd, 6
4333    cmovs               t1d, r7d
4334    sub                srcq, ss3q
4335%else
4336 %define m10    [base+pd_0x3ff]
4337 %define m11    [esp+0x00]
4338 %define m12    [esp+0x10]
4339    shr                  r3, 11
4340    movddup              m1, [base+s_8tap_h_rnd+r3*8]
4341    movd                 m2, [base+s_8tap_h_sh+r3*4]
4342 %if isput
4343  %define m13    [esp+0x20]
4344  %define pxmaxm [esp+0x30]
4345  %define stk esp+0x40
4346    movddup              m5, [base+put_s_8tap_v_rnd+r3*8]
4347    movd                 m6, [base+put_s_8tap_v_sh+r3*4]
4348    mova             pxmaxm, m15
4349    punpcklqdq           m2, m6
4350    mova                m13, m5
4351 %else
4352  %define m13 [base+pd_m524256]
4353 %endif
4354    mov                 ssd, ssm
4355    mova                m11, m1
4356    mova                m12, m2
4357 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
4358    mov                  r1, [esp+0x1f4]
4359    lea                  r0, [ssd*3]
4360    movzx                r2, r1b
4361    shr                  r1, 16
4362    cmp            dword hm, 6
4363    cmovs                r1, r2
4364    mov         [esp+0x1f4], r1
4365 %if isprep
4366    mov                  r1, r1m
4367 %endif
4368    mov                  r2, r2m
4369    sub                srcq, r0
4370 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4371 %define ss3q r0
4372 %define myd r4
4373 %define dyd dword dym
4374 %define hd  dword hm
4375%endif
4376    cmp                 dyd, 1024
4377    je .dy1
4378    cmp                 dyd, 2048
4379    je .dy2
4380    movzx                wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
4381    add                  wq, base_reg
4382    jmp                  wq
4383%if isput
4384.w2:
4385 %if ARCH_X86_64
4386    mov                 myd, mym
4387    movzx               t0d, t0b
4388    sub                srcq, 2
4389    movd                m15, t0d
4390 %else
4391    movzx                r4, byte [esp+0x1f0]
4392    sub                srcq, 2
4393    movd                m15, r4
4394 %endif
4395    pxor                 m9, m9
4396    punpckldq            m9, m8
4397    paddd               m14, m9 ; mx+dx*[0-1]
4398 %if ARCH_X86_64
4399    mova                 m9, [base+pd_0x4000]
4400 %endif
4401    pshufd              m15, m15, q0000
4402    pand                 m8, m14, m10
4403    psrld                m8, 6
4404    paddd               m15, m8
4405    movd                r4d, m15
4406    pshufd              m15, m15, q0321
4407 %if ARCH_X86_64
4408    movd                r6d, m15
4409 %else
4410    movd                r3d, m15
4411 %endif
4412    mova                 m5, [base+bdct_lb_q]
4413    mova                 m6, [base+spel_s_shuf2]
4414    movd                m15, [base+subpel_filters+r4*8+2]
4415 %if ARCH_X86_64
4416    movd                 m7, [base+subpel_filters+r6*8+2]
4417 %else
4418    movd                 m7, [base+subpel_filters+r3*8+2]
4419 %endif
4420    pxor                 m2, m2
4421    pcmpeqd              m8, m2
4422    psrld               m14, 10
4423    paddd               m14, m14
4424 %if ARCH_X86_32
4425    mov                  r3, r3m
4426    pshufb              m14, m5
4427    paddb               m14, m6
4428    mova              [stk], m14
4429    SWAP                 m5, m0
4430    SWAP                 m6, m3
4431  %define m15 m6
4432 %endif
4433    movu                 m0, [srcq+ssq*0]
4434    movu                 m1, [srcq+ssq*1]
4435    movu                 m2, [srcq+ssq*2]
4436    movu                 m3, [srcq+ss3q ]
4437    lea                srcq, [srcq+ssq*4]
4438    punpckldq           m15, m7
4439 %if ARCH_X86_64
4440    pshufb              m14, m5
4441    paddb               m14, m6
4442    pand                 m9, m8
4443    pandn                m8, m15
4444    SWAP                m15, m8
4445    por                 m15, m9
4446    movu                 m4, [srcq+ssq*0]
4447    movu                 m5, [srcq+ssq*1]
4448    movu                 m6, [srcq+ssq*2]
4449    movu                 m7, [srcq+ss3q ]
4450    lea                srcq, [srcq+ssq*4]
4451 %else
4452    pand                 m7, m5, [base+pd_0x4000]
4453    pandn                m5, m15
4454    por                  m5, m7
4455  %define m15 m5
4456 %endif
4457    punpcklbw           m15, m15
4458    psraw               m15, 8
4459    REPX    {pshufb x, m14}, m0, m1, m2, m3
4460    REPX   {pmaddwd x, m15}, m0, m1, m2, m3
4461 %if ARCH_X86_64
4462    REPX    {pshufb x, m14}, m4, m5, m6, m7
4463    REPX   {pmaddwd x, m15}, m4, m5, m6, m7
4464    phaddd               m0, m1
4465    phaddd               m2, m3
4466    phaddd               m4, m5
4467    phaddd               m6, m7
4468    REPX     {paddd x, m11}, m0, m2, m4, m6
4469    REPX     {psrad x, m12}, m0, m2, m4, m6
4470    packssdw             m0, m2 ; 0 1 2 3
4471    packssdw             m4, m6 ; 4 5 6 7
4472    SWAP                 m1, m4
4473 %else
4474    mova         [stk+0x10], m15
4475    phaddd               m0, m1
4476    phaddd               m2, m3
4477    movu                 m1, [srcq+ssq*0]
4478    movu                 m7, [srcq+ssq*1]
4479    movu                 m6, [srcq+ssq*2]
4480    movu                 m3, [srcq+ss3q ]
4481    lea                srcq, [srcq+ssq*4]
4482    REPX    {pshufb x, m14}, m1, m7, m6, m3
4483    REPX   {pmaddwd x, m15}, m1, m7, m6, m3
4484    phaddd               m1, m7
4485    phaddd               m6, m3
4486    REPX     {paddd x, m11}, m0, m2, m1, m6
4487    REPX     {psrad x, m12}, m0, m2, m1, m6
4488    packssdw             m0, m2
4489    packssdw             m1, m6
4490  %define m14 [stk+0x00]
4491  %define m15 [stk+0x10]
4492 %endif
4493    palignr              m2, m1, m0, 4 ; 1 2 3 4
4494    punpcklwd            m3, m0, m2    ; 01 12
4495    punpckhwd            m0, m2        ; 23 34
4496    pshufd               m5, m1, q0321 ; 5 6 7 _
4497    punpcklwd            m2, m1, m5    ; 45 56
4498    punpckhwd            m4, m1, m5    ; 67 __
4499 %if ARCH_X86_32
4500    mov                 myd, mym
4501    mov                  r0, r0m
4502    mova         [stk+0x20], m3
4503    mova         [stk+0x30], m0
4504    mova         [stk+0x40], m2
4505    mova         [stk+0x50], m4
4506 %endif
4507.w2_loop:
4508    and                 myd, 0x3ff
4509 %if ARCH_X86_64
4510    mov                 r6d, 64 << 24
4511    mov                 r4d, myd
4512    shr                 r4d, 6
4513    lea                 r4d, [t1+r4]
4514    cmovnz              r6q, [base+subpel_filters+r4*8]
4515    movq                m10, r6q
4516    punpcklbw           m10, m10
4517    psraw               m10, 8
4518    pshufd               m7, m10, q0000
4519    pshufd               m8, m10, q1111
4520    pmaddwd              m5, m3, m7
4521    pmaddwd              m6, m0, m8
4522    pshufd               m9, m10, q2222
4523    pshufd              m10, m10, q3333
4524    pmaddwd              m7, m2, m9
4525    pmaddwd              m8, m4, m10
4526    paddd                m5, m6
4527    paddd                m7, m8
4528 %else
4529    mov                  r1, [esp+0x1f4]
4530    xor                  r3, r3
4531    mov                  r5, myd
4532    shr                  r5, 6
4533    lea                  r1, [r1+r5]
4534    mov                  r5, 64 << 24
4535    cmovnz               r3, [base+subpel_filters+r1*8+4]
4536    cmovnz               r5, [base+subpel_filters+r1*8+0]
4537    movd                 m6, r3
4538    movd                 m7, r5
4539    punpckldq            m7, m6
4540    punpcklbw            m7, m7
4541    psraw                m7, 8
4542    pshufd               m5, m7, q0000
4543    pshufd               m6, m7, q1111
4544    pmaddwd              m3, m5
4545    pmaddwd              m0, m6
4546    pshufd               m5, m7, q2222
4547    pshufd               m7, m7, q3333
4548    pmaddwd              m2, m5
4549    pmaddwd              m4, m7
4550    paddd                m3, m0
4551    paddd                m2, m4
4552    SWAP                 m5, m3
4553    SWAP                 m7, m2
4554  %define m8 m3
4555 %endif
4556    paddd                m5, m13
4557    pshufd               m6, m12, q1032
4558    pxor                 m8, m8
4559    paddd                m5, m7
4560    psrad                m5, m6
4561    packssdw             m5, m5
4562    pmaxsw               m5, m8
4563    pminsw               m5, pxmaxm
4564    movd             [dstq], m5
4565    add                dstq, dsmp
4566    dec                  hd
4567    jz .ret
4568 %if ARCH_X86_64
4569    add                 myd, dyd
4570 %else
4571    add                 myd, dym
4572 %endif
4573    test                myd, ~0x3ff
4574 %if ARCH_X86_32
4575    SWAP                 m3, m5
4576    SWAP                 m2, m7
4577    mova                 m3, [stk+0x20]
4578    mova                 m0, [stk+0x30]
4579    mova                 m2, [stk+0x40]
4580    mova                 m4, [stk+0x50]
4581 %endif
4582    jz .w2_loop
4583 %if ARCH_X86_32
4584    mov                  r3, r3m
4585 %endif
4586    movu                 m5, [srcq]
4587    test                myd, 0x400
4588    jz .w2_skip_line
4589    add                srcq, ssq
4590    shufps               m3, m0, q1032      ; 01 12
4591    shufps               m0, m2, q1032      ; 23 34
4592    shufps               m2, m4, q1032      ; 45 56
4593    pshufb               m5, m14
4594    pmaddwd              m5, m15
4595    phaddd               m5, m5
4596    paddd                m5, m11
4597    psrad                m5, m12
4598    packssdw             m5, m5
4599    palignr              m4, m5, m1, 12
4600    punpcklqdq           m1, m4, m4         ; 6 7 6 7
4601    punpcklwd            m4, m1, m5         ; 67 __
4602 %if ARCH_X86_32
4603    mova         [stk+0x20], m3
4604    mova         [stk+0x30], m0
4605    mova         [stk+0x40], m2
4606    mova         [stk+0x50], m4
4607 %endif
4608    jmp .w2_loop
4609.w2_skip_line:
4610    movu                 m6, [srcq+ssq*1]
4611    lea                srcq, [srcq+ssq*2]
4612    mova                 m3, m0             ; 01 12
4613    mova                 m0, m2             ; 23 34
4614    pshufb               m5, m14
4615    pshufb               m6, m14
4616    pmaddwd              m5, m15
4617    pmaddwd              m6, m15
4618    phaddd               m5, m6
4619    paddd                m5, m11
4620    psrad                m5, m12
4621    packssdw             m5, m5             ; 6 7 6 7
4622    punpckhqdq           m1, m5             ; 4 5 6 7
4623    pshufd               m5, m1, q0321      ; 5 6 7 _
4624    punpcklwd            m2, m1, m5         ; 45 56
4625    punpckhwd            m4, m1, m5         ; 67 __
4626 %if ARCH_X86_32
4627    mova         [stk+0x20], m3
4628    mova         [stk+0x30], m0
4629    mova         [stk+0x40], m2
4630    mova         [stk+0x50], m4
4631 %endif
4632    jmp .w2_loop
4633%endif
4634INIT_XMM ssse3
4635.w4:
4636%if ARCH_X86_64
4637    mov                 myd, mym
4638    mova         [rsp+0x10], m11
4639    mova         [rsp+0x20], m12
4640 %if isput
4641    mova         [rsp+0x30], m13
4642 %endif
4643    movzx               t0d, t0b
4644    sub                srcq, 2
4645    movd                m15, t0d
4646%else
4647 %define m8  m0
4648 %xdefine m14 m4
4649 %define m15 m3
4650    movzx                r4, byte [esp+0x1f0]
4651    sub                srcq, 2
4652    movd                m15, r4
4653%endif
4654    pmaddwd              m8, [base+rescale_mul]
4655%if ARCH_X86_64
4656    mova                 m9, [base+pd_0x4000]
4657%else
4658 %define m9 [base+pd_0x4000]
4659%endif
4660    pshufd              m15, m15, q0000
4661    paddd               m14, m8 ; mx+dx*[0-3]
4662    pand                 m0, m14, m10
4663    psrld                m0, 6
4664    paddd               m15, m0
4665    pshufd               m7, m15, q1032
4666%if ARCH_X86_64
4667    movd                r4d, m15
4668    movd               r11d, m7
4669    pshufd              m15, m15, q0321
4670    pshufd               m7, m7, q0321
4671    movd                r6d, m15
4672    movd               r13d, m7
4673    mova                m10, [base+bdct_lb_q+ 0]
4674    mova                m11, [base+bdct_lb_q+16]
4675    movd                m13, [base+subpel_filters+ r4*8+2]
4676    movd                 m2, [base+subpel_filters+ r6*8+2]
4677    movd                m15, [base+subpel_filters+r11*8+2]
4678    movd                 m4, [base+subpel_filters+r13*8+2]
4679%else
4680    movd                 r0, m15
4681    movd                 r4, m7
4682    pshufd              m15, m15, q0321
4683    pshufd               m7, m7, q0321
4684    movd                 rX, m15
4685    movd                 r5, m7
4686    mova                 m5, [base+bdct_lb_q+ 0]
4687    mova                 m6, [base+bdct_lb_q+16]
4688    movd                 m1, [base+subpel_filters+r0*8+2]
4689    movd                 m2, [base+subpel_filters+rX*8+2]
4690    movd                 m3, [base+subpel_filters+r4*8+2]
4691    movd                 m7, [base+subpel_filters+r5*8+2]
4692    movifprep            r3, r3m
4693    SWAP                 m4, m7
4694 %define m10 m5
4695 %define m11 m6
4696 %define m12 m1
4697 %define m13 m1
4698%endif
4699    psrld               m14, 10
4700    paddd               m14, m14
4701    punpckldq           m13, m2
4702    punpckldq           m15, m4
4703    punpcklqdq          m13, m15
4704    pxor                 m2, m2
4705    pcmpeqd              m0, m2
4706%if ARCH_X86_64
4707    pand                 m9, m0
4708%else
4709    pand                 m2, m9, m0
4710 %define m9 m2
4711    SWAP                 m7, m4
4712%endif
4713    pandn                m0, m13
4714%if ARCH_X86_64
4715    SWAP                m13, m0
4716%else
4717 %define m13 m0
4718%endif
4719    por                 m13, m9
4720    punpckhbw           m15, m13, m13
4721    punpcklbw           m13, m13
4722    psraw               m15, 8
4723    psraw               m13, 8
4724    pshufb              m12, m14, m10
4725    pshufb              m14, m11
4726    mova                m10, [base+spel_s_shuf2]
4727    movd                r4d, m14
4728    shr                 r4d, 24
4729%if ARCH_X86_32
4730    mova         [stk+0x20], m13
4731    mova         [stk+0x30], m15
4732    pxor                 m2, m2
4733%endif
4734    pshufb               m7, m14, m2
4735    psubb               m14, m7
4736    paddb               m12, m10
4737    paddb               m14, m10
4738%if ARCH_X86_64
4739    lea                  r6, [r4+ssq*1]
4740    lea                 r11, [r4+ssq*2]
4741    lea                 r13, [r4+ss3q ]
4742    movu                 m7, [srcq+ssq*0]
4743    movu                 m9, [srcq+ssq*1]
4744    movu                 m8, [srcq+ssq*2]
4745    movu                m10, [srcq+ss3q ]
4746    movu                 m1, [srcq+r4   ]
4747    movu                 m3, [srcq+r6   ]
4748    movu                 m2, [srcq+r11  ]
4749    movu                 m4, [srcq+r13  ]
4750    lea                srcq, [srcq+ssq*4]
4751    REPX    {pshufb x, m12}, m7, m9, m8, m10
4752    REPX   {pmaddwd x, m13}, m7, m9, m8, m10
4753    REPX    {pshufb x, m14}, m1, m2, m3, m4
4754    REPX   {pmaddwd x, m15}, m1, m2, m3, m4
4755    mova                 m5, [rsp+0x10]
4756    movd                xm6, [rsp+0x20]
4757    phaddd               m7, m1
4758    phaddd               m9, m3
4759    phaddd               m8, m2
4760    phaddd              m10, m4
4761    movu                 m1, [srcq+ssq*0]
4762    movu                 m2, [srcq+ssq*1]
4763    movu                 m3, [srcq+ssq*2]
4764    movu                 m4, [srcq+ss3q ]
4765    REPX      {paddd x, m5}, m7, m9, m8, m10
4766    REPX     {psrad x, xm6}, m7, m9, m8, m10
4767    packssdw             m7, m9  ; 0 1
4768    packssdw             m8, m10 ; 2 3
4769    movu                 m0, [srcq+r4   ]
4770    movu                 m9, [srcq+r6   ]
4771    movu                m10, [srcq+r11  ]
4772    movu                m11, [srcq+r13  ]
4773    lea                srcq, [srcq+ssq*4]
4774    REPX    {pshufb x, m12}, m1, m2, m3, m4
4775    REPX   {pmaddwd x, m13}, m1, m2, m3, m4
4776    REPX    {pshufb x, m14}, m0, m9, m10, m11
4777    REPX   {pmaddwd x, m15}, m0, m9, m10, m11
4778    phaddd               m1, m0
4779    phaddd               m2, m9
4780    phaddd               m3, m10
4781    phaddd               m4, m11
4782    REPX      {paddd x, m5}, m1, m2, m3, m4
4783    REPX     {psrad x, xm6}, m1, m2, m3, m4
4784    packssdw             m1, m2 ; 4 5
4785    packssdw             m3, m4 ; 6 7
4786    SWAP                 m9, m1
4787    shufps               m4, m7, m8, q1032  ; 1 2
4788    shufps               m5, m8, m9, q1032  ; 3 4
4789    shufps               m6, m9, m3, q1032  ; 5 6
4790    pshufd              m10, m3, q1032      ; 7 _
4791    punpcklwd            m0, m7, m4 ; 01
4792    punpckhwd            m7, m4     ; 12
4793    punpcklwd            m1, m8, m5 ; 23
4794    punpckhwd            m8, m5     ; 34
4795    punpcklwd            m2, m9, m6 ; 45
4796    punpckhwd            m9, m6     ; 56
4797    punpcklwd            m3, m10    ; 67
4798    mova         [rsp+0x40], m7
4799    mova         [rsp+0x50], m8
4800    mova         [rsp+0x60], m9
4801%else
4802    mova         [stk+0x00], m12
4803    mova         [stk+0x10], m14
4804    add                  r4, srcq
4805    MC_4TAP_SCALED_H   0x40 ; 0 1
4806    MC_4TAP_SCALED_H   0x50 ; 2 3
4807    MC_4TAP_SCALED_H   0x60 ; 4 5
4808    MC_4TAP_SCALED_H   0x70 ; 6 7
4809    mova                 m4, [stk+0x40]
4810    mova                 m5, [stk+0x50]
4811    mova                 m6, [stk+0x60]
4812    mova                 m7, [stk+0x70]
4813    mov          [stk+0xc0], r4
4814    shufps               m1, m4, m5, q1032 ; 1 2
4815    shufps               m2, m5, m6, q1032 ; 3 4
4816    shufps               m3, m6, m7, q1032 ; 5 6
4817    pshufd               m0, m7, q1032     ; 7 _
4818    mova         [stk+0xb0], m0
4819    punpcklwd            m0, m4, m1         ; 01
4820    punpckhwd            m4, m1             ; 12
4821    punpcklwd            m1, m5, m2         ; 23
4822    punpckhwd            m5, m2             ; 34
4823    punpcklwd            m2, m6, m3         ; 45
4824    punpckhwd            m6, m3             ; 56
4825    punpcklwd            m3, m7, [stk+0xb0] ; 67
4826    mov                 myd, mym
4827    mov                  r0, r0m
4828    mova         [stk+0x40], m0 ; 01
4829    mova         [stk+0x50], m1 ; 23
4830    mova         [stk+0x60], m2 ; 45
4831    mova         [stk+0x70], m3 ; 67
4832    mova         [stk+0x80], m4 ; 12
4833    mova         [stk+0x90], m5 ; 34
4834    mova         [stk+0xa0], m6 ; 56
4835 %define m12 [stk+0x00]
4836 %define m14 [stk+0x10]
4837 %define m13 [stk+0x20]
4838 %define m15 [stk+0x30]
4839 %define hrnd_mem [esp+0x00]
4840 %define hsh_mem  [esp+0x10]
4841 %if isput
4842  %define vrnd_mem [esp+0x20]
4843 %else
4844  %define vrnd_mem [base+pd_m524256]
4845 %endif
4846%endif
4847.w4_loop:
4848    and                 myd, 0x3ff
4849%if ARCH_X86_64
4850    mov                r11d, 64 << 24
4851    mov                r13d, myd
4852    shr                r13d, 6
4853    lea                r13d, [t1+r13]
4854    cmovnz             r11q, [base+subpel_filters+r13*8]
4855    movq                 m9, r11q
4856    punpcklbw            m9, m9
4857    psraw                m9, 8
4858    pshufd               m7, m9, q0000
4859    pshufd               m8, m9, q1111
4860    pmaddwd              m4, m0, m7
4861    pmaddwd              m5, m1, m8
4862    pshufd               m7, m9, q2222
4863    pshufd               m9, m9, q3333
4864    pmaddwd              m6, m2, m7
4865    pmaddwd              m8, m3, m9
4866 %if isput
4867    movd                 m9, [rsp+0x28]
4868  %define vrnd_mem [rsp+0x30]
4869 %else
4870  %define vrnd_mem [base+pd_m524256]
4871 %endif
4872    paddd                m4, m5
4873    paddd                m6, m8
4874    paddd                m4, m6
4875    paddd                m4, vrnd_mem
4876%else
4877    mov                 mym, myd
4878    mov                  r5, [esp+0x1f4]
4879    xor                  r3, r3
4880    shr                  r4, 6
4881    lea                  r5, [r5+r4]
4882    mov                  r4, 64 << 24
4883    cmovnz               r4, [base+subpel_filters+r5*8+0]
4884    cmovnz               r3, [base+subpel_filters+r5*8+4]
4885    movd                 m7, r4
4886    movd                 m6, r3
4887    punpckldq            m7, m6
4888    punpcklbw            m7, m7
4889    psraw                m7, 8
4890    pshufd               m4, m7, q0000
4891    pshufd               m5, m7, q1111
4892    pshufd               m6, m7, q2222
4893    pshufd               m7, m7, q3333
4894    pmaddwd              m0, m4
4895    pmaddwd              m1, m5
4896    pmaddwd              m2, m6
4897    pmaddwd              m3, m7
4898 %if isput
4899    movd                 m4, [esp+0x18]
4900 %endif
4901    paddd                m0, m1
4902    paddd                m2, m3
4903    paddd                m0, vrnd_mem
4904    paddd                m0, m2
4905    SWAP                 m4, m0
4906 %define m9 m0
4907%endif
4908%if isput
4909    pxor                 m5, m5
4910    psrad                m4, m9
4911    packssdw             m4, m4
4912    pmaxsw               m4, m5
4913    pminsw               m4, pxmaxm
4914    movq             [dstq], m4
4915    add                dstq, dsmp
4916%else
4917    psrad                m4, 6
4918    packssdw             m4, m4
4919    movq             [tmpq], m4
4920    add                tmpq, 8
4921%endif
4922    dec                  hd
4923    jz .ret
4924%if ARCH_X86_64
4925    add                 myd, dyd
4926    test                myd, ~0x3ff
4927    jz .w4_loop
4928    mova                 m8, [rsp+0x10]
4929    movd                 m9, [rsp+0x20]
4930    movu                 m4, [srcq]
4931    movu                 m5, [srcq+r4]
4932    test                myd, 0x400
4933    jz .w4_skip_line
4934    mova                 m0, [rsp+0x40]
4935    mova         [rsp+0x40], m1
4936    mova                 m1, [rsp+0x50]
4937    mova         [rsp+0x50], m2
4938    mova                 m2, [rsp+0x60]
4939    mova         [rsp+0x60], m3
4940    pshufb               m4, m12
4941    pshufb               m5, m14
4942    pmaddwd              m4, m13
4943    pmaddwd              m5, m15
4944    phaddd               m4, m5
4945    paddd                m4, m8
4946    psrad                m4, m9
4947    packssdw             m4, m4
4948    punpcklwd            m3, m10, m4
4949    mova                m10, m4
4950    add                srcq, ssq
4951    jmp .w4_loop
4952.w4_skip_line:
4953    movu                 m6, [srcq+ssq*1]
4954    movu                 m7, [srcq+r6]
4955    mova                 m0, [rsp+0x50]
4956    mova                m11, [rsp+0x60]
4957    pshufb               m4, m12
4958    pshufb               m6, m12
4959    pshufb               m5, m14
4960    pshufb               m7, m14
4961    pmaddwd              m4, m13
4962    pmaddwd              m6, m13
4963    pmaddwd              m5, m15
4964    pmaddwd              m7, m15
4965    mova         [rsp+0x40], m0
4966    mova         [rsp+0x50], m11
4967    phaddd               m4, m5
4968    phaddd               m6, m7
4969    paddd                m4, m8
4970    paddd                m6, m8
4971    psrad                m4, m9
4972    psrad                m6, m9
4973    packssdw             m4, m6
4974    punpcklwd            m9, m10, m4
4975    mova         [rsp+0x60], m9
4976    pshufd              m10, m4, q1032
4977    mova                 m0, m1
4978    mova                 m1, m2
4979    mova                 m2, m3
4980    punpcklwd            m3, m4, m10
4981    lea                srcq, [srcq+ssq*2]
4982    jmp .w4_loop
4983%else
4984    SWAP                 m0, m4
4985    mov                 myd, mym
4986    mov                  r3, r3m
4987    add                 myd, dym
4988    test                myd, ~0x3ff
4989    jnz .w4_next_line
4990    mova                 m0, [stk+0x40]
4991    mova                 m1, [stk+0x50]
4992    mova                 m2, [stk+0x60]
4993    mova                 m3, [stk+0x70]
4994    jmp .w4_loop
4995.w4_next_line:
4996    mov                  r5, [stk+0xc0]
4997    movu                 m4, [srcq]
4998    movu                 m5, [r5]
4999    test                myd, 0x400
5000    jz .w4_skip_line
5001    add          [stk+0xc0], ssq
5002    mova                 m0, [stk+0x80]
5003    mova                 m3, [stk+0x50]
5004    mova         [stk+0x40], m0
5005    mova         [stk+0x80], m3
5006    mova                 m1, [stk+0x90]
5007    mova                 m6, [stk+0x60]
5008    mova         [stk+0x50], m1
5009    mova         [stk+0x90], m6
5010    mova                 m2, [stk+0xa0]
5011    mova                 m7, [stk+0x70]
5012    mova         [stk+0x60], m2
5013    mova         [stk+0xa0], m7
5014    pshufb               m4, m12
5015    pshufb               m5, m14
5016    pmaddwd              m4, m13
5017    pmaddwd              m5, m15
5018    phaddd               m4, m5
5019    paddd                m4, hrnd_mem
5020    psrad                m4, hsh_mem
5021    packssdw             m4, m4
5022    punpcklwd            m3, [stk+0xb0], m4
5023    mova         [stk+0xb0], m4
5024    mova         [stk+0x70], m3
5025    add                srcq, ssq
5026    jmp .w4_loop
5027.w4_skip_line:
5028    movu                 m6, [srcq+ssq*1]
5029    movu                 m7, [r5  +ssq*1]
5030    lea                  r5, [r5  +ssq*2]
5031    mov          [stk+0xc0], r5
5032    mova                 m0, [stk+0x50]
5033    mova                 m1, [stk+0x60]
5034    mova                 m2, [stk+0x70]
5035    mova                 m3, [stk+0x90]
5036    pshufb               m4, m12
5037    pshufb               m6, m12
5038    pshufb               m5, m14
5039    pshufb               m7, m14
5040    pmaddwd              m4, m13
5041    pmaddwd              m6, m13
5042    pmaddwd              m5, m15
5043    pmaddwd              m7, m15
5044    mova         [stk+0x40], m0
5045    mova         [stk+0x50], m1
5046    mova         [stk+0x60], m2
5047    mova         [stk+0x80], m3
5048    phaddd               m4, m5
5049    phaddd               m6, m7
5050    mova                 m5, [stk+0xa0]
5051    mova                 m7, [stk+0xb0]
5052    paddd                m4, hrnd_mem
5053    paddd                m6, hrnd_mem
5054    psrad                m4, hsh_mem
5055    psrad                m6, hsh_mem
5056    packssdw             m4, m6
5057    punpcklwd            m7, m4
5058    pshufd               m6, m4, q1032
5059    mova         [stk+0x90], m5
5060    mova         [stk+0xa0], m7
5061    mova         [stk+0xb0], m6
5062    punpcklwd            m3, m4, m6
5063    mova         [stk+0x70], m3
5064    lea                srcq, [srcq+ssq*2]
5065    jmp .w4_loop
5066%endif
5067INIT_XMM ssse3
5068%if ARCH_X86_64
5069 %define stk rsp+0x20
5070%endif
5071.w8:
5072    mov    dword [stk+0xf0], 1
5073    movifprep   tmp_stridem, 16
5074    jmp .w_start
5075.w16:
5076    mov    dword [stk+0xf0], 2
5077    movifprep   tmp_stridem, 32
5078    jmp .w_start
5079.w32:
5080    mov    dword [stk+0xf0], 4
5081    movifprep   tmp_stridem, 64
5082    jmp .w_start
5083.w64:
5084    mov    dword [stk+0xf0], 8
5085    movifprep   tmp_stridem, 128
5086    jmp .w_start
5087.w128:
5088    mov    dword [stk+0xf0], 16
5089    movifprep   tmp_stridem, 256
5090.w_start:
5091%if ARCH_X86_64
5092 %ifidn %1, put
5093    movifnidn           dsm, dsq
5094 %endif
5095    mova         [rsp+0x10], m11
5096 %define hround m11
5097    shr                 t0d, 16
5098    movd                m15, t0d
5099 %if isprep
5100    mova                m13, [base+pd_m524256]
5101 %endif
5102%else
5103 %define hround [esp+0x00]
5104 %define m12    [esp+0x10]
5105 %define m10    [base+pd_0x3ff]
5106 %define m8  m0
5107 %xdefine m14 m4
5108 %define m15 m3
5109 %if isprep
5110  %define ssq ssm
5111 %endif
5112    mov                  r4, [esp+0x1f0]
5113    shr                  r4, 16
5114    movd                m15, r4
5115    mov                  r0, r0m
5116    mov                 myd, mym
5117%endif
5118    sub                srcq, 6
5119    pslld                m7, m8, 2 ; dx*4
5120    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
5121    pshufd              m15, m15, q0000
5122    paddd               m14, m8 ; mx+dx*[0-3]
5123    mova        [stk+0x100], m7
5124    mova        [stk+0x120], m15
5125    mov         [stk+0x0f8], srcq
5126    mov         [stk+0x130], r0q ; dstq / tmpq
5127%if ARCH_X86_64 && UNIX64
5128    mov                  hm, hd
5129%elif ARCH_X86_32
5130    mov                  r5, hm
5131    mov         [stk+0x0f4], myd
5132    mov         [stk+0x134], r5
5133%endif
5134    jmp .hloop
5135.hloop_prep:
5136    dec   dword [stk+0x0f0]
5137    jz .ret
5138%if ARCH_X86_64
5139    add   qword [stk+0x130], 16
5140    mov                  hd, hm
5141%else
5142    add   dword [stk+0x130], 16
5143    mov                 myd, [stk+0x0f4]
5144    mov                  r5, [stk+0x134]
5145    mov                  r0, [stk+0x130]
5146%endif
5147    mova                 m7, [stk+0x100]
5148    mova                m14, [stk+0x110]
5149%if ARCH_X86_64
5150    mova                m10, [base+pd_0x3ff]
5151    mova                m11, [rsp+0x10]
5152%endif
5153    mova                m15, [stk+0x120]
5154    mov                srcq, [stk+0x0f8]
5155%if ARCH_X86_64
5156    mov                 r0q, [stk+0x130] ; dstq / tmpq
5157%else
5158    mov                 mym, myd
5159    mov                  hm, r5
5160    mov                 r0m, r0
5161    mov                  r3, r3m
5162%endif
5163    paddd               m14, m7
5164.hloop:
5165%if ARCH_X86_64
5166    mova                 m9, [base+pq_0x40000000]
5167%else
5168 %define m9 [base+pq_0x40000000]
5169%endif
5170    pxor                 m1, m1
5171    psrld                m2, m14, 10
5172    mova              [stk], m2
5173    pand                 m6, m14, m10
5174    psrld                m6, 6
5175    paddd                m5, m15, m6
5176    pcmpeqd              m6, m1
5177    pshufd               m2, m5, q1032
5178%if ARCH_X86_64
5179    movd                r4d, m5
5180    movd                r6d, m2
5181    pshufd               m5, m5, q0321
5182    pshufd               m2, m2, q0321
5183    movd                r7d, m5
5184    movd                r9d, m2
5185    movq                 m0, [base+subpel_filters+r4*8]
5186    movq                 m1, [base+subpel_filters+r6*8]
5187    movhps               m0, [base+subpel_filters+r7*8]
5188    movhps               m1, [base+subpel_filters+r9*8]
5189%else
5190    movd                 r0, m5
5191    movd                 rX, m2
5192    pshufd               m5, m5, q0321
5193    pshufd               m2, m2, q0321
5194    movd                 r4, m5
5195    movd                 r5, m2
5196    movq                 m0, [base+subpel_filters+r0*8]
5197    movq                 m1, [base+subpel_filters+rX*8]
5198    movhps               m0, [base+subpel_filters+r4*8]
5199    movhps               m1, [base+subpel_filters+r5*8]
5200%endif
5201    paddd               m14, m7 ; mx+dx*[4-7]
5202    pand                 m5, m14, m10
5203    psrld                m5, 6
5204    paddd               m15, m5
5205    pxor                 m2, m2
5206    pcmpeqd              m5, m2
5207    mova        [stk+0x110], m14
5208    pshufd               m4, m15, q1032
5209%if ARCH_X86_64
5210    movd               r10d, m15
5211    movd               r11d, m4
5212    pshufd              m15, m15, q0321
5213    pshufd               m4, m4, q0321
5214    movd               r13d, m15
5215    movd                rXd, m4
5216    movq                 m2, [base+subpel_filters+r10*8]
5217    movq                 m3, [base+subpel_filters+r11*8]
5218    movhps               m2, [base+subpel_filters+r13*8]
5219    movhps               m3, [base+subpel_filters+ rX*8]
5220    psrld               m14, 10
5221    movq                r11, m14
5222    punpckhqdq          m14, m14
5223    movq                 rX, m14
5224    mov                r10d, r11d
5225    shr                 r11, 32
5226    mov                r13d, rXd
5227    shr                  rX, 32
5228    mov                 r4d, [stk+ 0]
5229    mov                 r6d, [stk+ 4]
5230    mov                 r7d, [stk+ 8]
5231    mov                 r9d, [stk+12]
5232    pshufd               m4, m6, q1100
5233    pshufd               m6, m6, q3322
5234    pshufd              m14, m5, q1100
5235    pshufd               m5, m5, q3322
5236    pand                 m7, m9, m4
5237    pand                 m8, m9, m6
5238    pand                m15, m9, m14
5239    pand                 m9, m9, m5
5240    pandn                m4, m0
5241    pandn                m6, m1
5242    pandn               m14, m2
5243    pandn                m5, m3
5244    por                  m7, m4
5245    por                  m8, m6
5246    por                 m15, m14
5247    por                  m9, m5
5248    punpcklbw            m0, m7, m7
5249    punpckhbw            m7, m7
5250    punpcklbw            m1, m8, m8
5251    punpckhbw            m8, m8
5252    psraw                m0, 8
5253    psraw                m7, 8
5254    psraw                m1, 8
5255    psraw                m8, 8
5256    punpcklbw            m2, m15, m15
5257    punpckhbw           m15, m15
5258    punpcklbw            m3, m9, m9
5259    punpckhbw            m9, m9
5260    psraw                m2, 8
5261    psraw               m15, 8
5262    psraw                m3, 8
5263    psraw                m9, 8
5264    mova         [stk+0x10], m0
5265    mova         [stk+0x20], m7
5266    mova         [stk+0x30], m1
5267    mova         [stk+0x40], m8
5268    mova         [stk+0x50], m2
5269    mova         [stk+0x60], m15
5270    mova         [stk+0x70], m3
5271    mova         [stk+0x80], m9
5272    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
5273    mova         [stk+0x90], m1
5274    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
5275    mova         [stk+0xa0], m2
5276    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
5277    mova         [stk+0xb0], m3
5278    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
5279    mova         [stk+0xc0], m4
5280    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
5281    mova         [stk+0xd0], m5
5282    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
5283    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
5284    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
5285    mova                 m5, [stk+0xd0]
5286    mova                 m1, [stk+0x90]
5287    mova                 m2, [stk+0xa0]
5288    mova                 m3, [stk+0xb0]
5289    mova                 m9, [stk+0xc0]
5290    mov                 myd, mym
5291    mov                 dyd, dym
5292    punpcklwd            m4, m5, m6 ; 45a
5293    punpckhwd            m5, m6     ; 45b
5294    punpcklwd            m6, m7, m8 ; 67a
5295    punpckhwd            m7, m8     ; 67b
5296    punpcklwd            m0, m1, m2 ; 01a
5297    punpckhwd            m1, m2     ; 01b
5298    punpcklwd            m2, m3, m9 ; 23a
5299    punpckhwd            m3, m9     ; 23b
5300    mova         [stk+0x90], m4
5301    mova         [stk+0xa0], m5
5302    mova         [stk+0xb0], m6
5303    mova         [stk+0xc0], m7
5304 %define hround [rsp+0x10]
5305.vloop:
5306    and                 myd, 0x3ff
5307    mov                 r6d, 64 << 24
5308    mov                 r4d, myd
5309    shr                 r4d, 6
5310    lea                 r4d, [t1+r4]
5311    cmovnz              r6q, [base+subpel_filters+r4*8]
5312    movq                m11, r6q
5313    punpcklbw           m11, m11
5314    psraw               m11, 8
5315    pshufd               m5, m11, q0000
5316    pshufd               m7, m11, q1111
5317    pshufd              m10, m11, q2222
5318    pshufd              m11, m11, q3333
5319    pmaddwd              m4, m5, m0
5320    pmaddwd              m5, m5, m1
5321    pmaddwd              m6, m7, m2
5322    pmaddwd              m7, m7, m3
5323    paddd                m4, m13
5324    paddd                m5, m13
5325    paddd                m4, m6
5326    paddd                m5, m7
5327    pmaddwd              m6, [stk+0x90], m10
5328    pmaddwd              m7, [stk+0xa0], m10
5329    pmaddwd              m8, [stk+0xb0], m11
5330    pmaddwd              m9, [stk+0xc0], m11
5331    paddd                m4, m6
5332    paddd                m5, m7
5333 %if isput
5334    pshufd               m6, m12, q1032
5335 %endif
5336    paddd                m4, m8
5337    paddd                m5, m9
5338%else
5339    movd                 r0, m15
5340    movd                 rX, m4
5341    pshufd              m15, m15, q0321
5342    pshufd               m4, m4, q0321
5343    movd                 r4, m15
5344    movd                 r5, m4
5345    mova                m14, [stk+0x110]
5346    movq                 m2, [base+subpel_filters+r0*8]
5347    movq                 m3, [base+subpel_filters+rX*8]
5348    movhps               m2, [base+subpel_filters+r4*8]
5349    movhps               m3, [base+subpel_filters+r5*8]
5350    psrld               m14, 10
5351    mova           [stk+16], m14
5352    mov                  r0, [stk+ 0]
5353    mov                  rX, [stk+ 4]
5354    mov                  r4, [stk+ 8]
5355    mov                  r5, [stk+12]
5356    mova         [stk+0x20], m0
5357    mova         [stk+0x30], m1
5358    mova         [stk+0x40], m2
5359    mova         [stk+0x50], m3
5360    pshufd               m4, m6, q1100
5361    pshufd               m6, m6, q3322
5362    pshufd               m7, m5, q1100
5363    pshufd               m5, m5, q3322
5364    pand                 m0, m9, m4
5365    pand                 m1, m9, m6
5366    pand                 m2, m9, m7
5367    pand                 m3, m9, m5
5368    pandn                m4, [stk+0x20]
5369    pandn                m6, [stk+0x30]
5370    pandn                m7, [stk+0x40]
5371    pandn                m5, [stk+0x50]
5372    por                  m0, m4
5373    por                  m1, m6
5374    por                  m2, m7
5375    por                  m3, m5
5376    punpcklbw            m4, m0, m0
5377    punpckhbw            m0, m0
5378    punpcklbw            m5, m1, m1
5379    punpckhbw            m1, m1
5380    psraw                m4, 8
5381    psraw                m0, 8
5382    psraw                m5, 8
5383    psraw                m1, 8
5384    punpcklbw            m6, m2, m2
5385    punpckhbw            m2, m2
5386    punpcklbw            m7, m3, m3
5387    punpckhbw            m3, m3
5388    psraw                m6, 8
5389    psraw                m2, 8
5390    psraw                m7, 8
5391    psraw                m3, 8
5392    mova        [stk+0x0a0], m4
5393    mova        [stk+0x0b0], m0
5394    mova        [stk+0x0c0], m5
5395    mova        [stk+0x0d0], m1
5396    mova        [stk+0x140], m6
5397    mova        [stk+0x150], m2
5398    mova        [stk+0x160], m7
5399    mova        [stk+0x170], m3
5400    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
5401    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
5402    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
5403    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
5404    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
5405    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
5406    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
5407    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
5408    mova                 m5, [stk+0x60]
5409    mova                 m6, [stk+0x70]
5410    mova                 m7, [stk+0x80]
5411    mova                 m0, [stk+0x90]
5412    mov                 myd, mym
5413    punpcklwd            m4, m5, m6      ; 45a
5414    punpckhwd            m5, m6          ; 45b
5415    punpcklwd            m6, m7, m0      ; 67a
5416    punpckhwd            m7, m0          ; 67b
5417    mova         [stk+0x60], m4
5418    mova         [stk+0x70], m5
5419    mova         [stk+0x80], m6
5420    mova         [stk+0x90], m7
5421    mova                 m1, [stk+0x20]
5422    mova                 m2, [stk+0x30]
5423    mova                 m3, [stk+0x40]
5424    mova                 m4, [stk+0x50]
5425    punpcklwd            m0, m1, m2      ; 01a
5426    punpckhwd            m1, m2          ; 01b
5427    punpcklwd            m2, m3, m4      ; 23a
5428    punpckhwd            m3, m4          ; 23b
5429    mova         [stk+0x20], m0
5430    mova         [stk+0x30], m1
5431    mova         [stk+0x40], m2
5432    mova         [stk+0x50], m3
5433.vloop:
5434    mov                  r0, r0m
5435    mov                  r5, [esp+0x1f4]
5436    and                 myd, 0x3ff
5437    mov                 mym, myd
5438    xor                  r3, r3
5439    shr                  r4, 6
5440    lea                  r5, [r5+r4]
5441    mov                  r4, 64 << 24
5442    cmovnz               r4, [base+subpel_filters+r5*8+0]
5443    cmovnz               r3, [base+subpel_filters+r5*8+4]
5444    movd                 m7, r4
5445    movd                 m6, r3
5446    punpckldq            m7, m6
5447    punpcklbw            m7, m7
5448    psraw                m7, 8
5449    pshufd               m4, m7, q0000
5450    pshufd               m5, m7, q1111
5451    pmaddwd              m0, m4
5452    pmaddwd              m1, m4
5453    pmaddwd              m2, m5
5454    pmaddwd              m3, m5
5455    pshufd               m6, m7, q2222
5456    pshufd               m7, m7, q3333
5457    paddd                m0, m2
5458    paddd                m1, m3
5459    pmaddwd              m2, [stk+0x60], m6
5460    pmaddwd              m3, [stk+0x70], m6
5461    pmaddwd              m4, [stk+0x80], m7
5462    pmaddwd              m5, [stk+0x90], m7
5463 %if isput
5464    movd                 m6, [esp+0x18]
5465 %endif
5466    paddd                m0, m2
5467    paddd                m1, m3
5468    paddd                m0, vrnd_mem
5469    paddd                m1, vrnd_mem
5470    paddd                m4, m0
5471    paddd                m5, m1
5472%endif
5473%ifidn %1, put
5474    psrad                m4, m6
5475    psrad                m5, m6
5476    packssdw             m4, m5
5477    pxor                 m7, m7
5478    pmaxsw               m4, m7
5479    pminsw               m4, pxmaxm
5480    mova             [dstq], m4
5481    add                dstq, dsm
5482%else
5483    psrad                m4, 6
5484    psrad                m5, 6
5485    packssdw             m4, m5
5486    mova             [tmpq], m4
5487    add                tmpq, tmp_stridem
5488%endif
5489    dec                  hd
5490    jz .hloop_prep
5491%if ARCH_X86_64
5492    add                 myd, dyd
5493    test                myd, ~0x3ff
5494    jz .vloop
5495    test                myd, 0x400
5496    mov         [stk+0x140], myd
5497    mov                 r4d, [stk+ 0]
5498    mov                 r6d, [stk+ 4]
5499    mov                 r7d, [stk+ 8]
5500    mov                 r9d, [stk+12]
5501    jz .skip_line
5502    mova                m14, [base+unpckw]
5503    movu                 m8, [srcq+r10*2]
5504    movu                 m9, [srcq+r11*2]
5505    movu                m10, [srcq+r13*2]
5506    movu                m11, [srcq+ rX*2]
5507    movu                 m4, [srcq+ r4*2]
5508    movu                 m5, [srcq+ r6*2]
5509    movu                 m6, [srcq+ r7*2]
5510    movu                 m7, [srcq+ r9*2]
5511    add                srcq, ssq
5512    mov                 myd, [stk+0x140]
5513    mov                 dyd, dym
5514    pshufd              m15, m14, q1032
5515    pshufb               m0, m14                ; 0a 1a
5516    pshufb               m1, m14                ; 0b 1b
5517    pshufb               m2, m15                ; 3a 2a
5518    pshufb               m3, m15                ; 3b 2b
5519    pmaddwd              m8, [stk+0x50]
5520    pmaddwd              m9, [stk+0x60]
5521    pmaddwd             m10, [stk+0x70]
5522    pmaddwd             m11, [stk+0x80]
5523    pmaddwd              m4, [stk+0x10]
5524    pmaddwd              m5, [stk+0x20]
5525    pmaddwd              m6, [stk+0x30]
5526    pmaddwd              m7, [stk+0x40]
5527    phaddd               m8, m9
5528    phaddd              m10, m11
5529    mova                m11, hround
5530    phaddd               m4, m5
5531    phaddd               m6, m7
5532    phaddd               m8, m10
5533    phaddd               m4, m6
5534    paddd                m4, m11
5535    paddd                m8, m11
5536    psrad                m4, m12
5537    psrad                m8, m12
5538    packssdw             m4, m8
5539    pshufb               m5, [stk+0x90], m14    ; 4a 5a
5540    pshufb               m6, [stk+0xa0], m14    ; 4b 5b
5541    pshufb               m7, [stk+0xb0], m15    ; 7a 6a
5542    pshufb               m8, [stk+0xc0], m15    ; 7b 6b
5543    punpckhwd            m0, m2 ; 12a
5544    punpckhwd            m1, m3 ; 12b
5545    punpcklwd            m2, m5 ; 34a
5546    punpcklwd            m3, m6 ; 34b
5547    punpckhwd            m5, m7 ; 56a
5548    punpckhwd            m6, m8 ; 56b
5549    punpcklwd            m7, m4 ; 78a
5550    punpckhqdq           m4, m4
5551    punpcklwd            m8, m4 ; 78b
5552    mova         [stk+0x90], m5
5553    mova         [stk+0xa0], m6
5554    mova         [stk+0xb0], m7
5555    mova         [stk+0xc0], m8
5556    jmp .vloop
5557.skip_line:
5558    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11
5559    MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11
5560    mov                 myd, [stk+0x140]
5561    mov                 dyd, dym
5562    mova                 m0, m2         ; 01a
5563    mova                 m1, m3         ; 01b
5564    mova                 m2, [stk+0x90] ; 23a
5565    mova                 m3, [stk+0xa0] ; 23b
5566    mova                 m5, [stk+0xb0] ; 45a
5567    mova                 m6, [stk+0xc0] ; 45b
5568    punpcklwd            m7, m4, m8     ; 67a
5569    punpckhwd            m4, m8         ; 67b
5570    mova         [stk+0x90], m5
5571    mova         [stk+0xa0], m6
5572    mova         [stk+0xb0], m7
5573    mova         [stk+0xc0], m4
5574%else
5575    mov                 r0m, r0
5576    mov                 myd, mym
5577    mov                  r3, r3m
5578    add                 myd, dym
5579    test                myd, ~0x3ff
5580    mov                 mym, myd
5581    jnz .next_line
5582    mova                 m0, [stk+0x20]
5583    mova                 m1, [stk+0x30]
5584    mova                 m2, [stk+0x40]
5585    mova                 m3, [stk+0x50]
5586    jmp .vloop
5587.next_line:
5588    test                myd, 0x400
5589    mov                  r0, [stk+ 0]
5590    mov                  rX, [stk+ 4]
5591    mov                  r4, [stk+ 8]
5592    mov                  r5, [stk+12]
5593    jz .skip_line
5594    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
5595    mova                 m7, [base+unpckw]
5596    pshufd               m4, m7, q1032
5597    pshufb               m0, [stk+0x20], m7 ; 0a 1a
5598    pshufb               m1, [stk+0x30], m7 ; 0b 1b
5599    pshufb               m2, [stk+0x40], m4 ; 3a 2a
5600    pshufb               m3, [stk+0x50], m4 ; 3b 2b
5601    pshufb               m5, [stk+0x60], m7 ; 4a 5a
5602    pshufb               m6, [stk+0x70], m7 ; 4b 5b
5603    pshufb               m7, [stk+0x80], m4 ; 7a 6a
5604    punpckhwd            m0, m2 ; 12a
5605    punpckhwd            m1, m3 ; 12b
5606    punpcklwd            m2, m5 ; 34a
5607    punpcklwd            m3, m6 ; 34b
5608    mova         [stk+0x20], m0
5609    mova         [stk+0x30], m1
5610    mova         [stk+0x40], m2
5611    mova         [stk+0x50], m3
5612    punpckhwd            m5, m7 ; 56a
5613    mova         [stk+0x60], m5
5614    pshufb               m5, [stk+0x90], m4 ; 7b 6b
5615    punpcklwd            m7, [stk+0xe0] ; 78a
5616    punpckhwd            m6, m5 ; 56b
5617    mova         [stk+0x70], m6
5618    movq                 m6, [stk+0xe8]
5619    mova         [stk+0x80], m7
5620    punpcklwd            m5, m6
5621    mov                 myd, mym
5622    mova         [stk+0x90], m5
5623    jmp .vloop
5624.skip_line:
5625    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
5626    MC_8TAP_SCALED_H 0xa0, 0       ; 9
5627    mova                 m7, [stk+0xe0]
5628    mova                 m2, [stk+0x60] ; 23a
5629    mova                 m3, [stk+0x70] ; 23b
5630    mova                 m4, [stk+0x80] ; 45a
5631    mova                 m5, [stk+0x90] ; 45b
5632    punpcklwd            m6, m7, m0     ; 67a
5633    punpckhwd            m7, m0         ; 67b
5634    mova                 m0, [stk+0x40] ; 01a
5635    mova                 m1, [stk+0x50] ; 01b
5636    mov                 myd, mym
5637    mova         [stk+0x40], m2
5638    mova         [stk+0x50], m3
5639    mova         [stk+0x60], m4
5640    mova         [stk+0x70], m5
5641    mova         [stk+0x80], m6
5642    mova         [stk+0x90], m7
5643    mova         [stk+0x20], m0
5644    mova         [stk+0x30], m1
5645%endif
5646    jmp .vloop
5647INIT_XMM ssse3
5648.dy1:
5649    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
5650    add                  wq, base_reg
5651    jmp                  wq
5652%if isput
5653.dy1_w2:
5654 %if ARCH_X86_64
5655    mov                 myd, mym
5656    movzx               t0d, t0b
5657    sub                srcq, 2
5658    movd                m15, t0d
5659 %else
5660  %define m8  m0
5661  %define m9  m1
5662  %define m14 m4
5663  %define m15 m3
5664  %define m11 [esp+0x00]
5665  %define m12 [esp+0x10]
5666  %define m13 [esp+0x20]
5667    movzx                r5, byte [esp+0x1f0]
5668    sub                srcq, 2
5669    movd                m15, r5
5670    mov                  r1, r1m
5671 %endif
5672    pxor                 m9, m9
5673    punpckldq            m9, m8
5674    paddd               m14, m9 ; mx+dx*[0-1]
5675 %if ARCH_X86_64
5676    mova                 m9, [base+pd_0x4000]
5677 %endif
5678    pshufd              m15, m15, q0000
5679    pand                 m8, m14, m10
5680    psrld                m8, 6
5681    paddd               m15, m8
5682    movd                r4d, m15
5683    pshufd              m15, m15, q0321
5684 %if ARCH_X86_64
5685    movd                r6d, m15
5686 %else
5687    movd                r3d, m15
5688 %endif
5689    mova                 m5, [base+bdct_lb_q]
5690    mova                 m6, [base+spel_s_shuf2]
5691    movd                m15, [base+subpel_filters+r4*8+2]
5692 %if ARCH_X86_64
5693    movd                 m7, [base+subpel_filters+r6*8+2]
5694 %else
5695    movd                 m7, [base+subpel_filters+r3*8+2]
5696 %endif
5697    pxor                 m2, m2
5698    pcmpeqd              m8, m2
5699    psrld               m14, 10
5700    paddd               m14, m14
5701 %if ARCH_X86_32
5702    mov                  r3, r3m
5703    pshufb              m14, m5
5704    paddb               m14, m6
5705    mova              [stk], m14
5706    SWAP                 m5, m0
5707    SWAP                 m6, m3
5708  %define m15 m6
5709 %endif
5710    movu                 m0, [srcq+ssq*0]
5711    movu                 m1, [srcq+ssq*1]
5712    movu                 m2, [srcq+ssq*2]
5713    movu                 m3, [srcq+ss3q ]
5714    lea                srcq, [srcq+ssq*4]
5715    punpckldq           m15, m7
5716 %if ARCH_X86_64
5717    pshufb              m14, m5
5718    paddb               m14, m6
5719    pand                 m9, m8
5720    pandn                m8, m15
5721    SWAP                m15, m8
5722    por                 m15, m9
5723    movu                 m4, [srcq+ssq*0]
5724    movu                 m5, [srcq+ssq*1]
5725    movu                 m6, [srcq+ssq*2]
5726    add                srcq, ss3q
5727    shr                 myd, 6
5728    mov                 r4d, 64 << 24
5729    lea                 myd, [t1+myq]
5730    cmovnz              r4q, [base+subpel_filters+myq*8]
5731 %else
5732    pand                 m7, m5, [base+pd_0x4000]
5733    pandn                m5, m15
5734    por                  m5, m7
5735  %define m15 m5
5736    mov                 myd, mym
5737    mov                  r5, [esp+0x1f4]
5738    xor                  r3, r3
5739    shr                 myd, 6
5740    lea                  r5, [r5+myd]
5741    mov                  r4, 64 << 24
5742    cmovnz               r4, [base+subpel_filters+r5*8+0]
5743    cmovnz               r3, [base+subpel_filters+r5*8+4]
5744    mov          [stk+0x20], r3
5745    mov                  r3, r3m
5746 %endif
5747    punpcklbw           m15, m15
5748    psraw               m15, 8
5749    REPX    {pshufb x, m14}, m0, m1, m2, m3
5750    REPX   {pmaddwd x, m15}, m0, m1, m2, m3
5751 %if ARCH_X86_64
5752    REPX    {pshufb x, m14}, m4, m5, m6
5753    REPX   {pmaddwd x, m15}, m4, m5, m6
5754    phaddd               m0, m1
5755    phaddd               m2, m3
5756    phaddd               m4, m5
5757    phaddd               m6, m6
5758    REPX     {paddd x, m11}, m0, m2, m4, m6
5759    REPX     {psrad x, m12}, m0, m2, m4, m6
5760    packssdw             m0, m2 ; 0 1 2 3
5761    packssdw             m4, m6 ; 4 5 6
5762    SWAP                 m1, m4
5763    movq                m10, r4
5764 %else
5765    mova         [stk+0x10], m15
5766    phaddd               m0, m1
5767    phaddd               m2, m3
5768    movu                 m1, [srcq+ssq*0]
5769    movu                 m7, [srcq+ssq*1]
5770    movu                 m6, [srcq+ssq*2]
5771    add                srcq, ss3q
5772    REPX    {pshufb x, m14}, m1, m7, m6
5773    REPX   {pmaddwd x, m15}, m1, m7, m6
5774  %define m14 [stk+0x00]
5775  %define m15 [stk+0x10]
5776    phaddd               m1, m7
5777    phaddd               m6, m6
5778    REPX     {paddd x, m11}, m0, m2, m1, m6
5779    REPX     {psrad x, m12}, m0, m2, m1, m6
5780    packssdw             m0, m2
5781    packssdw             m1, m6
5782  %define m8  m6
5783  %define m9  m4
5784  %define m10 m5
5785    movd                m10, r4
5786    movd                 m9, [stk+0x20]
5787    punpckldq           m10, m9
5788 %endif
5789    punpcklbw           m10, m10
5790    psraw               m10, 8
5791    pshufd               m7, m10, q0000
5792    pshufd               m8, m10, q1111
5793    pshufd               m9, m10, q2222
5794    pshufd              m10, m10, q3333
5795 %if ARCH_X86_32
5796    mova         [stk+0x50], m7
5797    mova         [stk+0x60], m8
5798    mova         [stk+0x70], m9
5799    mova         [stk+0x80], m10
5800  %define m7  [stk+0x50]
5801  %define m8  [stk+0x60]
5802  %define m9  [stk+0x70]
5803  %define m10 [stk+0x80]
5804 %endif
5805    palignr              m2, m1, m0, 4 ; 1 2 3 4
5806    punpcklwd            m3, m0, m2    ; 01 12
5807    punpckhwd            m0, m2        ; 23 34
5808    pshufd               m4, m1, q2121 ; 5 6 5 6
5809    punpcklwd            m2, m1, m4    ; 45 56
5810 %if ARCH_X86_32
5811    mov                  r0, r0m
5812 %endif
5813.dy1_w2_loop:
5814    movu                 m1, [srcq+ssq*0]
5815    movu                 m6, [srcq+ssq*1]
5816    lea                srcq, [srcq+ssq*2]
5817    pmaddwd              m5, m3, m7
5818    mova                 m3, m0
5819    pmaddwd              m0, m8
5820    pshufb               m1, m14
5821    pshufb               m6, m14
5822    pmaddwd              m1, m15
5823    pmaddwd              m6, m15
5824    phaddd               m1, m6
5825    paddd                m1, m11
5826    psrad                m1, m12
5827    packssdw             m1, m1
5828    paddd                m5, m0
5829    mova                 m0, m2
5830    pmaddwd              m2, m9
5831    paddd                m5, m2
5832    palignr              m2, m1, m4, 12
5833    punpcklwd            m2, m1        ; 67 78
5834    pmaddwd              m4, m2, m10
5835    paddd                m5, m13
5836    paddd                m5, m4
5837    pxor                 m6, m6
5838    mova                 m4, m1
5839    pshufd               m1, m12, q1032
5840    psrad                m5, m1
5841    packssdw             m5, m5
5842    pmaxsw               m5, m6
5843    pminsw               m5, pxmaxm
5844    movd       [dstq+dsq*0], m5
5845    pshuflw              m5, m5, q1032
5846    movd       [dstq+dsq*1], m5
5847    lea                dstq, [dstq+dsq*2]
5848    sub                  hd, 2
5849    jg .dy1_w2_loop
5850    RET
5851%endif
5852INIT_XMM ssse3
5853.dy1_w4:
5854%if ARCH_X86_64
5855    mov                 myd, mym
5856    mova         [rsp+0x10], m11
5857    mova         [rsp+0x20], m12
5858 %if isput
5859    mova         [rsp+0x30], m13
5860  %define vrnd_mem [rsp+0x30]
5861  %define stk rsp+0x40
5862 %else
5863  %define vrnd_mem [base+pd_m524256]
5864  %define stk rsp+0x30
5865 %endif
5866    movzx               t0d, t0b
5867    sub                srcq, 2
5868    movd                m15, t0d
5869%else
5870 %define m10 [base+pd_0x3ff]
5871 %define m9  [base+pd_0x4000]
5872 %define m8  m0
5873 %xdefine m14 m4
5874 %define m15 m3
5875 %if isprep
5876  %define ssq r3
5877 %endif
5878    movzx                r5, byte [esp+0x1f0]
5879    sub                srcq, 2
5880    movd                m15, r5
5881%endif
5882    pmaddwd              m8, [base+rescale_mul]
5883%if ARCH_X86_64
5884    mova                 m9, [base+pd_0x4000]
5885%endif
5886    pshufd              m15, m15, q0000
5887    paddd               m14, m8 ; mx+dx*[0-3]
5888    pand                 m0, m14, m10
5889    psrld                m0, 6
5890    paddd               m15, m0
5891    pshufd               m7, m15, q1032
5892%if ARCH_X86_64
5893    movd                r4d, m15
5894    movd               r11d, m7
5895    pshufd              m15, m15, q0321
5896    pshufd               m7, m7, q0321
5897    movd                r6d, m15
5898    movd               r13d, m7
5899    mova                m10, [base+bdct_lb_q+ 0]
5900    mova                m11, [base+bdct_lb_q+16]
5901    movd                m13, [base+subpel_filters+ r4*8+2]
5902    movd                 m2, [base+subpel_filters+ r6*8+2]
5903    movd                m15, [base+subpel_filters+r11*8+2]
5904    movd                 m4, [base+subpel_filters+r13*8+2]
5905%else
5906    movd                 r0, m15
5907    movd                 r4, m7
5908    pshufd              m15, m15, q0321
5909    pshufd               m7, m7, q0321
5910    movd                 rX, m15
5911    movd                 r5, m7
5912    mova                 m5, [base+bdct_lb_q+ 0]
5913    mova                 m6, [base+bdct_lb_q+16]
5914    movd                 m1, [base+subpel_filters+r0*8+2]
5915    movd                 m2, [base+subpel_filters+rX*8+2]
5916    movd                 m3, [base+subpel_filters+r4*8+2]
5917    movd                 m7, [base+subpel_filters+r5*8+2]
5918    SWAP                 m4, m7
5919 %if isprep
5920    mov                  r3, r3m
5921 %endif
5922 %define m10 m5
5923 %define m11 m6
5924 %define m12 m1
5925 %define m13 m1
5926%endif
5927    psrld               m14, 10
5928    paddd               m14, m14
5929    punpckldq           m13, m2
5930    punpckldq           m15, m4
5931    punpcklqdq          m13, m15
5932    pxor                 m2, m2
5933    pcmpeqd              m0, m2
5934%if ARCH_X86_64
5935    pand                 m9, m0
5936%else
5937    pand                 m2, m9, m0
5938 %define m9 m2
5939    SWAP                 m7, m4
5940%endif
5941    pandn                m0, m13
5942%if ARCH_X86_64
5943    SWAP                m13, m0
5944%else
5945 %define m13 m0
5946%endif
5947    por                 m13, m9
5948    punpckhbw           m15, m13, m13
5949    punpcklbw           m13, m13
5950    psraw               m15, 8
5951    psraw               m13, 8
5952    pshufb              m12, m14, m10
5953    pshufb              m14, m11
5954    mova                m10, [base+spel_s_shuf2]
5955    movd                r4d, m14
5956    shr                 r4d, 24
5957%if ARCH_X86_32
5958    mova         [stk+0x40], m13
5959    mova         [stk+0x50], m15
5960    pxor                 m2, m2
5961%endif
5962    pshufb               m7, m14, m2
5963    psubb               m14, m7
5964    paddb               m12, m10
5965    paddb               m14, m10
5966%if ARCH_X86_64
5967    lea                  r6, [r4+ssq*1]
5968    lea                 r11, [r4+ssq*2]
5969    lea                 r13, [r4+ss3q ]
5970    movu                 m7, [srcq+ssq*0]
5971    movu                 m9, [srcq+ssq*1]
5972    movu                 m8, [srcq+ssq*2]
5973    movu                m10, [srcq+ss3q ]
5974    movu                 m1, [srcq+r4   ]
5975    movu                 m3, [srcq+r6   ]
5976    movu                 m2, [srcq+r11  ]
5977    movu                 m4, [srcq+r13  ]
5978    lea                srcq, [srcq+ssq*4]
5979    REPX    {pshufb x, m12}, m7, m9, m8, m10
5980    REPX   {pmaddwd x, m13}, m7, m9, m8, m10
5981    REPX    {pshufb x, m14}, m1, m3, m2, m4
5982    REPX   {pmaddwd x, m15}, m1, m3, m2, m4
5983    mova                 m5, [rsp+0x10]
5984    movd                xm6, [rsp+0x20]
5985    phaddd               m7, m1
5986    phaddd               m9, m3
5987    phaddd               m8, m2
5988    phaddd              m10, m4
5989    movu                 m1, [srcq+ssq*0]
5990    movu                 m2, [srcq+ssq*1]
5991    movu                 m3, [srcq+ssq*2]
5992    REPX      {paddd x, m5}, m7, m9, m8, m10
5993    REPX     {psrad x, xm6}, m7, m9, m8, m10
5994    packssdw             m7, m9  ; 0 1
5995    packssdw             m8, m10 ; 2 3
5996    movu                 m0, [srcq+r4   ]
5997    movu                 m9, [srcq+r6   ]
5998    movu                m10, [srcq+r11  ]
5999    add                srcq, ss3q
6000    REPX    {pshufb x, m12}, m1, m2, m3
6001    REPX   {pmaddwd x, m13}, m1, m2, m3
6002    REPX    {pshufb x, m14}, m0, m9, m10
6003    REPX   {pmaddwd x, m15}, m0, m9, m10
6004    phaddd               m1, m0
6005    phaddd               m2, m9
6006    phaddd               m3, m10
6007    shr                 myd, 6
6008    mov                r13d, 64 << 24
6009    lea                 myd, [t1+myq]
6010    cmovnz             r13q, [base+subpel_filters+myq*8]
6011    REPX      {paddd x, m5}, m1, m2, m3
6012    REPX     {psrad x, xm6}, m1, m2, m3
6013    packssdw             m1, m2 ; 4 5
6014    packssdw             m3, m3 ; 6 6
6015    SWAP                 m9, m1
6016    shufps               m4, m7, m8, q1032  ; 1 2
6017    shufps               m5, m8, m9, q1032  ; 3 4
6018    shufps               m6, m9, m3, q1032  ; 5 6
6019    punpcklwd            m0, m7, m4 ; 01
6020    punpckhwd            m7, m4     ; 12
6021    punpcklwd            m1, m8, m5 ; 23
6022    punpckhwd            m8, m5     ; 34
6023    punpcklwd            m2, m9, m6 ; 45
6024    punpckhwd            m9, m6     ; 56
6025    movq                m10, r13
6026    mova         [stk+0x00], m1
6027    mova         [stk+0x10], m8
6028    mova         [stk+0x20], m2
6029    mova         [stk+0x30], m9
6030    mova         [stk+0x40], m3
6031 %define hrnd_mem [rsp+0x10]
6032 %define hsh_mem  [rsp+0x20]
6033 %define vsh_mem  [rsp+0x28]
6034 %if isput
6035  %define vrnd_mem [rsp+0x30]
6036 %else
6037  %define vrnd_mem [base+pd_m524256]
6038 %endif
6039%else
6040    mova         [stk+0x20], m12
6041    mova         [stk+0x30], m14
6042    add                  r4, srcq
6043    MC_4TAP_SCALED_H   0x60 ; 0 1
6044    MC_4TAP_SCALED_H   0x70 ; 2 3
6045    MC_4TAP_SCALED_H   0x80 ; 4 5
6046    movu                 m7, [srcq]
6047    movu                 m2, [r4]
6048    add                srcq, ssq
6049    add                  r4, ssq
6050    mov          [stk+0xb0], r4
6051    pshufb               m7, m12
6052    pshufb               m2, m14
6053    pmaddwd              m7, m13
6054    pmaddwd              m2, m15
6055    phaddd               m7, m2
6056    paddd                m7, [esp+0x00]
6057    psrad                m7, [esp+0x10]
6058    packssdw             m7, m7 ; 6 6
6059    mova                 m4, [stk+0x60]
6060    mova                 m5, [stk+0x70]
6061    mova                 m6, [stk+0x80]
6062    mov                 myd, mym
6063    mov                  rX, [esp+0x1f4]
6064    xor                  r5, r5
6065    shr                 myd, 6
6066    lea                  rX, [rX+myd]
6067    mov                  r4, 64 << 24
6068    cmovnz               r4, [base+subpel_filters+rX*8+0]
6069    cmovnz               r5, [base+subpel_filters+rX*8+4]
6070    mov                  r3, r3m
6071    shufps               m1, m4, m5, q1032 ; 1 2
6072    shufps               m2, m5, m6, q1032 ; 3 4
6073    shufps               m3, m6, m7, q1032 ; 5 6
6074    mova         [stk+0xa0], m7
6075    punpcklwd            m0, m4, m1         ; 01
6076    punpckhwd            m4, m1             ; 12
6077    punpcklwd            m1, m5, m2         ; 23
6078    punpckhwd            m5, m2             ; 34
6079    punpcklwd            m2, m6, m3         ; 45
6080    punpckhwd            m6, m3             ; 56
6081    movd                 m7, r4
6082    movd                 m3, r5
6083    mov                  r0, r0m
6084 %if isput
6085    mov                  r1, r1m
6086 %endif
6087    mov                  r4, [stk+0xb0]
6088    mova         [stk+0xc0], m4 ; 12
6089    mova         [stk+0x60], m1 ; 23
6090    mova         [stk+0x70], m2 ; 45
6091    mova         [stk+0x80], m5 ; 34
6092    mova         [stk+0x90], m6 ; 56
6093 %define m12 [stk+0x20]
6094 %define m14 [stk+0x30]
6095 %define m13 [stk+0x40]
6096 %define m15 [stk+0x50]
6097 %define hrnd_mem [esp+0x00]
6098 %define hsh_mem  [esp+0x10]
6099 %define vsh_mem  [esp+0x18]
6100 %if isput
6101  %define vrnd_mem [esp+0x20]
6102 %else
6103  %define vrnd_mem [base+pd_m524256]
6104 %endif
6105 %define m10 m7
6106    punpckldq           m10, m3
6107%endif
6108    punpcklbw           m10, m10
6109    psraw               m10, 8
6110    pshufd               m3, m10, q0000
6111    pshufd               m4, m10, q1111
6112    pshufd               m5, m10, q2222
6113    pshufd              m10, m10, q3333
6114%if ARCH_X86_32
6115 %xdefine m8  m3
6116 %xdefine m9  m6
6117 %xdefine m11 m5
6118 %xdefine m6  m4
6119    mova         [stk+0x100], m3
6120    mova         [stk+0x110], m4
6121    mova         [stk+0x120], m5
6122    mova         [stk+0x130], m10
6123 %define m3  [stk+0x100]
6124 %define m4  [stk+0x110]
6125 %define m5  [stk+0x120]
6126 %define m10 [stk+0x130]
6127    mova                 m7, [stk+0xc0]
6128    mova                 m8, [stk+0x80]
6129%endif
6130.dy1_w4_loop:
6131    movu                m11, [srcq+ssq*0]
6132    movu                 m6, [srcq+ssq*1]
6133    pmaddwd              m0, m3
6134    pmaddwd              m7, m3
6135    pmaddwd              m1, m4
6136    pmaddwd              m8, m4
6137    pmaddwd              m2, m5
6138    pmaddwd              m9, m5
6139    paddd                m1, m0
6140    paddd                m8, m7
6141%if ARCH_X86_64
6142    movu                 m0, [srcq+r4]
6143    movu                 m7, [srcq+r6]
6144%else
6145    movu                 m0, [r4+ssq*0]
6146    movu                 m7, [r4+ssq*1]
6147    lea                  r4, [r4+ssq*2]
6148%endif
6149    lea                srcq, [srcq+ssq*2]
6150    paddd                m1, m2
6151    paddd                m8, m9
6152    pshufb              m11, m12
6153    pshufb               m6, m12
6154    pmaddwd             m11, m13
6155    pmaddwd              m6, m13
6156    pshufb               m0, m14
6157    pshufb               m7, m14
6158    pmaddwd              m0, m15
6159    pmaddwd              m7, m15
6160    phaddd              m11, m0
6161    phaddd               m6, m7
6162    paddd               m11, hrnd_mem
6163    paddd                m6, hrnd_mem
6164    psrad               m11, hsh_mem
6165    psrad                m6, hsh_mem
6166    packssdw            m11, m6                     ; 7 8
6167%if ARCH_X86_64
6168    shufps               m9, [stk+0x40], m11, q1032 ; 6 7
6169    mova                 m0, [stk+0x00]
6170    mova         [stk+0x40], m11
6171%else
6172    shufps               m9, [stk+0xa0], m11, q1032 ; 6 7
6173    mova                 m0, [stk+0x60]
6174    mova         [stk+0xa0], m11
6175%endif
6176    punpcklwd            m2, m9, m11 ; 67
6177    punpckhwd            m9, m11     ; 78
6178    pmaddwd              m6, m2, m10
6179    pmaddwd              m7, m9, m10
6180%if isput
6181    movd                m11, vsh_mem
6182%endif
6183    paddd                m1, vrnd_mem
6184    paddd                m8, vrnd_mem
6185    paddd                m1, m6
6186    paddd                m8, m7
6187%if ARCH_X86_64
6188    mova                 m7, [stk+0x10]
6189%else
6190    mova                 m7, [stk+0x80]
6191%endif
6192%if isput
6193    psrad                m1, m11
6194    psrad                m8, m11
6195%else
6196    psrad                m1, 6
6197    psrad                m8, 6
6198%endif
6199    packssdw             m1, m8
6200%if ARCH_X86_64
6201    mova                 m8, [stk+0x30]
6202%else
6203    mova                 m8, [stk+0x90]
6204%endif
6205%if isput
6206    pxor                 m6, m6
6207    pmaxsw               m1, m6
6208    pminsw               m1, pxmaxm
6209    movq       [dstq+dsq*0], m1
6210    movhps     [dstq+dsq*1], m1
6211    lea                dstq, [dstq+dsq*2]
6212%else
6213    mova             [tmpq], m1
6214    add                tmpq, 16
6215%endif
6216%if ARCH_X86_64
6217    mova                 m1, [stk+0x20]
6218    mova         [stk+0x10], m8
6219    mova         [stk+0x00], m1
6220    mova         [stk+0x20], m2
6221    mova         [stk+0x30], m9
6222%else
6223    mova                 m1, [stk+0x70]
6224    mova         [stk+0x80], m8
6225    mova         [stk+0x60], m1
6226    mova         [stk+0x70], m2
6227    mova         [stk+0x90], m9
6228%endif
6229    sub                  hd, 2
6230    jg .dy1_w4_loop
6231    MC_8TAP_SCALED_RET ; why not jz .ret?
6232INIT_XMM ssse3
6233.dy1_w8:
6234    mov    dword [stk+0xf0], 1
6235    movifprep   tmp_stridem, 16
6236    jmp .dy1_w_start
6237.dy1_w16:
6238    mov    dword [stk+0xf0], 2
6239    movifprep   tmp_stridem, 32
6240    jmp .dy1_w_start
6241.dy1_w32:
6242    mov    dword [stk+0xf0], 4
6243    movifprep   tmp_stridem, 64
6244    jmp .dy1_w_start
6245.dy1_w64:
6246    mov    dword [stk+0xf0], 8
6247    movifprep   tmp_stridem, 128
6248    jmp .dy1_w_start
6249.dy1_w128:
6250    mov    dword [stk+0xf0], 16
6251    movifprep   tmp_stridem, 256
6252.dy1_w_start:
6253    mov                 myd, mym
6254%if ARCH_X86_64
6255 %ifidn %1, put
6256    movifnidn           dsm, dsq
6257 %endif
6258    mova         [rsp+0x10], m11
6259    mova         [rsp+0x20], m12
6260 %define hround m11
6261 %if isput
6262    mova         [rsp+0x30], m13
6263 %else
6264    mova                m13, [base+pd_m524256]
6265 %endif
6266    shr                 t0d, 16
6267    shr                 myd, 6
6268    mov                 r4d, 64 << 24
6269    lea                 myd, [t1+myq]
6270    cmovnz              r4q, [base+subpel_filters+myq*8]
6271    movd                m15, t0d
6272%else
6273 %define hround [esp+0x00]
6274 %define m12    [esp+0x10]
6275 %define m10    [base+pd_0x3ff]
6276 %define m8  m0
6277 %xdefine m14 m4
6278 %xdefine m15 m3
6279 %if isprep
6280  %define ssq ssm
6281 %endif
6282    mov                  r5, [esp+0x1f0]
6283    mov                  r3, [esp+0x1f4]
6284    shr                  r5, 16
6285    movd                m15, r5
6286    xor                  r5, r5
6287    shr                 myd, 6
6288    lea                  r3, [r3+myd]
6289    mov                  r4, 64 << 24
6290    cmovnz               r4, [base+subpel_filters+r3*8+0]
6291    cmovnz               r5, [base+subpel_filters+r3*8+4]
6292    mov                  r0, r0m
6293    mov                  r3, r3m
6294%endif
6295    sub                srcq, 6
6296    pslld                m7, m8, 2 ; dx*4
6297    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
6298    pshufd              m15, m15, q0000
6299    paddd               m14, m8 ; mx+dx*[0-3]
6300%if ARCH_X86_64
6301    movq                 m3, r4q
6302%else
6303    movd                 m5, r4
6304    movd                 m6, r5
6305    punpckldq            m5, m6
6306    SWAP                 m3, m5
6307%endif
6308    punpcklbw            m3, m3
6309    psraw                m3, 8
6310    mova        [stk+0x100], m7
6311    mova        [stk+0x120], m15
6312    mov         [stk+0x0f8], srcq
6313    mov         [stk+0x130], r0q ; dstq / tmpq
6314    pshufd               m0, m3, q0000
6315    pshufd               m1, m3, q1111
6316    pshufd               m2, m3, q2222
6317    pshufd               m3, m3, q3333
6318%if ARCH_X86_64
6319    mova        [stk+0x140], m0
6320    mova        [stk+0x150], m1
6321    mova        [stk+0x160], m2
6322    mova        [stk+0x170], m3
6323 %if UNIX64
6324    mov                  hm, hd
6325 %endif
6326%else
6327    mova        [stk+0x180], m0
6328    mova        [stk+0x190], m1
6329    mova        [stk+0x1a0], m2
6330    mova        [stk+0x1b0], m3
6331    SWAP                 m5, m3
6332    mov                  r5, hm
6333    mov         [stk+0x134], r5
6334%endif
6335    jmp .dy1_hloop
6336.dy1_hloop_prep:
6337    dec   dword [stk+0x0f0]
6338    jz .ret
6339%if ARCH_X86_64
6340    add   qword [stk+0x130], 16
6341    mov                  hd, hm
6342%else
6343    add   dword [stk+0x130], 16
6344    mov                  r5, [stk+0x134]
6345    mov                  r0, [stk+0x130]
6346%endif
6347    mova                 m7, [stk+0x100]
6348    mova                m14, [stk+0x110]
6349%if ARCH_X86_64
6350    mova                m10, [base+pd_0x3ff]
6351    mova                m11, [rsp+0x10]
6352%endif
6353    mova                m15, [stk+0x120]
6354    mov                srcq, [stk+0x0f8]
6355%if ARCH_X86_64
6356    mov                 r0q, [stk+0x130] ; dstq / tmpq
6357%else
6358    mov                  hm, r5
6359    mov                 r0m, r0
6360    mov                  r3, r3m
6361%endif
6362    paddd               m14, m7
6363.dy1_hloop:
6364%if ARCH_X86_64
6365    mova                 m9, [base+pq_0x40000000]
6366%else
6367 %define m9 [base+pq_0x40000000]
6368%endif
6369    pxor                 m1, m1
6370    psrld                m2, m14, 10
6371    mova              [stk], m2
6372    pand                 m6, m14, m10
6373    psrld                m6, 6
6374    paddd                m5, m15, m6
6375    pcmpeqd              m6, m1
6376    pshufd               m2, m5, q1032
6377%if ARCH_X86_64
6378    movd                r4d, m5
6379    movd                r6d, m2
6380    pshufd               m5, m5, q0321
6381    pshufd               m2, m2, q0321
6382    movd                r7d, m5
6383    movd                r9d, m2
6384    movq                 m0, [base+subpel_filters+r4*8]
6385    movq                 m1, [base+subpel_filters+r6*8]
6386    movhps               m0, [base+subpel_filters+r7*8]
6387    movhps               m1, [base+subpel_filters+r9*8]
6388%else
6389    movd                 r0, m5
6390    movd                 rX, m2
6391    pshufd               m5, m5, q0321
6392    pshufd               m2, m2, q0321
6393    movd                 r4, m5
6394    movd                 r5, m2
6395    movq                 m0, [base+subpel_filters+r0*8]
6396    movq                 m1, [base+subpel_filters+rX*8]
6397    movhps               m0, [base+subpel_filters+r4*8]
6398    movhps               m1, [base+subpel_filters+r5*8]
6399%endif
6400    paddd               m14, m7 ; mx+dx*[4-7]
6401    pand                 m5, m14, m10
6402    psrld                m5, 6
6403    paddd               m15, m5
6404    pxor                 m2, m2
6405    pcmpeqd              m5, m2
6406    mova        [stk+0x110], m14
6407    pshufd               m4, m15, q1032
6408%if ARCH_X86_64
6409    movd               r10d, m15
6410    movd               r11d, m4
6411    pshufd              m15, m15, q0321
6412    pshufd               m4, m4, q0321
6413    movd               r13d, m15
6414    movd                rXd, m4
6415    movq                 m2, [base+subpel_filters+r10*8]
6416    movq                 m3, [base+subpel_filters+r11*8]
6417    movhps               m2, [base+subpel_filters+r13*8]
6418    movhps               m3, [base+subpel_filters+ rX*8]
6419    psrld               m14, 10
6420    movq                r11, m14
6421    punpckhqdq          m14, m14
6422    movq                 rX, m14
6423    mov                r10d, r11d
6424    shr                 r11, 32
6425    mov                r13d, rXd
6426    shr                  rX, 32
6427    mov                 r4d, [stk+ 0]
6428    mov                 r6d, [stk+ 4]
6429    mov                 r7d, [stk+ 8]
6430    mov                 r9d, [stk+12]
6431    pshufd               m4, m6, q1100
6432    pshufd               m6, m6, q3322
6433    pshufd              m14, m5, q1100
6434    pshufd               m5, m5, q3322
6435    pand                 m7, m9, m4
6436    pand                 m8, m9, m6
6437    pand                m15, m9, m14
6438    pand                 m9, m9, m5
6439    pandn                m4, m0
6440    pandn                m6, m1
6441    pandn               m14, m2
6442    pandn                m5, m3
6443    por                  m7, m4
6444    por                  m8, m6
6445    por                 m15, m14
6446    por                  m9, m5
6447    punpcklbw            m0, m7, m7
6448    punpckhbw            m7, m7
6449    punpcklbw            m1, m8, m8
6450    punpckhbw            m8, m8
6451    psraw                m0, 8
6452    psraw                m7, 8
6453    psraw                m1, 8
6454    psraw                m8, 8
6455    punpcklbw            m2, m15, m15
6456    punpckhbw           m15, m15
6457    punpcklbw            m3, m9, m9
6458    punpckhbw            m9, m9
6459    psraw                m2, 8
6460    psraw               m15, 8
6461    psraw                m3, 8
6462    psraw                m9, 8
6463    mova         [stk+0x10], m0
6464    mova         [stk+0x20], m7
6465    mova         [stk+0x30], m1
6466    mova         [stk+0x40], m8
6467    mova         [stk+0x50], m2
6468    mova         [stk+0x60], m15
6469    mova         [stk+0x70], m3
6470    mova         [stk+0x80], m9
6471    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
6472    mova         [stk+0x90], m1
6473    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
6474    mova         [stk+0xa0], m2
6475    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
6476    mova         [stk+0xb0], m3
6477    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
6478    mova         [stk+0xc0], m4
6479    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
6480    mova         [stk+0xd0], m5
6481    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
6482    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
6483    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
6484    mova                 m5, [stk+0xd0]
6485    mova                 m1, [stk+0x90]
6486    mova                 m2, [stk+0xa0]
6487    mova                 m3, [stk+0xb0]
6488    mova                 m9, [stk+0xc0]
6489    punpcklwd            m4, m5, m6 ; 45a
6490    punpckhwd            m5, m6     ; 45b
6491    punpcklwd            m6, m7, m8 ; 67a
6492    punpckhwd            m7, m8     ; 67b
6493    punpcklwd            m0, m1, m2 ; 01a
6494    punpckhwd            m1, m2     ; 01b
6495    punpcklwd            m2, m3, m9 ; 23a
6496    punpckhwd            m3, m9     ; 23b
6497    mova                m10, [stk+0x140]
6498    mova                m11, [stk+0x150]
6499    mova                m14, [stk+0x160]
6500    mova                m15, [stk+0x170]
6501    mova         [stk+0x90], m4
6502    mova         [stk+0xa0], m5
6503    mova         [stk+0xb0], m6
6504    mova         [stk+0xc0], m7
6505 %define hround [rsp+0x10]
6506 %define shift  [rsp+0x20]
6507 %if isput
6508  %define vround [rsp+0x30]
6509 %else
6510  %define vround [base+pd_m524256]
6511 %endif
6512.dy1_vloop:
6513    pmaddwd              m4, m0, m10
6514    pmaddwd              m5, m1, m10
6515    pmaddwd              m6, m2, m11
6516    pmaddwd              m7, m3, m11
6517    paddd                m4, m13
6518    paddd                m5, m13
6519    paddd                m4, m6
6520    paddd                m5, m7
6521    pmaddwd              m6, [stk+0x90], m14
6522    pmaddwd              m7, [stk+0xa0], m14
6523    pmaddwd              m8, [stk+0xb0], m15
6524    pmaddwd              m9, [stk+0xc0], m15
6525    paddd                m4, m6
6526    paddd                m5, m7
6527 %if isput
6528    pshufd               m6, m12, q1032
6529 %endif
6530    paddd                m4, m8
6531    paddd                m5, m9
6532%else
6533    movd                 r0, m15
6534    movd                 rX, m4
6535    pshufd              m15, m15, q0321
6536    pshufd               m4, m4, q0321
6537    movd                 r4, m15
6538    movd                 r5, m4
6539    mova                m14, [stk+0x110]
6540    movq                 m2, [base+subpel_filters+r0*8]
6541    movq                 m3, [base+subpel_filters+rX*8]
6542    movhps               m2, [base+subpel_filters+r4*8]
6543    movhps               m3, [base+subpel_filters+r5*8]
6544    psrld               m14, 10
6545    mova           [stk+16], m14
6546    mov                  r0, [stk+ 0]
6547    mov                  rX, [stk+ 4]
6548    mov                  r4, [stk+ 8]
6549    mov                  r5, [stk+12]
6550    mova         [stk+0x20], m0
6551    mova         [stk+0x30], m1
6552    mova         [stk+0x40], m2
6553    mova         [stk+0x50], m3
6554    pshufd               m4, m6, q1100
6555    pshufd               m6, m6, q3322
6556    pshufd               m7, m5, q1100
6557    pshufd               m5, m5, q3322
6558    pand                 m0, m9, m4
6559    pand                 m1, m9, m6
6560    pand                 m2, m9, m7
6561    pand                 m3, m9, m5
6562    pandn                m4, [stk+0x20]
6563    pandn                m6, [stk+0x30]
6564    pandn                m7, [stk+0x40]
6565    pandn                m5, [stk+0x50]
6566    por                  m0, m4
6567    por                  m1, m6
6568    por                  m2, m7
6569    por                  m3, m5
6570    punpcklbw            m4, m0, m0
6571    punpckhbw            m0, m0
6572    punpcklbw            m5, m1, m1
6573    punpckhbw            m1, m1
6574    psraw                m4, 8
6575    psraw                m0, 8
6576    psraw                m5, 8
6577    psraw                m1, 8
6578    punpcklbw            m6, m2, m2
6579    punpckhbw            m2, m2
6580    punpcklbw            m7, m3, m3
6581    punpckhbw            m3, m3
6582    psraw                m6, 8
6583    psraw                m2, 8
6584    psraw                m7, 8
6585    psraw                m3, 8
6586    mova        [stk+0x0a0], m4
6587    mova        [stk+0x0b0], m0
6588    mova        [stk+0x0c0], m5
6589    mova        [stk+0x0d0], m1
6590    mova        [stk+0x140], m6
6591    mova        [stk+0x150], m2
6592    mova        [stk+0x160], m7
6593    mova        [stk+0x170], m3
6594    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
6595    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
6596    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
6597    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
6598    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
6599    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
6600    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
6601    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
6602    mova                 m5, [stk+0x60]
6603    mova                 m6, [stk+0x70]
6604    mova                 m7, [stk+0x80]
6605    mova                 m0, [stk+0x90]
6606    mov                  r0, r0m
6607    punpcklwd            m4, m5, m6      ; 45a
6608    punpckhwd            m5, m6          ; 45b
6609    punpcklwd            m6, m7, m0      ; 67a
6610    punpckhwd            m7, m0          ; 67b
6611    mova         [stk+0x60], m4
6612    mova         [stk+0x70], m5
6613    mova         [stk+0x80], m6
6614    mova         [stk+0x90], m7
6615    mova                 m1, [stk+0x20]
6616    mova                 m2, [stk+0x30]
6617    mova                 m3, [stk+0x40]
6618    mova                 m4, [stk+0x50]
6619    punpcklwd            m0, m1, m2      ; 01a
6620    punpckhwd            m1, m2          ; 01b
6621    punpcklwd            m2, m3, m4      ; 23a
6622    punpckhwd            m3, m4          ; 23b
6623    mova                 m4, [stk+0x180]
6624    mova                 m5, [stk+0x190]
6625    mova                 m6, [stk+0x1a0]
6626    mova                 m7, [stk+0x1b0]
6627    mova         [stk+0x20], m0
6628    mova         [stk+0x30], m1
6629    mova         [stk+0x40], m2
6630    mova         [stk+0x50], m3
6631.dy1_vloop:
6632    pmaddwd              m0, m4
6633    pmaddwd              m1, m4
6634    pmaddwd              m2, m5
6635    pmaddwd              m3, m5
6636    paddd                m0, m2
6637    paddd                m1, m3
6638    pmaddwd              m2, [stk+0x60], m6
6639    pmaddwd              m3, [stk+0x70], m6
6640    pmaddwd              m4, [stk+0x80], m7
6641    pmaddwd              m5, [stk+0x90], m7
6642 %if isput
6643    movd                 m6, [esp+0x18]
6644 %endif
6645    paddd                m0, m2
6646    paddd                m1, m3
6647    paddd                m0, vrnd_mem
6648    paddd                m1, vrnd_mem
6649    paddd                m4, m0
6650    paddd                m5, m1
6651%endif
6652%ifidn %1, put
6653    psrad                m4, m6
6654    psrad                m5, m6
6655    packssdw             m4, m5
6656    pxor                 m7, m7
6657    pmaxsw               m4, m7
6658    pminsw               m4, pxmaxm
6659    mova             [dstq], m4
6660    add                dstq, dsm
6661%else
6662    psrad                m4, 6
6663    psrad                m5, 6
6664    packssdw             m4, m5
6665    mova             [tmpq], m4
6666    add                tmpq, tmp_stridem
6667%endif
6668    dec                  hd
6669    jz .dy1_hloop_prep
6670%if ARCH_X86_64
6671    movu                 m8, [srcq+r10*2]
6672    movu                 m9, [srcq+r11*2]
6673    movu                m12, [srcq+r13*2]
6674    movu                m13, [srcq+ rX*2]
6675    movu                 m4, [srcq+ r4*2]
6676    movu                 m5, [srcq+ r6*2]
6677    movu                 m6, [srcq+ r7*2]
6678    movu                 m7, [srcq+ r9*2]
6679    add                srcq, ssq
6680    pmaddwd              m8, [stk+0x50]
6681    pmaddwd              m9, [stk+0x60]
6682    pmaddwd             m12, [stk+0x70]
6683    pmaddwd             m13, [stk+0x80]
6684    pmaddwd              m4, [stk+0x10]
6685    pmaddwd              m5, [stk+0x20]
6686    pmaddwd              m6, [stk+0x30]
6687    pmaddwd              m7, [stk+0x40]
6688    phaddd               m8, m9
6689    phaddd              m12, m13
6690    mova                 m9, [base+unpckw]
6691    mova                m13, hround
6692    phaddd               m4, m5
6693    phaddd               m6, m7
6694    phaddd               m8, m12
6695    phaddd               m4, m6
6696    pshufd               m5, m9, q1032
6697    pshufb               m0, m9             ; 0a 1a
6698    pshufb               m1, m9             ; 0b 1b
6699    pshufb               m2, m5             ; 3a 2a
6700    pshufb               m3, m5             ; 3b 2b
6701    mova                m12, shift
6702    paddd                m4, m13
6703    paddd                m8, m13
6704    psrad                m4, m12
6705    psrad                m8, m12
6706    packssdw             m4, m8
6707    pshufb               m6, [stk+0x90], m9 ; 4a 5a
6708    pshufb               m7, [stk+0xa0], m9 ; 4b 5b
6709    pshufb               m8, [stk+0xb0], m5 ; 7a 6a
6710    pshufb              m13, [stk+0xc0], m5 ; 7b 6b
6711    punpckhwd            m0, m2  ; 12a
6712    punpckhwd            m1, m3  ; 12b
6713    punpcklwd            m2, m6  ; 34a
6714    punpcklwd            m3, m7  ; 34b
6715    punpckhwd            m6, m8  ; 56a
6716    punpckhwd            m7, m13 ; 56b
6717    punpcklwd            m8, m4  ; 78a
6718    punpckhqdq           m4, m4
6719    punpcklwd           m13, m4  ; 78b
6720    mova         [stk+0x90], m6
6721    mova         [stk+0xa0], m7
6722    mova         [stk+0xb0], m8
6723    mova         [stk+0xc0], m13
6724    mova                m13, vround
6725%else
6726    mov                 r0m, r0
6727    mov                  r3, r3m
6728    mov                  r0, [stk+ 0]
6729    mov                  rX, [stk+ 4]
6730    mov                  r4, [stk+ 8]
6731    mov                  r5, [stk+12]
6732    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
6733    mova                 m7, [base+unpckw]
6734    pshufd               m4, m7, q1032
6735    pshufb               m0, [stk+0x20], m7 ; 0a 1a
6736    pshufb               m1, [stk+0x30], m7 ; 0b 1b
6737    pshufb               m2, [stk+0x40], m4 ; 3a 2a
6738    pshufb               m3, [stk+0x50], m4 ; 3b 2b
6739    pshufb               m5, [stk+0x60], m7 ; 4a 5a
6740    pshufb               m6, [stk+0x70], m7 ; 4b 5b
6741    pshufb               m7, [stk+0x80], m4 ; 7a 6a
6742    punpckhwd            m0, m2 ; 12a
6743    punpckhwd            m1, m3 ; 12b
6744    punpcklwd            m2, m5 ; 34a
6745    punpcklwd            m3, m6 ; 34b
6746    mova         [stk+0x20], m0
6747    mova         [stk+0x30], m1
6748    mova         [stk+0x40], m2
6749    mova         [stk+0x50], m3
6750    punpckhwd            m5, m7 ; 56a
6751    mova         [stk+0x60], m5
6752    pshufb               m5, [stk+0x90], m4 ; 7b 6b
6753    punpcklwd            m7, [stk+0xe0] ; 78a
6754    mova                 m4, [stk+0x180]
6755    punpckhwd            m6, m5 ; 56b
6756    mova         [stk+0x70], m6
6757    movq                 m6, [stk+0xe8]
6758    mova         [stk+0x80], m7
6759    mova                 m7, [stk+0x1b0]
6760    punpcklwd            m5, m6
6761    mova                 m6, [stk+0x1a0]
6762    mova         [stk+0x90], m5
6763    mova                 m5, [stk+0x190]
6764    mov                  r0, r0m
6765%endif
6766    jmp .dy1_vloop
6767INIT_XMM ssse3
6768%if ARCH_X86_64
6769 %define stk rsp+0x20
6770%endif
6771.dy2:
6772    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
6773    add                  wq, base_reg
6774    jmp                  wq
6775%if isput
6776.dy2_w2:
6777 %if ARCH_X86_64
6778    mov                 myd, mym
6779    mova         [rsp+0x10], m13
6780  %define vrnd_mem [rsp+0x10]
6781    movzx               t0d, t0b
6782    sub                srcq, 2
6783    movd                m15, t0d
6784 %else
6785  %define m8  m0
6786  %define m9  m1
6787  %define m14 m4
6788  %define m15 m3
6789  %define m11 [esp+0x00]
6790  %define m12 [esp+0x10]
6791  %define vrnd_mem [esp+0x20]
6792    mov                  r1, r1m
6793    movzx                r5, byte [esp+0x1f0]
6794    sub                srcq, 2
6795    movd                m15, r5
6796 %endif
6797    pxor                 m9, m9
6798    punpckldq            m9, m8
6799    paddd               m14, m9 ; mx+dx*[0-1]
6800 %if ARCH_X86_64
6801    mova                 m9, [base+pd_0x4000]
6802 %endif
6803    pshufd              m15, m15, q0000
6804    pand                 m8, m14, m10
6805    psrld                m8, 6
6806    paddd               m15, m8
6807    movd                r4d, m15
6808    pshufd              m15, m15, q0321
6809 %if ARCH_X86_64
6810    movd                r6d, m15
6811 %else
6812    movd                r3d, m15
6813 %endif
6814    mova                 m5, [base+bdct_lb_q]
6815    mova                 m6, [base+spel_s_shuf2]
6816    movd                m15, [base+subpel_filters+r4*8+2]
6817 %if ARCH_X86_64
6818    movd                 m7, [base+subpel_filters+r6*8+2]
6819 %else
6820    movd                 m7, [base+subpel_filters+r3*8+2]
6821 %endif
6822    pxor                 m2, m2
6823    pcmpeqd              m8, m2
6824    psrld               m14, 10
6825    paddd               m14, m14
6826 %if ARCH_X86_32
6827    mov                  r3, r3m
6828    pshufb              m14, m5
6829    paddb               m14, m6
6830    mova              [stk], m14
6831    SWAP                 m5, m0
6832    SWAP                 m6, m3
6833  %define m15 m6
6834 %endif
6835    movu                 m0, [srcq+ssq*0]
6836    movu                 m1, [srcq+ssq*2]
6837    movu                 m2, [srcq+ssq*4]
6838    punpckldq           m15, m7
6839 %if ARCH_X86_64
6840    pshufb              m14, m5
6841    paddb               m14, m6
6842    pand                 m9, m8
6843    pandn                m8, m15
6844    SWAP                m15, m8
6845    por                 m15, m9
6846    movu                 m4, [srcq+ssq*1]
6847    movu                 m5, [srcq+ss3q ]
6848    lea                srcq, [srcq+ssq*4]
6849    movu                 m6, [srcq+ssq*1]
6850    lea                srcq, [srcq+ssq*2]
6851    shr                 myd, 6
6852    mov                 r4d, 64 << 24
6853    lea                 myd, [t1+myq]
6854    cmovnz              r4q, [base+subpel_filters+myq*8]
6855 %else
6856    pand                 m7, m5, [base+pd_0x4000]
6857    pandn                m5, m15
6858    por                  m5, m7
6859  %define m15 m5
6860    mov                 myd, mym
6861    mov                  r5, [esp+0x1f4]
6862    xor                  r3, r3
6863    shr                 myd, 6
6864    lea                  r5, [r5+myd]
6865    mov                  r4, 64 << 24
6866    cmovnz               r4, [base+subpel_filters+r5*8+0]
6867    cmovnz               r3, [base+subpel_filters+r5*8+4]
6868    mov          [stk+0x20], r3
6869    mov                  r3, r3m
6870 %endif
6871    punpcklbw           m15, m15
6872    psraw               m15, 8
6873    REPX    {pshufb x, m14}, m0, m1, m2
6874    REPX   {pmaddwd x, m15}, m0, m1, m2
6875 %if ARCH_X86_64
6876    REPX    {pshufb x, m14}, m4, m5, m6
6877    REPX   {pmaddwd x, m15}, m4, m5, m6
6878    phaddd               m0, m1
6879    phaddd               m1, m2
6880    phaddd               m4, m5
6881    phaddd               m5, m6
6882    REPX     {paddd x, m11}, m0, m1, m4, m5
6883    REPX     {psrad x, m12}, m0, m1, m4, m5
6884    packssdw             m0, m1 ; 0 2 2 4
6885    packssdw             m4, m5 ; 1 3 3 5
6886    SWAP                 m2, m4
6887    movq                m10, r4
6888 %else
6889    mova         [stk+0x10], m15
6890    phaddd               m0, m1
6891    phaddd               m1, m2
6892    movu                 m2, [srcq+ssq*1]
6893    movu                 m7, [srcq+ss3q ]
6894    lea                srcq, [srcq+ssq*4]
6895    movu                 m6, [srcq+ssq*1]
6896    lea                srcq, [srcq+ssq*2]
6897    REPX    {pshufb x, m14}, m2, m7, m6
6898    REPX   {pmaddwd x, m15}, m2, m7, m6
6899  %define m14 [stk+0x00]
6900  %define m15 [stk+0x10]
6901    phaddd               m2, m7
6902    phaddd               m7, m6
6903    REPX     {paddd x, m11}, m0, m1, m2, m7
6904    REPX     {psrad x, m12}, m0, m1, m2, m7
6905    packssdw             m0, m1
6906    packssdw             m2, m7
6907  %define m8  m6
6908  %define m9  m4
6909  %define m10 m5
6910    movd                m10, r4
6911    movd                 m9, [stk+0x20]
6912    punpckldq           m10, m9
6913 %endif
6914    punpcklbw           m10, m10
6915    psraw               m10, 8
6916    pshufd               m7, m10, q0000
6917    pshufd               m8, m10, q1111
6918    pshufd               m9, m10, q2222
6919    pshufd              m10, m10, q3333
6920 %if ARCH_X86_32
6921    mova         [stk+0x50], m7
6922    mova         [stk+0x60], m8
6923    mova         [stk+0x70], m9
6924    mova         [stk+0x80], m10
6925  %xdefine m13 m7
6926  %define m7  [stk+0x50]
6927  %define m8  [stk+0x60]
6928  %define m9  [stk+0x70]
6929  %define m10 [stk+0x80]
6930 %endif
6931    punpcklwd            m1, m0, m2    ; 01 23
6932    punpckhwd            m3, m0, m2    ; 23 45
6933 %if ARCH_X86_32
6934    mov                  r4, r0m
6935  %define dstq r4
6936    mova         [stk+0x20], m3
6937    mova         [stk+0x30], m0
6938 %endif
6939.dy2_w2_loop:
6940    movu                 m4, [srcq+ssq*0]
6941    movu                 m5, [srcq+ssq*1]
6942    movu                 m6, [srcq+ssq*2]
6943    movu                m13, [srcq+ss3q ]
6944    lea                srcq, [srcq+ssq*4]
6945    pmaddwd              m3, m8
6946    REPX    {pshufb x, m14}, m4, m5, m6, m13
6947    REPX   {pmaddwd x, m15}, m4, m5, m6, m13
6948    phaddd               m4, m5
6949    phaddd               m6, m13
6950    pmaddwd              m5, m1, m7
6951    paddd                m4, m11
6952    paddd                m6, m11
6953    psrad                m4, m12
6954    psrad                m6, m12
6955    packssdw             m4, m6 ; 6 7 8 9
6956    paddd                m5, m3
6957    pshufd               m3, m4, q2200
6958    pshufd               m4, m4, q3311
6959    palignr              m3, m0, 12 ; 4 6 6 8
6960    palignr              m4, m2, 12 ; 5 7 7 9
6961    mova                 m0, m3
6962    mova                 m2, m4
6963    punpcklwd            m1, m3, m4
6964    punpckhwd            m3, m4
6965    pmaddwd              m6, m1, m9
6966    pmaddwd              m4, m3, m10
6967    paddd                m5, vrnd_mem
6968    paddd                m6, m4
6969    paddd                m5, m6
6970    pshufd               m4, m12, q1032
6971    pxor                 m6, m6
6972    psrad                m5, m4
6973    packssdw             m5, m5
6974    pmaxsw               m5, m6
6975    pminsw               m5, pxmaxm
6976    movd       [dstq+dsq*0], m5
6977    pshuflw              m5, m5, q1032
6978    movd       [dstq+dsq*1], m5
6979    lea                dstq, [dstq+dsq*2]
6980    sub                  hd, 2
6981    jg .dy2_w2_loop
6982    RET
6983%endif
6984INIT_XMM ssse3
6985.dy2_w4:
6986%if ARCH_X86_64
6987    mov                 myd, mym
6988    mova         [rsp+0x10], m11
6989    mova         [rsp+0x20], m12
6990 %if isput
6991    mova         [rsp+0x30], m13
6992  %define vrnd_mem [rsp+0x30]
6993  %define stk rsp+0x40
6994 %else
6995  %define vrnd_mem [base+pd_m524256]
6996  %define stk rsp+0x30
6997 %endif
6998    movzx               t0d, t0b
6999    sub                srcq, 2
7000    movd                m15, t0d
7001%else
7002 %define m10 [base+pd_0x3ff]
7003 %define m9  [base+pd_0x4000]
7004 %define m8  m0
7005 %xdefine m14 m4
7006 %define m15 m3
7007 %if isprep
7008  %define ssq r3
7009 %endif
7010    movzx                r5, byte [esp+0x1f0]
7011    sub                srcq, 2
7012    movd                m15, r5
7013%endif
7014    pmaddwd              m8, [base+rescale_mul]
7015%if ARCH_X86_64
7016    mova                 m9, [base+pd_0x4000]
7017%endif
7018    pshufd              m15, m15, q0000
7019    paddd               m14, m8 ; mx+dx*[0-3]
7020    pand                 m0, m14, m10
7021    psrld                m0, 6
7022    paddd               m15, m0
7023    pshufd               m7, m15, q1032
7024%if ARCH_X86_64
7025    movd                r4d, m15
7026    movd               r11d, m7
7027    pshufd              m15, m15, q0321
7028    pshufd               m7, m7, q0321
7029    movd                r6d, m15
7030    movd               r13d, m7
7031    mova                m10, [base+bdct_lb_q+ 0]
7032    mova                m11, [base+bdct_lb_q+16]
7033    movd                m13, [base+subpel_filters+ r4*8+2]
7034    movd                 m2, [base+subpel_filters+ r6*8+2]
7035    movd                m15, [base+subpel_filters+r11*8+2]
7036    movd                 m4, [base+subpel_filters+r13*8+2]
7037%else
7038    movd                 r1, m15
7039    movd                 r4, m7
7040    pshufd              m15, m15, q0321
7041    pshufd               m7, m7, q0321
7042    movd                 r3, m15
7043    movd                 r5, m7
7044    mova                 m5, [base+bdct_lb_q+ 0]
7045    mova                 m6, [base+bdct_lb_q+16]
7046    movd                 m1, [base+subpel_filters+r1*8+2]
7047    movd                 m2, [base+subpel_filters+r3*8+2]
7048    movd                 m3, [base+subpel_filters+r4*8+2]
7049    movd                 m7, [base+subpel_filters+r5*8+2]
7050    SWAP                 m4, m7
7051    mov                  r3, r3m
7052 %if isprep
7053    lea                ss3q, [ssq*3]
7054 %endif
7055 %define m10 m5
7056 %define m11 m6
7057 %define m12 m1
7058 %define m13 m1
7059%endif
7060    psrld               m14, 10
7061    paddd               m14, m14
7062    punpckldq           m13, m2
7063    punpckldq           m15, m4
7064    punpcklqdq          m13, m15
7065    pxor                 m2, m2
7066    pcmpeqd              m0, m2
7067%if ARCH_X86_64
7068    pand                 m9, m0
7069%else
7070    pand                 m2, m9, m0
7071 %define m9 m2
7072    SWAP                 m7, m4
7073%endif
7074    pandn                m0, m13
7075%if ARCH_X86_64
7076    SWAP                m13, m0
7077%else
7078 %define m13 m0
7079%endif
7080    por                 m13, m9
7081    punpckhbw           m15, m13, m13
7082    punpcklbw           m13, m13
7083    psraw               m15, 8
7084    psraw               m13, 8
7085    pshufb              m12, m14, m10
7086    pshufb              m14, m11
7087    mova                m10, [base+spel_s_shuf2]
7088    movd                r4d, m14
7089    shr                 r4d, 24
7090%if ARCH_X86_32
7091    mova         [stk+0x40], m13
7092    mova         [stk+0x50], m15
7093    pxor                 m2, m2
7094%endif
7095    pshufb               m7, m14, m2
7096    psubb               m14, m7
7097    paddb               m12, m10
7098    paddb               m14, m10
7099%if ARCH_X86_64
7100    lea                  r6, [r4+ssq*1]
7101    lea                 r11, [r4+ssq*2]
7102    lea                 r13, [r4+ss3q ]
7103    movu                 m1, [srcq+ssq*0]
7104    movu                 m8, [srcq+ssq*2]
7105    movu                 m9, [srcq+ssq*1]
7106    movu                m10, [srcq+ss3q ]
7107    movu                 m7, [srcq+r4   ]
7108    movu                 m2, [srcq+r11  ]
7109    movu                 m3, [srcq+r6   ]
7110    movu                 m4, [srcq+r13  ]
7111    lea                srcq, [srcq+ssq*4]
7112    REPX    {pshufb x, m12}, m1, m9, m8, m10
7113    REPX   {pmaddwd x, m13}, m1, m9, m8, m10
7114    REPX    {pshufb x, m14}, m7, m3, m2, m4
7115    REPX   {pmaddwd x, m15}, m7, m3, m2, m4
7116    mova                 m5, [rsp+0x10]
7117    movd                xm6, [rsp+0x20]
7118    phaddd               m1, m7
7119    phaddd               m8, m2
7120    phaddd               m9, m3
7121    phaddd              m10, m4
7122    movu                 m2, [srcq+ssq*0]
7123    movu                 m3, [srcq+ssq*1]
7124    REPX      {paddd x, m5}, m1, m9, m8, m10
7125    REPX     {psrad x, xm6}, m1, m9, m8, m10
7126    packssdw             m1, m8     ; 0 2
7127    packssdw             m9, m10    ; 1 3
7128    movu                 m0, [srcq+r4   ]
7129    movu                 m8, [srcq+r6   ]
7130    lea                srcq, [srcq+ssq*2]
7131    REPX    {pshufb x, m12}, m2, m3
7132    REPX   {pmaddwd x, m13}, m2, m3
7133    REPX    {pshufb x, m14}, m0, m8
7134    REPX   {pmaddwd x, m15}, m0, m8
7135    phaddd               m2, m0
7136    phaddd               m3, m8
7137    shr                 myd, 6
7138    mov                 r9d, 64 << 24
7139    lea                 myd, [t1+myq]
7140    cmovnz              r9q, [base+subpel_filters+myq*8]
7141    REPX      {paddd x, m5}, m2, m3
7142    REPX     {psrad x, xm6}, m2, m3
7143    packssdw             m2, m3        ; 4 5
7144    pshufd               m3, m2, q1032 ; 5 _
7145    punpcklwd            m0, m1, m9    ; 01
7146    punpckhwd            m1, m9        ; 23
7147    punpcklwd            m2, m3        ; 45
7148    movq                m10, r9
7149 %define hrnd_mem [rsp+0x10]
7150 %define hsh_mem  [rsp+0x20]
7151 %define vsh_mem  [rsp+0x28]
7152 %if isput
7153  %define vrnd_mem [rsp+0x30]
7154 %else
7155  %define vrnd_mem [base+pd_m524256]
7156 %endif
7157%else
7158    mova         [stk+0x20], m12
7159    mova         [stk+0x30], m14
7160    add                  r4, srcq
7161    MC_4TAP_SCALED_H   0x60 ; 0 1
7162    MC_4TAP_SCALED_H   0x70 ; 2 3
7163    MC_4TAP_SCALED_H   0x80 ; 4 5
7164    mov          [stk+0xe0], r4
7165    mova                 m3, [base+spel_s_shuf8]
7166    mova                 m0, [stk+0x60]
7167    mova                 m1, [stk+0x70]
7168    mova                 m2, [stk+0x80]
7169    mov                 myd, mym
7170    mov                  rX, [esp+0x1f4]
7171    xor                  r5, r5
7172    shr                 myd, 6
7173    lea                  rX, [rX+myd]
7174    mov                  r4, 64 << 24
7175    cmovnz               r4, [base+subpel_filters+rX*8+0]
7176    cmovnz               r5, [base+subpel_filters+rX*8+4]
7177    mov                  r3, r3m
7178    pshufb               m0, m3 ; 01
7179    pshufb               m1, m3 ; 23
7180    pshufb               m2, m3 ; 45
7181    movd                 m7, r4
7182    movd                 m4, r5
7183    mov                  r5, r0m
7184 %if isput
7185    mov                  r1, r1m
7186 %endif
7187    mov                  r4, [stk+0xe0]
7188 %define dstq r5
7189 %define tmpq r5
7190 %define m12 [stk+0x20]
7191 %define m14 [stk+0x30]
7192 %define m13 [stk+0x40]
7193 %define m15 [stk+0x50]
7194 %define hrnd_mem [esp+0x00]
7195 %define hsh_mem  [esp+0x10]
7196 %define vsh_mem  [esp+0x18]
7197 %if isput
7198  %define vrnd_mem [esp+0x20]
7199 %else
7200  %define vrnd_mem [base+pd_m524256]
7201 %endif
7202 %define m10 m7
7203    punpckldq           m10, m4
7204%endif
7205    punpcklbw           m10, m10
7206    psraw               m10, 8
7207    pshufd               m3, m10, q0000
7208    pshufd               m4, m10, q1111
7209    pshufd               m5, m10, q2222
7210    pshufd              m10, m10, q3333
7211%if ARCH_X86_32
7212 %xdefine m8  m3
7213 %xdefine m9  m6
7214 %xdefine m11 m5
7215 %xdefine m6  m4
7216    mova         [stk+0x100], m3
7217    mova         [stk+0x110], m4
7218    mova         [stk+0x120], m5
7219    mova         [stk+0x130], m10
7220 %define m3  [stk+0x100]
7221 %define m4  [stk+0x110]
7222 %define m5  [stk+0x120]
7223 %define m10 [stk+0x130]
7224%endif
7225.dy2_w4_loop:
7226    pmaddwd              m8, m0, m3
7227    pmaddwd              m9, m1, m3
7228    mova                 m0, m2
7229    pmaddwd              m1, m4
7230    pmaddwd             m11, m2, m4
7231    paddd                m8, vrnd_mem
7232    paddd                m9, vrnd_mem
7233    pmaddwd              m2, m5
7234    paddd                m8, m1
7235    paddd                m9, m11
7236    paddd                m8, m2
7237    movu                 m6, [srcq+ssq*0]
7238    movu                 m1, [srcq+ssq*2]
7239%if ARCH_X86_64
7240    movu                m11, [srcq+r4 ]
7241    movu                 m2, [srcq+r11]
7242%else
7243    movu                m11, [r4+ssq*0]
7244    movu                 m2, [r4+ssq*2]
7245%endif
7246    pshufb               m6, m12
7247    pshufb               m1, m12
7248    pmaddwd              m6, m13
7249    pmaddwd              m1, m13
7250    pshufb              m11, m14
7251    pshufb               m2, m14
7252    pmaddwd             m11, m15
7253    pmaddwd              m2, m15
7254    phaddd               m6, m11
7255    phaddd               m1, m2
7256    paddd                m6, hrnd_mem
7257    paddd                m1, hrnd_mem
7258    psrad                m6, hsh_mem
7259    psrad                m1, hsh_mem
7260    movu                 m7, [srcq+ssq*1]
7261    movu                m11, [srcq+ss3q ]
7262    packssdw             m6, m1 ; 6 8
7263%if ARCH_X86_64
7264    movu                 m2, [srcq+r6 ]
7265    movu                 m1, [srcq+r13]
7266%else
7267    movu                 m2, [r4+ssq*1]
7268    movu                 m1, [r4+ss3q ]
7269%endif
7270    pshufb               m7, m12
7271    pshufb              m11, m12
7272    pmaddwd              m7, m13
7273    pmaddwd             m11, m13
7274    pshufb               m2, m14
7275    pshufb               m1, m14
7276    pmaddwd              m2, m15
7277    pmaddwd              m1, m15
7278    phaddd               m7, m2
7279    phaddd              m11, m1
7280    paddd                m7, hrnd_mem
7281    paddd               m11, hrnd_mem
7282    psrad                m7, hsh_mem
7283    psrad               m11, hsh_mem
7284    packssdw             m7, m11 ; 7 9
7285%if ARCH_X86_32
7286    lea                  r4, [r4+ssq*4]
7287%endif
7288    lea                srcq, [srcq+ssq*4]
7289    punpcklwd            m1, m6, m7 ; 67
7290    punpckhwd            m6, m7     ; 89
7291    mova                 m2, m6
7292    pmaddwd             m11, m1, m5
7293    pmaddwd              m7, m1, m10
7294    pmaddwd              m6, m10
7295    paddd                m9, m11
7296%if isput
7297    movd                m11, vsh_mem
7298%endif
7299    paddd                m8, m7
7300    paddd                m9, m6
7301%if isput
7302    psrad                m8, m11
7303    psrad                m9, m11
7304    packssdw             m8, m9
7305    pxor                 m7, m7
7306    pmaxsw               m8, m7
7307    pminsw               m8, pxmaxm
7308    movq       [dstq+dsq*0], m8
7309    movhps     [dstq+dsq*1], m8
7310    lea                dstq, [dstq+dsq*2]
7311%else
7312    psrad                m8, 6
7313    psrad                m9, 6
7314    packssdw             m8, m9
7315    mova             [tmpq], m8
7316    add                tmpq, 16
7317%endif
7318    sub                  hd, 2
7319    jg .dy2_w4_loop
7320    MC_8TAP_SCALED_RET ; why not jz .ret?
7321INIT_XMM ssse3
7322.dy2_w8:
7323    mov    dword [stk+0xf0], 1
7324    movifprep   tmp_stridem, 16
7325    jmp .dy2_w_start
7326.dy2_w16:
7327    mov    dword [stk+0xf0], 2
7328    movifprep   tmp_stridem, 32
7329    jmp .dy2_w_start
7330.dy2_w32:
7331    mov    dword [stk+0xf0], 4
7332    movifprep   tmp_stridem, 64
7333    jmp .dy2_w_start
7334.dy2_w64:
7335    mov    dword [stk+0xf0], 8
7336    movifprep   tmp_stridem, 128
7337    jmp .dy2_w_start
7338.dy2_w128:
7339    mov    dword [stk+0xf0], 16
7340    movifprep   tmp_stridem, 256
7341.dy2_w_start:
7342    mov                 myd, mym
7343%if ARCH_X86_64
7344 %ifidn %1, put
7345    movifnidn           dsm, dsq
7346 %endif
7347    mova         [rsp+0x10], m11
7348    mova         [rsp+0x20], m12
7349 %define hround m11
7350 %if isput
7351    mova         [rsp+0x30], m13
7352 %else
7353    mova                m13, [base+pd_m524256]
7354 %endif
7355    shr                 t0d, 16
7356    shr                 myd, 6
7357    mov                 r4d, 64 << 24
7358    lea                 myd, [t1+myq]
7359    cmovnz              r4q, [base+subpel_filters+myq*8]
7360    movd                m15, t0d
7361%else
7362 %define hround [esp+0x00]
7363 %define m12    [esp+0x10]
7364 %define m10    [base+pd_0x3ff]
7365 %define m8  m0
7366 %xdefine m14 m4
7367 %xdefine m15 m3
7368 %if isput
7369  %define dstq r0
7370 %else
7371  %define tmpq r0
7372  %define ssq ssm
7373 %endif
7374    mov                  r5, [esp+0x1f0]
7375    mov                  r3, [esp+0x1f4]
7376    shr                  r5, 16
7377    movd                m15, r5
7378    xor                  r5, r5
7379    shr                 myd, 6
7380    lea                  r3, [r3+myd]
7381    mov                  r4, 64 << 24
7382    cmovnz               r4, [base+subpel_filters+r3*8+0]
7383    cmovnz               r5, [base+subpel_filters+r3*8+4]
7384    mov                  r0, r0m
7385    mov                  r3, r3m
7386%endif
7387    sub                srcq, 6
7388    pslld                m7, m8, 2 ; dx*4
7389    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
7390    pshufd              m15, m15, q0000
7391    paddd               m14, m8 ; mx+dx*[0-3]
7392%if ARCH_X86_64
7393    movq                 m3, r4q
7394%else
7395    movd                 m5, r4
7396    movd                 m6, r5
7397    punpckldq            m5, m6
7398    SWAP                 m3, m5
7399%endif
7400    punpcklbw            m3, m3
7401    psraw                m3, 8
7402    mova        [stk+0x100], m7
7403    mova        [stk+0x120], m15
7404    mov         [stk+0x0f8], srcq
7405    mov         [stk+0x130], r0q ; dstq / tmpq
7406    pshufd               m0, m3, q0000
7407    pshufd               m1, m3, q1111
7408    pshufd               m2, m3, q2222
7409    pshufd               m3, m3, q3333
7410%if ARCH_X86_64
7411    mova        [stk+0x140], m0
7412    mova        [stk+0x150], m1
7413    mova        [stk+0x160], m2
7414    mova        [stk+0x170], m3
7415 %if UNIX64
7416    mov                  hm, hd
7417 %endif
7418%else
7419    mova        [stk+0x180], m0
7420    mova        [stk+0x190], m1
7421    mova        [stk+0x1a0], m2
7422    mova        [stk+0x1b0], m3
7423    SWAP                 m5, m3
7424    mov                  r5, hm
7425    mov         [stk+0x134], r5
7426%endif
7427    jmp .dy2_hloop
7428.dy2_hloop_prep:
7429    dec   dword [stk+0x0f0]
7430    jz .ret
7431%if ARCH_X86_64
7432    add   qword [stk+0x130], 16
7433    mov                  hd, hm
7434%else
7435    add   dword [stk+0x130], 16
7436    mov                  r5, [stk+0x134]
7437    mov                  r0, [stk+0x130]
7438%endif
7439    mova                 m7, [stk+0x100]
7440    mova                m14, [stk+0x110]
7441%if ARCH_X86_64
7442    mova                m10, [base+pd_0x3ff]
7443    mova                m11, [rsp+0x10]
7444%endif
7445    mova                m15, [stk+0x120]
7446    mov                srcq, [stk+0x0f8]
7447%if ARCH_X86_64
7448    mov                 r0q, [stk+0x130] ; dstq / tmpq
7449%else
7450    mov                  hm, r5
7451    mov                 r0m, r0
7452    mov                  r3, r3m
7453%endif
7454    paddd               m14, m7
7455.dy2_hloop:
7456%if ARCH_X86_64
7457    mova                 m9, [base+pq_0x40000000]
7458%else
7459 %define m9 [base+pq_0x40000000]
7460%endif
7461    pxor                 m1, m1
7462    psrld                m2, m14, 10
7463    mova              [stk], m2
7464    pand                 m6, m14, m10
7465    psrld                m6, 6
7466    paddd                m5, m15, m6
7467    pcmpeqd              m6, m1
7468    pshufd               m2, m5, q1032
7469%if ARCH_X86_64
7470    movd                r4d, m5
7471    movd                r6d, m2
7472    pshufd               m5, m5, q0321
7473    pshufd               m2, m2, q0321
7474    movd                r7d, m5
7475    movd                r9d, m2
7476    movq                 m0, [base+subpel_filters+r4*8]
7477    movq                 m1, [base+subpel_filters+r6*8]
7478    movhps               m0, [base+subpel_filters+r7*8]
7479    movhps               m1, [base+subpel_filters+r9*8]
7480%else
7481    movd                 r0, m5
7482    movd                 rX, m2
7483    pshufd               m5, m5, q0321
7484    pshufd               m2, m2, q0321
7485    movd                 r4, m5
7486    movd                 r5, m2
7487    movq                 m0, [base+subpel_filters+r0*8]
7488    movq                 m1, [base+subpel_filters+rX*8]
7489    movhps               m0, [base+subpel_filters+r4*8]
7490    movhps               m1, [base+subpel_filters+r5*8]
7491%endif
7492    paddd               m14, m7 ; mx+dx*[4-7]
7493    pand                 m5, m14, m10
7494    psrld                m5, 6
7495    paddd               m15, m5
7496    pxor                 m2, m2
7497    pcmpeqd              m5, m2
7498    mova        [stk+0x110], m14
7499    pshufd               m4, m15, q1032
7500%if ARCH_X86_64
7501    movd               r10d, m15
7502    movd               r11d, m4
7503    pshufd              m15, m15, q0321
7504    pshufd               m4, m4, q0321
7505    movd               r13d, m15
7506    movd                rXd, m4
7507    movq                 m2, [base+subpel_filters+r10*8]
7508    movq                 m3, [base+subpel_filters+r11*8]
7509    movhps               m2, [base+subpel_filters+r13*8]
7510    movhps               m3, [base+subpel_filters+ rX*8]
7511    psrld               m14, 10
7512    movq                r11, m14
7513    punpckhqdq          m14, m14
7514    movq                 rX, m14
7515    mov                r10d, r11d
7516    shr                 r11, 32
7517    mov                r13d, rXd
7518    shr                  rX, 32
7519    mov                 r4d, [stk+ 0]
7520    mov                 r6d, [stk+ 4]
7521    mov                 r7d, [stk+ 8]
7522    mov                 r9d, [stk+12]
7523    pshufd               m4, m6, q1100
7524    pshufd               m6, m6, q3322
7525    pshufd              m14, m5, q1100
7526    pshufd               m5, m5, q3322
7527    pand                 m7, m9, m4
7528    pand                 m8, m9, m6
7529    pand                m15, m9, m14
7530    pand                 m9, m9, m5
7531    pandn                m4, m0
7532    pandn                m6, m1
7533    pandn               m14, m2
7534    pandn                m5, m3
7535    por                  m7, m4
7536    por                  m8, m6
7537    por                 m15, m14
7538    por                  m9, m5
7539    punpcklbw            m0, m7, m7
7540    punpckhbw            m7, m7
7541    punpcklbw            m1, m8, m8
7542    punpckhbw            m8, m8
7543    psraw                m0, 8
7544    psraw                m7, 8
7545    psraw                m1, 8
7546    psraw                m8, 8
7547    punpcklbw            m2, m15, m15
7548    punpckhbw           m15, m15
7549    punpcklbw            m3, m9, m9
7550    punpckhbw            m9, m9
7551    psraw                m2, 8
7552    psraw               m15, 8
7553    psraw                m3, 8
7554    psraw                m9, 8
7555    mova         [stk+0x10], m0
7556    mova         [stk+0x20], m7
7557    mova         [stk+0x30], m1
7558    mova         [stk+0x40], m8
7559    mova         [stk+0x50], m2
7560    mova         [stk+0x60], m15
7561    mova         [stk+0x70], m3
7562    mova         [stk+0x80], m9
7563    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
7564    mova         [stk+0x90], m1
7565    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
7566    mova         [stk+0xa0], m2
7567    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
7568    mova         [stk+0xb0], m3
7569    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
7570    mova         [stk+0xc0], m4
7571    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
7572    mova         [stk+0xd0], m5
7573    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
7574    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
7575    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
7576    mova                 m5, [stk+0xd0]
7577    mova                 m1, [stk+0x90]
7578    mova                 m2, [stk+0xa0]
7579    mova                 m3, [stk+0xb0]
7580    mova                 m9, [stk+0xc0]
7581    punpcklwd            m4, m5, m6 ; 45a
7582    punpckhwd            m5, m6     ; 45b
7583    punpcklwd            m6, m7, m8 ; 67a
7584    punpckhwd            m7, m8     ; 67b
7585    punpcklwd            m0, m1, m2 ; 01a
7586    punpckhwd            m1, m2     ; 01b
7587    punpcklwd            m2, m3, m9 ; 23a
7588    punpckhwd            m3, m9     ; 23b
7589    mova                m10, [stk+0x140]
7590    mova                m11, [stk+0x150]
7591    mova                m14, [stk+0x160]
7592    mova                m15, [stk+0x170]
7593    mova         [stk+0x90], m4
7594    mova         [stk+0xa0], m5
7595    mova         [stk+0xb0], m6
7596    mova         [stk+0xc0], m7
7597 %define hround [rsp+0x10]
7598 %define shift  [rsp+0x20]
7599 %if isput
7600  %define vround [rsp+0x30]
7601 %else
7602  %define vround [base+pd_m524256]
7603 %endif
7604.dy2_vloop:
7605    pmaddwd              m4, m0, m10
7606    pmaddwd              m5, m1, m10
7607    pmaddwd              m6, m2, m11
7608    pmaddwd              m7, m3, m11
7609    paddd                m4, m13
7610    paddd                m5, m13
7611    paddd                m4, m6
7612    paddd                m5, m7
7613    pmaddwd              m6, [stk+0x90], m14
7614    pmaddwd              m7, [stk+0xa0], m14
7615    pmaddwd              m8, [stk+0xb0], m15
7616    pmaddwd              m9, [stk+0xc0], m15
7617    paddd                m4, m6
7618    paddd                m5, m7
7619 %if isput
7620    pshufd               m6, m12, q1032
7621 %endif
7622    paddd                m4, m8
7623    paddd                m5, m9
7624%else
7625    movd                 r0, m15
7626    movd                 rX, m4
7627    pshufd              m15, m15, q0321
7628    pshufd               m4, m4, q0321
7629    movd                 r4, m15
7630    movd                 r5, m4
7631    mova                m14, [stk+0x110]
7632    movq                 m2, [base+subpel_filters+r0*8]
7633    movq                 m3, [base+subpel_filters+rX*8]
7634    movhps               m2, [base+subpel_filters+r4*8]
7635    movhps               m3, [base+subpel_filters+r5*8]
7636    psrld               m14, 10
7637    mova           [stk+16], m14
7638    mov                  r0, [stk+ 0]
7639    mov                  rX, [stk+ 4]
7640    mov                  r4, [stk+ 8]
7641    mov                  r5, [stk+12]
7642    mova         [stk+0x20], m0
7643    mova         [stk+0x30], m1
7644    mova         [stk+0x40], m2
7645    mova         [stk+0x50], m3
7646    pshufd               m4, m6, q1100
7647    pshufd               m6, m6, q3322
7648    pshufd               m7, m5, q1100
7649    pshufd               m5, m5, q3322
7650    pand                 m0, m9, m4
7651    pand                 m1, m9, m6
7652    pand                 m2, m9, m7
7653    pand                 m3, m9, m5
7654    pandn                m4, [stk+0x20]
7655    pandn                m6, [stk+0x30]
7656    pandn                m7, [stk+0x40]
7657    pandn                m5, [stk+0x50]
7658    por                  m0, m4
7659    por                  m1, m6
7660    por                  m2, m7
7661    por                  m3, m5
7662    punpcklbw            m4, m0, m0
7663    punpckhbw            m0, m0
7664    punpcklbw            m5, m1, m1
7665    punpckhbw            m1, m1
7666    psraw                m4, 8
7667    psraw                m0, 8
7668    psraw                m5, 8
7669    psraw                m1, 8
7670    punpcklbw            m6, m2, m2
7671    punpckhbw            m2, m2
7672    punpcklbw            m7, m3, m3
7673    punpckhbw            m3, m3
7674    psraw                m6, 8
7675    psraw                m2, 8
7676    psraw                m7, 8
7677    psraw                m3, 8
7678    mova        [stk+0x0a0], m4
7679    mova        [stk+0x0b0], m0
7680    mova        [stk+0x0c0], m5
7681    mova        [stk+0x0d0], m1
7682    mova        [stk+0x140], m6
7683    mova        [stk+0x150], m2
7684    mova        [stk+0x160], m7
7685    mova        [stk+0x170], m3
7686    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
7687    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
7688    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
7689    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
7690    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
7691    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
7692    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
7693    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
7694    mova                 m5, [stk+0x60]
7695    mova                 m6, [stk+0x70]
7696    mova                 m7, [stk+0x80]
7697    mova                 m0, [stk+0x90]
7698    mov                  r0, r0m
7699    punpcklwd            m4, m5, m6      ; 45a
7700    punpckhwd            m5, m6          ; 45b
7701    punpcklwd            m6, m7, m0      ; 67a
7702    punpckhwd            m7, m0          ; 67b
7703    mova         [stk+0x60], m4
7704    mova         [stk+0x70], m5
7705    mova         [stk+0x80], m6
7706    mova         [stk+0x90], m7
7707    mova                 m1, [stk+0x20]
7708    mova                 m2, [stk+0x30]
7709    mova                 m3, [stk+0x40]
7710    mova                 m4, [stk+0x50]
7711    punpcklwd            m0, m1, m2      ; 01a
7712    punpckhwd            m1, m2          ; 01b
7713    punpcklwd            m2, m3, m4      ; 23a
7714    punpckhwd            m3, m4          ; 23b
7715    mova                 m4, [stk+0x180]
7716    mova                 m5, [stk+0x190]
7717    mova                 m6, [stk+0x1a0]
7718    mova                 m7, [stk+0x1b0]
7719    mova         [stk+0x40], m2
7720    mova         [stk+0x50], m3
7721.dy2_vloop:
7722    pmaddwd              m0, m4
7723    pmaddwd              m1, m4
7724    pmaddwd              m2, m5
7725    pmaddwd              m3, m5
7726    paddd                m0, m2
7727    paddd                m1, m3
7728    pmaddwd              m2, [stk+0x60], m6
7729    pmaddwd              m3, [stk+0x70], m6
7730    pmaddwd              m4, [stk+0x80], m7
7731    pmaddwd              m5, [stk+0x90], m7
7732 %if isput
7733    movd                 m6, [esp+0x18]
7734 %endif
7735    paddd                m0, m2
7736    paddd                m1, m3
7737    paddd                m0, vrnd_mem
7738    paddd                m1, vrnd_mem
7739    paddd                m4, m0
7740    paddd                m5, m1
7741%endif
7742%ifidn %1, put
7743    psrad                m4, m6
7744    psrad                m5, m6
7745    packssdw             m4, m5
7746    pxor                 m7, m7
7747    pmaxsw               m4, m7
7748    pminsw               m4, pxmaxm
7749    mova             [dstq], m4
7750    add                dstq, dsm
7751%else
7752    psrad                m4, 6
7753    psrad                m5, 6
7754    packssdw             m4, m5
7755    mova             [tmpq], m4
7756    add                tmpq, tmp_stridem
7757%endif
7758    dec                  hd
7759    jz .dy2_hloop_prep
7760%if ARCH_X86_64
7761    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1
7762    mova         [stk+0xd0], m4
7763    MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1
7764    mova                 m4, [stk+0xd0]
7765    mova                 m0, m2         ; 01a
7766    mova                 m1, m3         ; 01b
7767    mova                 m2, [stk+0x90] ; 23a
7768    mova                 m3, [stk+0xa0] ; 23b
7769    mova                 m5, [stk+0xb0] ; 45a
7770    mova                 m6, [stk+0xc0] ; 45b
7771    punpcklwd            m7, m4, m8     ; 67a
7772    punpckhwd            m4, m8         ; 67b
7773    mova         [stk+0x90], m5
7774    mova         [stk+0xa0], m6
7775    mova         [stk+0xb0], m7
7776    mova         [stk+0xc0], m4
7777%else
7778    mov                 r0m, r0
7779    mov                  r3, r3m
7780    MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8
7781    MC_8TAP_SCALED_H 0xa0, 0    ; 9
7782    mova                 m7, [stk+0xe0]
7783    mova                 m2, [stk+0x60] ; 23a
7784    mova                 m3, [stk+0x70] ; 23b
7785    mova                 m4, [stk+0x80] ; 45a
7786    mova                 m5, [stk+0x90] ; 45b
7787    punpcklwd            m6, m7, m0     ; 67a
7788    punpckhwd            m7, m0         ; 67b
7789    mova                 m0, [stk+0x40] ; 01a
7790    mova                 m1, [stk+0x50] ; 01b
7791    mova         [stk+0x40], m2
7792    mova         [stk+0x50], m3
7793    mova         [stk+0x60], m4
7794    mova         [stk+0x70], m5
7795    mova                 m4, [stk+0x180]
7796    mova                 m5, [stk+0x190]
7797    mova         [stk+0x80], m6
7798    mova         [stk+0x90], m7
7799    mova                 m6, [stk+0x1a0]
7800    mova                 m7, [stk+0x1b0]
7801    mov                  r0, r0m
7802%endif
7803    jmp .dy2_vloop
7804INIT_XMM ssse3
7805.ret:
7806    MC_8TAP_SCALED_RET 0
7807%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
7808 %define r0m [rstk+stack_offset+ 4]
7809 %define r1m [rstk+stack_offset+ 8]
7810 %define r2m [rstk+stack_offset+12]
7811 %define r3m [rstk+stack_offset+16]
7812%endif
7813%undef isput
7814%undef isprep
7815%endmacro
7816
7817%macro BILIN_SCALED_FN 1
7818cglobal %1_bilin_scaled_16bpc
7819    mov                 t0d, (5*15 << 16) | 5*15
7820    mov                 t1d, (5*15 << 16) | 5*15
7821    jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
7822%endmacro
7823
7824%if WIN64
7825DECLARE_REG_TMP 6, 5
7826%elif ARCH_X86_64
7827DECLARE_REG_TMP 6, 8
7828%else
7829DECLARE_REG_TMP 1, 2
7830%endif
7831
7832%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
7833BILIN_SCALED_FN put
7834PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_16bpc
7835PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_16bpc
7836PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_16bpc
7837PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_16bpc
7838PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_16bpc
7839PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_16bpc
7840PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_16bpc
7841PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_16bpc
7842PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
7843MC_8TAP_SCALED put
7844
7845%if WIN64
7846DECLARE_REG_TMP 5, 4
7847%elif ARCH_X86_64
7848DECLARE_REG_TMP 6, 7
7849%else
7850DECLARE_REG_TMP 1, 2
7851%endif
7852
7853%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
7854BILIN_SCALED_FN prep
7855PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_16bpc
7856PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_16bpc
7857PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_16bpc
7858PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_16bpc
7859PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_16bpc
7860PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_16bpc
7861PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_16bpc
7862PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_16bpc
7863PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
7864MC_8TAP_SCALED prep
7865
7866%if ARCH_X86_64
7867DECLARE_REG_TMP 6
7868%else
7869DECLARE_REG_TMP 2
7870%endif
7871
7872%if ARCH_X86_64
7873; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that
7874; by allocating 16 bytes more stack space so that stack offsets match up.
7875%if WIN64 && STACK_ALIGNMENT == 16
7876%assign stksz 16*14
7877%else
7878%assign stksz 16*13
7879%endif
7880cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \
7881                                                 mx, tmp, alpha, beta, \
7882                                                 filter, my, gamma, cnt
7883%assign stack_size_padded_8x8t stack_size_padded
7884%else
7885cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
7886                                                 filter, mx, my
7887%define m8   [esp+16*13]
7888%define m9   [esp+16*14]
7889%define cntd dword [esp+4*63]
7890%define dstq tmpq
7891%define dsq  0
7892%if STACK_ALIGNMENT < 16
7893%define dstm [esp+4*65]
7894%define dsm  [esp+4*66]
7895%else
7896%define dstm r0m
7897%define dsm  r1m
7898%endif
7899%endif
7900%define base filterq-$$
7901    mov                 t0d, r7m
7902    LEA             filterq, $$
7903    shr                 t0d, 11
7904%if ARCH_X86_64
7905    movddup              m8, [base+warp8x8t_rnd]
7906%else
7907    movddup              m1, [base+warp8x8t_rnd]
7908    mov                  r1, r1m
7909    add                  r1, r1
7910    mova                 m8, m1
7911    mov                 r1m, r1 ; ds *= 2
7912%endif
7913    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main
7914    jmp .start
7915.loop:
7916%if ARCH_X86_64
7917    lea                dstq, [dstq+dsq*4]
7918%else
7919    add                dstq, dsm
7920    mov                dstm, dstq
7921%endif
7922    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2
7923.start:
7924%if ARCH_X86_32
7925    mov                dstq, dstm
7926%endif
7927    paddd                m1, m8
7928    paddd                m2, m8
7929    psrad                m1, 15
7930    psrad                m2, 15
7931    packssdw             m1, m2
7932    mova       [dstq+dsq*0], m1
7933    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3
7934%if ARCH_X86_32
7935    mov                dstq, dstm
7936    add                dstq, dsm
7937%endif
7938    paddd                m1, m8
7939    paddd                m2, m8
7940    psrad                m1, 15
7941    psrad                m2, 15
7942    packssdw             m1, m2
7943    mova       [dstq+dsq*2], m1
7944    dec                cntd
7945    jg .loop
7946    RET
7947
7948%if ARCH_X86_64
7949cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \
7950                                                 mx, tmp, alpha, beta, \
7951                                                 filter, my, gamma, cnt
7952ASSERT stack_size_padded == stack_size_padded_8x8t
7953%else
7954cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
7955                                                filter, mx, my
7956%endif
7957    mov                 t0d, r7m
7958    LEA             filterq, $$
7959    shr                 t0d, 11
7960%if ARCH_X86_64
7961    movddup              m8, [base+warp8x8_rnd2+t0*8]
7962    movd                 m9, r7m ; pixel_max
7963    pshufb               m9, [base+pw_256]
7964%else
7965    movddup              m1, [base+warp8x8_rnd2+t0*8]
7966    movd                 m2, r7m ; pixel_max
7967    pshufb               m2, [base+pw_256]
7968    mova                 m8, m1
7969    mova                 m9, m2
7970%endif
7971    call .main
7972    jmp .start
7973.loop:
7974%if ARCH_X86_64
7975    lea                dstq, [dstq+dsq*2]
7976%else
7977    add                dstq, dsm
7978    mov                dstm, dstq
7979%endif
7980    call .main2
7981.start:
7982%if ARCH_X86_32
7983    mov                dstq, dstm
7984%endif
7985    psrad                m1, 16
7986    psrad                m2, 16
7987    packssdw             m1, m2
7988    pmaxsw               m1, m6
7989    pmulhrsw             m1, m8
7990    pminsw               m1, m9
7991    mova       [dstq+dsq*0], m1
7992    call .main3
7993%if ARCH_X86_32
7994    mov                dstq, dstm
7995    add                dstq, dsm
7996%endif
7997    psrad                m1, 16
7998    psrad                m2, 16
7999    packssdw             m1, m2
8000    pmaxsw               m1, m6
8001    pmulhrsw             m1, m8
8002    pminsw               m1, m9
8003    mova       [dstq+dsq*1], m1
8004    dec                cntd
8005    jg .loop
8006    RET
8007ALIGN function_align
8008.main:
8009    ; Stack args offset by one (r4m -> r5m etc.) due to call
8010%if WIN64
8011    mov              deltaq, r5m
8012    mov                 mxd, r6m
8013%endif
8014    movd                 m0, [base+warp8x8_shift+t0*4]
8015    movddup              m7, [base+warp8x8_rnd1+t0*8]
8016    add             filterq, mc_warp_filter-$$
8017%if ARCH_X86_64
8018    movsx            alphad, word [deltaq+2*0]
8019    movsx             betad, word [deltaq+2*1]
8020    movsx            gammad, word [deltaq+2*2]
8021    movsx            deltad, word [deltaq+2*3]
8022    lea                tmpq, [ssq*3]
8023    add                 mxd, 512+(64<<10)
8024    sub                srcq, tmpq             ; src -= ss*3
8025    imul               tmpd, alphad, -7
8026    mov                 myd, r7m
8027    add               betad, tmpd             ; beta -= alpha*7
8028    imul               tmpd, gammad, -7
8029    add                 myd, 512+(64<<10)
8030    mov                cntd, 4
8031    add              deltad, tmpd             ; delta -= gamma*7
8032%else
8033%if STACK_ALIGNMENT < 16
8034    %assign stack_offset stack_offset - gprsize
8035%endif
8036    mov                 r3d, r5m              ; abcd
8037%if STACK_ALIGNMENT < 16
8038    mov                  r0, r1m              ; dst
8039    mov                  r1, r2m              ; ds
8040    mov  [esp+gprsize+4*65], r0
8041    mov  [esp+gprsize+4*66], r1
8042%endif
8043    movsx            alphad, word [r3+2*0]
8044    movsx               r2d, word [r3+2*1]
8045    movsx            gammad, word [r3+2*2]
8046    movsx               r3d, word [r3+2*3]
8047    imul                r5d, alphad, -7
8048    add                 r2d, r5d              ; beta -= alpha*7
8049    imul                r5d, gammad, -7
8050    mov  [esp+gprsize+4*60], r2d
8051    add                 r3d, r5d              ; delta -= gamma*7
8052    mov  [esp+gprsize+4*61], r3d
8053    mov                 r3d, r4m              ; ss
8054    mov                srcq, r3m
8055    mov                 mxd, r6m
8056    mov                 myd, r7m
8057    mov dword [esp+gprsize+4*63], 4           ; cnt
8058    mov  [esp+gprsize+4*62], r3
8059    lea                  r3, [r3*3]
8060    add                 mxd, 512+(64<<10)
8061    add                 myd, 512+(64<<10)
8062    sub                srcq, r3               ; src -= ss*3
8063%if STACK_ALIGNMENT < 16
8064    %assign stack_offset stack_offset + gprsize
8065%endif
8066%endif
8067    mova      [rsp+gprsize], m0
8068    pxor                 m6, m6
8069    call .h
8070    mova                 m5, m0
8071    call .h
8072    punpcklwd            m1, m5, m0           ; 01
8073    punpckhwd            m5, m0
8074    mova [rsp+gprsize+16* 1], m1
8075    mova [rsp+gprsize+16* 4], m5
8076    mova                 m5, m0
8077    call .h
8078    punpcklwd            m1, m5, m0           ; 12
8079    punpckhwd            m5, m0
8080    mova [rsp+gprsize+16* 7], m1
8081    mova [rsp+gprsize+16*10], m5
8082    mova                 m5, m0
8083    call .h
8084    punpcklwd            m1, m5, m0           ; 23
8085    punpckhwd            m5, m0
8086    mova [rsp+gprsize+16* 2], m1
8087    mova [rsp+gprsize+16* 5], m5
8088    mova                 m5, m0
8089    call .h
8090    punpcklwd            m1, m5, m0           ; 34
8091    punpckhwd            m5, m0
8092    mova [rsp+gprsize+16* 8], m1
8093    mova [rsp+gprsize+16*11], m5
8094    mova                 m5, m0
8095    call .h
8096    punpcklwd            m1, m5, m0           ; 45
8097    punpckhwd            m5, m0
8098    mova [rsp+gprsize+16* 3], m1
8099    mova [rsp+gprsize+16* 6], m5
8100    mova                 m5, m0
8101    call .h
8102    punpcklwd            m1, m5, m0           ; 56
8103    punpckhwd            m5, m0
8104    mova [rsp+gprsize+16* 9], m1
8105    mova [rsp+gprsize+16*12], m5
8106    mova                 m5, m0
8107.main2:
8108    call .h
8109%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h
8110    lea                tmpd, [myq+gammaq]
8111    shr                 myd, 10
8112    movq                 m4, [filterq+myq*8]  ; a
8113    lea                 myd, [tmpq+gammaq]
8114    shr                tmpd, 10
8115    movq                 m2, [filterq+tmpq*8] ; b
8116    lea                tmpd, [myq+gammaq]
8117    shr                 myd, 10
8118    movq                 m3, [filterq+myq*8]  ; c
8119    lea                 myd, [tmpq+gammaq]
8120    shr                tmpd, 10
8121    movq                 m1, [filterq+tmpq*8] ; d
8122    lea                tmpd, [myq+gammaq]
8123    shr                 myd, 10
8124    punpcklwd            m4, m2
8125    punpcklwd            m3, m1
8126    punpckldq            m2, m4, m3
8127    punpckhdq            m4, m3
8128    punpcklbw            m1, m6, m2           ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
8129    pmaddwd              m1, [rsp+gprsize+16*%1]
8130    punpckhbw            m3, m6, m2           ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
8131    mova                 m2, [rsp+gprsize+16*%2]
8132    pmaddwd              m3, m2
8133    mova [rsp+gprsize+16*%1], m2
8134    paddd                m1, m3
8135    punpcklbw            m3, m6, m4           ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
8136    mova                 m2, [rsp+gprsize+16*%3]
8137    pmaddwd              m3, m2
8138    mova [rsp+gprsize+16*%2], m2
8139    paddd                m1, m3
8140    punpcklwd            m3, m5, m0           ; 67
8141    punpckhbw            m2, m6, m4           ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
8142    pmaddwd              m2, m3
8143    mova [rsp+gprsize+16*%3], m3
8144    paddd                m1, m2
8145    movq                 m4, [filterq+myq*8]  ; e
8146    lea                 myd, [tmpq+gammaq]
8147    shr                tmpd, 10
8148    movq                 m3, [filterq+tmpq*8] ; f
8149    lea                tmpd, [myq+gammaq]
8150    shr                 myd, 10
8151    movq                 m2, [filterq+myq*8]  ; g
8152%if ARCH_X86_64
8153    lea                 myd, [tmpq+deltaq]    ; my += delta
8154%else
8155    mov                 myd, [esp+gprsize+4*61]
8156    add                 myd, tmpd
8157%endif
8158    shr                tmpd, 10
8159    punpcklwd            m4, m3
8160    movq                 m3, [filterq+tmpq*8] ; h
8161    punpcklwd            m2, m3
8162    punpckldq            m3, m4, m2
8163    punpckhdq            m4, m2
8164    punpcklbw            m2, m6, m3           ; e0 e1 f0 f1 g0 g1 h0 h1 << 8
8165    pmaddwd              m2, [rsp+gprsize+16*%4]
8166    punpckhbw            m6, m3               ; e2 e3 f2 f3 g2 g3 h2 h3 << 8
8167    mova                 m3, [rsp+gprsize+16*%5]
8168    pmaddwd              m6, m3
8169    mova [rsp+gprsize+16*%4], m3
8170    pxor                 m3, m3
8171    paddd                m2, m6
8172    punpcklbw            m3, m4               ; e4 e5 f4 f5 g4 g5 h4 h5 << 8
8173    mova                 m6, [rsp+gprsize+16*%6]
8174    pmaddwd              m3, m6
8175    mova [rsp+gprsize+16*%5], m6
8176    punpckhwd            m5, m0
8177    pxor                 m6, m6
8178    paddd                m2, m3
8179    punpckhbw            m3, m6, m4           ; e6 e7 f6 f7 g6 g7 h6 h7 << 8
8180    pmaddwd              m3, m5
8181    mova [rsp+gprsize+16*%6], m5
8182    mova                 m5, m0
8183    paddd                m2, m3
8184%endmacro
8185    WARP_V                1,  2,  3,  4,  5,  6
8186    ret
8187.main3:
8188    call .h
8189    WARP_V                7,  8,  9, 10, 11, 12
8190    ret
8191ALIGN function_align
8192.h:
8193    lea                tmpd, [mxq+alphaq]
8194    shr                 mxd, 10
8195    movq                 m3, [filterq+mxq*8]
8196    punpcklbw            m0, m6, m3
8197    movu                 m3, [srcq-6]
8198    pmaddwd              m0, m3               ; 0
8199    lea                 mxd, [tmpq+alphaq]
8200    shr                tmpd, 10
8201    movq                 m3, [filterq+tmpq*8]
8202    punpcklbw            m2, m6, m3
8203    movu                 m3, [srcq-4]
8204    pmaddwd              m2, m3               ; 1
8205    lea                tmpd, [mxq+alphaq]
8206    shr                 mxd, 10
8207    movq                 m3, [filterq+mxq*8]
8208    phaddd               m0, m2               ; 0 1
8209    punpcklbw            m2, m6, m3
8210    movu                 m3, [srcq-2]
8211    pmaddwd              m2, m3               ; 2
8212    lea                 mxd, [tmpq+alphaq]
8213    shr                tmpd, 10
8214    movq                 m3, [filterq+tmpq*8]
8215    punpcklbw            m1, m6, m3
8216    movu                 m3, [srcq+0]
8217    pmaddwd              m1, m3               ; 3
8218    lea                tmpd, [mxq+alphaq]
8219    shr                 mxd, 10
8220    movq                 m3, [filterq+mxq*8]
8221    phaddd               m2, m1               ; 2 3
8222    punpcklbw            m1, m6, m3
8223    movu                 m3, [srcq+2]
8224    pmaddwd              m1, m3               ; 4
8225    lea                 mxd, [tmpq+alphaq]
8226    shr                tmpd, 10
8227    movq                 m3, [filterq+tmpq*8]
8228    phaddd               m0, m2               ; 0 1 2 3
8229    punpcklbw            m2, m6, m3
8230    movu                 m3, [srcq+4]
8231    pmaddwd              m2, m3               ; 5
8232    lea                tmpd, [mxq+alphaq]
8233    shr                 mxd, 10
8234    movq                 m3, [filterq+mxq*8]
8235    phaddd               m1, m2               ; 4 5
8236    punpcklbw            m2, m6, m3
8237    movu                 m3, [srcq+6]
8238    pmaddwd              m2, m3               ; 6
8239%if ARCH_X86_64
8240    lea                 mxd, [tmpq+betaq]     ; mx += beta
8241%else
8242    mov                 mxd, [esp+gprsize*2+4*60]
8243    add                 mxd, tmpd
8244%endif
8245    shr                tmpd, 10
8246    movq                 m3, [filterq+tmpq*8]
8247    punpcklbw            m4, m6, m3
8248    movu                 m3, [srcq+8]
8249%if ARCH_X86_64
8250    add                srcq, ssq
8251%else
8252    add                srcq, [esp+gprsize*2+4*62]
8253%endif
8254    pmaddwd              m3, m4               ; 7
8255    phaddd               m2, m3               ; 6 7
8256    phaddd               m1, m2               ; 4 5 6 7
8257    paddd                m0, m7
8258    paddd                m1, m7
8259    psrad                m0, [rsp+gprsize*2]
8260    psrad                m1, [rsp+gprsize*2]
8261    packssdw             m0, m1
8262    ret
8263
8264%macro BIDIR_FN 0
8265    call .main
8266    jmp                  wq
8267.w4_loop:
8268    call .main
8269    lea                dstq, [dstq+strideq*2]
8270.w4:
8271    movq   [dstq+strideq*0], m0
8272    movhps [dstq+strideq*1], m0
8273    lea                dstq, [dstq+strideq*2]
8274    movq   [dstq+strideq*0], m1
8275    movhps [dstq+strideq*1], m1
8276    sub                  hd, 4
8277    jg .w4_loop
8278.ret:
8279    RET
8280.w8_loop:
8281    call .main
8282    lea                dstq, [dstq+strideq*2]
8283.w8:
8284    mova   [dstq+strideq*0], m0
8285    mova   [dstq+strideq*1], m1
8286    sub                  hd, 2
8287    jne .w8_loop
8288    RET
8289.w16_loop:
8290    call .main
8291    add                dstq, strideq
8292.w16:
8293    mova        [dstq+16*0], m0
8294    mova        [dstq+16*1], m1
8295    dec                  hd
8296    jg .w16_loop
8297    RET
8298.w32_loop:
8299    call .main
8300    add                dstq, strideq
8301.w32:
8302    mova        [dstq+16*0], m0
8303    mova        [dstq+16*1], m1
8304    call .main
8305    mova        [dstq+16*2], m0
8306    mova        [dstq+16*3], m1
8307    dec                  hd
8308    jg .w32_loop
8309    RET
8310.w64_loop:
8311    call .main
8312    add                dstq, strideq
8313.w64:
8314    mova        [dstq+16*0], m0
8315    mova        [dstq+16*1], m1
8316    call .main
8317    mova        [dstq+16*2], m0
8318    mova        [dstq+16*3], m1
8319    call .main
8320    mova        [dstq+16*4], m0
8321    mova        [dstq+16*5], m1
8322    call .main
8323    mova        [dstq+16*6], m0
8324    mova        [dstq+16*7], m1
8325    dec                  hd
8326    jg .w64_loop
8327    RET
8328.w128_loop:
8329    call .main
8330    add                dstq, strideq
8331.w128:
8332    mova       [dstq+16* 0], m0
8333    mova       [dstq+16* 1], m1
8334    call .main
8335    mova       [dstq+16* 2], m0
8336    mova       [dstq+16* 3], m1
8337    call .main
8338    mova       [dstq+16* 4], m0
8339    mova       [dstq+16* 5], m1
8340    call .main
8341    mova       [dstq+16* 6], m0
8342    mova       [dstq+16* 7], m1
8343    call .main
8344    mova       [dstq+16* 8], m0
8345    mova       [dstq+16* 9], m1
8346    call .main
8347    mova       [dstq+16*10], m0
8348    mova       [dstq+16*11], m1
8349    call .main
8350    mova       [dstq+16*12], m0
8351    mova       [dstq+16*13], m1
8352    call .main
8353    mova       [dstq+16*14], m0
8354    mova       [dstq+16*15], m1
8355    dec                  hd
8356    jg .w128_loop
8357    RET
8358%endmacro
8359
8360%if UNIX64
8361DECLARE_REG_TMP 7
8362%else
8363DECLARE_REG_TMP 5
8364%endif
8365
8366cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h
8367%define base r6-avg_ssse3_table
8368    LEA                  r6, avg_ssse3_table
8369    tzcnt                wd, wm
8370    mov                 t0d, r6m ; pixel_max
8371    movsxd               wq, [r6+wq*4]
8372    shr                 t0d, 11
8373    movddup              m2, [base+bidir_rnd+t0*8]
8374    movddup              m3, [base+bidir_mul+t0*8]
8375    movifnidn            hd, hm
8376    add                  wq, r6
8377    BIDIR_FN
8378ALIGN function_align
8379.main:
8380    mova                 m0, [tmp1q+16*0]
8381    paddsw               m0, [tmp2q+16*0]
8382    mova                 m1, [tmp1q+16*1]
8383    paddsw               m1, [tmp2q+16*1]
8384    add               tmp1q, 16*2
8385    add               tmp2q, 16*2
8386    pmaxsw               m0, m2
8387    pmaxsw               m1, m2
8388    psubsw               m0, m2
8389    psubsw               m1, m2
8390    pmulhw               m0, m3
8391    pmulhw               m1, m3
8392    ret
8393
8394cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h
8395%define base r6-w_avg_ssse3_table
8396    LEA                  r6, w_avg_ssse3_table
8397    tzcnt                wd, wm
8398    mov                 t0d, r6m ; weight
8399    movd                 m6, r7m ; pixel_max
8400    movddup              m5, [base+pd_65538]
8401    movsxd               wq, [r6+wq*4]
8402    pshufb               m6, [base+pw_256]
8403    add                  wq, r6
8404    lea                 r6d, [t0-16]
8405    shl                 t0d, 16
8406    sub                 t0d, r6d ; 16-weight, weight
8407    paddw                m5, m6
8408    mov                 r6d, t0d
8409    shl                 t0d, 2
8410    test          dword r7m, 0x800
8411    cmovnz              r6d, t0d
8412    movifnidn            hd, hm
8413    movd                 m4, r6d
8414    pslld                m5, 7
8415    pxor                 m7, m7
8416    pshufd               m4, m4, q0000
8417    BIDIR_FN
8418ALIGN function_align
8419.main:
8420    mova                 m2, [tmp1q+16*0]
8421    mova                 m0, [tmp2q+16*0]
8422    punpckhwd            m3, m0, m2
8423    punpcklwd            m0, m2
8424    mova                 m2, [tmp1q+16*1]
8425    mova                 m1, [tmp2q+16*1]
8426    add               tmp1q, 16*2
8427    add               tmp2q, 16*2
8428    pmaddwd              m3, m4
8429    pmaddwd              m0, m4
8430    paddd                m3, m5
8431    paddd                m0, m5
8432    psrad                m3, 8
8433    psrad                m0, 8
8434    packssdw             m0, m3
8435    punpckhwd            m3, m1, m2
8436    punpcklwd            m1, m2
8437    pmaddwd              m3, m4
8438    pmaddwd              m1, m4
8439    paddd                m3, m5
8440    paddd                m1, m5
8441    psrad                m3, 8
8442    psrad                m1, 8
8443    packssdw             m1, m3
8444    pminsw               m0, m6
8445    pminsw               m1, m6
8446    pmaxsw               m0, m7
8447    pmaxsw               m1, m7
8448    ret
8449
8450%if ARCH_X86_64
8451cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
8452%else
8453cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
8454%define hd dword r5m
8455%define m8 [base+pw_64]
8456%endif
8457%define base r6-mask_ssse3_table
8458    LEA                  r6, mask_ssse3_table
8459    tzcnt                wd, wm
8460    mov                 t0d, r7m ; pixel_max
8461    shr                 t0d, 11
8462    movsxd               wq, [r6+wq*4]
8463    movddup              m6, [base+bidir_rnd+t0*8]
8464    movddup              m7, [base+bidir_mul+t0*8]
8465%if ARCH_X86_64
8466    mova                 m8, [base+pw_64]
8467    movifnidn            hd, hm
8468%endif
8469    add                  wq, r6
8470    mov               maskq, r6mp
8471    BIDIR_FN
8472ALIGN function_align
8473.main:
8474    movq                 m3, [maskq+8*0]
8475    mova                 m0, [tmp1q+16*0]
8476    mova                 m4, [tmp2q+16*0]
8477    pxor                 m5, m5
8478    punpcklbw            m3, m5
8479    punpckhwd            m2, m0, m4
8480    punpcklwd            m0, m4
8481    psubw                m1, m8, m3
8482    punpckhwd            m4, m3, m1 ; m, 64-m
8483    punpcklwd            m3, m1
8484    pmaddwd              m2, m4     ; tmp1 * m + tmp2 * (64-m)
8485    pmaddwd              m0, m3
8486    movq                 m3, [maskq+8*1]
8487    mova                 m1, [tmp1q+16*1]
8488    mova                 m4, [tmp2q+16*1]
8489    add               maskq, 8*2
8490    add               tmp1q, 16*2
8491    add               tmp2q, 16*2
8492    psrad                m2, 5
8493    psrad                m0, 5
8494    packssdw             m0, m2
8495    punpcklbw            m3, m5
8496    punpckhwd            m2, m1, m4
8497    punpcklwd            m1, m4
8498    psubw                m5, m8, m3
8499    punpckhwd            m4, m3, m5 ; m, 64-m
8500    punpcklwd            m3, m5
8501    pmaddwd              m2, m4     ; tmp1 * m + tmp2 * (64-m)
8502    pmaddwd              m1, m3
8503    psrad                m2, 5
8504    psrad                m1, 5
8505    packssdw             m1, m2
8506    pmaxsw               m0, m6
8507    pmaxsw               m1, m6
8508    psubsw               m0, m6
8509    psubsw               m1, m6
8510    pmulhw               m0, m7
8511    pmulhw               m1, m7
8512    ret
8513
8514cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
8515%define base t0-w_mask_420_ssse3_table
8516    LEA                  t0, w_mask_420_ssse3_table
8517    tzcnt                wd, wm
8518    mov                 r6d, r8m ; pixel_max
8519    movd                 m0, r7m ; sign
8520    shr                 r6d, 11
8521    movsxd               wq, [t0+wq*4]
8522%if ARCH_X86_64
8523    mova                 m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
8524    mova                 m9, [base+pw_64]
8525    movddup             m10, [base+bidir_rnd+r6*8]
8526    movddup             m11, [base+bidir_mul+r6*8]
8527%else
8528    mova                 m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
8529    mova                 m2, [base+pw_64]
8530    movddup              m3, [base+bidir_rnd+r6*8]
8531    movddup              m4, [base+bidir_mul+r6*8]
8532    ALLOC_STACK       -16*4
8533    mova         [rsp+16*0], m1
8534    mova         [rsp+16*1], m2
8535    mova         [rsp+16*2], m3
8536    mova         [rsp+16*3], m4
8537    %define              m8  [rsp+gprsize+16*0]
8538    %define              m9  [rsp+gprsize+16*1]
8539    %define             m10  [rsp+gprsize+16*2]
8540    %define             m11  [rsp+gprsize+16*3]
8541%endif
8542    movd                 m7, [base+pw_2]
8543    psubw                m7, m0
8544    pshufb               m7, [base+pw_256]
8545    add                  wq, t0
8546    movifnidn            hd, r5m
8547    mov               maskq, r6mp
8548    call .main
8549    jmp                  wq
8550.w4_loop:
8551    call .main
8552    lea                dstq, [dstq+strideq*2]
8553    add               maskq, 4
8554.w4:
8555    movq   [dstq+strideq*0], m0
8556    phaddw               m2, m3
8557    movhps [dstq+strideq*1], m0
8558    phaddd               m2, m2
8559    lea                dstq, [dstq+strideq*2]
8560    paddw                m2, m7
8561    movq   [dstq+strideq*0], m1
8562    psrlw                m2, 2
8563    movhps [dstq+strideq*1], m1
8564    packuswb             m2, m2
8565    movd            [maskq], m2
8566    sub                  hd, 4
8567    jg .w4_loop
8568    RET
8569.w8_loop:
8570    call .main
8571    lea                dstq, [dstq+strideq*2]
8572    add               maskq, 4
8573.w8:
8574    mova   [dstq+strideq*0], m0
8575    paddw                m2, m3
8576    phaddw               m2, m2
8577    mova   [dstq+strideq*1], m1
8578    paddw                m2, m7
8579    psrlw                m2, 2
8580    packuswb             m2, m2
8581    movd            [maskq], m2
8582    sub                  hd, 2
8583    jg .w8_loop
8584    RET
8585.w16_loop:
8586    call .main
8587    lea                dstq, [dstq+strideq*2]
8588    add               maskq, 8
8589.w16:
8590    mova [dstq+strideq*1+16*0], m2
8591    mova [dstq+strideq*0+16*0], m0
8592    mova [dstq+strideq*1+16*1], m3
8593    mova [dstq+strideq*0+16*1], m1
8594    call .main
8595    paddw                m2, [dstq+strideq*1+16*0]
8596    paddw                m3, [dstq+strideq*1+16*1]
8597    mova [dstq+strideq*1+16*0], m0
8598    phaddw               m2, m3
8599    mova [dstq+strideq*1+16*1], m1
8600    paddw                m2, m7
8601    psrlw                m2, 2
8602    packuswb             m2, m2
8603    movq            [maskq], m2
8604    sub                  hd, 2
8605    jg .w16_loop
8606    RET
8607.w32_loop:
8608    call .main
8609    lea                dstq, [dstq+strideq*2]
8610    add               maskq, 16
8611.w32:
8612    mova [dstq+strideq*1+16*0], m2
8613    mova [dstq+strideq*0+16*0], m0
8614    mova [dstq+strideq*1+16*1], m3
8615    mova [dstq+strideq*0+16*1], m1
8616    call .main
8617    mova [dstq+strideq*0+16*2], m0
8618    phaddw               m2, m3
8619    mova [dstq+strideq*1+16*3], m2
8620    mova [dstq+strideq*0+16*3], m1
8621    call .main
8622    paddw                m2, [dstq+strideq*1+16*0]
8623    paddw                m3, [dstq+strideq*1+16*1]
8624    mova [dstq+strideq*1+16*0], m0
8625    phaddw               m2, m3
8626    mova [dstq+strideq*1+16*2], m2
8627    mova [dstq+strideq*1+16*1], m1
8628    call .main
8629    phaddw               m2, m3
8630    paddw                m3, m7, [dstq+strideq*1+16*2]
8631    paddw                m2, [dstq+strideq*1+16*3]
8632    mova [dstq+strideq*1+16*2], m0
8633    paddw                m2, m7
8634    psrlw                m3, 2
8635    psrlw                m2, 2
8636    mova [dstq+strideq*1+16*3], m1
8637    packuswb             m3, m2
8638    mova            [maskq], m3
8639    sub                  hd, 2
8640    jg .w32_loop
8641    RET
8642.w64_loop:
8643    call .main
8644    lea                dstq, [dstq+strideq*2]
8645    add               maskq, 16*2
8646.w64:
8647    mova [dstq+strideq*1+16*1], m2
8648    mova [dstq+strideq*0+16*0], m0
8649    mova [dstq+strideq*1+16*2], m3
8650    mova [dstq+strideq*0+16*1], m1
8651    call .main
8652    mova [dstq+strideq*1+16*3], m2
8653    mova [dstq+strideq*0+16*2], m0
8654    mova [dstq+strideq*1+16*4], m3
8655    mova [dstq+strideq*0+16*3], m1
8656    call .main
8657    mova [dstq+strideq*1+16*5], m2
8658    mova [dstq+strideq*0+16*4], m0
8659    mova [dstq+strideq*1+16*6], m3
8660    mova [dstq+strideq*0+16*5], m1
8661    call .main
8662    mova [dstq+strideq*0+16*6], m0
8663    phaddw               m2, m3
8664    mova [dstq+strideq*1+16*7], m2
8665    mova [dstq+strideq*0+16*7], m1
8666    call .main
8667    paddw                m2, [dstq+strideq*1+16*1]
8668    paddw                m3, [dstq+strideq*1+16*2]
8669    mova [dstq+strideq*1+16*0], m0
8670    phaddw               m2, m3
8671    mova [dstq+strideq*1+16*2], m2
8672    mova [dstq+strideq*1+16*1], m1
8673    call .main
8674    paddw                m2, [dstq+strideq*1+16*3]
8675    paddw                m3, [dstq+strideq*1+16*4]
8676    phaddw               m2, m3
8677    paddw                m3, m7, [dstq+strideq*1+16*2]
8678    mova [dstq+strideq*1+16*2], m0
8679    paddw                m2, m7
8680    psrlw                m3, 2
8681    psrlw                m2, 2
8682    mova [dstq+strideq*1+16*3], m1
8683    packuswb             m3, m2
8684    mova       [maskq+16*0], m3
8685    call .main
8686    paddw                m2, [dstq+strideq*1+16*5]
8687    paddw                m3, [dstq+strideq*1+16*6]
8688    mova [dstq+strideq*1+16*4], m0
8689    phaddw               m2, m3
8690    mova [dstq+strideq*1+16*6], m2
8691    mova [dstq+strideq*1+16*5], m1
8692    call .main
8693    phaddw               m2, m3
8694    paddw                m3, m7, [dstq+strideq*1+16*6]
8695    paddw                m2, [dstq+strideq*1+16*7]
8696    mova [dstq+strideq*1+16*6], m0
8697    paddw                m2, m7
8698    psrlw                m3, 2
8699    psrlw                m2, 2
8700    mova [dstq+strideq*1+16*7], m1
8701    packuswb             m3, m2
8702    mova       [maskq+16*1], m3
8703    sub                  hd, 2
8704    jg .w64_loop
8705    RET
8706.w128_loop:
8707    call .main
8708    lea                dstq, [dstq+strideq*2]
8709    add               maskq, 16*4
8710.w128:
8711    mova [dstq+strideq*1+16* 1], m2
8712    mova [dstq+strideq*0+16* 0], m0
8713    mova [dstq+strideq*1+16* 2], m3
8714    mova [dstq+strideq*0+16* 1], m1
8715    call .main
8716    mova [dstq+strideq*1+16* 3], m2
8717    mova [dstq+strideq*0+16* 2], m0
8718    mova [dstq+strideq*1+16* 4], m3
8719    mova [dstq+strideq*0+16* 3], m1
8720    call .main
8721    mova [dstq+strideq*1+16* 5], m2
8722    mova [dstq+strideq*0+16* 4], m0
8723    mova [dstq+strideq*1+16* 6], m3
8724    mova [dstq+strideq*0+16* 5], m1
8725    call .main
8726    mova [dstq+strideq*1+16* 7], m2
8727    mova [dstq+strideq*0+16* 6], m0
8728    mova [dstq+strideq*1+16* 8], m3
8729    mova [dstq+strideq*0+16* 7], m1
8730    call .main
8731    mova [dstq+strideq*1+16* 9], m2
8732    mova [dstq+strideq*0+16* 8], m0
8733    mova [dstq+strideq*1+16*10], m3
8734    mova [dstq+strideq*0+16* 9], m1
8735    call .main
8736    mova [dstq+strideq*1+16*11], m2
8737    mova [dstq+strideq*0+16*10], m0
8738    mova [dstq+strideq*1+16*12], m3
8739    mova [dstq+strideq*0+16*11], m1
8740    call .main
8741    mova [dstq+strideq*1+16*13], m2
8742    mova [dstq+strideq*0+16*12], m0
8743    mova [dstq+strideq*1+16*14], m3
8744    mova [dstq+strideq*0+16*13], m1
8745    call .main
8746    mova [dstq+strideq*0+16*14], m0
8747    phaddw               m2, m3
8748    mova [dstq+strideq*1+16*15], m2
8749    mova [dstq+strideq*0+16*15], m1
8750    call .main
8751    paddw                m2, [dstq+strideq*1+16* 1]
8752    paddw                m3, [dstq+strideq*1+16* 2]
8753    mova [dstq+strideq*1+16* 0], m0
8754    phaddw               m2, m3
8755    mova [dstq+strideq*1+16* 2], m2
8756    mova [dstq+strideq*1+16* 1], m1
8757    call .main
8758    paddw                m2, [dstq+strideq*1+16* 3]
8759    paddw                m3, [dstq+strideq*1+16* 4]
8760    phaddw               m2, m3
8761    paddw                m3, m7, [dstq+strideq*1+16* 2]
8762    mova [dstq+strideq*1+16* 2], m0
8763    paddw                m2, m7
8764    psrlw                m3, 2
8765    psrlw                m2, 2
8766    mova [dstq+strideq*1+16* 3], m1
8767    packuswb             m3, m2
8768    mova       [maskq+16*0], m3
8769    call .main
8770    paddw                m2, [dstq+strideq*1+16* 5]
8771    paddw                m3, [dstq+strideq*1+16* 6]
8772    mova [dstq+strideq*1+16* 4], m0
8773    phaddw               m2, m3
8774    mova [dstq+strideq*1+16* 6], m2
8775    mova [dstq+strideq*1+16* 5], m1
8776    call .main
8777    paddw                m2, [dstq+strideq*1+16* 7]
8778    paddw                m3, [dstq+strideq*1+16* 8]
8779    phaddw               m2, m3
8780    paddw                m3, m7, [dstq+strideq*1+16* 6]
8781    mova [dstq+strideq*1+16* 6], m0
8782    paddw                m2, m7
8783    psrlw                m3, 2
8784    psrlw                m2, 2
8785    mova [dstq+strideq*1+16* 7], m1
8786    packuswb             m3, m2
8787    mova       [maskq+16*1], m3
8788    call .main
8789    paddw                m2, [dstq+strideq*1+16* 9]
8790    paddw                m3, [dstq+strideq*1+16*10]
8791    mova [dstq+strideq*1+16* 8], m0
8792    phaddw               m2, m3
8793    mova [dstq+strideq*1+16*10], m2
8794    mova [dstq+strideq*1+16* 9], m1
8795    call .main
8796    paddw                m2, [dstq+strideq*1+16*11]
8797    paddw                m3, [dstq+strideq*1+16*12]
8798    phaddw               m2, m3
8799    paddw                m3, m7, [dstq+strideq*1+16*10]
8800    mova [dstq+strideq*1+16*10], m0
8801    paddw                m2, m7
8802    psrlw                m3, 2
8803    psrlw                m2, 2
8804    mova [dstq+strideq*1+16*11], m1
8805    packuswb             m3, m2
8806    mova       [maskq+16*2], m3
8807    call .main
8808    paddw                m2, [dstq+strideq*1+16*13]
8809    paddw                m3, [dstq+strideq*1+16*14]
8810    mova [dstq+strideq*1+16*12], m0
8811    phaddw               m2, m3
8812    mova [dstq+strideq*1+16*14], m2
8813    mova [dstq+strideq*1+16*13], m1
8814    call .main
8815    phaddw               m2, m3
8816    paddw                m3, m7, [dstq+strideq*1+16*14]
8817    paddw                m2, [dstq+strideq*1+16*15]
8818    mova [dstq+strideq*1+16*14], m0
8819    paddw                m2, m7
8820    psrlw                m3, 2
8821    psrlw                m2, 2
8822    mova [dstq+strideq*1+16*15], m1
8823    packuswb             m3, m2
8824    mova       [maskq+16*3], m3
8825    sub                  hd, 2
8826    jg .w128_loop
8827    RET
8828ALIGN function_align
8829.main:
8830%macro W_MASK 2 ; dst/tmp_offset, mask
8831    mova                m%1, [tmp1q+16*%1]
8832    mova                m%2, [tmp2q+16*%1]
8833    punpcklwd            m4, m%2, m%1
8834    punpckhwd            m5, m%2, m%1
8835    psubsw              m%1, m%2
8836    pabsw               m%1, m%1
8837    psubusw              m6, m8, m%1
8838    psrlw                m6, 10      ; 64-m
8839    psubw               m%2, m9, m6  ; m
8840    punpcklwd           m%1, m6, m%2
8841    punpckhwd            m6, m%2
8842    pmaddwd             m%1, m4
8843    pmaddwd              m6, m5
8844    psrad               m%1, 5
8845    psrad                m6, 5
8846    packssdw            m%1, m6
8847    pmaxsw              m%1, m10
8848    psubsw              m%1, m10
8849    pmulhw              m%1, m11
8850%endmacro
8851    W_MASK                0, 2
8852    W_MASK                1, 3
8853    add               tmp1q, 16*2
8854    add               tmp2q, 16*2
8855    ret
8856
8857cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
8858%define base t0-w_mask_422_ssse3_table
8859    LEA                  t0, w_mask_422_ssse3_table
8860    tzcnt                wd, wm
8861    mov                 r6d, r8m ; pixel_max
8862    movd                 m7, r7m ; sign
8863    shr                 r6d, 11
8864    movsxd               wq, [t0+wq*4]
8865%if ARCH_X86_64
8866    mova                 m8, [base+pw_27615]
8867    mova                 m9, [base+pw_64]
8868    movddup             m10, [base+bidir_rnd+r6*8]
8869    movddup             m11, [base+bidir_mul+r6*8]
8870%else
8871    mova                 m1, [base+pw_27615]
8872    mova                 m2, [base+pw_64]
8873    movddup              m3, [base+bidir_rnd+r6*8]
8874    movddup              m4, [base+bidir_mul+r6*8]
8875    ALLOC_STACK       -16*4
8876    mova         [rsp+16*0], m1
8877    mova         [rsp+16*1], m2
8878    mova         [rsp+16*2], m3
8879    mova         [rsp+16*3], m4
8880%endif
8881    pxor                 m0, m0
8882    add                  wq, t0
8883    pshufb               m7, m0
8884    movifnidn            hd, r5m
8885    mov               maskq, r6mp
8886    call .main
8887    jmp                  wq
8888.w4_loop:
8889    call .main
8890    lea                dstq, [dstq+strideq*2]
8891.w4:
8892    movq   [dstq+strideq*0], m0
8893    movhps [dstq+strideq*1], m0
8894    lea                dstq, [dstq+strideq*2]
8895    movq   [dstq+strideq*0], m1
8896    movhps [dstq+strideq*1], m1
8897    sub                  hd, 4
8898    jg .w4_loop
8899.end:
8900    RET
8901.w8_loop:
8902    call .main
8903    lea                dstq, [dstq+strideq*2]
8904.w8:
8905    mova   [dstq+strideq*0], m0
8906    mova   [dstq+strideq*1], m1
8907    sub                  hd, 2
8908    jg .w8_loop
8909.w8_end:
8910    RET
8911.w16_loop:
8912    call .main
8913    lea                dstq, [dstq+strideq*2]
8914.w16:
8915    mova [dstq+strideq*0+16*0], m0
8916    mova [dstq+strideq*0+16*1], m1
8917    call .main
8918    mova [dstq+strideq*1+16*0], m0
8919    mova [dstq+strideq*1+16*1], m1
8920    sub                  hd, 2
8921    jg .w16_loop
8922    RET
8923.w32_loop:
8924    call .main
8925    add                dstq, strideq
8926.w32:
8927    mova        [dstq+16*0], m0
8928    mova        [dstq+16*1], m1
8929    call .main
8930    mova        [dstq+16*2], m0
8931    mova        [dstq+16*3], m1
8932    dec                  hd
8933    jg .w32_loop
8934    RET
8935.w64_loop:
8936    call .main
8937    add                dstq, strideq
8938.w64:
8939    mova        [dstq+16*0], m0
8940    mova        [dstq+16*1], m1
8941    call .main
8942    mova        [dstq+16*2], m0
8943    mova        [dstq+16*3], m1
8944    call .main
8945    mova        [dstq+16*4], m0
8946    mova        [dstq+16*5], m1
8947    call .main
8948    mova        [dstq+16*6], m0
8949    mova        [dstq+16*7], m1
8950    dec                  hd
8951    jg .w64_loop
8952    RET
8953.w128_loop:
8954    call .main
8955    add                dstq, strideq
8956.w128:
8957    mova       [dstq+16* 0], m0
8958    mova       [dstq+16* 1], m1
8959    call .main
8960    mova       [dstq+16* 2], m0
8961    mova       [dstq+16* 3], m1
8962    call .main
8963    mova       [dstq+16* 4], m0
8964    mova       [dstq+16* 5], m1
8965    call .main
8966    mova       [dstq+16* 6], m0
8967    mova       [dstq+16* 7], m1
8968    call .main
8969    mova       [dstq+16* 8], m0
8970    mova       [dstq+16* 9], m1
8971    call .main
8972    mova       [dstq+16*10], m0
8973    mova       [dstq+16*11], m1
8974    call .main
8975    mova       [dstq+16*12], m0
8976    mova       [dstq+16*13], m1
8977    call .main
8978    mova       [dstq+16*14], m0
8979    mova       [dstq+16*15], m1
8980    dec                  hd
8981    jg .w128_loop
8982    RET
8983ALIGN function_align
8984.main:
8985    W_MASK                0, 2
8986    W_MASK                1, 3
8987    phaddw               m2, m3
8988    add               tmp1q, 16*2
8989    add               tmp2q, 16*2
8990    packuswb             m2, m2
8991    pxor                 m3, m3
8992    psubb                m2, m7
8993    pavgb                m2, m3
8994    movq            [maskq], m2
8995    add               maskq, 8
8996    ret
8997
8998cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
8999%define base t0-w_mask_444_ssse3_table
9000    LEA                  t0, w_mask_444_ssse3_table
9001    tzcnt                wd, wm
9002    mov                 r6d, r8m ; pixel_max
9003    shr                 r6d, 11
9004    movsxd               wq, [t0+wq*4]
9005%if ARCH_X86_64
9006    mova                 m8, [base+pw_27615]
9007    mova                 m9, [base+pw_64]
9008    movddup             m10, [base+bidir_rnd+r6*8]
9009    movddup             m11, [base+bidir_mul+r6*8]
9010%else
9011    mova                 m1, [base+pw_27615]
9012    mova                 m2, [base+pw_64]
9013    movddup              m3, [base+bidir_rnd+r6*8]
9014    movddup              m7, [base+bidir_mul+r6*8]
9015    ALLOC_STACK       -16*3
9016    mova         [rsp+16*0], m1
9017    mova         [rsp+16*1], m2
9018    mova         [rsp+16*2], m3
9019    %define             m11  m7
9020%endif
9021    add                  wq, t0
9022    movifnidn            hd, r5m
9023    mov               maskq, r6mp
9024    call .main
9025    jmp                  wq
9026.w4_loop:
9027    call .main
9028    lea                dstq, [dstq+strideq*2]
9029.w4:
9030    movq   [dstq+strideq*0], m0
9031    movhps [dstq+strideq*1], m0
9032    lea                dstq, [dstq+strideq*2]
9033    movq   [dstq+strideq*0], m1
9034    movhps [dstq+strideq*1], m1
9035    sub                  hd, 4
9036    jg .w4_loop
9037.end:
9038    RET
9039.w8_loop:
9040    call .main
9041    lea                dstq, [dstq+strideq*2]
9042.w8:
9043    mova   [dstq+strideq*0], m0
9044    mova   [dstq+strideq*1], m1
9045    sub                  hd, 2
9046    jg .w8_loop
9047.w8_end:
9048    RET
9049.w16_loop:
9050    call .main
9051    lea                dstq, [dstq+strideq*2]
9052.w16:
9053    mova [dstq+strideq*0+16*0], m0
9054    mova [dstq+strideq*0+16*1], m1
9055    call .main
9056    mova [dstq+strideq*1+16*0], m0
9057    mova [dstq+strideq*1+16*1], m1
9058    sub                  hd, 2
9059    jg .w16_loop
9060    RET
9061.w32_loop:
9062    call .main
9063    add                dstq, strideq
9064.w32:
9065    mova        [dstq+16*0], m0
9066    mova        [dstq+16*1], m1
9067    call .main
9068    mova        [dstq+16*2], m0
9069    mova        [dstq+16*3], m1
9070    dec                  hd
9071    jg .w32_loop
9072    RET
9073.w64_loop:
9074    call .main
9075    add                dstq, strideq
9076.w64:
9077    mova        [dstq+16*0], m0
9078    mova        [dstq+16*1], m1
9079    call .main
9080    mova        [dstq+16*2], m0
9081    mova        [dstq+16*3], m1
9082    call .main
9083    mova        [dstq+16*4], m0
9084    mova        [dstq+16*5], m1
9085    call .main
9086    mova        [dstq+16*6], m0
9087    mova        [dstq+16*7], m1
9088    dec                  hd
9089    jg .w64_loop
9090    RET
9091.w128_loop:
9092    call .main
9093    add                dstq, strideq
9094.w128:
9095    mova       [dstq+16* 0], m0
9096    mova       [dstq+16* 1], m1
9097    call .main
9098    mova       [dstq+16* 2], m0
9099    mova       [dstq+16* 3], m1
9100    call .main
9101    mova       [dstq+16* 4], m0
9102    mova       [dstq+16* 5], m1
9103    call .main
9104    mova       [dstq+16* 6], m0
9105    mova       [dstq+16* 7], m1
9106    call .main
9107    mova       [dstq+16* 8], m0
9108    mova       [dstq+16* 9], m1
9109    call .main
9110    mova       [dstq+16*10], m0
9111    mova       [dstq+16*11], m1
9112    call .main
9113    mova       [dstq+16*12], m0
9114    mova       [dstq+16*13], m1
9115    call .main
9116    mova       [dstq+16*14], m0
9117    mova       [dstq+16*15], m1
9118    dec                  hd
9119    jg .w128_loop
9120    RET
9121ALIGN function_align
9122.main:
9123    W_MASK                0, 2
9124    W_MASK                1, 3
9125    packuswb             m2, m3
9126    add               tmp1q, 16*2
9127    add               tmp2q, 16*2
9128    mova            [maskq], m2
9129    add               maskq, 16
9130    ret
9131
9132; (a * (64 - m) + b * m + 32) >> 6
9133; = (((b - a) * m + 32) >> 6) + a
9134; = (((b - a) * (m << 9) + 16384) >> 15) + a
9135;   except m << 9 overflows int16_t when m == 64 (which is possible),
9136;   but if we negate m it works out (-64 << 9 == -32768).
9137; = (((a - b) * (m * -512) + 16384) >> 15) + a
9138cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3
9139%define base r6-blend_ssse3_table
9140    LEA                  r6, blend_ssse3_table
9141    tzcnt                wd, wm
9142    movifnidn            hd, hm
9143    movsxd               wq, [r6+wq*4]
9144    movifnidn         maskq, maskmp
9145    mova                 m7, [base+pw_m512]
9146    add                  wq, r6
9147    lea            stride3q, [strideq*3]
9148    pxor                 m6, m6
9149    jmp                  wq
9150.w4:
9151    mova                 m5, [maskq]
9152    movq                 m0, [dstq+strideq*0]
9153    movhps               m0, [dstq+strideq*1]
9154    movq                 m1, [dstq+strideq*2]
9155    movhps               m1, [dstq+stride3q ]
9156    psubw                m2, m0, [tmpq+16*0]
9157    psubw                m3, m1, [tmpq+16*1]
9158    add               maskq, 16
9159    add                tmpq, 32
9160    punpcklbw            m4, m5, m6
9161    punpckhbw            m5, m6
9162    pmullw               m4, m7
9163    pmullw               m5, m7
9164    pmulhrsw             m2, m4
9165    pmulhrsw             m3, m5
9166    paddw                m0, m2
9167    paddw                m1, m3
9168    movq   [dstq+strideq*0], m0
9169    movhps [dstq+strideq*1], m0
9170    movq   [dstq+strideq*2], m1
9171    movhps [dstq+stride3q ], m1
9172    lea                dstq, [dstq+strideq*4]
9173    sub                  hd, 4
9174    jg .w4
9175    RET
9176.w8:
9177    mova                 m5, [maskq]
9178    mova                 m0, [dstq+strideq*0]
9179    mova                 m1, [dstq+strideq*1]
9180    psubw                m2, m0, [tmpq+16*0]
9181    psubw                m3, m1, [tmpq+16*1]
9182    add               maskq, 16
9183    add                tmpq, 32
9184    punpcklbw            m4, m5, m6
9185    punpckhbw            m5, m6
9186    pmullw               m4, m7
9187    pmullw               m5, m7
9188    pmulhrsw             m2, m4
9189    pmulhrsw             m3, m5
9190    paddw                m0, m2
9191    paddw                m1, m3
9192    mova   [dstq+strideq*0], m0
9193    mova   [dstq+strideq*1], m1
9194    lea                dstq, [dstq+strideq*2]
9195    sub                  hd, 2
9196    jg .w8
9197    RET
9198.w16:
9199    mova                 m5, [maskq]
9200    mova                 m0, [dstq+16*0]
9201    mova                 m1, [dstq+16*1]
9202    psubw                m2, m0, [tmpq+16*0]
9203    psubw                m3, m1, [tmpq+16*1]
9204    add               maskq, 16
9205    add                tmpq, 32
9206    punpcklbw            m4, m5, m6
9207    punpckhbw            m5, m6
9208    pmullw               m4, m7
9209    pmullw               m5, m7
9210    pmulhrsw             m2, m4
9211    pmulhrsw             m3, m5
9212    paddw                m0, m2
9213    paddw                m1, m3
9214    mova        [dstq+16*0], m0
9215    mova        [dstq+16*1], m1
9216    add                dstq, strideq
9217    dec                  hd
9218    jg .w16
9219    RET
9220.w32:
9221    mova                 m5, [maskq+16*0]
9222    mova                 m0, [dstq+16*0]
9223    mova                 m1, [dstq+16*1]
9224    psubw                m2, m0, [tmpq+16*0]
9225    psubw                m3, m1, [tmpq+16*1]
9226    punpcklbw            m4, m5, m6
9227    punpckhbw            m5, m6
9228    pmullw               m4, m7
9229    pmullw               m5, m7
9230    pmulhrsw             m2, m4
9231    pmulhrsw             m3, m5
9232    paddw                m0, m2
9233    paddw                m1, m3
9234    mova        [dstq+16*0], m0
9235    mova        [dstq+16*1], m1
9236    mova                 m5, [maskq+16*1]
9237    mova                 m0, [dstq+16*2]
9238    mova                 m1, [dstq+16*3]
9239    psubw                m2, m0, [tmpq+16*2]
9240    psubw                m3, m1, [tmpq+16*3]
9241    add               maskq, 32
9242    add                tmpq, 64
9243    punpcklbw            m4, m5, m6
9244    punpckhbw            m5, m6
9245    pmullw               m4, m7
9246    pmullw               m5, m7
9247    pmulhrsw             m2, m4
9248    pmulhrsw             m3, m5
9249    paddw                m0, m2
9250    paddw                m1, m3
9251    mova        [dstq+16*2], m0
9252    mova        [dstq+16*3], m1
9253    add                dstq, strideq
9254    dec                  hd
9255    jg .w32
9256    RET
9257
9258cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
9259%define base r5-blend_v_ssse3_table
9260    LEA                  r5, blend_v_ssse3_table
9261    tzcnt                wd, wm
9262    movifnidn            hd, hm
9263    movsxd               wq, [r5+wq*4]
9264    add                  wq, r5
9265    jmp                  wq
9266.w2:
9267    movd                 m4, [base+obmc_masks+2*2]
9268.w2_loop:
9269    movd                 m0, [dstq+strideq*0]
9270    movd                 m2, [tmpq+4*0]
9271    movd                 m1, [dstq+strideq*1]
9272    movd                 m3, [tmpq+4*1]
9273    add                tmpq, 4*2
9274    psubw                m2, m0
9275    psubw                m3, m1
9276    pmulhrsw             m2, m4
9277    pmulhrsw             m3, m4
9278    paddw                m0, m2
9279    paddw                m1, m3
9280    movd   [dstq+strideq*0], m0
9281    movd   [dstq+strideq*1], m1
9282    lea                dstq, [dstq+strideq*2]
9283    sub                  hd, 2
9284    jg .w2_loop
9285    RET
9286.w4:
9287    movddup              m2, [base+obmc_masks+4*2]
9288.w4_loop:
9289    movq                 m0, [dstq+strideq*0]
9290    movhps               m0, [dstq+strideq*1]
9291    mova                 m1, [tmpq]
9292    add                tmpq, 8*2
9293    psubw                m1, m0
9294    pmulhrsw             m1, m2
9295    paddw                m0, m1
9296    movq   [dstq+strideq*0], m0
9297    movhps [dstq+strideq*1], m0
9298    lea                dstq, [dstq+strideq*2]
9299    sub                  hd, 2
9300    jg .w4_loop
9301    RET
9302.w8:
9303    mova                 m4, [base+obmc_masks+8*2]
9304.w8_loop:
9305    mova                 m0, [dstq+strideq*0]
9306    mova                 m2, [tmpq+16*0]
9307    mova                 m1, [dstq+strideq*1]
9308    mova                 m3, [tmpq+16*1]
9309    add                tmpq, 16*2
9310    psubw                m2, m0
9311    psubw                m3, m1
9312    pmulhrsw             m2, m4
9313    pmulhrsw             m3, m4
9314    paddw                m0, m2
9315    paddw                m1, m3
9316    mova   [dstq+strideq*0], m0
9317    mova   [dstq+strideq*1], m1
9318    lea                dstq, [dstq+strideq*2]
9319    sub                  hd, 2
9320    jg .w8_loop
9321    RET
9322.w16:
9323    mova                 m4, [base+obmc_masks+16*2]
9324    movq                 m5, [base+obmc_masks+16*3]
9325.w16_loop:
9326    mova                 m0, [dstq+16*0]
9327    mova                 m2, [tmpq+16*0]
9328    mova                 m1, [dstq+16*1]
9329    mova                 m3, [tmpq+16*1]
9330    add                tmpq, 16*2
9331    psubw                m2, m0
9332    psubw                m3, m1
9333    pmulhrsw             m2, m4
9334    pmulhrsw             m3, m5
9335    paddw                m0, m2
9336    paddw                m1, m3
9337    mova        [dstq+16*0], m0
9338    mova        [dstq+16*1], m1
9339    add                dstq, strideq
9340    dec                  hd
9341    jg .w16_loop
9342    RET
9343.w32:
9344%if WIN64
9345    movaps          [rsp+8], m6
9346%endif
9347    mova                 m4, [base+obmc_masks+16*4]
9348    mova                 m5, [base+obmc_masks+16*5]
9349    mova                 m6, [base+obmc_masks+16*6]
9350.w32_loop:
9351    mova                 m0, [dstq+16*0]
9352    mova                 m2, [tmpq+16*0]
9353    mova                 m1, [dstq+16*1]
9354    mova                 m3, [tmpq+16*1]
9355    psubw                m2, m0
9356    psubw                m3, m1
9357    pmulhrsw             m2, m4
9358    pmulhrsw             m3, m5
9359    paddw                m0, m2
9360    mova                 m2, [dstq+16*2]
9361    paddw                m1, m3
9362    mova                 m3, [tmpq+16*2]
9363    add                tmpq, 16*4
9364    psubw                m3, m2
9365    pmulhrsw             m3, m6
9366    paddw                m2, m3
9367    mova        [dstq+16*0], m0
9368    mova        [dstq+16*1], m1
9369    mova        [dstq+16*2], m2
9370    add                dstq, strideq
9371    dec                  hd
9372    jg .w32_loop
9373%if WIN64
9374    movaps               m6, [rsp+8]
9375%endif
9376    RET
9377
9378%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
9379    mova                 m0, [dstq+16*(%1+0)]
9380    mova                 m2, [tmpq+16*(%2+0)]
9381    mova                 m1, [dstq+16*(%1+1)]
9382    mova                 m3, [tmpq+16*(%2+1)]
9383%if %3
9384    add                tmpq, 16*%3
9385%endif
9386    psubw                m2, m0
9387    psubw                m3, m1
9388    pmulhrsw             m2, m5
9389    pmulhrsw             m3, m5
9390    paddw                m0, m2
9391    paddw                m1, m3
9392    mova   [dstq+16*(%1+0)], m0
9393    mova   [dstq+16*(%1+1)], m1
9394%endmacro
9395
9396cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
9397%define base r6-blend_h_ssse3_table
9398    LEA                  r6, blend_h_ssse3_table
9399    tzcnt                wd, wm
9400    mov                  hd, hm
9401    movsxd               wq, [r6+wq*4]
9402    movddup              m4, [base+blend_shuf]
9403    lea               maskq, [base+obmc_masks+hq*2]
9404    lea                  hd, [hq*3]
9405    add                  wq, r6
9406    shr                  hd, 2 ; h * 3/4
9407    lea               maskq, [maskq+hq*2]
9408    neg                  hq
9409    jmp                  wq
9410.w2:
9411    movd                 m0, [dstq+dsq*0]
9412    movd                 m2, [dstq+dsq*1]
9413    movd                 m3, [maskq+hq*2]
9414    movq                 m1, [tmpq]
9415    add                tmpq, 4*2
9416    punpckldq            m0, m2
9417    punpcklwd            m3, m3
9418    psubw                m1, m0
9419    pmulhrsw             m1, m3
9420    paddw                m0, m1
9421    movd       [dstq+dsq*0], m0
9422    psrlq                m0, 32
9423    movd       [dstq+dsq*1], m0
9424    lea                dstq, [dstq+dsq*2]
9425    add                  hq, 2
9426    jl .w2
9427    RET
9428.w4:
9429    mova                 m3, [base+blend_shuf]
9430.w4_loop:
9431    movq                 m0, [dstq+dsq*0]
9432    movhps               m0, [dstq+dsq*1]
9433    movd                 m2, [maskq+hq*2]
9434    mova                 m1, [tmpq]
9435    add                tmpq, 8*2
9436    psubw                m1, m0
9437    pshufb               m2, m3
9438    pmulhrsw             m1, m2
9439    paddw                m0, m1
9440    movq       [dstq+dsq*0], m0
9441    movhps     [dstq+dsq*1], m0
9442    lea                dstq, [dstq+dsq*2]
9443    add                  hq, 2
9444    jl .w4_loop
9445    RET
9446.w8:
9447    movddup              m5, [base+blend_shuf+8]
9448%if WIN64
9449    movaps         [rsp+ 8], m6
9450    movaps         [rsp+24], m7
9451%endif
9452.w8_loop:
9453    movd                 m7, [maskq+hq*2]
9454    mova                 m0, [dstq+dsq*0]
9455    mova                 m2, [tmpq+16*0]
9456    mova                 m1, [dstq+dsq*1]
9457    mova                 m3, [tmpq+16*1]
9458    add                tmpq, 16*2
9459    pshufb               m6, m7, m4
9460    psubw                m2, m0
9461    pshufb               m7, m5
9462    psubw                m3, m1
9463    pmulhrsw             m2, m6
9464    pmulhrsw             m3, m7
9465    paddw                m0, m2
9466    paddw                m1, m3
9467    mova       [dstq+dsq*0], m0
9468    mova       [dstq+dsq*1], m1
9469    lea                dstq, [dstq+dsq*2]
9470    add                  hq, 2
9471    jl .w8_loop
9472%if WIN64
9473    movaps               m6, [rsp+ 8]
9474    movaps               m7, [rsp+24]
9475%endif
9476    RET
9477.w16:
9478    movd                 m5, [maskq+hq*2]
9479    pshufb               m5, m4
9480    BLEND_H_ROW           0, 0, 2
9481    add                dstq, dsq
9482    inc                  hq
9483    jl .w16
9484    RET
9485.w32:
9486    movd                 m5, [maskq+hq*2]
9487    pshufb               m5, m4
9488    BLEND_H_ROW           0, 0
9489    BLEND_H_ROW           2, 2, 4
9490    add                dstq, dsq
9491    inc                  hq
9492    jl .w32
9493    RET
9494.w64:
9495    movd                 m5, [maskq+hq*2]
9496    pshufb               m5, m4
9497    BLEND_H_ROW           0, 0
9498    BLEND_H_ROW           2, 2
9499    BLEND_H_ROW           4, 4
9500    BLEND_H_ROW           6, 6, 8
9501    add                dstq, dsq
9502    inc                  hq
9503    jl .w64
9504    RET
9505.w128:
9506    movd                 m5, [maskq+hq*2]
9507    pshufb               m5, m4
9508    BLEND_H_ROW           0,  0
9509    BLEND_H_ROW           2,  2
9510    BLEND_H_ROW           4,  4
9511    BLEND_H_ROW           6,  6, 16
9512    BLEND_H_ROW           8, -8
9513    BLEND_H_ROW          10, -6
9514    BLEND_H_ROW          12, -4
9515    BLEND_H_ROW          14, -2
9516    add                dstq, dsq
9517    inc                  hq
9518    jl .w128
9519    RET
9520
9521; emu_edge args:
9522; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
9523; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
9524; const pixel *ref, const ptrdiff_t ref_stride
9525;
9526; bw, bh total filled size
9527; iw, ih, copied block -> fill bottom, right
9528; x, y, offset in bw/bh -> fill top, left
9529cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \
9530                             y, dst, dstride, src, sstride, \
9531                             bottomext, rightext, blk
9532    ; we assume that the buffer (stride) is larger than width, so we can
9533    ; safely overwrite by a few bytes
9534
9535%if ARCH_X86_64
9536 %define reg_zero       r12q
9537 %define reg_tmp        r10
9538 %define reg_src        srcq
9539 %define reg_bottomext  bottomextq
9540 %define reg_rightext   rightextq
9541 %define reg_blkm       r9m
9542%else
9543 %define reg_zero       r6
9544 %define reg_tmp        r0
9545 %define reg_src        r1
9546 %define reg_bottomext  r0
9547 %define reg_rightext   r1
9548 %define reg_blkm       r2m
9549%endif
9550    ;
9551    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
9552    xor            reg_zero, reg_zero
9553    lea             reg_tmp, [ihq-1]
9554    cmp                  yq, ihq
9555    cmovs           reg_tmp, yq
9556    test                 yq, yq
9557    cmovs           reg_tmp, reg_zero
9558%if ARCH_X86_64
9559    imul            reg_tmp, sstrideq
9560    add                srcq, reg_tmp
9561%else
9562    imul            reg_tmp, sstridem
9563    mov             reg_src, srcm
9564    add             reg_src, reg_tmp
9565%endif
9566    ;
9567    ; ref += iclip(x, 0, iw - 1)
9568    lea             reg_tmp, [iwq-1]
9569    cmp                  xq, iwq
9570    cmovs           reg_tmp, xq
9571    test                 xq, xq
9572    cmovs           reg_tmp, reg_zero
9573    lea             reg_src, [reg_src+reg_tmp*2]
9574%if ARCH_X86_32
9575    mov                srcm, reg_src
9576%endif
9577    ;
9578    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
9579%if ARCH_X86_32
9580    mov                  r1, r1m ; restore bh
9581%endif
9582    lea       reg_bottomext, [yq+bhq]
9583    sub       reg_bottomext, ihq
9584    lea                  r3, [bhq-1]
9585    cmovs     reg_bottomext, reg_zero
9586    ;
9587
9588    DEFINE_ARGS bw, bh, iw, ih, x, \
9589                topext, dst, dstride, src, sstride, \
9590                bottomext, rightext, blk
9591
9592    ; top_ext = iclip(-y, 0, bh - 1)
9593    neg             topextq
9594    cmovs           topextq, reg_zero
9595    cmp       reg_bottomext, bhq
9596    cmovns    reg_bottomext, r3
9597    cmp             topextq, bhq
9598    cmovg           topextq, r3
9599 %if ARCH_X86_32
9600    mov                 r4m, reg_bottomext
9601    ;
9602    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
9603    mov                  r0, r0m ; restore bw
9604 %endif
9605    lea        reg_rightext, [xq+bwq]
9606    sub        reg_rightext, iwq
9607    lea                  r2, [bwq-1]
9608    cmovs      reg_rightext, reg_zero
9609
9610    DEFINE_ARGS bw, bh, iw, ih, leftext, \
9611                topext, dst, dstride, src, sstride, \
9612                bottomext, rightext, blk
9613
9614    ; left_ext = iclip(-x, 0, bw - 1)
9615    neg            leftextq
9616    cmovs          leftextq, reg_zero
9617    cmp        reg_rightext, bwq
9618    cmovns     reg_rightext, r2
9619 %if ARCH_X86_32
9620    mov                 r3m, r1
9621 %endif
9622    cmp            leftextq, bwq
9623    cmovns         leftextq, r2
9624
9625%undef reg_zero
9626%undef reg_tmp
9627%undef reg_src
9628%undef reg_bottomext
9629%undef reg_rightext
9630
9631    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
9632                topext, dst, dstride, src, sstride, \
9633                bottomext, rightext, blk
9634
9635    ; center_h = bh - top_ext - bottom_ext
9636%if ARCH_X86_64
9637    lea                  r3, [bottomextq+topextq]
9638    sub            centerhq, r3
9639%else
9640    mov                   r1, centerhm ; restore r1
9641    sub             centerhq, topextq
9642    sub             centerhq, r4m
9643    mov                  r1m, centerhq
9644%endif
9645    ;
9646    ; blk += top_ext * PXSTRIDE(dst_stride)
9647    mov                  r2, topextq
9648%if ARCH_X86_64
9649    imul                 r2, dstrideq
9650%else
9651    mov                  r6, r6m ; restore dstq
9652    imul                 r2, dstridem
9653%endif
9654    add                dstq, r2
9655    mov            reg_blkm, dstq ; save pointer for ext
9656    ;
9657    ; center_w = bw - left_ext - right_ext
9658    mov            centerwq, bwq
9659%if ARCH_X86_64
9660    lea                  r3, [rightextq+leftextq]
9661    sub            centerwq, r3
9662%else
9663    sub            centerwq, r3m
9664    sub            centerwq, leftextq
9665%endif
9666
9667; vloop Macro
9668%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
9669  %if ARCH_X86_64
9670    %define reg_tmp        r12
9671  %else
9672    %define reg_tmp        r0
9673  %endif
9674.v_loop_%3:
9675  %if ARCH_X86_32
9676    mov                  r0, r0m
9677    mov                  r1, r1m
9678  %endif
9679%if %1
9680    ; left extension
9681  %if ARCH_X86_64
9682    movd                 m0, [srcq]
9683  %else
9684    mov                  r3, srcm
9685    movd                 m0, [r3]
9686  %endif
9687    pshuflw              m0, m0, q0000
9688    punpcklqdq           m0, m0
9689    xor                  r3, r3
9690.left_loop_%3:
9691    mova        [dstq+r3*2], m0
9692    add                  r3, mmsize/2
9693    cmp                  r3, leftextq
9694    jl .left_loop_%3
9695    ; body
9696    lea             reg_tmp, [dstq+leftextq*2]
9697%endif
9698    xor                  r3, r3
9699.body_loop_%3:
9700  %if ARCH_X86_64
9701    movu                 m0, [srcq+r3*2]
9702  %else
9703    mov                  r1, srcm
9704    movu                 m0, [r1+r3*2]
9705  %endif
9706%if %1
9707    movu     [reg_tmp+r3*2], m0
9708%else
9709    movu        [dstq+r3*2], m0
9710%endif
9711    add                  r3, mmsize/2
9712    cmp                  r3, centerwq
9713    jl .body_loop_%3
9714%if %2
9715    ; right extension
9716%if %1
9717    lea             reg_tmp, [reg_tmp+centerwq*2]
9718%else
9719    lea             reg_tmp, [dstq+centerwq*2]
9720%endif
9721  %if ARCH_X86_64
9722    movd                 m0, [srcq+centerwq*2-2]
9723  %else
9724    mov                  r3, srcm
9725    movd                 m0, [r3+centerwq*2-2]
9726  %endif
9727    pshuflw              m0, m0, q0000
9728    punpcklqdq           m0, m0
9729    xor                  r3, r3
9730.right_loop_%3:
9731    movu     [reg_tmp+r3*2], m0
9732    add                  r3, mmsize/2
9733  %if ARCH_X86_64
9734    cmp                  r3, rightextq
9735  %else
9736    cmp                  r3, r3m
9737  %endif
9738    jl .right_loop_%3
9739%endif
9740  %if ARCH_X86_64
9741    add                dstq, dstrideq
9742    add                srcq, sstrideq
9743    dec            centerhq
9744    jg .v_loop_%3
9745  %else
9746    add                dstq, dstridem
9747    mov                  r0, sstridem
9748    add                srcm, r0
9749    sub       dword centerhm, 1
9750    jg .v_loop_%3
9751    mov                  r0, r0m ; restore r0
9752  %endif
9753%endmacro ; vloop MACRO
9754
9755    test           leftextq, leftextq
9756    jnz .need_left_ext
9757 %if ARCH_X86_64
9758    test          rightextq, rightextq
9759    jnz .need_right_ext
9760 %else
9761    cmp            leftextq, r3m ; leftextq == 0
9762    jne .need_right_ext
9763 %endif
9764    v_loop                0, 0, 0
9765    jmp .body_done
9766
9767    ;left right extensions
9768.need_left_ext:
9769 %if ARCH_X86_64
9770    test          rightextq, rightextq
9771 %else
9772    mov                  r3, r3m
9773    test                 r3, r3
9774 %endif
9775    jnz .need_left_right_ext
9776    v_loop                1, 0, 1
9777    jmp .body_done
9778
9779.need_left_right_ext:
9780    v_loop                1, 1, 2
9781    jmp .body_done
9782
9783.need_right_ext:
9784    v_loop                0, 1, 3
9785
9786.body_done:
9787; r0 ; bw
9788; r1 ;; x loop
9789; r4 ;; y loop
9790; r5 ; topextq
9791; r6 ;dstq
9792; r7 ;dstrideq
9793; r8 ; srcq
9794%if ARCH_X86_64
9795 %define reg_dstride    dstrideq
9796%else
9797 %define reg_dstride    r2
9798%endif
9799    ;
9800    ; bottom edge extension
9801 %if ARCH_X86_64
9802    test         bottomextq, bottomextq
9803    jz .top
9804 %else
9805    xor                  r1, r1
9806    cmp                  r1, r4m
9807    je .top
9808 %endif
9809    ;
9810 %if ARCH_X86_64
9811    mov                srcq, dstq
9812    sub                srcq, dstrideq
9813    xor                  r1, r1
9814 %else
9815    mov                  r3, dstq
9816    mov         reg_dstride, dstridem
9817    sub                  r3, reg_dstride
9818    mov                srcm, r3
9819 %endif
9820    ;
9821.bottom_x_loop:
9822 %if ARCH_X86_64
9823    mova                 m0, [srcq+r1*2]
9824    lea                  r3, [dstq+r1*2]
9825    mov                  r4, bottomextq
9826 %else
9827    mov                  r3, srcm
9828    mova                 m0, [r3+r1*2]
9829    lea                  r3, [dstq+r1*2]
9830    mov                  r4, r4m
9831 %endif
9832    ;
9833.bottom_y_loop:
9834    mova               [r3], m0
9835    add                  r3, reg_dstride
9836    dec                  r4
9837    jg .bottom_y_loop
9838    add                  r1, mmsize/2
9839    cmp                  r1, bwq
9840    jl .bottom_x_loop
9841
9842.top:
9843    ; top edge extension
9844    test            topextq, topextq
9845    jz .end
9846%if ARCH_X86_64
9847    mov                srcq, reg_blkm
9848%else
9849    mov                  r3, reg_blkm
9850    mov         reg_dstride, dstridem
9851%endif
9852    mov                dstq, dstm
9853    xor                  r1, r1
9854    ;
9855.top_x_loop:
9856%if ARCH_X86_64
9857    mova                 m0, [srcq+r1*2]
9858%else
9859    mov                  r3, reg_blkm
9860    mova                 m0, [r3+r1*2]
9861%endif
9862    lea                  r3, [dstq+r1*2]
9863    mov                  r4, topextq
9864    ;
9865.top_y_loop:
9866    mova               [r3], m0
9867    add                  r3, reg_dstride
9868    dec                  r4
9869    jg .top_y_loop
9870    add                  r1, mmsize/2
9871    cmp                  r1, bwq
9872    jl .top_x_loop
9873
9874.end:
9875    RET
9876
9877%undef reg_dstride
9878%undef reg_blkm
9879%undef reg_tmp
9880
9881%macro SCRATCH 3
9882%if ARCH_X86_32
9883    mova [rsp+%3*mmsize], m%1
9884%define m%2 [rsp+%3*mmsize]
9885%else
9886    SWAP             %1, %2
9887%endif
9888%endmacro
9889
9890%if ARCH_X86_64
9891cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \
9892                                       dst_w, h, src_w, dx, mx0, pxmax
9893%elif STACK_ALIGNMENT >= 16
9894cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \
9895                                     dst_w, h, src_w, dx, mx0, pxmax
9896%else
9897cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
9898                                     dst_w, h, src_w, dx, mx0, pxmax
9899%endif
9900    movifnidn         dstq, dstmp
9901    movifnidn         srcq, srcmp
9902%if STACK_ALIGNMENT >= 16
9903    movifnidn       dst_wd, dst_wm
9904%endif
9905%if ARCH_X86_64
9906    movifnidn           hd, hm
9907%endif
9908    sub         dword mx0m, 4<<14
9909    sub       dword src_wm, 8
9910    movd                m4, pxmaxm
9911    movd                m7, dxm
9912    movd                m6, mx0m
9913    movd                m5, src_wm
9914    punpcklwd           m4, m4
9915    pshufd              m4, m4, q0000
9916    pshufd              m7, m7, q0000
9917    pshufd              m6, m6, q0000
9918    pshufd              m5, m5, q0000
9919    mova [rsp+16*3*ARCH_X86_32], m4
9920%if ARCH_X86_64
9921 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
9922    LEA                 r7, $$
9923 %define base r7-$$
9924%else
9925 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
9926 %define hd dword r5m
9927 %if STACK_ALIGNMENT >= 16
9928    LEA                 r6, $$
9929  %define base r6-$$
9930 %else
9931    LEA                 r4, $$
9932  %define base r4-$$
9933 %endif
9934%endif
9935%if ARCH_X86_64
9936    mova               m12, [base+pd_64]
9937    mova               m11, [base+pd_63]
9938%else
9939 %define m12 [base+pd_64]
9940 %define m11 [base+pd_63]
9941%endif
9942    pmaddwd             m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
9943    pslld               m7, 2                      ; dx*4
9944    pslld               m5, 14
9945    paddd               m6, m4                     ; mx+[0..3]*dx
9946    SCRATCH              7, 15, 0
9947    SCRATCH              6, 14, 1
9948    SCRATCH              5, 13, 2
9949    pxor                m1, m1
9950.loop_y:
9951    xor                 xd, xd
9952    mova                m0, m14            ; per-line working version of mx
9953.loop_x:
9954    pcmpgtd             m1, m0
9955    pandn               m1, m0
9956    psrad               m2, m0, 8          ; filter offset (unmasked)
9957    pcmpgtd             m3, m13, m1
9958    pand                m1, m3
9959    pandn               m3, m13
9960    por                 m1, m3
9961    psubd               m3, m0, m1         ; pshufb offset
9962    psrad               m1, 14             ; clipped src_x offset
9963    psrad               m3, 14             ; pshufb edge_emu offset
9964    pand                m2, m11            ; filter offset (masked)
9965    ; load source pixels
9966%if ARCH_X86_64
9967    movd               r8d, m1
9968    pshuflw             m1, m1, q3232
9969    movd               r9d, m1
9970    punpckhqdq          m1, m1
9971    movd              r10d, m1
9972    psrlq               m1, 32
9973    movd              r11d, m1
9974    movu                m4, [srcq+r8*2]
9975    movu                m5, [srcq+r9*2]
9976    movu                m6, [srcq+r10*2]
9977    movu                m7, [srcq+r11*2]
9978    ; if no emulation is required, we don't need to shuffle or emulate edges
9979    packssdw            m3, m3
9980    movq               r11, m3
9981    test               r11, r11
9982    jz .filter
9983    movsx               r8, r11w
9984    sar                r11, 16
9985    movsx               r9, r11w
9986    sar                r11, 16
9987    movsx              r10, r11w
9988    sar                r11, 16
9989    movu                m1, [base+resize_shuf+8+r8*2]
9990    movu                m3, [base+resize_shuf+8+r9*2]
9991    movu                m8, [base+resize_shuf+8+r10*2]
9992    movu                m9, [base+resize_shuf+8+r11*2]
9993    pshufb              m4, m1
9994    pshufb              m5, m3
9995    pshufb              m6, m8
9996    pshufb              m7, m9
9997.filter:
9998    movd               r8d, m2
9999    pshuflw             m2, m2, q3232
10000    movd               r9d, m2
10001    punpckhqdq          m2, m2
10002    movd              r10d, m2
10003    psrlq               m2, 32
10004    movd              r11d, m2
10005    movq                m8, [base+resize_filter+r8*8]
10006    movq                m2, [base+resize_filter+r9*8]
10007    pxor                m9, m9
10008    punpcklbw           m1, m9, m8
10009    punpcklbw           m3, m9, m2
10010    psraw               m1, 8
10011    psraw               m3, 8
10012    movq               m10, [base+resize_filter+r10*8]
10013    movq                m2, [base+resize_filter+r11*8]
10014    punpcklbw           m8, m9, m10
10015    punpcklbw           m9, m2
10016    psraw               m8, 8
10017    psraw               m9, 8
10018    pmaddwd             m4, m1
10019    pmaddwd             m5, m3
10020    pmaddwd             m6, m8
10021    pmaddwd             m7, m9
10022    phaddd              m4, m5
10023%else
10024    movd                r3, m1
10025    pshuflw             m1, m1, q3232
10026    movd                r1, m1
10027    punpckhqdq          m1, m1
10028    movu                m4, [srcq+r3*2]
10029    movu                m5, [srcq+r1*2]
10030    movd                r3, m1
10031    psrlq               m1, 32
10032    movd                r1, m1
10033    movu                m6, [srcq+r3*2]
10034    movu                m7, [srcq+r1*2]
10035    ; if no emulation is required, we don't need to shuffle or emulate edges
10036    pxor                m1, m1
10037    pcmpeqb             m1, m3
10038    pmovmskb           r3d, m1
10039    cmp                r3d, 0xffff
10040    je .filter
10041    movd                r3, m3
10042    movu                m1, [base+resize_shuf+8+r3*2]
10043    pshuflw             m3, m3, q3232
10044    movd                r1, m3
10045    pshufb              m4, m1
10046    movu                m1, [base+resize_shuf+8+r1*2]
10047    punpckhqdq          m3, m3
10048    movd                r3, m3
10049    pshufb              m5, m1
10050    movu                m1, [base+resize_shuf+8+r3*2]
10051    psrlq               m3, 32
10052    movd                r1, m3
10053    pshufb              m6, m1
10054    movu                m1, [base+resize_shuf+8+r1*2]
10055    pshufb              m7, m1
10056.filter:
10057    mova        [esp+4*16], m6
10058    mova        [esp+5*16], m7
10059    movd                r3, m2
10060    pshuflw             m2, m2, q3232
10061    movd                r1, m2
10062    movq                m6, [base+resize_filter+r3*8]
10063    movq                m7, [base+resize_filter+r1*8]
10064    pxor                m3, m3
10065    punpcklbw           m1, m3, m6
10066    punpcklbw           m3, m7
10067    psraw               m1, 8
10068    psraw               m3, 8
10069    pmaddwd             m4, m1
10070    pmaddwd             m5, m3
10071    punpckhqdq          m2, m2
10072    movd                r3, m2
10073    psrlq               m2, 32
10074    movd                r1, m2
10075    phaddd              m4, m5
10076    movq                m2, [base+resize_filter+r3*8]
10077    movq                m5, [base+resize_filter+r1*8]
10078    mova                m6, [esp+4*16]
10079    mova                m7, [esp+5*16]
10080    pxor                m3, m3
10081    punpcklbw           m1, m3, m2
10082    punpcklbw           m3, m5
10083    psraw               m1, 8
10084    psraw               m3, 8
10085    pmaddwd             m6, m1
10086    pmaddwd             m7, m3
10087%endif
10088    phaddd              m6, m7
10089    phaddd              m4, m6
10090    pxor                m1, m1
10091    psubd               m2, m12, m4
10092    psrad               m2, 7
10093    packssdw            m2, m2
10094    pmaxsw              m2, m1
10095    pminsw              m2, [rsp+16*3*ARCH_X86_32]
10096    movq       [dstq+xq*2], m2
10097    paddd               m0, m15
10098    add                 xd, 4
10099%if STACK_ALIGNMENT >= 16
10100    cmp                 xd, dst_wd
10101%else
10102    cmp                 xd, dst_wm
10103%endif
10104    jl .loop_x
10105    add               dstq, dst_stridemp
10106    add               srcq, src_stridemp
10107    dec                 hd
10108    jg .loop_y
10109    RET
10110