xref: /aosp_15_r20/external/libdav1d/src/x86/cdef_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31%macro JMP_TABLE 2-*
32 %xdefine %1_jmptable %%table
33 %xdefine %%base mangle(private_prefix %+ _%1_avx2)
34 %%table:
35 %rep %0 - 1
36    dd %%base %+ .%2 - %%table
37  %rotate 1
38 %endrep
39%endmacro
40
41%macro CDEF_FILTER_JMP_TABLE 1
42JMP_TABLE cdef_filter_%1_8bpc, \
43    d6k0, d6k1, d7k0, d7k1, \
44    d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
45    d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
46    d0k0, d0k1, d1k0, d1k1
47%endmacro
48
49SECTION_RODATA 32
50
51pd_47130256:   dd  4,  7,  1,  3,  0,  2,  5,  6
52blend_4x4:     dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
53               dd 0x80, 0x00, 0x00
54blend_4x8_0:   dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
55blend_4x8_1:   dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
56               dd 0x00, 0x00
57blend_4x8_2:   dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
58               dd 0x0000
59blend_4x8_3:   dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
60               dd 0x0000, 0x0000
61blend_8x8_0:   dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
62blend_8x8_1:   dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
63div_table:     dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
64shufw_6543210x:db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
65shufb_lohi:    db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
66pw_128:        times 2 dw 128
67pw_2048:       times 2 dw 2048
68tap_table:     ; masks for 8 bit shifts
69               db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
70               ; weights
71               db  4,  2,  3,  3,  2,  1
72               db -1 * 16 + 1, -2 * 16 + 2
73               db  0 * 16 + 1, -1 * 16 + 2
74               db  0 * 16 + 1,  0 * 16 + 2
75               db  0 * 16 + 1,  1 * 16 + 2
76               db  1 * 16 + 1,  2 * 16 + 2
77               db  1 * 16 + 0,  2 * 16 + 1
78               db  1 * 16 + 0,  2 * 16 + 0
79               db  1 * 16 + 0,  2 * 16 - 1
80               ; the last 6 are repeats of the first 6 so we don't need to & 7
81               db -1 * 16 + 1, -2 * 16 + 2
82               db  0 * 16 + 1, -1 * 16 + 2
83               db  0 * 16 + 1,  0 * 16 + 2
84               db  0 * 16 + 1,  1 * 16 + 2
85               db  1 * 16 + 1,  2 * 16 + 2
86               db  1 * 16 + 0,  2 * 16 + 1
87
88CDEF_FILTER_JMP_TABLE 4x4
89CDEF_FILTER_JMP_TABLE 4x8
90CDEF_FILTER_JMP_TABLE 8x8
91
92SECTION .text
93
94%macro PREP_REGS 2 ; w, h
95    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
96    mov           dird, r7m
97    lea         tableq, [cdef_filter_%1x%2_8bpc_jmptable]
98    lea           dirq, [tableq+dirq*2*4]
99%if %1 == 4
100 %if %2 == 4
101  DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
102              table, dir, dirjmp, stride3, k
103 %else
104  DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
105              table, dir, dirjmp, dst4, stride3, k
106    lea          dst4q, [dstq+strideq*4]
107 %endif
108%else
109  DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \
110              table, dir, dirjmp, top2, stride3, k
111    mov             hq, -8
112    lea          top1q, [top1q+strideq*0]
113    lea          top2q, [top1q+strideq*1]
114%endif
115%if %1 == 4
116    lea       stride3q, [strideq*3]
117%endif
118%endmacro
119
120%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
121    mov             kd, 1
122    pxor           m15, m15                     ; sum
123%if %2 == 8
124    pxor           m12, m12
125 %if %1 == 4
126    movd           xm4, [dstq +strideq*0]
127    movd           xm6, [dstq +strideq*1]
128    movd           xm5, [dstq +strideq*2]
129    movd           xm7, [dstq +stride3q ]
130    vinserti128     m4, [dst4q+strideq*0], 1
131    vinserti128     m6, [dst4q+strideq*1], 1
132    vinserti128     m5, [dst4q+strideq*2], 1
133    vinserti128     m7, [dst4q+stride3q ], 1
134    punpckldq       m4, m6
135    punpckldq       m5, m7
136 %else
137    movq           xm4, [dstq+strideq*0]
138    movq           xm5, [dstq+strideq*1]
139    vinserti128     m4, [dstq+strideq*2], 1
140    vinserti128     m5, [dstq+stride3q ], 1
141 %endif
142    punpcklqdq      m4, m5
143%else
144    movd           xm4, [dstq+strideq*0]
145    movd           xm5, [dstq+strideq*1]
146    vinserti128     m4, [dstq+strideq*2], 1
147    vinserti128     m5, [dstq+stride3q ], 1
148    punpckldq       m4, m5
149%endif
150%if %3 == 1
151    mova            m7, m4                      ; min
152    mova            m8, m4                      ; max
153%endif
154%endmacro
155
156%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
157                                 ; mul_tap, w, h, clip
158    ; load p0/p1
159    movsxd     dirjmpq, [dirq+kq*4+%1*2*4]
160    add        dirjmpq, tableq
161    call       dirjmpq
162
163%if %8 == 1
164    pmaxub          m7, m5
165    pminub          m8, m5
166    pmaxub          m7, m6
167    pminub          m8, m6
168%endif
169
170    ; accumulate sum[m15] over p0/p1
171%if %7 == 4
172    punpcklbw       m5, m6
173    punpcklbw       m6, m4, m4
174    psubusb         m9, m5, m6
175    psubusb         m5, m6, m5
176    por             m9, m5     ; abs_diff_p01(p01 - px)
177    pcmpeqb         m5, m9
178    por             m5, %5
179    psignb          m6, %5, m5
180    psrlw           m5, m9, %2 ; emulate 8-bit shift
181    pand            m5, %3
182    psubusb         m5, %4, m5
183    pminub          m5, m9
184    pmaddubsw       m5, m6
185    paddw          m15, m5
186%else
187    psubusb         m9, m5, m4
188    psubusb         m5, m4, m5
189    psubusb        m11, m6, m4
190    psubusb         m6, m4, m6
191    por             m9, m5      ; abs_diff_p0(p0 - px)
192    por            m11, m6      ; abs_diff_p1(p1 - px)
193    pcmpeqb         m5, m9
194    pcmpeqb         m6, m11
195    punpckhbw      m10, m9, m11
196    punpcklbw       m9, m11
197    por             m5, %5
198    por            m11, m6, %5
199    punpckhbw       m6, m5, m11
200    punpcklbw       m5, m11
201    psignb         m11, %5, m6
202    psrlw           m6, m10, %2 ; emulate 8-bit shift
203    pand            m6, %3
204    psubusb         m6, %4, m6
205    pminub          m6, m10
206    pmaddubsw       m6, m11
207    paddw          m12, m6
208    psignb         m11, %5, m5
209    psrlw           m5, m9, %2  ; emulate 8-bit shift
210    pand            m5, %3
211    psubusb         m5, %4, m5
212    pminub          m5, m9
213    pmaddubsw       m5, m11
214    paddw          m15, m5
215%endif
216%endmacro
217
218%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
219%if %2 == 4
220 %if %5 == 1
221    punpcklbw       m4, %3
222 %endif
223    pcmpgtw         %3, m15
224    paddw          m15, %3
225    pmulhrsw       m15, %4
226 %if %5 == 0
227    packsswb       m15, m15
228    paddb           m4, m15
229 %else
230    paddw           m4, m15
231    packuswb        m4, m4 ; clip px in [0x0,0xff]
232    pminub          m4, m7
233    pmaxub          m4, m8
234 %endif
235    vextracti128   xm5, m4, 1
236    movd   [dstq+strideq*0], xm4
237    movd   [dstq+strideq*2], xm5
238    pextrd [dstq+strideq*1], xm4, 1
239    pextrd [dstq+stride3q ], xm5, 1
240%else
241    pcmpgtw         m6, %3, m12
242    pcmpgtw         m5, %3, m15
243    paddw          m12, m6
244    paddw          m15, m5
245 %if %5 == 1
246    punpckhbw       m5, m4, %3
247    punpcklbw       m4, %3
248 %endif
249    pmulhrsw       m12, %4
250    pmulhrsw       m15, %4
251 %if %5 == 0
252    packsswb       m15, m12
253    paddb           m4, m15
254 %else
255    paddw           m5, m12
256    paddw           m4, m15
257    packuswb        m4, m5 ; clip px in [0x0,0xff]
258    pminub          m4, m7
259    pmaxub          m4, m8
260 %endif
261    vextracti128   xm5, m4, 1
262 %if %1 == 4
263    movd   [dstq +strideq*0], xm4
264    movd   [dst4q+strideq*0], xm5
265    pextrd [dstq +strideq*1], xm4, 1
266    pextrd [dst4q+strideq*1], xm5, 1
267    pextrd [dstq +strideq*2], xm4, 2
268    pextrd [dst4q+strideq*2], xm5, 2
269    pextrd [dstq +stride3q ], xm4, 3
270    pextrd [dst4q+stride3q ], xm5, 3
271 %else
272    movq   [dstq+strideq*0], xm4
273    movq   [dstq+strideq*2], xm5
274    movhps [dstq+strideq*1], xm4
275    movhps [dstq+stride3q ], xm5
276 %endif
277%endif
278%endmacro
279
280%macro BORDER_PREP_REGS 2 ; w, h
281    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
282    mov           dird, r7m
283    lea           dirq, [tableq+dirq*2+14]
284%if %1*%2*2/mmsize > 1
285 %if %1 == 4
286    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off
287 %else
288    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off
289 %endif
290    mov             hd, %1*%2*2/mmsize
291%else
292    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off
293%endif
294    lea           stkq, [px]
295    pxor           m11, m11
296%endmacro
297
298%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
299    mov             kd, 1
300%if %1 == 4
301    movq           xm4, [stkq+32*0]
302    movhps         xm4, [stkq+32*1]
303    movq           xm5, [stkq+32*2]
304    movhps         xm5, [stkq+32*3]
305    vinserti128     m4, xm5, 1
306%else
307    mova           xm4, [stkq+32*0]             ; px
308    vinserti128     m4, [stkq+32*1], 1
309%endif
310    pxor           m15, m15                     ; sum
311%if %3 == 1
312    mova            m7, m4                      ; max
313    mova            m8, m4                      ; min
314%endif
315%endmacro
316
317%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
318                                 ; mul_tap, w, clip
319    ; load p0/p1
320    movsx         offq, byte [dirq+kq+%1]       ; off1
321%if %6 == 4
322    movq           xm5, [stkq+offq*2+32*0]      ; p0
323    movq           xm6, [stkq+offq*2+32*2]
324    movhps         xm5, [stkq+offq*2+32*1]
325    movhps         xm6, [stkq+offq*2+32*3]
326    vinserti128     m5, xm6, 1
327%else
328    movu           xm5, [stkq+offq*2+32*0]      ; p0
329    vinserti128     m5, [stkq+offq*2+32*1], 1
330%endif
331    neg           offq                          ; -off1
332%if %6 == 4
333    movq           xm6, [stkq+offq*2+32*0]      ; p1
334    movq           xm9, [stkq+offq*2+32*2]
335    movhps         xm6, [stkq+offq*2+32*1]
336    movhps         xm9, [stkq+offq*2+32*3]
337    vinserti128     m6, xm9, 1
338%else
339    movu           xm6, [stkq+offq*2+32*0]      ; p1
340    vinserti128     m6, [stkq+offq*2+32*1], 1
341%endif
342%if %7 == 1
343    ; out of bounds values are set to a value that is a both a large unsigned
344    ; value and a negative signed value.
345    ; use signed max and unsigned min to remove them
346    pmaxsw          m7, m5                      ; max after p0
347    pminuw          m8, m5                      ; min after p0
348    pmaxsw          m7, m6                      ; max after p1
349    pminuw          m8, m6                      ; min after p1
350%endif
351
352    ; accumulate sum[m15] over p0/p1
353    ; calculate difference before converting
354    psubw           m5, m4                      ; diff_p0(p0 - px)
355    psubw           m6, m4                      ; diff_p1(p1 - px)
356
357    ; convert to 8-bits with signed saturation
358    ; saturating to large diffs has no impact on the results
359    packsswb        m5, m6
360
361    ; group into pairs so we can accumulate using maddubsw
362    pshufb          m5, m12
363    pabsb           m9, m5
364    psignb         m10, %5, m5
365    psrlw           m5, m9, %2                  ; emulate 8-bit shift
366    pand            m5, %3
367    psubusb         m5, %4, m5
368
369    ; use unsigned min since abs diff can equal 0x80
370    pminub          m5, m9
371    pmaddubsw       m5, m10
372    paddw          m15, m5
373%endmacro
374
375%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
376    pcmpgtw         m9, m11, m15
377    paddw          m15, m9
378    pmulhrsw       m15, %2
379    paddw           m4, m15
380%if %3 == 1
381    pminsw          m4, m7
382    pmaxsw          m4, m8
383%endif
384    packuswb        m4, m4
385    vextracti128   xm5, m4, 1
386%if %1 == 4
387    movd   [dstq+strideq*0], xm4
388    pextrd [dstq+strideq*1], xm4, 1
389    movd   [dstq+strideq*2], xm5
390    pextrd [dstq+stride3q ], xm5, 1
391%else
392    movq [dstq+strideq*0], xm4
393    movq [dstq+strideq*1], xm5
394%endif
395%endmacro
396
397%macro CDEF_FILTER 2 ; w, h
398INIT_YMM avx2
399cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
400                                          pri, sec, dir, damping, edge
401    mov          edged, edgem
402    cmp          edged, 0xf
403    jne .border_block
404
405    PUSH           r11
406    PUSH           r12
407%if %2 == 4
408%assign regs_used 13
409    ALLOC_STACK   0x60, 16
410    pmovzxbw       xm0, [leftq+1]
411    vpermq          m0, m0, q0110
412    psrldq          m1, m0, 4
413    vpalignr        m2, m0, m0, 12
414    movu    [rsp+0x10], m0
415    movu    [rsp+0x28], m1
416    movu    [rsp+0x40], m2
417%elif %1 == 4
418%assign regs_used 14
419    PUSH           r13
420    ALLOC_STACK 8*2+%1*%2*1, 16
421    pmovzxwd        m0, [leftq]
422    mova    [rsp+0x10], m0
423%else
424%assign regs_used 15
425    PUSH           r13
426    PUSH           r14
427    ALLOC_STACK 8*4+%1*%2*2+32, 16
428    lea            r11, [strideq*3]
429    movu           xm4, [dstq+strideq*2]
430    pmovzxwq        m0, [leftq+0]
431    pmovzxwq        m1, [leftq+8]
432    vinserti128     m4, [dstq+r11], 1
433    pmovzxbd        m2, [leftq+1]
434    pmovzxbd        m3, [leftq+9]
435    mov       [rsp+16], botq
436    mova    [rsp+0x20], m0
437    mova    [rsp+0x40], m1
438    mova    [rsp+0x60], m2
439    mova    [rsp+0x80], m3
440    mova    [rsp+0xa0], m4
441    lea           botq, [dstq+strideq*4]
442%endif
443
444 DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping
445    mov       dampingd, r8m
446    xor          zerod, zerod
447    movifnidn     prid, prim
448    sub       dampingd, 31
449    movifnidn  secdmpd, secdmpm
450    test          prid, prid
451    jz .sec_only
452    movd           xm0, prid
453    lzcnt      pridmpd, prid
454    add        pridmpd, dampingd
455    cmovs      pridmpd, zerod
456    mov        [rsp+0], pridmpq                 ; pri_shift
457    test       secdmpd, secdmpd
458    jz .pri_only
459    movd           xm1, secdmpd
460    lzcnt      secdmpd, secdmpd
461    add        secdmpd, dampingd
462    mov        [rsp+8], secdmpq                 ; sec_shift
463
464 DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp
465    lea         tableq, [tap_table]
466    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
467    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
468
469    ; pri/sec_taps[k] [4 total]
470 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir
471    vpbroadcastb    m0, xm0                     ; pri_strength
472    vpbroadcastb    m1, xm1                     ; sec_strength
473    and           prid, 1
474    lea           priq, [tableq+priq*2+8]       ; pri_taps
475    lea           secq, [tableq+12]             ; sec_taps
476
477    PREP_REGS       %1, %2
478%if %1*%2 > mmsize
479.v_loop:
480%endif
481    LOAD_BLOCK      %1, %2, 1
482.k_loop:
483    vpbroadcastb    m2, [priq+kq]                          ; pri_taps
484    vpbroadcastb    m3, [secq+kq]                          ; sec_taps
485    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
486    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
487    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
488    dec             kq
489    jge .k_loop
490
491    vpbroadcastd   m10, [pw_2048]
492    pxor            m9, m9
493    ADJUST_PIXEL    %1, %2, m9, m10, 1
494%if %1*%2 > mmsize
495    lea           dstq, [dstq+strideq*4]
496    lea          top1q, [rsp+0xa0]
497    lea          top2q, [rsp+0xb0]
498    mov           botq, [rsp+16]
499    add             hq, 4
500    jl .v_loop
501%endif
502    RET
503
504.pri_only:
505 DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp
506    lea         tableq, [tap_table]
507    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
508    ; pri/sec_taps[k] [4 total]
509 DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir
510    vpbroadcastb    m0, xm0                     ; pri_strength
511    and           prid, 1
512    lea           priq, [tableq+priq*2+8]       ; pri_taps
513    PREP_REGS       %1, %2
514    vpbroadcastd    m3, [pw_2048]
515    pxor            m1, m1
516%if %1*%2 > mmsize
517.pri_v_loop:
518%endif
519    LOAD_BLOCK      %1, %2
520.pri_k_loop:
521    vpbroadcastb    m2, [priq+kq]                       ; pri_taps
522    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
523    dec             kq
524    jge .pri_k_loop
525    ADJUST_PIXEL    %1, %2, m1, m3
526%if %1*%2 > mmsize
527    lea           dstq, [dstq+strideq*4]
528    lea          top1q, [rsp+0xa0]
529    lea          top2q, [rsp+0xb0]
530    mov           botq, [rsp+16]
531    add             hq, 4
532    jl .pri_v_loop
533%endif
534    RET
535
536.sec_only:
537 DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping
538    movd           xm1, secdmpd
539    lzcnt      secdmpd, secdmpd
540    add        secdmpd, dampingd
541    mov        [rsp+8], secdmpq                 ; sec_shift
542 DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table
543    lea         tableq, [tap_table]
544    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
545    ; pri/sec_taps[k] [4 total]
546 DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir
547    vpbroadcastb    m1, xm1                     ; sec_strength
548    lea           secq, [tableq+12]             ; sec_taps
549    PREP_REGS       %1, %2
550    vpbroadcastd    m2, [pw_2048]
551    pxor            m0, m0
552%if %1*%2 > mmsize
553.sec_v_loop:
554%endif
555    LOAD_BLOCK      %1, %2
556.sec_k_loop:
557    vpbroadcastb    m3, [secq+kq]                       ; sec_taps
558    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
559    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
560    dec             kq
561    jge .sec_k_loop
562    ADJUST_PIXEL    %1, %2, m0, m2
563%if %1*%2 > mmsize
564    lea           dstq, [dstq+strideq*4]
565    lea          top1q, [rsp+0xa0]
566    lea          top2q, [rsp+0xb0]
567    mov           botq, [rsp+16]
568    add             hq, 4
569    jl .sec_v_loop
570%endif
571    RET
572
573.d0k0:
574%if %1 == 4
575 %if %2 == 4
576    vpbroadcastq    m6, [dstq+strideq*1-1]
577    vpbroadcastq   m10, [dstq+strideq*2-1]
578    movd           xm5, [topq+strideq*1+1]
579    movd           xm9, [dstq+strideq*0+1]
580    psrldq         m11, m6, 2
581    psrldq         m12, m10, 2
582    vinserti128     m6, [dstq+stride3q -1], 1
583    vinserti128    m10, [botq          -1], 1
584    vpblendd        m5, m11, 0x10
585    vpblendd        m9, m12, 0x10
586    movu           m11, [blend_4x4+16]
587    punpckldq       m6, m10
588    punpckldq       m5, m9
589    vpblendvb       m6, [rsp+gprsize+0x28], m11
590 %else
591    movd           xm5, [topq +strideq*1+1]
592    movq           xm6, [dstq +strideq*1-1]
593    movq          xm10, [dstq +stride3q -1]
594    movq          xm11, [dst4q+strideq*1-1]
595    pinsrd         xm5, [dstq +strideq*0+1], 1
596    movhps         xm6, [dstq +strideq*2-1]
597    movhps        xm10, [dst4q+strideq*0-1]
598    movhps        xm11, [dst4q+strideq*2-1]
599    psrldq         xm9, xm6, 2
600    shufps         xm5, xm9, q2010   ; -1 +0 +1 +2
601    shufps         xm6, xm10, q2020  ; +1 +2 +3 +4
602    psrldq         xm9, xm11, 2
603    psrldq        xm10, 2
604    shufps        xm10, xm9, q2020   ; +3 +4 +5 +6
605    movd           xm9, [dst4q+stride3q -1]
606    pinsrd         xm9, [botq           -1], 1
607    shufps        xm11, xm9, q1020   ; +5 +6 +7 +8
608    pmovzxbw        m9, [leftq+3]
609    vinserti128     m6, xm11, 1
610    movu           m11, [blend_4x8_0+4]
611    vinserti128     m5, xm10, 1
612    vpblendvb       m6, m9, m11
613 %endif
614%else
615    lea            r13, [blend_8x8_0+16]
616    movq           xm5, [top2q         +1]
617    vbroadcasti128 m10, [dstq+strideq*1-1]
618    vbroadcasti128 m11, [dstq+strideq*2-1]
619    movhps         xm5, [dstq+strideq*0+1]
620    vinserti128     m6, m10, [dstq+stride3q-1], 1
621    vinserti128     m9, m11, [botq         -1], 1
622    psrldq         m10, 2
623    psrldq         m11, 2
624    punpcklqdq      m6, m9
625    movu            m9, [r13+hq*2*1+16*1]
626    punpcklqdq     m10, m11
627    vpblendd        m5, m10, 0xF0
628    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9
629%endif
630    ret
631.d1k0:
632.d2k0:
633.d3k0:
634%if %1 == 4
635 %if %2 == 4
636    movq           xm6, [dstq+strideq*0-1]
637    movq           xm9, [dstq+strideq*1-1]
638    vinserti128     m6, [dstq+strideq*2-1], 1
639    vinserti128     m9, [dstq+stride3q -1], 1
640    movu           m11, [rsp+gprsize+0x10]
641    pcmpeqd        m12, m12
642    psrldq          m5, m6, 2
643    psrldq         m10, m9, 2
644    psrld          m12, 24
645    punpckldq       m6, m9
646    punpckldq       m5, m10
647    vpblendvb       m6, m11, m12
648 %else
649    movq           xm6, [dstq +strideq*0-1]
650    movq           xm9, [dstq +strideq*2-1]
651    movhps         xm6, [dstq +strideq*1-1]
652    movhps         xm9, [dstq +stride3q -1]
653    movq          xm10, [dst4q+strideq*0-1]
654    movhps        xm10, [dst4q+strideq*1-1]
655    psrldq         xm5, xm6, 2
656    psrldq        xm11, xm9, 2
657    shufps         xm5, xm11, q2020
658    movq          xm11, [dst4q+strideq*2-1]
659    movhps        xm11, [dst4q+stride3q -1]
660    shufps         xm6, xm9, q2020
661    shufps         xm9, xm10, xm11, q2020
662    vinserti128     m6, xm9, 1
663    pmovzxbw        m9, [leftq+1]
664    psrldq        xm10, 2
665    psrldq        xm11, 2
666    shufps        xm10, xm11, q2020
667    vpbroadcastd   m11, [blend_4x8_0+4]
668    vinserti128     m5, xm10, 1
669    vpblendvb       m6, m9, m11
670 %endif
671%else
672    movu           xm5, [dstq+strideq*0-1]
673    movu           xm9, [dstq+strideq*1-1]
674    vinserti128     m5, [dstq+strideq*2-1], 1
675    vinserti128     m9, [dstq+stride3q -1], 1
676    movu           m10, [blend_8x8_0+16]
677    punpcklqdq      m6, m5, m9
678    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64], m10
679    psrldq          m5, 2
680    psrldq          m9, 2
681    punpcklqdq      m5, m9
682%endif
683    ret
684.d4k0:
685%if %1 == 4
686 %if %2 == 4
687    vpbroadcastq   m10, [dstq+strideq*1-1]
688    vpbroadcastq   m11, [dstq+strideq*2-1]
689    movd           xm6, [topq+strideq*1-1]
690    movd           xm9, [dstq+strideq*0-1]
691    psrldq          m5, m10, 2
692    psrldq         m12, m11, 2
693    vpblendd        m6, m10, 0x10
694    vpblendd        m9, m11, 0x10
695    movu           m10, [blend_4x4]
696    vinserti128     m5, [dstq+stride3q +1], 1
697    vinserti128    m12, [botq          +1], 1
698    punpckldq       m6, m9
699    punpckldq       m5, m12
700    vpblendvb       m6, [rsp+gprsize+0x40], m10
701 %else
702    movd           xm6, [topq +strideq*1-1]
703    movq           xm9, [dstq +strideq*1-1]
704    movq          xm10, [dstq +stride3q -1]
705    movq          xm11, [dst4q+strideq*1-1]
706    pinsrd         xm6, [dstq +strideq*0-1], 1
707    movhps         xm9, [dstq +strideq*2-1]
708    movhps        xm10, [dst4q+strideq*0-1]
709    movhps        xm11, [dst4q+strideq*2-1]
710    psrldq         xm5, xm9, 2
711    shufps         xm6, xm9, q2010
712    psrldq         xm9, xm10, 2
713    shufps         xm5, xm9, q2020
714    shufps        xm10, xm11, q2020
715    movd           xm9, [dst4q+stride3q +1]
716    vinserti128     m6, xm10, 1
717    pinsrd         xm9, [botq           +1], 1
718    psrldq        xm11, 2
719    pmovzxbw       m10, [leftq-1]
720    shufps        xm11, xm9, q1020
721    movu            m9, [blend_4x8_0]
722    vinserti128     m5, xm11, 1
723    vpblendvb       m6, m10, m9
724 %endif
725%else
726    lea            r13, [blend_8x8_0+8]
727    movq           xm6, [top2q         -1]
728    vbroadcasti128  m5, [dstq+strideq*1-1]
729    vbroadcasti128  m9, [dstq+strideq*2-1]
730    movhps         xm6, [dstq+strideq*0-1]
731    movu           m11, [r13+hq*2*1+16*1]
732    punpcklqdq     m10, m5, m9
733    vinserti128     m5, [dstq+stride3q -1], 1
734    vinserti128     m9, [botq          -1], 1
735    vpblendd        m6, m10, 0xF0
736    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11
737    psrldq          m5, 2
738    psrldq          m9, 2
739    punpcklqdq      m5, m9
740%endif
741    ret
742.d5k0:
743.d6k0:
744.d7k0:
745%if %1 == 4
746 %if %2 == 4
747    movd           xm6, [topq+strideq*1  ]
748    vpbroadcastd    m5, [dstq+strideq*1  ]
749    vpbroadcastd    m9, [dstq+strideq*2  ]
750    vpblendd       xm6, [dstq+strideq*0-4], 0x2
751    vpblendd        m5, m9, 0x22
752    vpblendd        m6, m5, 0x30
753    vinserti128     m5, [dstq+stride3q   ], 1
754    vpblendd        m5, [botq         -20], 0x20
755 %else
756    movd           xm6, [topq +strideq*1]
757    movd           xm5, [dstq +strideq*1]
758    movd           xm9, [dstq +stride3q ]
759    movd          xm10, [dst4q+strideq*1]
760    movd          xm11, [dst4q+stride3q ]
761    pinsrd         xm6, [dstq +strideq*0], 1
762    pinsrd         xm5, [dstq +strideq*2], 1
763    pinsrd         xm9, [dst4q+strideq*0], 1
764    pinsrd        xm10, [dst4q+strideq*2], 1
765    pinsrd        xm11, [botq           ], 1
766    punpcklqdq     xm6, xm5
767    punpcklqdq     xm5, xm9
768    punpcklqdq     xm9, xm10
769    punpcklqdq    xm10, xm11
770    vinserti128     m6, xm9, 1
771    vinserti128     m5, xm10, 1
772 %endif
773%else
774    movq           xm6, [top2q         ]
775    movq           xm5, [dstq+strideq*1]
776    movq           xm9, [dstq+stride3q ]
777    movhps         xm6, [dstq+strideq*0]
778    movhps         xm5, [dstq+strideq*2]
779    movhps         xm9, [botq          ]
780    vinserti128     m6, xm5, 1
781    vinserti128     m5, xm9, 1
782%endif
783    ret
784.d0k1:
785%if %1 == 4
786 %if %2 == 4
787    movd           xm6, [dstq+strideq*2-2]
788    movd           xm9, [dstq+stride3q -2]
789    movd           xm5, [topq+strideq*0+2]
790    movd          xm10, [topq+strideq*1+2]
791    pinsrw         xm6, [leftq+4], 0
792    pinsrw         xm9, [leftq+6], 0
793    vinserti128     m5, [dstq+strideq*0+2], 1
794    vinserti128    m10, [dstq+strideq*1+2], 1
795    vinserti128     m6, [botq+strideq*0-2], 1
796    vinserti128     m9, [botq+strideq*1-2], 1
797    punpckldq       m5, m10
798    punpckldq       m6, m9
799 %else
800    movq           xm6, [dstq +strideq*2-2]
801    movd          xm10, [dst4q+strideq*2-2]
802    movd           xm5, [topq +strideq*0+2]
803    movq           xm9, [dst4q+strideq*0-2]
804    movhps         xm6, [dstq +stride3q -2]
805    pinsrw        xm10, [dst4q+stride3q   ], 3
806    pinsrd         xm5, [topq +strideq*1+2], 1
807    movhps         xm9, [dst4q+strideq*1-2]
808    pinsrd        xm10, [botq +strideq*0-2], 2
809    pinsrd         xm5, [dstq +strideq*0+2], 2
810    pinsrd        xm10, [botq +strideq*1-2], 3
811    pinsrd         xm5, [dstq +strideq*1+2], 3
812    shufps        xm11, xm6, xm9, q3131
813    shufps         xm6, xm9, q2020
814    movu            m9, [blend_4x8_3+8]
815    vinserti128     m6, xm10, 1
816    vinserti128     m5, xm11, 1
817    vpblendvb       m6, [rsp+gprsize+0x10+8], m9
818 %endif
819%else
820    lea            r13, [blend_8x8_1+16]
821    movq           xm6, [dstq+strideq*2-2]
822    movq           xm9, [dstq+stride3q -2]
823    movq           xm5, [top1q         +2]
824    movq          xm10, [top2q         +2]
825    movu           m11, [r13+hq*2*2+16*2]
826    vinserti128     m6, [botq+strideq*0-2], 1
827    vinserti128     m9, [botq+strideq*1-2], 1
828    vinserti128     m5, [dstq+strideq*0+2], 1
829    vinserti128    m10, [dstq+strideq*1+2], 1
830    punpcklqdq      m6, m9
831    punpcklqdq      m5, m10
832    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11
833%endif
834    ret
835.d1k1:
836%if %1 == 4
837 %if %2 == 4
838    vpbroadcastq    m6, [dstq+strideq*1-2]
839    vpbroadcastq    m9, [dstq+strideq*2-2]
840    movd           xm5, [topq+strideq*1+2]
841    movd          xm10, [dstq+strideq*0+2]
842    psrldq         m11, m6, 4
843    psrldq         m12, m9, 4
844    vpblendd        m5, m11, 0x10
845    movq          xm11, [leftq+2]
846    vinserti128     m6, [dstq+stride3q-2], 1
847    punpckldq     xm11, xm11
848    vpblendd       m10, m12, 0x10
849    pcmpeqd        m12, m12
850    pmovzxwd       m11, xm11
851    psrld          m12, 16
852    punpckldq       m6, m9
853    vpbroadcastd    m9, [botq-2]
854    vpblendvb       m6, m11, m12
855    punpckldq       m5, m10
856    vpblendd        m6, m9, 0x20
857 %else
858    movd           xm5, [topq +strideq*1+2]
859    movq           xm6, [dstq +strideq*1-2]
860    movq           xm9, [dstq +stride3q -2]
861    movq          xm10, [dst4q+strideq*1-2]
862    movd          xm11, [dst4q+stride3q -2]
863    pinsrd         xm5, [dstq +strideq*0+2], 1
864    movhps         xm6, [dstq +strideq*2-2]
865    movhps         xm9, [dst4q+strideq*0-2]
866    movhps        xm10, [dst4q+strideq*2-2]
867    pinsrd        xm11, [botq           -2], 1
868    shufps         xm5, xm6, q3110
869    shufps         xm6, xm9, q2020
870    shufps         xm9, xm10, q3131
871    shufps        xm10, xm11, q1020
872    movu           m11, [blend_4x8_2+4]
873    vinserti128     m6, xm10, 1
874    vinserti128     m5, xm9, 1
875    vpblendvb       m6, [rsp+gprsize+0x10+4], m11
876 %endif
877%else
878    lea            r13, [blend_8x8_1+16]
879    movq           xm5, [top2q         +2]
880    vbroadcasti128  m6, [dstq+strideq*1-2]
881    vbroadcasti128  m9, [dstq+strideq*2-2]
882    movhps         xm5, [dstq+strideq*0+2]
883    shufps         m10, m6, m9, q2121
884    vinserti128     m6, [dstq+stride3q -2], 1
885    vinserti128     m9, [botq          -2], 1
886    movu           m11, [r13+hq*2*1+16*1]
887    vpblendd        m5, m10, 0xF0
888    punpcklqdq      m6, m9
889    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11
890%endif
891    ret
892.d2k1:
893%if %1 == 4
894 %if %2 == 4
895    movq          xm11, [leftq]
896    movq           xm6, [dstq+strideq*0-2]
897    movq           xm9, [dstq+strideq*1-2]
898    vinserti128     m6, [dstq+strideq*2-2], 1
899    vinserti128     m9, [dstq+stride3q -2], 1
900    punpckldq     xm11, xm11
901    psrldq          m5, m6, 4
902    psrldq         m10, m9, 4
903    pmovzxwd       m11, xm11
904    punpckldq       m6, m9
905    punpckldq       m5, m10
906    pblendw         m6, m11, 0x05
907 %else
908    movq           xm5, [dstq +strideq*0-2]
909    movq           xm9, [dstq +strideq*2-2]
910    movq          xm10, [dst4q+strideq*0-2]
911    movq          xm11, [dst4q+strideq*2-2]
912    movhps         xm5, [dstq +strideq*1-2]
913    movhps         xm9, [dstq +stride3q -2]
914    movhps        xm10, [dst4q+strideq*1-2]
915    movhps        xm11, [dst4q+stride3q -2]
916    shufps         xm6, xm5, xm9, q2020
917    shufps         xm5, xm9, q3131
918    shufps         xm9, xm10, xm11, q2020
919    shufps        xm10, xm11, q3131
920    pmovzxwd       m11, [leftq]
921    vinserti128     m6, xm9, 1
922    vinserti128     m5, xm10, 1
923    pblendw         m6, m11, 0x55
924 %endif
925%else
926    mova           m11, [rsp+gprsize+0x20+hq*8+64]
927    movu           xm5, [dstq+strideq*0-2]
928    movu           xm9, [dstq+strideq*1-2]
929    vinserti128     m5, [dstq+strideq*2-2], 1
930    vinserti128     m9, [dstq+stride3q -2], 1
931    shufps          m6, m5, m9, q1010
932    shufps          m5, m9, q2121
933    pblendw         m6, m11, 0x11
934%endif
935    ret
936.d3k1:
937%if %1 == 4
938 %if %2 == 4
939    vpbroadcastq   m11, [dstq+strideq*1-2]
940    vpbroadcastq   m12, [dstq+strideq*2-2]
941    movd           xm6, [topq+strideq*1-2]
942    movd           xm9, [dstq+strideq*0-2]
943    pblendw        m11, [leftq-16+2], 0x01
944    pblendw        m12, [leftq-16+4], 0x01
945    pinsrw         xm9, [leftq- 0+0], 0
946    psrldq          m5, m11, 4
947    psrldq         m10, m12, 4
948    vinserti128     m5, [dstq+stride3q +2], 1
949    vinserti128    m10, [botq          +2], 1
950    vpblendd        m6, m11, 0x10
951    vpblendd        m9, m12, 0x10
952    punpckldq       m6, m9
953    punpckldq       m5, m10
954 %else
955    movd           xm6, [topq +strideq*1-2]
956    movq           xm5, [dstq +strideq*1-2]
957    movq           xm9, [dstq +stride3q -2]
958    movq          xm10, [dst4q+strideq*1-2]
959    movd          xm11, [dst4q+stride3q +2]
960    pinsrw         xm6, [dstq +strideq*0  ], 3
961    movhps         xm5, [dstq +strideq*2-2]
962    movhps         xm9, [dst4q+strideq*0-2]
963    movhps        xm10, [dst4q+strideq*2-2]
964    pinsrd        xm11, [botq           +2], 1
965    shufps         xm6, xm5, q2010
966    shufps         xm5, xm9, q3131
967    shufps         xm9, xm10, q2020
968    shufps        xm10, xm11, q1031
969    movu           m11, [blend_4x8_2]
970    vinserti128     m6, xm9, 1
971    vinserti128     m5, xm10, 1
972    vpblendvb       m6, [rsp+gprsize+0x10-4], m11
973 %endif
974%else
975    lea            r13, [blend_8x8_1+8]
976    movq           xm6, [top2q         -2]
977    vbroadcasti128  m5, [dstq+strideq*1-2]
978    vbroadcasti128 m10, [dstq+strideq*2-2]
979    movhps         xm6, [dstq+strideq*0-2]
980    punpcklqdq      m9, m5, m10
981    vinserti128     m5, [dstq+stride3q -2], 1
982    vinserti128    m10, [botq          -2], 1
983    movu           m11, [r13+hq*2*1+16*1]
984    vpblendd        m6, m9, 0xF0
985    shufps          m5, m10, q2121
986    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11
987%endif
988    ret
989.d4k1:
990%if %1 == 4
991 %if %2 == 4
992    vinserti128     m6, [dstq+strideq*0-2], 1
993    vinserti128     m9, [dstq+strideq*1-2], 1
994    movd           xm5, [dstq+strideq*2+2]
995    movd          xm10, [dstq+stride3q +2]
996    pblendw         m6, [leftq-16+0], 0x01
997    pblendw         m9, [leftq-16+2], 0x01
998    vinserti128     m5, [botq+strideq*0+2], 1
999    vinserti128    m10, [botq+strideq*1+2], 1
1000    vpblendd        m6, [topq+strideq*0-2], 0x01
1001    vpblendd        m9, [topq+strideq*1-2], 0x01
1002    punpckldq       m5, m10
1003    punpckldq       m6, m9
1004 %else
1005    movd           xm6, [topq +strideq*0-2]
1006    movq           xm5, [dstq +strideq*2-2]
1007    movq           xm9, [dst4q+strideq*0-2]
1008    movd          xm10, [dst4q+strideq*2+2]
1009    pinsrd         xm6, [topq +strideq*1-2], 1
1010    movhps         xm5, [dstq +stride3q -2]
1011    movhps         xm9, [dst4q+strideq*1-2]
1012    pinsrd        xm10, [dst4q+stride3q +2], 1
1013    pinsrd         xm6, [dstq +strideq*0-2], 2
1014    pinsrd        xm10, [botq +strideq*0+2], 2
1015    pinsrd         xm6, [dstq +strideq*1-2], 3
1016    pinsrd        xm10, [botq +strideq*1+2], 3
1017    shufps        xm11, xm5, xm9, q2020
1018    shufps         xm5, xm9, q3131
1019    movu            m9, [blend_4x8_3]
1020    vinserti128     m6, xm11, 1
1021    vinserti128     m5, xm10, 1
1022    vpblendvb       m6, [rsp+gprsize+0x10-8], m9
1023 %endif
1024%else
1025    lea            r13, [blend_8x8_1]
1026    movu           m11, [r13+hq*2*2+16*2]
1027    movq           xm6, [top1q         -2]
1028    movq           xm9, [top2q         -2]
1029    movq           xm5, [dstq+strideq*2+2]
1030    movq          xm10, [dstq+stride3q +2]
1031    vinserti128     m6, [dstq+strideq*0-2], 1
1032    vinserti128     m9, [dstq+strideq*1-2], 1
1033    vinserti128     m5, [botq+strideq*0+2], 1
1034    vinserti128    m10, [botq+strideq*1+2], 1
1035    punpcklqdq      m6, m9
1036    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11
1037    punpcklqdq      m5, m10
1038%endif
1039    ret
1040.d5k1:
1041%if %1 == 4
1042 %if %2 == 4
1043    movd           xm6, [topq+strideq*0-1]
1044    movd           xm9, [topq+strideq*1-1]
1045    movd           xm5, [dstq+strideq*2+1]
1046    movd          xm10, [dstq+stride3q +1]
1047    pcmpeqd        m12, m12
1048    pmovzxbw       m11, [leftq-8+1]
1049    psrld          m12, 24
1050    vinserti128     m6, [dstq+strideq*0-1], 1
1051    vinserti128     m9, [dstq+strideq*1-1], 1
1052    vinserti128     m5, [botq+strideq*0+1], 1
1053    vinserti128    m10, [botq+strideq*1+1], 1
1054    punpckldq       m6, m9
1055    pxor            m9, m9
1056    vpblendd       m12, m9, 0x0F
1057    punpckldq       m5, m10
1058    vpblendvb       m6, m11, m12
1059 %else
1060    movd           xm6, [topq +strideq*0-1]
1061    movq           xm5, [dstq +strideq*2-1]
1062    movq           xm9, [dst4q+strideq*0-1]
1063    movd          xm10, [dst4q+strideq*2+1]
1064    pinsrd         xm6, [topq +strideq*1-1], 1
1065    movhps         xm5, [dstq +stride3q -1]
1066    movhps         xm9, [dst4q+strideq*1-1]
1067    pinsrd        xm10, [dst4q+stride3q +1], 1
1068    pinsrd         xm6, [dstq +strideq*0-1], 2
1069    pinsrd        xm10, [botq +strideq*0+1], 2
1070    pinsrd         xm6, [dstq +strideq*1-1], 3
1071    pinsrd        xm10, [botq +strideq*1+1], 3
1072    shufps        xm11, xm5, xm9, q2020
1073    vinserti128     m6, xm11, 1
1074    pmovzxbw       m11, [leftq-3]
1075    psrldq         xm5, 2
1076    psrldq         xm9, 2
1077    shufps         xm5, xm9, q2020
1078    movu            m9, [blend_4x8_1]
1079    vinserti128     m5, xm10, 1
1080    vpblendvb       m6, m11, m9
1081 %endif
1082%else
1083    lea            r13, [blend_8x8_0]
1084    movu           m11, [r13+hq*2*2+16*2]
1085    movq           xm6, [top1q         -1]
1086    movq           xm9, [top2q         -1]
1087    movq           xm5, [dstq+strideq*2+1]
1088    movq          xm10, [dstq+stride3q +1]
1089    vinserti128     m6, [dstq+strideq*0-1], 1
1090    vinserti128     m9, [dstq+strideq*1-1], 1
1091    vinserti128     m5, [botq+strideq*0+1], 1
1092    vinserti128    m10, [botq+strideq*1+1], 1
1093    punpcklqdq      m6, m9
1094    punpcklqdq      m5, m10
1095    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11
1096%endif
1097    ret
1098.d6k1:
1099%if %1 == 4
1100 %if %2 == 4
1101    movd           xm6, [topq+strideq*0]
1102    movd           xm9, [topq+strideq*1]
1103    movd           xm5, [dstq+strideq*2]
1104    movd          xm10, [dstq+stride3q ]
1105    vinserti128     m6, [dstq+strideq*0], 1
1106    vinserti128     m9, [dstq+strideq*1], 1
1107    vinserti128     m5, [botq+strideq*0], 1
1108    vinserti128    m10, [botq+strideq*1], 1
1109    punpckldq       m6, m9
1110    punpckldq       m5, m10
1111 %else
1112    movd           xm5, [dstq +strideq*2]
1113    movd           xm6, [topq +strideq*0]
1114    movd           xm9, [dst4q+strideq*2]
1115    pinsrd         xm5, [dstq +stride3q ], 1
1116    pinsrd         xm6, [topq +strideq*1], 1
1117    pinsrd         xm9, [dst4q+stride3q ], 1
1118    pinsrd         xm5, [dst4q+strideq*0], 2
1119    pinsrd         xm6, [dstq +strideq*0], 2
1120    pinsrd         xm9, [botq +strideq*0], 2
1121    pinsrd         xm5, [dst4q+strideq*1], 3
1122    pinsrd         xm6, [dstq +strideq*1], 3
1123    pinsrd         xm9, [botq +strideq*1], 3
1124    vinserti128     m6, xm5, 1
1125    vinserti128     m5, xm9, 1
1126 %endif
1127%else
1128    movq           xm5, [dstq+strideq*2]
1129    movq           xm9, [botq+strideq*0]
1130    movq           xm6, [top1q         ]
1131    movq          xm10, [dstq+strideq*0]
1132    movhps         xm5, [dstq+stride3q ]
1133    movhps         xm9, [botq+strideq*1]
1134    movhps         xm6, [top2q         ]
1135    movhps        xm10, [dstq+strideq*1]
1136    vinserti128     m5, xm9, 1
1137    vinserti128     m6, xm10, 1
1138%endif
1139    ret
1140.d7k1:
1141%if %1 == 4
1142 %if %2 == 4
1143    movd           xm5, [dstq+strideq*2-1]
1144    movd           xm9, [dstq+stride3q -1]
1145    movd           xm6, [topq+strideq*0+1]
1146    movd          xm10, [topq+strideq*1+1]
1147    pinsrb         xm5, [leftq+ 5], 0
1148    pinsrb         xm9, [leftq+ 7], 0
1149    vinserti128     m6, [dstq+strideq*0+1], 1
1150    vinserti128    m10, [dstq+strideq*1+1], 1
1151    vinserti128     m5, [botq+strideq*0-1], 1
1152    vinserti128     m9, [botq+strideq*1-1], 1
1153    punpckldq       m6, m10
1154    punpckldq       m5, m9
1155 %else
1156    movd           xm6, [topq +strideq*0+1]
1157    movq           xm9, [dstq +strideq*2-1]
1158    movq          xm10, [dst4q+strideq*0-1]
1159    movd          xm11, [dst4q+strideq*2-1]
1160    pinsrd         xm6, [topq +strideq*1+1], 1
1161    movhps         xm9, [dstq +stride3q -1]
1162    movhps        xm10, [dst4q+strideq*1-1]
1163    pinsrd        xm11, [dst4q+stride3q -1], 1
1164    pinsrd         xm6, [dstq +strideq*0+1], 2
1165    pinsrd        xm11, [botq +strideq*0-1], 2
1166    pinsrd         xm6, [dstq +strideq*1+1], 3
1167    pinsrd        xm11, [botq +strideq*1-1], 3
1168    shufps         xm5, xm9, xm10, q2020
1169    vinserti128     m5, xm11, 1
1170    pmovzxbw       m11, [leftq+5]
1171    psrldq         xm9, 2
1172    psrldq        xm10, 2
1173    shufps         xm9, xm10, q2020
1174    movu           m10, [blend_4x8_1+8]
1175    vinserti128     m6, xm9, 1
1176    vpblendvb       m5, m11, m10
1177 %endif
1178%else
1179    lea            r13, [blend_8x8_0+16]
1180    movq           xm5, [dstq+strideq*2-1]
1181    movq           xm9, [botq+strideq*0-1]
1182    movq           xm6, [top1q         +1]
1183    movq          xm10, [dstq+strideq*0+1]
1184    movhps         xm5, [dstq+stride3q -1]
1185    movhps         xm9, [botq+strideq*1-1]
1186    movhps         xm6, [top2q         +1]
1187    movhps        xm10, [dstq+strideq*1+1]
1188    movu           m11, [r13+hq*2*2+16*2]
1189    vinserti128     m5, xm9, 1
1190    vinserti128     m6, xm10, 1
1191    vpblendvb       m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11
1192%endif
1193    ret
1194
1195.border_block:
1196 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge
1197    RESET_STACK_STATE
1198    %assign stack_offset stack_offset - (regs_used - 11) * gprsize
1199    %assign regs_used 11
1200    ALLOC_STACK 2*16+(%2+4)*32, 16
1201%define px rsp+2*16+2*32
1202
1203    pcmpeqw        m14, m14
1204    psllw          m14, 15                  ; 0x8000
1205
1206    ; prepare pixel buffers - body/right
1207%if %1 == 4
1208    INIT_XMM avx2
1209%endif
1210%if %2 == 8
1211    lea          dst4q, [dstq+strideq*4]
1212%endif
1213    lea       stride3q, [strideq*3]
1214    test         edgeb, 2                   ; have_right
1215    jz .no_right
1216    pmovzxbw        m1, [dstq+strideq*0]
1217    pmovzxbw        m2, [dstq+strideq*1]
1218    pmovzxbw        m3, [dstq+strideq*2]
1219    pmovzxbw        m4, [dstq+stride3q]
1220    mova     [px+0*32], m1
1221    mova     [px+1*32], m2
1222    mova     [px+2*32], m3
1223    mova     [px+3*32], m4
1224%if %2 == 8
1225    pmovzxbw        m1, [dst4q+strideq*0]
1226    pmovzxbw        m2, [dst4q+strideq*1]
1227    pmovzxbw        m3, [dst4q+strideq*2]
1228    pmovzxbw        m4, [dst4q+stride3q]
1229    mova     [px+4*32], m1
1230    mova     [px+5*32], m2
1231    mova     [px+6*32], m3
1232    mova     [px+7*32], m4
1233%endif
1234    jmp .body_done
1235.no_right:
1236%if %1 == 4
1237    movd           xm1, [dstq+strideq*0]
1238    movd           xm2, [dstq+strideq*1]
1239    movd           xm3, [dstq+strideq*2]
1240    movd           xm4, [dstq+stride3q]
1241    pmovzxbw       xm1, xm1
1242    pmovzxbw       xm2, xm2
1243    pmovzxbw       xm3, xm3
1244    pmovzxbw       xm4, xm4
1245    movq     [px+0*32], xm1
1246    movq     [px+1*32], xm2
1247    movq     [px+2*32], xm3
1248    movq     [px+3*32], xm4
1249%else
1250    pmovzxbw       xm1, [dstq+strideq*0]
1251    pmovzxbw       xm2, [dstq+strideq*1]
1252    pmovzxbw       xm3, [dstq+strideq*2]
1253    pmovzxbw       xm4, [dstq+stride3q]
1254    mova     [px+0*32], xm1
1255    mova     [px+1*32], xm2
1256    mova     [px+2*32], xm3
1257    mova     [px+3*32], xm4
1258%endif
1259    movd [px+0*32+%1*2], xm14
1260    movd [px+1*32+%1*2], xm14
1261    movd [px+2*32+%1*2], xm14
1262    movd [px+3*32+%1*2], xm14
1263%if %2 == 8
1264 %if %1 == 4
1265    movd           xm1, [dst4q+strideq*0]
1266    movd           xm2, [dst4q+strideq*1]
1267    movd           xm3, [dst4q+strideq*2]
1268    movd           xm4, [dst4q+stride3q]
1269    pmovzxbw       xm1, xm1
1270    pmovzxbw       xm2, xm2
1271    pmovzxbw       xm3, xm3
1272    pmovzxbw       xm4, xm4
1273    movq     [px+4*32], xm1
1274    movq     [px+5*32], xm2
1275    movq     [px+6*32], xm3
1276    movq     [px+7*32], xm4
1277 %else
1278    pmovzxbw       xm1, [dst4q+strideq*0]
1279    pmovzxbw       xm2, [dst4q+strideq*1]
1280    pmovzxbw       xm3, [dst4q+strideq*2]
1281    pmovzxbw       xm4, [dst4q+stride3q]
1282    mova     [px+4*32], xm1
1283    mova     [px+5*32], xm2
1284    mova     [px+6*32], xm3
1285    mova     [px+7*32], xm4
1286 %endif
1287    movd [px+4*32+%1*2], xm14
1288    movd [px+5*32+%1*2], xm14
1289    movd [px+6*32+%1*2], xm14
1290    movd [px+7*32+%1*2], xm14
1291%endif
1292.body_done:
1293
1294    ; top
1295    test         edgeb, 4                    ; have_top
1296    jz .no_top
1297    test         edgeb, 1                    ; have_left
1298    jz .top_no_left
1299    test         edgeb, 2                    ; have_right
1300    jz .top_no_right
1301    pmovzxbw        m1, [topq+strideq*0-(%1/2)]
1302    pmovzxbw        m2, [topq+strideq*1-(%1/2)]
1303    movu  [px-2*32-%1], m1
1304    movu  [px-1*32-%1], m2
1305    jmp .top_done
1306.top_no_right:
1307    pmovzxbw        m1, [topq+strideq*0-%1]
1308    pmovzxbw        m2, [topq+strideq*1-%1]
1309    movu [px-2*32-%1*2], m1
1310    movu [px-1*32-%1*2], m2
1311    movd [px-2*32+%1*2], xm14
1312    movd [px-1*32+%1*2], xm14
1313    jmp .top_done
1314.top_no_left:
1315    test         edgeb, 2                   ; have_right
1316    jz .top_no_left_right
1317    pmovzxbw        m1, [topq+strideq*0]
1318    pmovzxbw        m2, [topq+strideq*1]
1319    mova   [px-2*32+0], m1
1320    mova   [px-1*32+0], m2
1321    movd   [px-2*32-4], xm14
1322    movd   [px-1*32-4], xm14
1323    jmp .top_done
1324.top_no_left_right:
1325%if %1 == 4
1326    movd           xm1, [topq+strideq*0]
1327    pinsrd         xm1, [topq+strideq*1], 1
1328    pmovzxbw       xm1, xm1
1329    movq   [px-2*32+0], xm1
1330    movhps [px-1*32+0], xm1
1331%else
1332    pmovzxbw       xm1, [topq+strideq*0]
1333    pmovzxbw       xm2, [topq+strideq*1]
1334    mova   [px-2*32+0], xm1
1335    mova   [px-1*32+0], xm2
1336%endif
1337    movd   [px-2*32-4], xm14
1338    movd   [px-1*32-4], xm14
1339    movd [px-2*32+%1*2], xm14
1340    movd [px-1*32+%1*2], xm14
1341    jmp .top_done
1342.no_top:
1343    movu   [px-2*32-%1], m14
1344    movu   [px-1*32-%1], m14
1345.top_done:
1346
1347    ; left
1348    test         edgeb, 1                   ; have_left
1349    jz .no_left
1350    pmovzxbw       xm1, [leftq+ 0]
1351%if %2 == 8
1352    pmovzxbw       xm2, [leftq+ 8]
1353%endif
1354    movd   [px+0*32-4], xm1
1355    pextrd [px+1*32-4], xm1, 1
1356    pextrd [px+2*32-4], xm1, 2
1357    pextrd [px+3*32-4], xm1, 3
1358%if %2 == 8
1359    movd   [px+4*32-4], xm2
1360    pextrd [px+5*32-4], xm2, 1
1361    pextrd [px+6*32-4], xm2, 2
1362    pextrd [px+7*32-4], xm2, 3
1363%endif
1364    jmp .left_done
1365.no_left:
1366    movd   [px+0*32-4], xm14
1367    movd   [px+1*32-4], xm14
1368    movd   [px+2*32-4], xm14
1369    movd   [px+3*32-4], xm14
1370%if %2 == 8
1371    movd   [px+4*32-4], xm14
1372    movd   [px+5*32-4], xm14
1373    movd   [px+6*32-4], xm14
1374    movd   [px+7*32-4], xm14
1375%endif
1376.left_done:
1377
1378    ; bottom
1379 DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge
1380    test         edgeb, 8                   ; have_bottom
1381    jz .no_bottom
1382    test         edgeb, 1                   ; have_left
1383    jz .bottom_no_left
1384    test         edgeb, 2                   ; have_right
1385    jz .bottom_no_right
1386    pmovzxbw        m1, [botq+strideq*0-(%1/2)]
1387    pmovzxbw        m2, [botq+strideq*1-(%1/2)]
1388    movu   [px+(%2+0)*32-%1], m1
1389    movu   [px+(%2+1)*32-%1], m2
1390    jmp .bottom_done
1391.bottom_no_right:
1392    pmovzxbw        m1, [botq+strideq*0-%1]
1393    pmovzxbw        m2, [botq+strideq*1-%1]
1394    movu  [px+(%2+0)*32-%1*2], m1
1395    movu  [px+(%2+1)*32-%1*2], m2
1396%if %1 == 8
1397    movd  [px+(%2-1)*32+%1*2], xm14                ; overwritten by previous movu
1398%endif
1399    movd  [px+(%2+0)*32+%1*2], xm14
1400    movd  [px+(%2+1)*32+%1*2], xm14
1401    jmp .bottom_done
1402.bottom_no_left:
1403    test          edgeb, 2                  ; have_right
1404    jz .bottom_no_left_right
1405    pmovzxbw        m1, [botq+strideq*0]
1406    pmovzxbw        m2, [botq+strideq*1]
1407    mova   [px+(%2+0)*32+0], m1
1408    mova   [px+(%2+1)*32+0], m2
1409    movd   [px+(%2+0)*32-4], xm14
1410    movd   [px+(%2+1)*32-4], xm14
1411    jmp .bottom_done
1412.bottom_no_left_right:
1413%if %1 == 4
1414    movd           xm1, [botq+strideq*0]
1415    pinsrd         xm1, [botq+strideq*1], 1
1416    pmovzxbw       xm1, xm1
1417    movq   [px+(%2+0)*32+0], xm1
1418    movhps [px+(%2+1)*32+0], xm1
1419%else
1420    pmovzxbw       xm1, [botq+strideq*0]
1421    pmovzxbw       xm2, [botq+strideq*1]
1422    mova   [px+(%2+0)*32+0], xm1
1423    mova   [px+(%2+1)*32+0], xm2
1424%endif
1425    movd   [px+(%2+0)*32-4], xm14
1426    movd   [px+(%2+1)*32-4], xm14
1427    movd  [px+(%2+0)*32+%1*2], xm14
1428    movd  [px+(%2+1)*32+%1*2], xm14
1429    jmp .bottom_done
1430.no_bottom:
1431    movu   [px+(%2+0)*32-%1], m14
1432    movu   [px+(%2+1)*32-%1], m14
1433.bottom_done:
1434
1435    ; actual filter
1436 INIT_YMM avx2
1437 DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero
1438%undef edged
1439    ; register to shuffle values into after packing
1440    vbroadcasti128 m12, [shufb_lohi]
1441
1442    mov       dampingd, r8m
1443    xor          zerod, zerod
1444    movifnidn     prid, prim
1445    sub       dampingd, 31
1446    movifnidn  secdmpd, secdmpm
1447    test          prid, prid
1448    jz .border_sec_only
1449    movd           xm0, prid
1450    lzcnt      pridmpd, prid
1451    add        pridmpd, dampingd
1452    cmovs      pridmpd, zerod
1453    mov        [rsp+0], pridmpq                 ; pri_shift
1454    test       secdmpd, secdmpd
1455    jz .border_pri_only
1456    movd           xm1, secdmpd
1457    lzcnt      secdmpd, secdmpd
1458    add        secdmpd, dampingd
1459    mov        [rsp+8], secdmpq                 ; sec_shift
1460
1461 DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3
1462    lea         tableq, [tap_table]
1463    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
1464    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
1465
1466    ; pri/sec_taps[k] [4 total]
1467 DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3
1468    vpbroadcastb    m0, xm0                     ; pri_strength
1469    vpbroadcastb    m1, xm1                     ; sec_strength
1470    and           prid, 1
1471    lea           priq, [tableq+priq*2+8]       ; pri_taps
1472    lea           secq, [tableq+12]             ; sec_taps
1473
1474    BORDER_PREP_REGS %1, %2
1475%if %1*%2*2/mmsize > 1
1476.border_v_loop:
1477%endif
1478    BORDER_LOAD_BLOCK %1, %2, 1
1479.border_k_loop:
1480    vpbroadcastb    m2, [priq+kq]               ; pri_taps
1481    vpbroadcastb    m3, [secq+kq]               ; sec_taps
1482    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
1483    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
1484    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
1485    dec             kq
1486    jge .border_k_loop
1487
1488    vpbroadcastd   m10, [pw_2048]
1489    BORDER_ADJUST_PIXEL %1, m10, 1
1490%if %1*%2*2/mmsize > 1
1491 %define vloop_lines (mmsize/(%1*2))
1492    lea           dstq, [dstq+strideq*vloop_lines]
1493    add           stkq, 32*vloop_lines
1494    dec             hd
1495    jg .border_v_loop
1496%endif
1497    RET
1498
1499.border_pri_only:
1500 DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3
1501    lea         tableq, [tap_table]
1502    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
1503 DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3
1504    vpbroadcastb    m0, xm0                     ; pri_strength
1505    and           prid, 1
1506    lea           priq, [tableq+priq*2+8]       ; pri_taps
1507    BORDER_PREP_REGS %1, %2
1508    vpbroadcastd    m1, [pw_2048]
1509%if %1*%2*2/mmsize > 1
1510.border_pri_v_loop:
1511%endif
1512    BORDER_LOAD_BLOCK %1, %2
1513.border_pri_k_loop:
1514    vpbroadcastb    m2, [priq+kq]               ; pri_taps
1515    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
1516    dec             kq
1517    jge .border_pri_k_loop
1518    BORDER_ADJUST_PIXEL %1, m1
1519%if %1*%2*2/mmsize > 1
1520 %define vloop_lines (mmsize/(%1*2))
1521    lea           dstq, [dstq+strideq*vloop_lines]
1522    add           stkq, 32*vloop_lines
1523    dec             hd
1524    jg .border_pri_v_loop
1525%endif
1526    RET
1527
1528.border_sec_only:
1529 DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3
1530    movd           xm1, secdmpd
1531    lzcnt      secdmpd, secdmpd
1532    add        secdmpd, dampingd
1533    mov        [rsp+8], secdmpq                 ; sec_shift
1534 DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3
1535    lea         tableq, [tap_table]
1536    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
1537 DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3
1538    vpbroadcastb    m1, xm1                     ; sec_strength
1539    lea           secq, [tableq+12]             ; sec_taps
1540    BORDER_PREP_REGS %1, %2
1541    vpbroadcastd    m0, [pw_2048]
1542%if %1*%2*2/mmsize > 1
1543.border_sec_v_loop:
1544%endif
1545    BORDER_LOAD_BLOCK %1, %2
1546.border_sec_k_loop:
1547    vpbroadcastb    m3, [secq+kq]               ; sec_taps
1548    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
1549    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
1550    dec             kq
1551    jge .border_sec_k_loop
1552    BORDER_ADJUST_PIXEL %1, m0
1553%if %1*%2*2/mmsize > 1
1554 %define vloop_lines (mmsize/(%1*2))
1555    lea           dstq, [dstq+strideq*vloop_lines]
1556    add           stkq, 32*vloop_lines
1557    dec             hd
1558    jg .border_sec_v_loop
1559%endif
1560    RET
1561%endmacro
1562
1563CDEF_FILTER 8, 8
1564CDEF_FILTER 4, 8
1565CDEF_FILTER 4, 4
1566
1567INIT_YMM avx2
1568cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3
1569    lea       stride3q, [strideq*3]
1570    movq           xm0, [srcq+strideq*0]
1571    movq           xm1, [srcq+strideq*1]
1572    movq           xm2, [srcq+strideq*2]
1573    movq           xm3, [srcq+stride3q ]
1574    lea           srcq, [srcq+strideq*4]
1575    vpbroadcastq    m4, [srcq+stride3q ]
1576    vpbroadcastq    m5, [srcq+strideq*2]
1577    vpblendd        m0, m4, 0xf0
1578    vpblendd        m1, m5, 0xf0
1579    vpbroadcastq    m4, [srcq+strideq*1]
1580    vpbroadcastq    m5, [srcq+strideq*0]
1581    vpblendd        m2, m4, 0xf0
1582    vpblendd        m3, m5, 0xf0
1583    pxor            m4, m4
1584    punpcklbw       m0, m4
1585    punpcklbw       m1, m4
1586    punpcklbw       m2, m4
1587    punpcklbw       m3, m4
1588cglobal_label .main
1589    vpbroadcastd    m4, [pw_128]
1590    PROLOGUE 3, 4, 15
1591    psubw           m0, m4
1592    psubw           m1, m4
1593    psubw           m2, m4
1594    psubw           m3, m4
1595
1596    ; shuffle registers to generate partial_sum_diag[0-1] together
1597    vperm2i128      m7, m0, m0, 0x01
1598    vperm2i128      m6, m1, m1, 0x01
1599    vperm2i128      m5, m2, m2, 0x01
1600    vperm2i128      m4, m3, m3, 0x01
1601
1602    ; start with partial_sum_hv[0-1]
1603    paddw           m8, m0, m1
1604    paddw           m9, m2, m3
1605    phaddw         m10, m0, m1
1606    phaddw         m11, m2, m3
1607    paddw           m8, m9
1608    phaddw         m10, m11
1609    vextracti128   xm9, m8, 1
1610    vextracti128  xm11, m10, 1
1611    paddw          xm8, xm9                 ; partial_sum_hv[1]
1612    phaddw        xm10, xm11                ; partial_sum_hv[0]
1613    vinserti128     m8, xm10, 1
1614    vpbroadcastd    m9, [div_table+44]
1615    pmaddwd         m8, m8
1616    pmulld          m8, m9                  ; cost6[2a-d] | cost2[a-d]
1617
1618    ; create aggregates [lower half]:
1619    ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
1620    ;      m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
1621    ; m10=             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
1622    ;      m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
1623    ; and [upper half]:
1624    ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
1625    ;      m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
1626    ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
1627    ;      m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
1628    ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
1629
1630    pslldq          m9, m1, 2
1631    psrldq         m10, m1, 14
1632    pslldq         m11, m2, 4
1633    psrldq         m12, m2, 12
1634    pslldq         m13, m3, 6
1635    psrldq         m14, m3, 10
1636    paddw           m9, m11
1637    paddw          m10, m12
1638    paddw           m9, m13
1639    paddw          m10, m14
1640    pslldq         m11, m4, 8
1641    psrldq         m12, m4, 8
1642    pslldq         m13, m5, 10
1643    psrldq         m14, m5, 6
1644    paddw           m9, m11
1645    paddw          m10, m12
1646    paddw           m9, m13
1647    paddw          m10, m14
1648    pslldq         m11, m6, 12
1649    psrldq         m12, m6, 4
1650    pslldq         m13, m7, 14
1651    psrldq         m14, m7, 2
1652    paddw           m9, m11
1653    paddw          m10, m12
1654    paddw           m9, m13
1655    paddw          m10, m14                 ; partial_sum_diag[0/1][8-14,zero]
1656    vbroadcasti128 m14, [shufw_6543210x]
1657    vbroadcasti128 m13, [div_table+16]
1658    vbroadcasti128 m12, [div_table+0]
1659    paddw           m9, m0                  ; partial_sum_diag[0/1][0-7]
1660    pshufb         m10, m14
1661    punpckhwd      m11, m9, m10
1662    punpcklwd       m9, m10
1663    pmaddwd        m11, m11
1664    pmaddwd         m9, m9
1665    pmulld         m11, m13
1666    pmulld          m9, m12
1667    paddd           m9, m11                 ; cost0[a-d] | cost4[a-d]
1668
1669    ; merge horizontally and vertically for partial_sum_alt[0-3]
1670    paddw          m10, m0, m1
1671    paddw          m11, m2, m3
1672    paddw          m12, m4, m5
1673    paddw          m13, m6, m7
1674    phaddw          m0, m4
1675    phaddw          m1, m5
1676    phaddw          m2, m6
1677    phaddw          m3, m7
1678
1679    ; create aggregates [lower half]:
1680    ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
1681    ; m11=              m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
1682    ; and [upper half]:
1683    ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
1684    ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
1685    ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
1686
1687    pslldq          m4, m11, 2
1688    psrldq         m11, 14
1689    pslldq          m5, m12, 4
1690    psrldq         m12, 12
1691    pslldq          m6, m13, 6
1692    psrldq         m13, 10
1693    paddw           m4, m10
1694    paddw          m11, m12
1695    vpbroadcastd   m12, [div_table+44]
1696    paddw           m5, m6
1697    paddw          m11, m13                 ; partial_sum_alt[3/2] right
1698    vbroadcasti128 m13, [div_table+32]
1699    paddw           m4, m5                  ; partial_sum_alt[3/2] left
1700    pshuflw         m5, m11, q3012
1701    punpckhwd       m6, m11, m4
1702    punpcklwd       m4, m5
1703    pmaddwd         m6, m6
1704    pmaddwd         m4, m4
1705    pmulld          m6, m12
1706    pmulld          m4, m13
1707    paddd           m4, m6                  ; cost7[a-d] | cost5[a-d]
1708
1709    ; create aggregates [lower half]:
1710    ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
1711    ; m1 =             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
1712    ; and [upper half]:
1713    ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
1714    ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
1715    ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
1716
1717    pslldq          m5, m1, 2
1718    psrldq          m1, 14
1719    pslldq          m6, m2, 4
1720    psrldq          m2, 12
1721    pslldq          m7, m3, 6
1722    psrldq          m3, 10
1723    paddw           m5, m0
1724    paddw           m1, m2
1725    paddw           m6, m7
1726    paddw           m1, m3                  ; partial_sum_alt[0/1] right
1727    paddw           m5, m6                  ; partial_sum_alt[0/1] left
1728    pshuflw         m0, m1, q3012
1729    punpckhwd       m1, m5
1730    punpcklwd       m5, m0
1731    pmaddwd         m1, m1
1732    pmaddwd         m5, m5
1733    pmulld          m1, m12
1734    pmulld          m5, m13
1735    paddd           m5, m1                  ; cost1[a-d] | cost3[a-d]
1736
1737    mova           xm0, [pd_47130256+ 16]
1738    mova            m1, [pd_47130256]
1739    phaddd          m9, m8
1740    phaddd          m5, m4
1741    phaddd          m9, m5
1742    vpermd          m0, m9                  ; cost[0-3]
1743    vpermd          m1, m9                  ; cost[4-7] | cost[0-3]
1744
1745    ; now find the best cost
1746    pmaxsd         xm2, xm0, xm1
1747    pshufd         xm3, xm2, q1032
1748    pmaxsd         xm2, xm3
1749    pshufd         xm3, xm2, q2301
1750    pmaxsd         xm2, xm3 ; best cost
1751
1752    ; find the idx using minpos
1753    ; make everything other than the best cost negative via subtraction
1754    ; find the min of unsigned 16-bit ints to sort out the negative values
1755    psubd          xm4, xm1, xm2
1756    psubd          xm3, xm0, xm2
1757    packssdw       xm3, xm4
1758    phminposuw     xm3, xm3
1759
1760    ; convert idx to 32-bits
1761    psrld          xm3, 16
1762    movd           eax, xm3
1763
1764    ; get idx^4 complement
1765    vpermd          m3, m1
1766    psubd          xm2, xm3
1767    psrld          xm2, 10
1768    movd        [varq], xm2
1769    RET
1770
1771%endif ; ARCH_X86_64
1772