xref: /aosp_15_r20/external/libdav1d/src/x86/looprestoration_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33wiener_shufA:  db  1,  2,  7,  6,  3,  4,  9,  8,  5,  6, 11, 10,  7,  8, 13, 12
34wiener_shufB:  db  2,  3,  8,  7,  4,  5, 10,  9,  6,  7, 12, 11,  8,  9, 14, 13
35wiener_shufC:  db  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11
36wiener_shufD:  db  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12
37wiener_perm32: db  1,  9,  3, 11,  5, 13,  7, 15, 33, 41, 35, 43, 37, 45, 39, 47
38               db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63
39sgr_shuf:      db 128, 1, -1,  2,132,  3, -1,  4,136,  5, -1,  6,140,  7, -1,  8
40               db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1,  0,128
41sgr_mix_perm:  db  1,  3,  5,  7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
42r_ext_mask:    times 68 db -1
43               times  4 db  0
44wiener_x_shuf: db  0,  2, -1,  0
45wiener_x_add:  db  0,  1,127,  0
46
47pw_61448:      times 2 dw 61448
48pw_164_455:    dw 164, 455
49pd_m16380:     dd -16380
50pd_m4096:      dd -4096
51pd_m25         dd -25
52pd_m9:         dd -9
53pd_34816:      dd 34816
54pd_8421376:    dd 8421376
55
56cextern sgr_x_by_x
57
58SECTION .text
59
60DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
61
62INIT_ZMM avx512icl
63cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \
64                                                    w, h, edge, flt
65    mov           fltq, r6mp
66    mov             wd, wm
67    movifnidn       hd, hm
68    mov          edged, r7m
69    vbroadcasti32x4 m6, [wiener_shufA]
70    vbroadcasti32x4 m7, [wiener_shufB]
71    mov           r10d, 0xfffe
72    vbroadcasti32x4 m8, [wiener_shufC]
73    vbroadcasti32x4 m9, [wiener_shufD]
74    kmovw           k1, r10d
75    vpbroadcastd    m0, [wiener_x_shuf]
76    vpbroadcastd    m1, [wiener_x_add]
77    mov            r10, 0xaaaaaaaaaaaaaaaa
78    vpbroadcastd   m11, [fltq+ 0]
79    vpbroadcastd   m12, [fltq+ 4]
80    kmovq           k2, r10
81    vpbroadcastd   m10, [pd_m16380]
82    packsswb       m11, m11 ; x0   x1   x0   x1
83    vpbroadcastd   m14, [fltq+16]
84    pshufb         m12, m0
85    vpbroadcastd   m15, [fltq+20]
86    paddb          m12, m1  ; x2   x3+1 x2   127
87    vpbroadcastd   m13, [pd_8421376]
88    psllw          m14, 5   ; y0 y1
89    psllw          m15, 5   ; y2 y3
90    cmp             wd, 32  ; the minimum lr unit size for chroma in 4:2:0 is 32
91    jle .w32                ; pixels, so we need a special case for small widths
92    lea             t1, [rsp+wq*2+16]
93    add           lpfq, wq
94    add           dstq, wq
95    neg             wq
96    test         edgeb, 4 ; LR_HAVE_TOP
97    jz .no_top
98    call .h_top
99    add           lpfq, strideq
100    mov             t6, t1
101    mov             t5, t1
102    add             t1, 384*2
103    call .h_top
104    lea            r10, [lpfq+strideq*4]
105    mov           lpfq, dstq
106    mov             t4, t1
107    add             t1, 384*2
108    add            r10, strideq
109    mov          [rsp], r10 ; below
110    call .h
111    mov             t3, t1
112    mov             t2, t1
113    dec             hd
114    jz .v1
115    add           lpfq, strideq
116    add             t1, 384*2
117    call .h
118    mov             t2, t1
119    dec             hd
120    jz .v2
121    add           lpfq, strideq
122    add             t1, 384*2
123    call .h
124    dec             hd
125    jz .v3
126.main:
127    lea             t0, [t1+384*2]
128.main_loop:
129    call .hv
130    dec             hd
131    jnz .main_loop
132    test         edgeb, 8 ; LR_HAVE_BOTTOM
133    jz .v3
134    mov           lpfq, [rsp]
135    call .hv_bottom
136    add           lpfq, strideq
137    call .hv_bottom
138.v1:
139    call .v
140    RET
141.no_top:
142    lea            r10, [lpfq+strideq*4]
143    mov           lpfq, dstq
144    lea            r10, [r10+strideq*2]
145    mov          [rsp], r10
146    call .h
147    mov             t6, t1
148    mov             t5, t1
149    mov             t4, t1
150    mov             t3, t1
151    mov             t2, t1
152    dec             hd
153    jz .v1
154    add           lpfq, strideq
155    add             t1, 384*2
156    call .h
157    mov             t2, t1
158    dec             hd
159    jz .v2
160    add           lpfq, strideq
161    add             t1, 384*2
162    call .h
163    dec             hd
164    jz .v3
165    lea             t0, [t1+384*2]
166    call .hv
167    dec             hd
168    jz .v3
169    add             t0, 384*8
170    call .hv
171    dec             hd
172    jnz .main
173.v3:
174    call .v
175.v2:
176    call .v
177    jmp .v1
178.h:
179    mov            r10, wq
180    test         edgeb, 1 ; LR_HAVE_LEFT
181    jz .h_extend_left
182    movd          xm16, [leftq]
183    vmovdqu32  m16{k1}, [lpfq+r10-4]
184    add          leftq, 4
185    jmp .h_main
186.h_extend_left:
187    vpbroadcastb  xm16, [lpfq+r10]   ; the masked load ensures that no exception
188    vmovdqu32  m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory
189    jmp .h_main
190.h_top:
191    mov            r10, wq
192    test         edgeb, 1 ; LR_HAVE_LEFT
193    jz .h_extend_left
194.h_loop:
195    movu           m16, [lpfq+r10-4]
196.h_main:
197    movu           m17, [lpfq+r10+4]
198    test         edgeb, 2 ; LR_HAVE_RIGHT
199    jnz .h_have_right
200    cmp           r10d, -66
201    jl .h_have_right
202    push            r0
203    lea             r0, [r_ext_mask+65]
204    vpbroadcastb    m0, [lpfq-1]
205    vpternlogd     m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
206    vpternlogd     m17, m0, [r0+r10+8], 0xe4
207    pop             r0
208.h_have_right:
209    pshufb          m4, m16, m6
210    mova            m0, m10
211    vpdpbusd        m0, m4, m11
212    pshufb          m4, m16, m7
213    mova            m2, m10
214    vpdpbusd        m2, m4, m11
215    pshufb          m4, m17, m6
216    mova            m1, m10
217    vpdpbusd        m1, m4, m11
218    pshufb          m4, m17, m7
219    mova            m3, m10
220    vpdpbusd        m3, m4, m11
221    pshufb          m4, m16, m8
222    vpdpbusd        m0, m4, m12
223    pshufb         m16, m9
224    vpdpbusd        m2, m16, m12
225    pshufb          m4, m17, m8
226    vpdpbusd        m1, m4, m12
227    pshufb         m17, m9
228    vpdpbusd        m3, m17, m12
229    packssdw        m0, m2
230    packssdw        m1, m3
231    psraw           m0, 3
232    psraw           m1, 3
233    mova [t1+r10*2+ 0], m0
234    mova [t1+r10*2+64], m1
235    add            r10, 64
236    jl .h_loop
237    ret
238ALIGN function_align
239.hv:
240    add           lpfq, strideq
241    mov            r10, wq
242    test         edgeb, 1 ; LR_HAVE_LEFT
243    jz .hv_extend_left
244    movd          xm16, [leftq]
245    vmovdqu32  m16{k1}, [lpfq+r10-4]
246    add          leftq, 4
247    jmp .hv_main
248.hv_extend_left:
249    vpbroadcastb  xm16, [lpfq+r10]
250    vmovdqu32  m16{k1}, [lpfq+r10-4]
251    jmp .hv_main
252.hv_bottom:
253    mov            r10, wq
254    test         edgeb, 1 ; LR_HAVE_LEFT
255    jz .hv_extend_left
256.hv_loop:
257    movu           m16, [lpfq+r10-4]
258.hv_main:
259    movu           m17, [lpfq+r10+4]
260    test         edgeb, 2 ; LR_HAVE_RIGHT
261    jnz .hv_have_right
262    cmp           r10d, -66
263    jl .hv_have_right
264    push            r0
265    lea             r0, [r_ext_mask+65]
266    vpbroadcastb    m0, [lpfq-1]
267    vpternlogd     m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
268    vpternlogd     m17, m0, [r0+r10+8], 0xe4
269    pop             r0
270.hv_have_right:
271    pshufb          m4, m16, m6
272    mova            m0, m10
273    vpdpbusd        m0, m4, m11
274    pshufb          m4, m16, m7
275    mova            m2, m10
276    vpdpbusd        m2, m4, m11
277    pshufb          m4, m17, m6
278    mova            m1, m10
279    vpdpbusd        m1, m4, m11
280    pshufb          m4, m17, m7
281    mova            m3, m10
282    vpdpbusd        m3, m4, m11
283    pshufb          m4, m16, m8
284    vpdpbusd        m0, m4, m12
285    pshufb         m16, m9
286    vpdpbusd        m2, m16, m12
287    pshufb          m4, m17, m8
288    vpdpbusd        m1, m4, m12
289    pshufb         m17, m9
290    vpdpbusd        m3, m17, m12
291    packssdw        m0, m2
292    packssdw        m1, m3
293    psraw           m0, 3
294    psraw           m1, 3
295    mova           m16, [t4+r10*2]
296    paddw          m16, [t2+r10*2]
297    mova            m3, [t3+r10*2]
298    mova           m17, [t4+r10*2+64]
299    paddw          m17, [t2+r10*2+64]
300    mova            m5, [t3+r10*2+64]
301    punpcklwd       m4, m16, m3
302    mova            m2, m13
303    vpdpwssd        m2, m4, m15
304    punpcklwd      m18, m17, m5
305    mova            m4, m13
306    vpdpwssd        m4, m18, m15
307    punpckhwd      m16, m3
308    mova            m3, m13
309    vpdpwssd        m3, m16, m15
310    punpckhwd      m17, m5
311    mova            m5, m13
312    vpdpwssd        m5, m17, m15
313    mova           m17, [t5+r10*2]
314    paddw          m17, [t1+r10*2]
315    paddw          m16, m0, [t6+r10*2]
316    mova           m19, [t5+r10*2+64]
317    paddw          m19, [t1+r10*2+64]
318    paddw          m18, m1, [t6+r10*2+64]
319    mova [t0+r10*2+ 0], m0
320    mova [t0+r10*2+64], m1
321    punpcklwd       m0, m16, m17
322    vpdpwssd        m2, m0, m14
323    punpcklwd       m1, m18, m19
324    vpdpwssd        m4, m1, m14
325    punpckhwd      m16, m17
326    vpdpwssd        m3, m16, m14
327    punpckhwd      m18, m19
328    vpdpwssd        m5, m18, m14
329    packuswb        m2, m4
330    psrlw           m2, 8
331    vpackuswb   m2{k2}, m3, m5
332    movu    [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap
333    add            r10, 64 ; function is used for chroma as well, and in some
334    jl .hv_loop            ; esoteric edge cases chroma dst pointers may only
335    mov             t6, t5 ; have a 32-byte alignment despite having a width
336    mov             t5, t4 ; larger than 32, so use an unaligned store here.
337    mov             t4, t3
338    mov             t3, t2
339    mov             t2, t1
340    mov             t1, t0
341    mov             t0, t6
342    add           dstq, strideq
343    ret
344.v:
345    mov            r10, wq
346.v_loop:
347    mova            m4, [t4+r10*2+ 0]
348    paddw           m4, [t2+r10*2+ 0]
349    mova            m1, [t3+r10*2+ 0]
350    mova            m5, [t4+r10*2+64]
351    paddw           m5, [t2+r10*2+64]
352    mova            m3, [t3+r10*2+64]
353    punpcklwd       m6, m4, m1
354    mova            m0, m13
355    vpdpwssd        m0, m6, m15
356    punpcklwd       m6, m5, m3
357    mova            m2, m13
358    vpdpwssd        m2, m6, m15
359    punpckhwd       m4, m1
360    mova            m1, m13
361    vpdpwssd        m1, m4, m15
362    punpckhwd       m5, m3
363    mova            m3, m13
364    vpdpwssd        m3, m5, m15
365    mova            m5, [t1+r10*2+ 0]
366    paddw           m4, m5, [t6+r10*2+ 0]
367    paddw           m5, [t5+r10*2+ 0]
368    mova            m7, [t1+r10*2+64]
369    paddw           m6, m7, [t6+r10*2+64]
370    paddw           m7, [t5+r10*2+64]
371    punpcklwd       m8, m4, m5
372    vpdpwssd        m0, m8, m14
373    punpcklwd       m8, m6, m7
374    vpdpwssd        m2, m8, m14
375    punpckhwd       m4, m5
376    vpdpwssd        m1, m4, m14
377    punpckhwd       m6, m7
378    vpdpwssd        m3, m6, m14
379    packuswb        m0, m2
380    psrlw           m0, 8
381    vpackuswb   m0{k2}, m1, m3
382    movu    [dstq+r10], m0
383    add            r10, 64
384    jl .v_loop
385    mov             t6, t5
386    mov             t5, t4
387    mov             t4, t3
388    mov             t3, t2
389    mov             t2, t1
390    add           dstq, strideq
391    ret
392.w32:
393    lea            r10, [r_ext_mask+73]
394    mova          ym18, [wiener_perm32]
395    lea             t1, [rsp+16]
396    sub            r10, wq
397    test         edgeb, 4 ; LR_HAVE_TOP
398    jz .w32_no_top
399    call .w32_h_top
400    add           lpfq, strideq
401    mov             t6, t1
402    mov             t5, t1
403    add             t1, 32*2
404    call .w32_h_top
405    lea             r9, [lpfq+strideq*4]
406    mov           lpfq, dstq
407    mov             t4, t1
408    add             t1, 32*2
409    add             r9, strideq
410    mov          [rsp], r9 ; below
411    call .w32_h
412    mov             t3, t1
413    mov             t2, t1
414    dec             hd
415    jz .w32_v1
416    add           lpfq, strideq
417    add             t1, 32*2
418    call .w32_h
419    mov             t2, t1
420    dec             hd
421    jz .w32_v2
422    add           lpfq, strideq
423    add             t1, 32*2
424    call .w32_h
425    dec             hd
426    jz .w32_v3
427.w32_main:
428    lea             t0, [t1+32*2]
429.w32_main_loop:
430    call .w32_hv
431    dec             hd
432    jnz .w32_main_loop
433    test         edgeb, 8 ; LR_HAVE_BOTTOM
434    jz .w32_v3
435    mov           lpfq, [rsp]
436    call .w32_hv_bottom
437    add           lpfq, strideq
438    call .w32_hv_bottom
439.w32_v1:
440    call .w32_v
441    RET
442.w32_no_top:
443    lea             r9, [lpfq+strideq*4]
444    mov           lpfq, dstq
445    lea             r9, [r9+strideq*2]
446    mov          [rsp], r9
447    call .w32_h
448    mov             t6, t1
449    mov             t5, t1
450    mov             t4, t1
451    mov             t3, t1
452    mov             t2, t1
453    dec             hd
454    jz .w32_v1
455    add           lpfq, strideq
456    add             t1, 32*2
457    call .w32_h
458    mov             t2, t1
459    dec             hd
460    jz .w32_v2
461    add           lpfq, strideq
462    add             t1, 32*2
463    call .w32_h
464    dec             hd
465    jz .w32_v3
466    lea             t0, [t1+32*2]
467    call .w32_hv
468    dec             hd
469    jz .w32_v3
470    add             t0, 32*8
471    call .w32_hv
472    dec             hd
473    jnz .w32_main
474.w32_v3:
475    call .w32_v
476.w32_v2:
477    call .w32_v
478    jmp .w32_v1
479.w32_h:
480    test         edgeb, 1 ; LR_HAVE_LEFT
481    jz .w32_h_extend_left
482    movd          xm16, [leftq]
483    vmovdqu32 ym16{k1}, [lpfq-4]
484    add          leftq, 4
485    jmp .w32_h_main
486.w32_h_extend_left:
487    vpbroadcastb  xm16, [lpfq]   ; the masked load ensures that no exception
488    vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory
489    jmp .w32_h_main
490.w32_h_top:
491    test         edgeb, 1 ; LR_HAVE_LEFT
492    jz .w32_h_extend_left
493    movu          ym16, [lpfq-4]
494.w32_h_main:
495    vinserti32x8   m16, [lpfq+4], 1
496    test         edgeb, 2 ; LR_HAVE_RIGHT
497    jnz .w32_h_have_right
498    vpbroadcastb    m0, [lpfq+wq-1]
499    movu          ym17, [r10-8]
500    vinserti32x8   m17, [r10+0], 1
501    vpternlogd     m16, m0, m17, 0xe4 ; c ? a : b
502.w32_h_have_right:
503    pshufb          m2, m16, m6
504    mova            m0, m10
505    vpdpbusd        m0, m2, m11
506    pshufb          m2, m16, m7
507    mova            m1, m10
508    vpdpbusd        m1, m2, m11
509    pshufb          m2, m16, m8
510    vpdpbusd        m0, m2, m12
511    pshufb         m16, m9
512    vpdpbusd        m1, m16, m12
513    packssdw        m0, m1
514    psraw           m0, 3
515    mova          [t1], m0
516    ret
517.w32_hv:
518    add           lpfq, strideq
519    test         edgeb, 1 ; LR_HAVE_LEFT
520    jz .w32_hv_extend_left
521    movd          xm16, [leftq]
522    vmovdqu32 ym16{k1}, [lpfq-4]
523    add          leftq, 4
524    jmp .w32_hv_main
525.w32_hv_extend_left:
526    vpbroadcastb  xm16, [lpfq]
527    vmovdqu32 ym16{k1}, [lpfq-4]
528    jmp .w32_hv_main
529.w32_hv_bottom:
530    test         edgeb, 1 ; LR_HAVE_LEFT
531    jz .w32_hv_extend_left
532    movu          ym16, [lpfq-4]
533.w32_hv_main:
534    vinserti32x8   m16, [lpfq+4], 1
535    test         edgeb, 2 ; LR_HAVE_RIGHT
536    jnz .w32_hv_have_right
537    vpbroadcastb    m0, [lpfq+wq-1]
538    movu          ym17, [r10-8]
539    vinserti32x8   m17, [r10+0], 1
540    vpternlogd     m16, m0, m17, 0xe4
541.w32_hv_have_right:
542    mova            m3, [t4]
543    paddw           m3, [t2]
544    mova            m2, [t3]
545    pshufb          m4, m16, m6
546    mova            m0, m10
547    vpdpbusd        m0, m4, m11
548    pshufb          m4, m16, m7
549    mova            m5, m10
550    vpdpbusd        m5, m4, m11
551    punpcklwd       m4, m3, m2
552    mova            m1, m13
553    vpdpwssd        m1, m4, m15
554    punpckhwd       m3, m2
555    mova            m2, m13
556    vpdpwssd        m2, m3, m15
557    pshufb          m4, m16, m8
558    vpdpbusd        m0, m4, m12
559    pshufb         m16, m9
560    vpdpbusd        m5, m16, m12
561    packssdw        m0, m5
562    psraw           m0, 3
563    mova            m4, [t5]
564    paddw           m4, [t1]
565    paddw           m3, m0, [t6]
566    mova          [t0], m0
567    punpcklwd       m0, m3, m4
568    vpdpwssd        m1, m0, m14
569    punpckhwd       m3, m4
570    vpdpwssd        m2, m3, m14
571    packuswb        m1, m2
572    vpermb         m16, m18, m1
573    mova        [dstq], ym16
574    mov             t6, t5
575    mov             t5, t4
576    mov             t4, t3
577    mov             t3, t2
578    mov             t2, t1
579    mov             t1, t0
580    mov             t0, t6
581    add           dstq, strideq
582    ret
583.w32_v:
584    mova            m2, [t4]
585    paddw           m2, [t2]
586    mova            m1, [t3]
587    mova            m4, [t1]
588    paddw           m3, m4, [t6]
589    paddw           m4, [t5]
590    punpcklwd       m5, m2, m1
591    mova            m0, m13
592    vpdpwssd        m0, m5, m15
593    punpckhwd       m2, m1
594    mova            m1, m13
595    vpdpwssd        m1, m2, m15
596    punpcklwd       m2, m3, m4
597    vpdpwssd        m0, m2, m14
598    punpckhwd       m3, m4
599    vpdpwssd        m1, m3, m14
600    packuswb        m0, m1
601    vpermb         m16, m18, m0
602    mova        [dstq], ym16
603    mov             t6, t5
604    mov             t5, t4
605    mov             t4, t3
606    mov             t3, t2
607    mov             t2, t1
608    add           dstq, strideq
609    ret
610
611cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \
612                                                   w, h, edge, params
613    mov        paramsq, r6mp
614    mov             wd, wm
615    mov             hd, hm
616    mov          edged, r7m
617    vbroadcasti32x4 m5, [sgr_shuf+1]
618    add           lpfq, wq
619    vbroadcasti32x4 m6, [sgr_shuf+9]
620    add           dstq, wq
621    vbroadcasti32x4 m7, [sgr_shuf+3]
622    lea             t3, [rsp+wq*4+16+416*12]
623    vbroadcasti32x4 m8, [sgr_shuf+7]
624    pxor            m4, m4
625    vpbroadcastd    m9, [pd_m25]
626    vpsubd         m11, m4, [paramsq+0] {1to16} ; -s0
627    vpbroadcastw   m15, [paramsq+8]             ; w0
628    lea             t1, [rsp+wq*2+20]
629    vpbroadcastd   m10, [pw_164_455]
630    neg             wq
631    vpbroadcastd   m12, [pw_61448]              ; (15 << 12) + (1 << 3)
632    mov           r10d, 0xfe
633    vpbroadcastd   m13, [pd_m4096]
634    kmovb           k1, r10d
635    vpbroadcastd   m14, [pd_34816]              ; (1 << 11) + (1 << 15)
636    mov            r10, 0x3333333333333333
637    mova           m18, [sgr_x_by_x+64*0]
638    kmovq           k2, r10
639    mova           m19, [sgr_x_by_x+64*1]
640    lea            r12, [r_ext_mask+75]
641    mova           m20, [sgr_x_by_x+64*2]
642    psllw          m15, 4
643    mova           m21, [sgr_x_by_x+64*3]
644    lea            r10, [lpfq+strideq*4]
645    mova          ym22, [sgr_shuf]
646    add            r10, strideq
647    mov          [rsp], r10 ; below
648    test         edgeb, 4 ; LR_HAVE_TOP
649    jz .no_top
650    call .h_top
651    add           lpfq, strideq
652    mov             t2, t1
653    call .top_fixup
654    add             t1, 416*6
655    call .h_top
656    lea            r10, [lpfq+strideq*4]
657    mov           lpfq, dstq
658    add            r10, strideq
659    mov          [rsp], r10 ; below
660    mov             t0, t2
661    dec             hd
662    jz .height1
663    or           edged, 16
664    call .h
665.main:
666    add           lpfq, strideq
667    call .hv
668    call .prep_n
669    sub             hd, 2
670    jl .extend_bottom
671.main_loop:
672    add           lpfq, strideq
673    test            hd, hd
674    jz .odd_height
675    call .h
676    add           lpfq, strideq
677    call .hv
678    call .n0
679    call .n1
680    sub             hd, 2
681    jge .main_loop
682    test         edgeb, 8 ; LR_HAVE_BOTTOM
683    jz .extend_bottom
684    mov           lpfq, [rsp]
685    call .h_top
686    add           lpfq, strideq
687    call .hv_bottom
688.end:
689    call .n0
690    call .n1
691.end2:
692    RET
693.height1:
694    call .hv
695    call .prep_n
696    jmp .odd_height_end
697.odd_height:
698    call .hv
699    call .n0
700    call .n1
701.odd_height_end:
702    call .v
703    call .n0
704    jmp .end2
705.extend_bottom:
706    call .v
707    jmp .end
708.no_top:
709    lea            r10, [lpfq+strideq*4]
710    mov           lpfq, dstq
711    lea            r10, [r10+strideq*2]
712    mov          [rsp], r10
713    call .h
714    lea             t2, [t1+416*6]
715    call .top_fixup
716    dec             hd
717    jz .no_top_height1
718    or           edged, 16
719    mov             t0, t1
720    mov             t1, t2
721    jmp .main
722.no_top_height1:
723    call .v
724    call .prep_n
725    jmp .odd_height_end
726.h: ; horizontal boxsum
727    lea            r10, [wq-2]
728    test         edgeb, 1 ; LR_HAVE_LEFT
729    jz .h_extend_left
730    movd          xm17, [leftq]
731    vmovdqu32 ym17{k1}, [lpfq+wq-4]
732    add          leftq, 4
733    jmp .h_main
734.h_extend_left:
735    vpbroadcastb  xm17, [lpfq+wq]
736    vmovdqu32 ym17{k1}, [lpfq+wq-4]
737    jmp .h_main
738.h_top:
739    lea            r10, [wq-2]
740    test         edgeb, 1 ; LR_HAVE_LEFT
741    jz .h_extend_left
742.h_loop:
743    movu          ym17, [lpfq+r10-2]
744.h_main:
745    vinserti32x8   m17, [lpfq+r10+6], 1
746    test         edgeb, 2 ; LR_HAVE_RIGHT
747    jnz .h_have_right
748    cmp           r10d, -34
749    jl .h_have_right
750    vpbroadcastb    m0, [lpfq-1]
751    movu          ym16, [r12+r10-8]
752    vinserti32x8   m16, [r12+r10+0], 1
753    vpternlogd     m17, m0, m16, 0xe4
754.h_have_right:
755    pshufb          m3, m17, m5
756    pmullw          m2, m3, m3
757    pshufb          m1, m17, m6
758    paddw           m0, m3, m1
759    shufps          m3, m1, q2121
760    paddw           m0, m3
761    punpcklwd      m16, m3, m1
762    punpckhwd       m3, m1
763    punpcklwd       m1, m2, m4
764    vpdpwssd        m1, m16, m16
765    punpckhwd       m2, m4
766    vpdpwssd        m2, m3, m3
767    pshufb         m16, m17, m7
768    paddw           m0, m16
769    pshufb         m17, m8
770    paddw           m0, m17              ; sum
771    punpcklwd       m3, m16, m17
772    vpdpwssd        m1, m3, m3           ; sumsq
773    punpckhwd      m16, m17
774    vpdpwssd        m2, m16, m16
775    test         edgeb, 16 ; y > 0
776    jz .h_loop_end
777    paddw           m0, [t1+r10*2+416*0]
778    paddd           m1, [t1+r10*2+416*2]
779    paddd           m2, [t1+r10*2+416*4]
780.h_loop_end:
781    mova [t1+r10*2+416*0], m0
782    mova [t1+r10*2+416*2], m1
783    mova [t1+r10*2+416*4], m2
784    add            r10, 32
785    jl .h_loop
786    ret
787.top_fixup:
788    lea            r10, [wq-2]
789.top_fixup_loop: ; the sums of the first row needs to be doubled
790    mova            m0, [t1+r10*2+416*0]
791    mova            m1, [t1+r10*2+416*2]
792    mova            m2, [t1+r10*2+416*4]
793    paddw           m0, m0
794    paddd           m1, m1
795    paddd           m2, m2
796    mova [t2+r10*2+416*0], m0
797    mova [t2+r10*2+416*2], m1
798    mova [t2+r10*2+416*4], m2
799    add            r10, 32
800    jl .top_fixup_loop
801    ret
802ALIGN function_align
803.hv: ; horizontal boxsum + vertical boxsum + ab
804    lea            r10, [wq-2]
805    test         edgeb, 1 ; LR_HAVE_LEFT
806    jz .hv_extend_left
807    movd          xm17, [leftq]
808    vmovdqu32 ym17{k1}, [lpfq+wq-4]
809    add          leftq, 4
810    jmp .hv_main
811.hv_extend_left:
812    vpbroadcastb  xm17, [lpfq+wq]
813    vmovdqu32 ym17{k1}, [lpfq+wq-4]
814    jmp .hv_main
815.hv_bottom:
816    lea            r10, [wq-2]
817    test         edgeb, 1 ; LR_HAVE_LEFT
818    jz .hv_extend_left
819.hv_loop:
820    movu          ym17, [lpfq+r10-2]
821.hv_main:
822    vinserti32x8   m17, [lpfq+r10+6], 1
823    test         edgeb, 2 ; LR_HAVE_RIGHT
824    jnz .hv_have_right
825    cmp           r10d, -34
826    jl .hv_have_right
827    vpbroadcastb    m0, [lpfq-1]
828    movu          ym16, [r12+r10-8]
829    vinserti32x8   m16, [r12+r10+0], 1
830    vpternlogd     m17, m0, m16, 0xe4
831.hv_have_right:
832    pshufb          m1, m17, m5
833    pmullw          m3, m1, m1
834    pshufb          m2, m17, m6
835    paddw           m0, m1, m2
836    shufps          m1, m2, q2121
837    paddw           m0, m1
838    punpcklwd      m16, m1, m2
839    punpckhwd       m1, m2
840    punpcklwd       m2, m3, m4
841    vpdpwssd        m2, m16, m16
842    punpckhwd       m3, m4
843    vpdpwssd        m3, m1, m1
844    pshufb         m16, m17, m7
845    paddw           m0, m16
846    pshufb         m17, m8
847    paddw           m0, m17              ; h sum
848    punpcklwd       m1, m16, m17
849    vpdpwssd        m2, m1, m1           ; h sumsq
850    punpckhwd      m16, m17
851    vpdpwssd        m3, m16, m16
852    paddw           m1, m0, [t1+r10*2+416*0]
853    paddd          m16, m2, [t1+r10*2+416*2]
854    paddd          m17, m3, [t1+r10*2+416*4]
855    test            hd, hd
856    jz .hv_last_row
857.hv_main2:
858    paddd          m16, [t2+r10*2+416*2] ; hv sumsq
859    paddd          m17, [t2+r10*2+416*4]
860    paddw           m1, [t2+r10*2+416*0] ; hv sum
861    mova [t0+r10*2+416*2], m2
862    mova [t0+r10*2+416*4], m3
863    mova [t0+r10*2+416*0], m0
864    pmulld         m16, m9               ; -a * 25
865    pmulld         m17, m9
866    punpcklwd       m0, m1, m4           ; b
867    vpdpwssd       m16, m0, m0           ; -p
868    punpckhwd       m1, m4
869    vpdpwssd       m17, m1, m1
870    pmaddwd         m0, m10              ; b * 164
871    pmaddwd         m1, m10
872    pmulld         m16, m11              ; p * s
873    pmulld         m17, m11
874    vpalignr   m17{k2}, m16, m16, 2
875    mova           m16, m20
876    paddusw        m17, m12
877    psraw          m17, 4                ; min(z, 255) - 256
878    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
879    vpmovb2m        k3, m17
880    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
881    vmovdqu8   m17{k3}, m16              ; x
882    pandn          m16, m13, m17
883    psrld          m17, 16
884    pmulld          m0, m16
885    pmulld          m1, m17
886    paddd           m0, m14              ; x * b * 164 + (1 << 11) + (1 << 15)
887    paddd           m1, m14
888    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
889    vpternlogd     m17, m1, m13, 0xd8
890    mova          [t3+r10*4+  8], m16    ; The neighbor calculations requires
891    mova          [t3+r10*4+ 24], xm17   ; 13 bits for a and 21 bits for b.
892    vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but
893    mova          [t3+r10*4+ 72], m17    ; that gets us most of the way.
894    vextracti128  [t3+r10*4+ 72], ym16, 1
895    vextracti32x4 [t3+r10*4+104], m16, 3
896    add            r10, 32
897    jl .hv_loop
898    mov             t2, t1
899    mov             t1, t0
900    mov             t0, t2
901    ret
902.hv_last_row: ; esoteric edge case for odd heights
903    mova [t1+r10*2+416*0], m1
904    paddw              m1, m0
905    mova [t1+r10*2+416*2], m16
906    paddd             m16, m2
907    mova [t1+r10*2+416*4], m17
908    paddd             m17, m3
909    jmp .hv_main2
910.v: ; vertical boxsum + ab
911    lea            r10, [wq-2]
912.v_loop:
913    mova            m2, [t1+r10*2+416*2]
914    paddd          m16, m2, [t2+r10*2+416*2]
915    mova            m3, [t1+r10*2+416*4]
916    paddd          m17, m3, [t2+r10*2+416*4]
917    paddd           m2, m2
918    paddd           m3, m3
919    paddd          m16, m2               ; hv sumsq
920    paddd          m17, m3
921    pmulld         m16, m9               ; -a * 25
922    pmulld         m17, m9
923    mova            m0, [t1+r10*2+416*0]
924    paddw           m1, m0, [t2+r10*2+416*0]
925    paddw           m0, m0
926    paddw           m1, m0               ; hv sum
927    punpcklwd       m0, m1, m4           ; b
928    vpdpwssd       m16, m0, m0           ; -p
929    punpckhwd       m1, m4
930    vpdpwssd       m17, m1, m1
931    pmaddwd         m0, m10              ; b * 164
932    pmaddwd         m1, m10
933    pmulld         m16, m11              ; p * s
934    pmulld         m17, m11
935    vpalignr   m17{k2}, m16, m16, 2
936    mova           m16, m20
937    paddusw        m17, m12
938    psraw          m17, 4                ; min(z, 255) - 256
939    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
940    vpmovb2m        k3, m17
941    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
942    vmovdqu8   m17{k3}, m16              ; x
943    pandn          m16, m13, m17
944    psrld          m17, 16
945    pmulld          m0, m16
946    pmulld          m1, m17
947    paddd           m0, m14              ; x * b * 164 + (1 << 11) + (1 << 15)
948    paddd           m1, m14
949    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
950    vpternlogd     m17, m1, m13, 0xd8
951    mova          [t3+r10*4+  8], m16
952    mova          [t3+r10*4+ 24], xm17
953    vextracti32x4 [t3+r10*4+ 56], m17, 2
954    mova          [t3+r10*4+ 72], m17
955    vextracti128  [t3+r10*4+ 72], ym16, 1
956    vextracti32x4 [t3+r10*4+104], m16, 3
957    add            r10, 32
958    jl .v_loop
959    ret
960.prep_n: ; initial neighbor setup
961    mov            r10, wq
962.prep_n_loop:
963    movu            m0, [t3+r10*4+ 4]
964    movu            m1, [t3+r10*4+68]
965    paddd           m2, m0, [t3+r10*4+ 0]
966    paddd           m3, m1, [t3+r10*4+64]
967    paddd           m2, [t3+r10*4+ 8]
968    paddd           m3, [t3+r10*4+72]
969    paddd           m0, m2
970    pslld           m2, 2
971    paddd           m1, m3
972    pslld           m3, 2
973    paddd           m2, m0                ; ab 565
974    paddd           m3, m1
975    pandn           m0, m13, m2           ; a
976    psrld           m2, 12                ; b
977    pandn           m1, m13, m3
978    psrld           m3, 12
979    mova [t3+r10*4+416*4+ 0], m0
980    mova [t3+r10*4+416*8+ 0], m2
981    mova [t3+r10*4+416*4+64], m1
982    mova [t3+r10*4+416*8+64], m3
983    add            r10, 32
984    jl .prep_n_loop
985    ret
986ALIGN function_align
987.n0: ; neighbor + output (even rows)
988    mov            r10, wq
989.n0_loop:
990    movu           m16, [t3+r10*4+ 4]
991    movu           m17, [t3+r10*4+68]
992    paddd           m0, m16, [t3+r10*4+ 0]
993    paddd           m1, m17, [t3+r10*4+64]
994    paddd           m0, [t3+r10*4+ 8]
995    paddd           m1, [t3+r10*4+72]
996    paddd          m16, m0
997    pslld           m0, 2
998    paddd          m17, m1
999    pslld           m1, 2
1000    paddd           m0, m16
1001    paddd           m1, m17
1002    pandn          m16, m13, m0
1003    psrld           m0, 12
1004    pandn          m17, m13, m1
1005    psrld           m1, 12
1006    paddd           m2, m16, [t3+r10*4+416*4+ 0] ; a
1007    paddd           m3, m17, [t3+r10*4+416*4+64]
1008    mova [t3+r10*4+416*4+ 0], m16
1009    mova [t3+r10*4+416*4+64], m17
1010    paddd          m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8)
1011    paddd          m17, m1, [t3+r10*4+416*8+64]
1012    mova [t3+r10*4+416*8+ 0], m0
1013    mova [t3+r10*4+416*8+64], m1
1014    pmovzxbd        m0, [dstq+r10+ 0]
1015    pmovzxbd        m1, [dstq+r10+16]
1016    pmaddwd         m2, m0                      ; a * src
1017    pmaddwd         m3, m1
1018    packssdw        m0, m1
1019    psubd          m16, m2                      ; b - a * src + (1 << 8)
1020    psubd          m17, m3
1021    psrad          m16, 9
1022    psrad          m17, 9
1023    packssdw       m16, m17
1024    pmulhrsw       m16, m15
1025    paddw          m16, m0
1026    packuswb       m16, m16
1027    vpermd         m16, m22, m16
1028    mova    [dstq+r10], ym16
1029    add            r10, 32
1030    jl .n0_loop
1031    add           dstq, strideq
1032    ret
1033ALIGN function_align
1034.n1: ; neighbor + output (odd rows)
1035    mov            r10, wq
1036.n1_loop:
1037    pmovzxbd        m0, [dstq+r10+ 0]
1038    pmovzxbd        m1, [dstq+r10+16]
1039    pmaddwd         m2, m0, [t3+r10*4+416*4+ 0] ; a * src
1040    pmaddwd         m3, m1, [t3+r10*4+416*4+64]
1041    mova           m16, [t3+r10*4+416*8+ 0]     ; b + (1 << 7)
1042    mova           m17, [t3+r10*4+416*8+64]
1043    packssdw        m0, m1
1044    psubd          m16, m2                      ; b - a * src + (1 << 7)
1045    psubd          m17, m3
1046    psrad          m16, 8
1047    psrad          m17, 8
1048    packssdw       m16, m17
1049    pmulhrsw       m16, m15
1050    paddw          m16, m0
1051    packuswb       m16, m16
1052    vpermd         m16, m22, m16
1053    mova    [dstq+r10], ym16
1054    add            r10, 32
1055    jl .n1_loop
1056    add           dstq, strideq
1057    ret
1058
1059cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \
1060                                                    w, h, edge, params
1061    mov        paramsq, r6mp
1062    mov             wd, wm
1063    movifnidn       hd, hm
1064    mov          edged, r7m
1065    vbroadcasti32x4 m5, [sgr_shuf+3]
1066    add           lpfq, wq
1067    vbroadcasti32x4 m6, [sgr_shuf+5]
1068    add           dstq, wq
1069    vbroadcasti32x4 m7, [sgr_shuf+7]
1070    pxor            m4, m4
1071    vpbroadcastd    m8, [pd_m9]
1072    vpsubd         m11, m4, [paramsq+4] {1to16} ; -s1
1073    vpbroadcastw   m15, [paramsq+10]            ; w1
1074    lea             t1, [rsp+wq*2+20]
1075    vpbroadcastd   m10, [pw_164_455]
1076    lea             t3, [rsp+wq*4+16+416*12]
1077    vpbroadcastd   m12, [pw_61448]              ; (15 << 12) + (1 << 3)
1078    neg             wq
1079    vpbroadcastd   m13, [pd_m4096]
1080    mov           r10d, 0xfe
1081    vpbroadcastd   m14, [pd_34816]              ; (1 << 11) + (1 << 15)
1082    kmovb           k1, r10d
1083    mova           m18, [sgr_x_by_x+64*0]
1084    mov            r10, 0x3333333333333333
1085    mova           m19, [sgr_x_by_x+64*1]
1086    kmovq           k2, r10
1087    mova           m20, [sgr_x_by_x+64*2]
1088    psllw          m15, 4
1089    mova           m21, [sgr_x_by_x+64*3]
1090    lea            r14, [r_ext_mask+75]
1091    mova           ym9, [sgr_shuf]
1092    test         edgeb, 4 ; LR_HAVE_TOP
1093    jz .no_top
1094    call .h_top
1095    add           lpfq, strideq
1096    mov             t2, t1
1097    add             t1, 416*6
1098    call .h_top
1099    lea             t4, [lpfq+strideq*4]
1100    mov           lpfq, dstq
1101    add             t4, strideq
1102    mov          [rsp], t4 ; below
1103    mov             t0, t2
1104    call .hv
1105.main:
1106    mov             t5, t3
1107    add             t3, 416*4
1108    dec             hd
1109    jz .height1
1110    add           lpfq, strideq
1111    call .hv
1112    call .prep_n
1113    dec             hd
1114    jz .extend_bottom
1115.main_loop:
1116    add           lpfq, strideq
1117    call .hv
1118    call .n
1119    dec             hd
1120    jnz .main_loop
1121    test         edgeb, 8 ; LR_HAVE_BOTTOM
1122    jz .extend_bottom
1123    mov           lpfq, [rsp]
1124    call .hv_bottom
1125    call .n
1126    add           lpfq, strideq
1127    call .hv_bottom
1128.end:
1129    call .n
1130    RET
1131.height1:
1132    call .v
1133    call .prep_n
1134    mov             t2, t1
1135    call .v
1136    jmp .end
1137.extend_bottom:
1138    call .v
1139    call .n
1140    mov             t2, t1
1141    call .v
1142    jmp .end
1143.no_top:
1144    lea             t4, [lpfq+strideq*4]
1145    mov           lpfq, dstq
1146    lea             t4, [t4+strideq*2]
1147    mov          [rsp], t4
1148    call .h
1149    lea             t0, [t1+416*6]
1150    mov             t2, t1
1151    call .v
1152    jmp .main
1153.h: ; horizontal boxsum
1154    lea            r10, [wq-2]
1155    test         edgeb, 1 ; LR_HAVE_LEFT
1156    jz .h_extend_left
1157    movd          xm17, [leftq]
1158    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1159    add          leftq, 4
1160    jmp .h_main
1161.h_extend_left:
1162    vpbroadcastb  xm17, [lpfq+wq]
1163    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1164    jmp .h_main
1165.h_top:
1166    lea            r10, [wq-2]
1167    test         edgeb, 1 ; LR_HAVE_LEFT
1168    jz .h_extend_left
1169.h_loop:
1170    movu          ym17, [lpfq+r10-2]
1171.h_main:
1172    vinserti32x8   m17, [lpfq+r10+6], 1
1173    test         edgeb, 2 ; LR_HAVE_RIGHT
1174    jnz .h_have_right
1175    cmp           r10d, -33
1176    jl .h_have_right
1177    vpbroadcastb    m0, [lpfq-1]
1178    movu          ym16, [r14+r10-8]
1179    vinserti32x8   m16, [r14+r10+0], 1
1180    vpternlogd     m17, m0, m16, 0xe4
1181.h_have_right:
1182    pshufb          m0, m17, m5
1183    pmullw          m2, m0, m0
1184    pshufb         m16, m17, m6
1185    paddw           m0, m16
1186    pshufb         m17, m7
1187    paddw           m0, m17    ; sum
1188    punpcklwd       m3, m16, m17
1189    punpcklwd       m1, m2, m4
1190    vpdpwssd        m1, m3, m3 ; sumsq
1191    punpckhwd      m16, m17
1192    punpckhwd       m2, m4
1193    vpdpwssd        m2, m16, m16
1194    mova [t1+r10*2+416*0], m0
1195    mova [t1+r10*2+416*2], m1
1196    mova [t1+r10*2+416*4], m2
1197    add            r10, 32
1198    jl .h_loop
1199    ret
1200ALIGN function_align
1201.hv: ; horizontal boxsum + vertical boxsum + ab
1202    lea            r10, [wq-2]
1203    test         edgeb, 1 ; LR_HAVE_LEFT
1204    jz .hv_extend_left
1205    movd          xm17, [leftq]
1206    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1207    add          leftq, 4
1208    jmp .hv_main
1209.hv_extend_left:
1210    vpbroadcastb  xm17, [lpfq+wq]
1211    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1212    jmp .hv_main
1213.hv_bottom:
1214    lea            r10, [wq-2]
1215    test         edgeb, 1 ; LR_HAVE_LEFT
1216    jz .hv_extend_left
1217.hv_loop:
1218    movu          ym17, [lpfq+r10-2]
1219.hv_main:
1220    vinserti32x8   m17, [lpfq+r10+6], 1
1221    test         edgeb, 2 ; LR_HAVE_RIGHT
1222    jnz .hv_have_right
1223    cmp           r10d, -33
1224    jl .hv_have_right
1225    vpbroadcastb    m0, [lpfq-1]
1226    movu          ym16, [r14+r10-8]
1227    vinserti32x8   m16, [r14+r10+0], 1
1228    vpternlogd     m17, m0, m16, 0xe4
1229.hv_have_right:
1230    pshufb          m0, m17, m5
1231    pmullw          m3, m0, m0
1232    pshufb          m1, m17, m6
1233    paddw           m0, m1
1234    pshufb         m17, m7
1235    paddw           m0, m17              ; h sum
1236    punpcklwd      m16, m17, m1
1237    punpcklwd       m2, m3, m4
1238    vpdpwssd        m2, m16, m16         ; h sumsq
1239    punpckhwd      m17, m1
1240    punpckhwd       m3, m4
1241    vpdpwssd        m3, m17, m17
1242    paddw           m1, m0, [t2+r10*2+416*0]
1243    paddw           m1, [t1+r10*2+416*0] ; hv sum
1244    paddd          m16, m2, [t2+r10*2+416*2]
1245    paddd          m17, m3, [t2+r10*2+416*4]
1246    paddd          m16, [t1+r10*2+416*2] ; hv sumsq
1247    paddd          m17, [t1+r10*2+416*4]
1248    mova [t0+r10*2+416*0], m0
1249    mova [t0+r10*2+416*2], m2
1250    mova [t0+r10*2+416*4], m3
1251    pmulld         m16, m8               ; -a * 9
1252    pmulld         m17, m8
1253    punpcklwd       m0, m4, m1           ; b
1254    vpdpwssd       m16, m0, m0           ; -p
1255    punpckhwd       m1, m4, m1
1256    vpdpwssd       m17, m1, m1
1257    pmaddwd         m0, m10              ; b * 455
1258    pmaddwd         m1, m10
1259    pmulld         m16, m11              ; p * s
1260    pmulld         m17, m11
1261    vpalignr   m17{k2}, m16, m16, 2
1262    mova           m16, m20
1263    paddusw        m17, m12
1264    psraw          m17, 4                ; min(z, 255) - 256
1265    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
1266    vpmovb2m        k3, m17
1267    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
1268    vmovdqu8   m17{k3}, m16              ; x
1269    pandn          m16, m13, m17
1270    psrld          m17, 16
1271    pmulld          m0, m16
1272    pmulld          m1, m17
1273    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
1274    paddd           m1, m14
1275    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
1276    vpternlogd     m17, m1, m13, 0xd8
1277    mova          [t3+r10*4+  8], m16
1278    mova          [t3+r10*4+ 24], xm17
1279    vextracti32x4 [t3+r10*4+ 56], m17, 2
1280    mova          [t3+r10*4+ 72], m17
1281    vextracti128  [t3+r10*4+ 72], ym16, 1
1282    vextracti32x4 [t3+r10*4+104], m16, 3
1283    add            r10, 32
1284    jl .hv_loop
1285    mov             t2, t1
1286    mov             t1, t0
1287    mov             t0, t2
1288    ret
1289.v: ; vertical boxsum + ab
1290    lea            r10, [wq-2]
1291.v_loop:
1292    mova           m16, [t1+r10*2+416*2]
1293    mova           m17, [t1+r10*2+416*4]
1294    paddd          m16, m16
1295    paddd          m17, m17
1296    paddd          m16, [t2+r10*2+416*2] ; hv sumsq
1297    paddd          m17, [t2+r10*2+416*4]
1298    pmulld         m16, m8               ; -a * 9
1299    pmulld         m17, m8
1300    mova            m1, [t1+r10*2+416*0]
1301    paddw           m1, m1
1302    paddw           m1, [t2+r10*2+416*0] ; hv sum
1303    punpcklwd       m0, m4, m1           ; b
1304    vpdpwssd       m16, m0, m0           ; -p
1305    punpckhwd       m1, m4, m1
1306    vpdpwssd       m17, m1, m1
1307    pmaddwd         m0, m10              ; b * 455
1308    pmaddwd         m1, m10
1309    pmulld         m16, m11              ; p * s
1310    pmulld         m17, m11
1311    vpalignr   m17{k2}, m16, m16, 2
1312    mova           m16, m20
1313    paddusw        m17, m12
1314    psraw          m17, 4                ; min(z, 255) - 256
1315    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
1316    vpmovb2m        k3, m17
1317    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
1318    vmovdqu8   m17{k3}, m16              ; x
1319    pandn          m16, m13, m17
1320    psrld          m17, 16
1321    pmulld          m0, m16
1322    pmulld          m1, m17
1323    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
1324    paddd           m1, m14
1325    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
1326    vpternlogd     m17, m1, m13, 0xd8
1327    mova          [t3+r10*4+  8], m16
1328    mova          [t3+r10*4+ 24], xm17
1329    vextracti32x4 [t3+r10*4+ 56], m17, 2
1330    mova          [t3+r10*4+ 72], m17
1331    vextracti128  [t3+r10*4+ 72], ym16, 1
1332    vextracti32x4 [t3+r10*4+104], m16, 3
1333    add            r10, 32
1334    jl .v_loop
1335    ret
1336.prep_n: ; initial neighbor setup
1337    mov            r10, wq
1338    mov             t4, t3
1339    add             t3, 416*4
1340.prep_n_loop:
1341    mova            m2, [t5+r10*4+0]
1342    mova            m3, [t4+r10*4+0]
1343    paddd           m2, [t5+r10*4+8]
1344    paddd           m3, [t4+r10*4+8]
1345    paddd           m0, m2, [t5+r10*4+4]
1346    paddd           m1, m3, [t4+r10*4+4]
1347    pslld           m0, 2
1348    paddd           m1, m1                ; ab[ 0] 222
1349    psubd           m0, m2                ; ab[-1] 343
1350    mova [t3+r10*4+416*4], m1
1351    paddd           m1, m1
1352    mova    [t5+r10*4], m0
1353    psubd           m1, m3                ; ab[ 0] 343
1354    mova    [t4+r10*4], m1
1355    add            r10, 16
1356    jl .prep_n_loop
1357    ret
1358; a+b are packed together in a single dword, but we can't do the
1359; full neighbor calculations before splitting them since we don't
1360; have sufficient precision. The solution is to do the calculations
1361; in two equal halves and split a and b before doing the final sum.
1362ALIGN function_align
1363.n: ; neighbor + output
1364    mov            r10, wq
1365.n_loop:
1366    mova           m16, [t3+r10*4+ 0]
1367    paddd          m16, [t3+r10*4+ 8]
1368    paddd          m17, m16, [t3+r10*4+ 4]
1369    paddd          m17, m17               ; ab[+1] 222
1370    mova            m2, [t3+r10*4+416*4+ 0]
1371    paddd           m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
1372    mova            m3, [t3+r10*4+416*4+64]
1373    paddd           m1, m3, [t5+r10*4+64]
1374    mova [t3+r10*4+416*4+ 0], m17
1375    paddd          m17, m17
1376    psubd          m17, m16               ; ab[+1] 343
1377    mova [t5+r10*4+ 0], m17
1378    paddd           m2, m17               ; ab[ 0] 222 + ab[+1] 343
1379    mova           m16, [t3+r10*4+64]
1380    paddd          m16, [t3+r10*4+72]
1381    paddd          m17, m16, [t3+r10*4+68]
1382    paddd          m17, m17
1383    mova [t3+r10*4+416*4+64], m17
1384    paddd          m17, m17
1385    psubd          m17, m16
1386    mova [t5+r10*4+64], m17
1387    pandn          m16, m13, m0
1388    psrld           m0, 12
1389    paddd           m3, m17
1390    pandn          m17, m13, m2
1391    psrld           m2, 12
1392    paddd          m16, m17               ; a
1393    pandn          m17, m13, m1
1394    psrld           m1, 12
1395    paddd           m0, m2                ; b + (1 << 8)
1396    pandn           m2, m13, m3
1397    psrld           m3, 12
1398    paddd          m17, m2
1399    pmovzxbd        m2, [dstq+r10+ 0]
1400    paddd           m1, m3
1401    pmovzxbd        m3, [dstq+r10+16]
1402    pmaddwd        m16, m2                ; a * src
1403    pmaddwd        m17, m3
1404    packssdw        m2, m3
1405    psubd           m0, m16               ; b - a * src + (1 << 8)
1406    psubd           m1, m17
1407    psrad           m0, 9
1408    psrad           m1, 9
1409    packssdw        m0, m1
1410    pmulhrsw        m0, m15
1411    paddw           m0, m2
1412    packuswb        m0, m0
1413    vpermd         m16, m9, m0
1414    mova    [dstq+r10], ym16
1415    add            r10, 32
1416    jl .n_loop
1417    mov            r10, t5
1418    mov             t5, t4
1419    mov             t4, r10
1420    add           dstq, strideq
1421    ret
1422
1423cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \
1424                                                  w, h, edge, params
1425    mov        paramsq, r6mp
1426    mov             wd, wm
1427    movifnidn       hd, hm
1428    mov          edged, r7m
1429    vbroadcasti128  m5, [sgr_shuf+1]
1430    add           lpfq, wq
1431    vbroadcasti128  m6, [sgr_shuf+9]
1432    add           dstq, wq
1433    vbroadcasti128  m7, [sgr_shuf+3]
1434    lea             t3, [rsp+wq*4+416*24+8]
1435    vbroadcasti128  m8, [sgr_shuf+7]
1436    pxor            m4, m4
1437    vpbroadcastd    m9, [pd_m9]
1438    vpsubd         m11, m4, [paramsq+0] {1to16} ; -s0
1439    vpbroadcastd   m14, [pw_61448]
1440    vpsubd         m12, m4, [paramsq+4] {1to16} ; -s1
1441    vpbroadcastd   m26, [paramsq+8]             ; w0 w1
1442    lea             t1, [rsp+wq*2+12]
1443    vpbroadcastd   m10, [pd_m25]
1444    neg             wq
1445    vpbroadcastd   m13, [pw_164_455]
1446    mov           r10d, 0xfe
1447    vpbroadcastd   m15, [pd_34816]
1448    kmovb           k1, r10d
1449    mova           m20, [sgr_x_by_x+64*0]
1450    mov            r10, 0x3333333333333333
1451    mova           m21, [sgr_x_by_x+64*1]
1452    kmovq           k2, r10
1453    mova           m22, [sgr_x_by_x+64*2]
1454    lea            r12, [r_ext_mask+75]
1455    mova           m23, [sgr_x_by_x+64*3]
1456    vpbroadcastd   m24, [pd_m4096]
1457    vpbroadcastd   m25, [sgr_shuf+28]           ; 0x8000____
1458    psllw          m26, 5
1459    mova          xm27, [sgr_mix_perm]
1460    test         edgeb, 4 ; LR_HAVE_TOP
1461    jz .no_top
1462    call .h_top
1463    add           lpfq, strideq
1464    mov             t2, t1
1465    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup
1466    add             t1, 416*12
1467    call .h_top
1468    lea            r10, [lpfq+strideq*4]
1469    mov           lpfq, dstq
1470    add            r10, strideq
1471    mov          [rsp], r10 ; below
1472    call .hv0
1473.main:
1474    dec             hd
1475    jz .height1
1476    add           lpfq, strideq
1477    call .hv1
1478    call .prep_n
1479    sub             hd, 2
1480    jl .extend_bottom
1481.main_loop:
1482    add           lpfq, strideq
1483    call .hv0
1484    test            hd, hd
1485    jz .odd_height
1486    add           lpfq, strideq
1487    call .hv1
1488    call .n0
1489    call .n1
1490    sub             hd, 2
1491    jge .main_loop
1492    test         edgeb, 8 ; LR_HAVE_BOTTOM
1493    jz .extend_bottom
1494    mov           lpfq, [rsp]
1495    call .hv0_bottom
1496    add           lpfq, strideq
1497    call .hv1_bottom
1498.end:
1499    call .n0
1500    call .n1
1501.end2:
1502    RET
1503.height1:
1504    call .v1
1505    call .prep_n
1506    jmp .odd_height_end
1507.odd_height:
1508    call .v1
1509    call .n0
1510    call .n1
1511.odd_height_end:
1512    call .v0
1513    call .v1
1514    call .n0
1515    jmp .end2
1516.extend_bottom:
1517    call .v0
1518    call .v1
1519    jmp .end
1520.no_top:
1521    lea            r10, [lpfq+strideq*4]
1522    mov           lpfq, dstq
1523    lea            r10, [r10+strideq*2]
1524    mov          [rsp], r10
1525    call .h
1526    lea             t2, [t1+416*12]
1527    lea            r10, [wq-2]
1528.top_fixup_loop:
1529    mova            m0, [t1+r10*2+416* 0]
1530    mova            m1, [t1+r10*2+416* 2]
1531    mova            m2, [t1+r10*2+416* 4]
1532    paddw           m0, m0
1533    mova            m3, [t1+r10*2+416* 6]
1534    paddd           m1, m1
1535    mova           m16, [t1+r10*2+416* 8]
1536    paddd           m2, m2
1537    mova           m17, [t1+r10*2+416*10]
1538    mova [t2+r10*2+416* 0], m0
1539    mova [t2+r10*2+416* 2], m1
1540    mova [t2+r10*2+416* 4], m2
1541    mova [t2+r10*2+416* 6], m3
1542    mova [t2+r10*2+416* 8], m16
1543    mova [t2+r10*2+416*10], m17
1544    add            r10, 32
1545    jl .top_fixup_loop
1546    call .v0
1547    jmp .main
1548.h: ; horizontal boxsums
1549    lea            r10, [wq-2]
1550    test         edgeb, 1 ; LR_HAVE_LEFT
1551    jz .h_extend_left
1552    movd          xm17, [leftq]
1553    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1554    add          leftq, 4
1555    jmp .h_main
1556.h_extend_left:
1557    vpbroadcastb  xm17, [lpfq+wq]
1558    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1559    jmp .h_main
1560.h_top:
1561    lea            r10, [wq-2]
1562    test         edgeb, 1 ; LR_HAVE_LEFT
1563    jz .h_extend_left
1564.h_loop:
1565    movu          ym17, [lpfq+r10-2]
1566.h_main:
1567    vinserti32x8   m17, [lpfq+r10+6], 1
1568    test         edgeb, 2 ; LR_HAVE_RIGHT
1569    jnz .h_have_right
1570    cmp           r10d, -34
1571    jl .h_have_right
1572    vpbroadcastb    m0, [lpfq-1]
1573    movu          ym16, [r12+r10-8]
1574    vinserti32x8   m16, [r12+r10+0], 1
1575    vpternlogd     m17, m0, m16, 0xe4
1576.h_have_right:
1577    pshufb          m3, m17, m5
1578    pshufb         m18, m17, m6
1579    shufps          m0, m3, m18, q2121
1580    pmullw          m2, m0, m0
1581    pshufb         m19, m17, m7
1582    paddw           m0, m19
1583    pshufb         m17, m8
1584    paddw           m0, m17           ; sum3
1585    punpcklwd      m16, m19, m17
1586    punpcklwd       m1, m2, m4
1587    vpdpwssd        m1, m16, m16      ; sumsq3
1588    punpckhwd      m19, m17
1589    punpckhwd       m2, m4
1590    vpdpwssd        m2, m19, m19
1591    mova [t1+r10*2+416* 6], m0
1592    mova [t1+r10*2+416* 8], m1
1593    mova [t1+r10*2+416*10], m2
1594    punpcklwd      m19, m3, m18
1595    paddw           m0, m3
1596    vpdpwssd        m1, m19, m19      ; sumsq5
1597    punpckhwd       m3, m18
1598    paddw           m0, m18           ; sum5
1599    vpdpwssd        m2, m3, m3
1600    mova [t1+r10*2+416* 0], m0
1601    mova [t1+r10*2+416* 2], m1
1602    mova [t1+r10*2+416* 4], m2
1603    add            r10, 32
1604    jl .h_loop
1605    ret
1606ALIGN function_align
1607.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
1608    lea            r10, [wq-2]
1609    test         edgeb, 1 ; LR_HAVE_LEFT
1610    jz .hv0_extend_left
1611    movd          xm17, [leftq]
1612    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1613    add          leftq, 4
1614    jmp .hv0_main
1615.hv0_extend_left:
1616    vpbroadcastb  xm17, [lpfq+wq]
1617    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1618    jmp .hv0_main
1619.hv0_bottom:
1620    lea            r10, [wq-2]
1621    test         edgeb, 1 ; LR_HAVE_LEFT
1622    jz .hv0_extend_left
1623.hv0_loop:
1624    movu          ym17, [lpfq+r10-2]
1625.hv0_main:
1626    vinserti32x8   m17, [lpfq+r10+6], 1
1627    test         edgeb, 2 ; LR_HAVE_RIGHT
1628    jnz .hv0_have_right
1629    cmp           r10d, -34
1630    jl .hv0_have_right
1631    vpbroadcastb    m0, [lpfq-1]
1632    movu          ym16, [r12+r10-8]
1633    vinserti32x8   m16, [r12+r10+0], 1
1634    vpternlogd     m17, m0, m16, 0xe4
1635.hv0_have_right:
1636    pshufb         m18, m17, m5
1637    pshufb         m19, m17, m6
1638    shufps          m1, m18, m19, q2121
1639    pmullw          m3, m1, m1
1640    pshufb          m0, m17, m7
1641    paddw           m1, m0
1642    pshufb         m17, m8
1643    paddw           m1, m17           ; sum3
1644    punpcklwd      m16, m0, m17
1645    punpcklwd       m2, m3, m4
1646    vpdpwssd        m2, m16, m16      ; sumsq3
1647    punpckhwd       m0, m17
1648    punpckhwd       m3, m4
1649    vpdpwssd        m3, m0, m0
1650    paddw           m0, m1, [t1+r10*2+416* 6]
1651    paddd          m16, m2, [t1+r10*2+416* 8]
1652    paddd          m17, m3, [t1+r10*2+416*10]
1653    mova [t1+r10*2+416* 6], m1
1654    mova [t1+r10*2+416* 8], m2
1655    mova [t1+r10*2+416*10], m3
1656    paddw           m1, m18
1657    paddw           m1, m19           ; sum5
1658    mova [t3+r10*4+416*8+ 8], m1
1659    paddw           m1, [t1+r10*2+416* 0]
1660    mova [t1+r10*2+416* 0], m1
1661    punpcklwd       m1, m18, m19
1662    vpdpwssd        m2, m1, m1        ; sumsq5
1663    punpckhwd      m18, m19
1664    vpdpwssd        m3, m18, m18
1665    mova [t3+r10*4+416*0+ 8], m2      ; we need a clean copy of the last row
1666    mova [t3+r10*4+416*0+72], m3      ; in case height is odd
1667    paddd           m2, [t1+r10*2+416* 2]
1668    paddd           m3, [t1+r10*2+416* 4]
1669    mova [t1+r10*2+416* 2], m2
1670    mova [t1+r10*2+416* 4], m3
1671    paddw           m1, m0, [t2+r10*2+416* 6]
1672    paddd           m2, m16, [t2+r10*2+416* 8]
1673    paddd           m3, m17, [t2+r10*2+416*10]
1674    mova [t2+r10*2+416* 6], m0
1675    mova [t2+r10*2+416* 8], m16
1676    mova [t2+r10*2+416*10], m17
1677    pmulld         m16, m2, m9        ; -a3 * 9
1678    pmulld         m17, m3, m9
1679    punpcklwd       m0, m4, m1        ; b3
1680    vpdpwssd       m16, m0, m0        ; -p3
1681    punpckhwd       m1, m4, m1
1682    vpdpwssd       m17, m1, m1
1683    pmulld         m16, m12           ; p3 * s1
1684    pmulld         m17, m12
1685    pmaddwd         m0, m13           ; b3 * 455
1686    pmaddwd         m1, m13
1687    vpalignr   m17{k2}, m16, m16, 2
1688    mova           m16, m22
1689    paddusw        m17, m14
1690    psraw          m17, 4             ; min(z3, 255) - 256
1691    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
1692    vpmovb2m        k3, m17
1693    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
1694    vmovdqu8   m17{k3}, m16           ; x3
1695    pandn          m16, m24, m17
1696    psrld          m17, 16
1697    pmulld          m0, m16
1698    pmulld          m1, m17
1699    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1700    paddd           m1, m15
1701    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
1702    vpternlogd     m17, m1, m24, 0xd8
1703    mova          [t3+r10*4+416*4+  8], m16
1704    mova          [t3+r10*4+416*4+ 24], xm17
1705    vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
1706    mova          [t3+r10*4+416*4+ 72], m17
1707    vextracti128  [t3+r10*4+416*4+ 72], ym16, 1
1708    vextracti32x4 [t3+r10*4+416*4+104], m16, 3
1709    add            r10, 32
1710    jl .hv0_loop
1711    ret
1712ALIGN function_align
1713.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1714    lea            r10, [wq-2]
1715    test         edgeb, 1 ; LR_HAVE_LEFT
1716    jz .hv1_extend_left
1717    movd          xm17, [leftq]
1718    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1719    add          leftq, 4
1720    jmp .hv1_main
1721.hv1_extend_left:
1722    vpbroadcastb  xm17, [lpfq+wq]
1723    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1724    jmp .hv1_main
1725.hv1_bottom:
1726    lea            r10, [wq-2]
1727    test         edgeb, 1 ; LR_HAVE_LEFT
1728    jz .hv1_extend_left
1729.hv1_loop:
1730    movu          ym17, [lpfq+r10-2]
1731.hv1_main:
1732    vinserti32x8   m17, [lpfq+r10+6], 1
1733    test         edgeb, 2 ; LR_HAVE_RIGHT
1734    jnz .hv1_have_right
1735    cmp           r10d, -34
1736    jl .hv1_have_right
1737    vpbroadcastb    m0, [lpfq-1]
1738    movu          ym16, [r12+r10-8]
1739    vinserti32x8   m16, [r12+r10+0], 1
1740    vpternlogd    m17, m0, m16, 0xe4
1741.hv1_have_right:
1742    pshufb          m3, m17, m5
1743    pshufb         m19, m17, m6
1744    shufps          m2, m3, m19, q2121
1745    pmullw          m1, m2, m2
1746    pshufb         m18, m17, m7
1747    paddw           m2, m18
1748    pshufb         m17, m8
1749    paddw           m2, m17           ; sum3
1750    punpcklwd      m16, m17, m18
1751    punpcklwd       m0, m1, m4
1752    vpdpwssd        m0, m16, m16      ; sumsq3
1753    punpckhwd      m17, m18
1754    punpckhwd       m1, m4
1755    vpdpwssd        m1, m17, m17
1756    paddd          m16, m0, [t2+r10*2+416* 8]
1757    paddd          m17, m1, [t2+r10*2+416*10]
1758    mova [t2+r10*2+416* 8], m0
1759    mova [t2+r10*2+416*10], m1
1760    punpcklwd      m18, m3, m19
1761    vpdpwssd        m0, m18, m18      ; sumsq5
1762    punpckhwd      m18, m3, m19
1763    vpdpwssd        m1, m18, m18
1764    paddw           m3, m19
1765    pmulld         m16, m9            ; -a3 * 9
1766    pmulld         m17, m9
1767    paddd          m18, m0, [t2+r10*2+416*2]
1768    paddd          m19, m1, [t2+r10*2+416*4]
1769    paddd          m18, [t1+r10*2+416*2]
1770    paddd          m19, [t1+r10*2+416*4]
1771    mova [t2+r10*2+416*2], m0
1772    mova [t2+r10*2+416*4], m1
1773    pmulld         m18, m10           ; -a5 * 25
1774    pmulld         m19, m10
1775    paddw           m1, m2, [t2+r10*2+416* 6]
1776    mova [t2+r10*2+416* 6], m2
1777    paddw           m2, m3            ; sum5
1778    paddw           m3, m2, [t2+r10*2+416*0]
1779    paddw           m3, [t1+r10*2+416*0]
1780    mova [t2+r10*2+416*0], m2
1781    punpcklwd       m0, m4, m1        ; b3
1782    vpdpwssd       m16, m0, m0        ; -p3
1783    punpckhwd       m1, m4, m1
1784    vpdpwssd       m17, m1, m1
1785    punpcklwd       m2, m3, m4        ; b5
1786    vpdpwssd       m18, m2, m2        ; -p5
1787    punpckhwd       m3, m4
1788    vpdpwssd       m19, m3, m3
1789    pmulld         m16, m12           ; p3 * s1
1790    pmulld         m17, m12
1791    pmulld         m18, m11           ; p5 * s0
1792    pmulld         m19, m11
1793    pmaddwd         m0, m13           ; b3 * 455
1794    pmaddwd         m1, m13
1795    pmaddwd         m2, m13           ; b5 * 164
1796    pmaddwd         m3, m13
1797    vpalignr   m17{k2}, m16, m16, 2
1798    vpalignr   m19{k2}, m18, m18, 2
1799    paddusw        m17, m14
1800    mova           m16, m22
1801    psraw          m17, 4             ; min(z3, 255) - 256
1802    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
1803    vpmovb2m        k3, m17
1804    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
1805    paddusw        m19, m14
1806    mova           m18, m22
1807    psraw          m19, 4             ; min(z5, 255) - 256
1808    vpermt2b       m18, m19, m23      ; sgr_x_by_x[128..255]
1809    vpmovb2m        k4, m19
1810    vpermi2b       m19, m20, m21      ; sgr_x_by_x[  0..127]
1811    vmovdqu8   m17{k3}, m16           ; x3
1812    vmovdqu8   m19{k4}, m18           ; x5
1813    pandn          m16, m24, m17
1814    psrld          m17, 16
1815    pmulld          m0, m16
1816    pmulld          m1, m17
1817    pandn          m18, m24, m19
1818    psrld          m19, 16
1819    pmulld          m2, m18
1820    pmulld          m3, m19
1821    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1822    paddd           m1, m15
1823    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
1824    vpternlogd     m17, m1, m24, 0xd8
1825    mova          [t3+r10*4+416*8+  8], m16
1826    mova          [t3+r10*4+416*8+ 24], xm17
1827    vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
1828    paddd           m2, m15           ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
1829    paddd           m3, m15
1830    mova          [t3+r10*4+416*8+ 72], m17
1831    vextracti128  [t3+r10*4+416*8+ 72], ym16, 1
1832    vextracti32x4 [t3+r10*4+416*8+104], m16, 3
1833    vpternlogd     m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
1834    vpternlogd     m19, m3, m24, 0xd8
1835    mova          [t3+r10*4+416*0+  8], m18
1836    mova          [t3+r10*4+416*0+ 24], xm19
1837    vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
1838    mova          [t3+r10*4+416*0+ 72], m19
1839    vextracti128  [t3+r10*4+416*0+ 72], ym18, 1
1840    vextracti32x4 [t3+r10*4+416*0+104], m18, 3
1841    add            r10, 32
1842    jl .hv1_loop
1843    mov            r10, t2
1844    mov             t2, t1
1845    mov             t1, r10
1846    ret
1847.v0: ; vertical boxsums + ab3 (even rows)
1848    lea            r10, [wq-2]
1849.v0_loop:
1850    mova            m2, [t1+r10*2+416* 8]
1851    mova            m3, [t1+r10*2+416*10]
1852    paddd           m2, m2
1853    paddd           m3, m3
1854    paddd          m16, m2, [t2+r10*2+416* 8]
1855    paddd          m17, m3, [t2+r10*2+416*10]
1856    mova            m0, [t1+r10*2+416* 6]
1857    paddw           m0, m0
1858    paddw           m1, m0, [t2+r10*2+416* 6]
1859    pmulld         m16, m9            ; -a3 * 9
1860    pmulld         m17, m9
1861    mova [t2+r10*2+416* 6], m0
1862    mova [t2+r10*2+416* 8], m2
1863    mova [t2+r10*2+416*10], m3
1864    mova            m2, [t1+r10*2+416*0]
1865    mova            m3, [t1+r10*2+416*2]
1866    mova           m18, [t1+r10*2+416*4]
1867    punpcklwd       m0, m4, m1        ; b3
1868    vpdpwssd       m16, m0, m0        ; -p3
1869    punpckhwd       m1, m4, m1
1870    vpdpwssd       m17, m1, m1
1871    pmulld         m16, m12           ; p3 * s1
1872    pmulld         m17, m12
1873    pmaddwd         m0, m13           ; b3 * 455
1874    pmaddwd         m1, m13
1875    mova [t3+r10*4+416*8+ 8], m2
1876    mova [t3+r10*4+416*0+ 8], m3
1877    mova [t3+r10*4+416*0+72], m18
1878    vpalignr   m17{k2}, m16, m16, 2
1879    mova           m16, m22
1880    paddusw        m17, m14
1881    psraw          m17, 4             ; min(z3, 255) - 256
1882    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
1883    vpmovb2m        k3, m17
1884    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
1885    vmovdqu8   m17{k3}, m16           ; x3
1886    pandn          m16, m24, m17
1887    psrld          m17, 16
1888    pmulld          m0, m16
1889    pmulld          m1, m17
1890    paddw           m2, m2            ; cc5
1891    paddd           m3, m3
1892    paddd          m18, m18
1893    mova [t1+r10*2+416*0], m2
1894    mova [t1+r10*2+416*2], m3
1895    mova [t1+r10*2+416*4], m18
1896    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1897    paddd           m1, m15
1898    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
1899    vpternlogd     m17, m1, m24, 0xd8
1900    mova          [t3+r10*4+416*4+  8], m16
1901    mova          [t3+r10*4+416*4+ 24], xm17
1902    vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
1903    mova          [t3+r10*4+416*4+ 72], m17
1904    vextracti128  [t3+r10*4+416*4+ 72], ym16, 1
1905    vextracti32x4 [t3+r10*4+416*4+104], m16, 3
1906    add            r10, 32
1907    jl .v0_loop
1908    ret
1909.v1: ; vertical boxsums + ab (odd rows)
1910    lea            r10, [wq-2]
1911.v1_loop:
1912    mova            m0, [t1+r10*2+416* 8]
1913    paddd          m16, m0, [t2+r10*2+416* 8]
1914    mova            m1, [t1+r10*2+416*10]
1915    paddd          m17, m1, [t2+r10*2+416*10]
1916    mova            m2, [t3+r10*4+416*0+ 8]
1917    paddd          m18, m2, [t2+r10*2+416* 2]
1918    mova            m3, [t3+r10*4+416*0+72]
1919    paddd          m19, m3, [t2+r10*2+416* 4]
1920    paddd          m18, [t1+r10*2+416* 2]
1921    paddd          m19, [t1+r10*2+416* 4]
1922    mova [t2+r10*2+416* 8], m0
1923    mova [t2+r10*2+416*10], m1
1924    mova [t2+r10*2+416* 2], m2
1925    mova [t2+r10*2+416* 4], m3
1926    pmulld         m16, m9            ; -a3 * 9
1927    pmulld         m17, m9
1928    pmulld         m18, m10           ; -a5 * 25
1929    pmulld         m19, m10
1930    mova            m0, [t1+r10*2+416* 6]
1931    paddw           m1, m0, [t2+r10*2+416* 6]
1932    mova            m2, [t3+r10*4+416*8+ 8]
1933    paddw           m3, m2, [t2+r10*2+416*0]
1934    paddw           m3, [t1+r10*2+416*0]
1935    mova [t2+r10*2+416* 6], m0
1936    mova [t2+r10*2+416*0], m2
1937    punpcklwd       m0, m4, m1        ; b3
1938    vpdpwssd       m16, m0, m0        ; -p3
1939    punpckhwd       m1, m4, m1
1940    vpdpwssd       m17, m1, m1
1941    punpcklwd       m2, m3, m4        ; b5
1942    vpdpwssd       m18, m2, m2        ; -p5
1943    punpckhwd       m3, m4
1944    vpdpwssd       m19, m3, m3
1945    pmulld         m16, m12           ; p3 * s1
1946    pmulld         m17, m12
1947    pmulld         m18, m11           ; p5 * s0
1948    pmulld         m19, m11
1949    pmaddwd         m0, m13           ; b3 * 455
1950    pmaddwd         m1, m13
1951    pmaddwd         m2, m13           ; b5 * 164
1952    pmaddwd         m3, m13
1953    vpalignr   m17{k2}, m16, m16, 2
1954    vpalignr   m19{k2}, m18, m18, 2
1955    paddusw        m17, m14
1956    mova           m16, m22
1957    psraw          m17, 4             ; min(z3, 255) - 256
1958    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
1959    vpmovb2m        k3, m17
1960    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
1961    paddusw        m19, m14
1962    mova           m18, m22
1963    psraw          m19, 4             ; min(z5, 255) - 256
1964    vpermt2b       m18, m19, m23      ; sgr_x_by_x[128..255]
1965    vpmovb2m        k4, m19
1966    vpermi2b       m19, m20, m21      ; sgr_x_by_x[  0..127]
1967    vmovdqu8   m17{k3}, m16           ; x3
1968    vmovdqu8   m19{k4}, m18           ; x5
1969    pandn          m16, m24, m17
1970    psrld          m17, 16
1971    pmulld          m0, m16
1972    pmulld          m1, m17
1973    pandn          m18, m24, m19
1974    psrld          m19, m19, 16
1975    pmulld          m2, m18
1976    pmulld          m3, m19
1977    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1978    paddd           m1, m15
1979    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
1980    vpternlogd     m17, m1, m24, 0xd8
1981    mova          [t3+r10*4+416*8+  8], m16
1982    mova          [t3+r10*4+416*8+ 24], xm17
1983    vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
1984    paddd           m2, m15           ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
1985    paddd           m3, m15
1986    mova          [t3+r10*4+416*8+ 72], m17
1987    vextracti128  [t3+r10*4+416*8+ 72], ym16, 1
1988    vextracti32x4 [t3+r10*4+416*8+104], m16, 3
1989    vpternlogd     m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
1990    vpternlogd     m19, m3, m24, 0xd8
1991    mova          [t3+r10*4+416*0+  8], m18
1992    mova          [t3+r10*4+416*0+ 24], xm19
1993    vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
1994    mova          [t3+r10*4+416*0+ 72], m19
1995    vextracti128  [t3+r10*4+416*0+ 72], ym18, 1
1996    vextracti32x4 [t3+r10*4+416*0+104], m18, 3
1997    add            r10, 32
1998    jl .v1_loop
1999    mov            r10, t2
2000    mov             t2, t1
2001    mov             t1, r10
2002    ret
2003.prep_n: ; initial neighbor setup
2004    mov            r10, wq
2005.prep_n_loop:
2006    movu            m0, [t3+r10*4+416*0+4]
2007    paddd           m1, m0, [t3+r10*4+416*0+0]
2008    mova           m16, [t3+r10*4+416*4+0]
2009    paddd           m1, [t3+r10*4+416*0+8]
2010    mova           m17, [t3+r10*4+416*8+0]
2011    paddd          m16, [t3+r10*4+416*4+8]
2012    paddd          m17, [t3+r10*4+416*8+8]
2013    paddd           m2, m16, [t3+r10*4+416*4+4]
2014    paddd           m3, m17, [t3+r10*4+416*8+4]
2015    paddd           m0, m1
2016    pslld           m1, 2
2017    pslld           m2, 2
2018    paddd           m1, m0            ; ab5 565
2019    paddd           m3, m3            ; ab3[ 0] 222
2020    psubd           m2, m16           ; ab3[-1] 343
2021    mova [t3+r10*4+416*20], m3
2022    pandn           m0, m24, m1       ; a5 565
2023    mova [t3+r10*4+416*24], m2
2024    psrld           m1, 12            ; b5 565
2025    mova [t3+r10*4+416*12], m0
2026    paddd           m3, m3
2027    mova [t3+r10*4+416*16], m1
2028    psubd           m3, m17           ; ab3[ 0] 343
2029    mova [t3+r10*4+416*28], m3
2030    add            r10, 16
2031    jl .prep_n_loop
2032    ret
2033ALIGN function_align
2034.n0: ; neighbor + output (even rows)
2035    mov            r10, wq
2036.n0_loop:
2037    movu            m2, [t3+r10*4+4]
2038    paddd           m3, m2, [t3+r10*4+0]
2039    paddd           m3, [t3+r10*4+8]
2040    mova            m1, [t3+r10*4+416*4+0]
2041    paddd           m2, m3
2042    pslld           m3, 2
2043    paddd           m1, [t3+r10*4+416*4+8]
2044    paddd           m3, m2
2045    pandn           m2, m24, m3
2046    psrld           m3, 12
2047    paddd           m0, m2, [t3+r10*4+416*12] ; a5
2048    paddd          m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8)
2049    mova [t3+r10*4+416*12], m2
2050    mova [t3+r10*4+416*16], m3
2051    paddd           m2, m1, [t3+r10*4+416*4+4]
2052    paddd           m2, m2                    ; ab3[ 1] 222
2053    mova            m3, [t3+r10*4+416*20]
2054    paddd          m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343
2055    mova [t3+r10*4+416*20], m2
2056    paddd           m2, m2
2057    psubd           m2, m1                    ; ab3[ 1] 343
2058    mova [t3+r10*4+416*24], m2
2059    paddd           m2, m3                    ; ab3[ 0] 222 + ab3[ 1] 343
2060    pandn           m1, m24, m17
2061    psrld          m17, 12
2062    pandn           m3, m24, m2
2063    psrld           m2, 12
2064    paddd           m1, m3                    ; a3
2065    pmovzxbd        m3, [dstq+r10]
2066    paddd          m17, m2                    ; b3 + (1 << 8)
2067    pmaddwd         m0, m3                    ; a5 * src
2068    pmaddwd         m1, m3                    ; a3 * src
2069    vpshldd         m3, m25, 16               ; (dst << 16) + (1 << 15)
2070    psubd          m16, m0                    ; b5 - a5 * src + (1 << 8)
2071    psubd          m17, m1                    ; b3 - a3 * src + (1 << 8)
2072    psrld          m16, 9
2073    pslld          m17, 7
2074    vmovdqu8   m17{k2}, m16
2075    vpdpwssd        m3, m17, m26
2076    packuswb        m3, m2
2077    vpermb         m16, m27, m3
2078    mova    [dstq+r10], xm16
2079    add            r10, 16
2080    jl .n0_loop
2081    add           dstq, strideq
2082    ret
2083ALIGN function_align
2084.n1: ; neighbor + output (odd rows)
2085    mov            r10, wq
2086.n1_loop:
2087    mova            m1, [t3+r10*4+416*8+0]
2088    paddd           m1, [t3+r10*4+416*8+8]
2089    paddd           m2, m1, [t3+r10*4+416*8+4]
2090    paddd           m2, m2                    ; ab3[ 1] 222
2091    mova            m0, [t3+r10*4+416*20]
2092    paddd          m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343
2093    pmovzxbd        m3, [dstq+r10]
2094    mova [t3+r10*4+416*20], m2
2095    paddd           m2, m2
2096    psubd           m2, m1                    ; ab3[ 1] 343
2097    mova [t3+r10*4+416*28], m2
2098    paddd           m0, m2                    ; ab3[ 0] 222 + ab3[ 1] 343
2099    pandn           m1, m24, m17
2100    psrld          m17, 12
2101    pandn           m2, m24, m0
2102    psrld           m0, 12
2103    paddd           m1, m2                    ; a3
2104    paddd          m17, m0                    ; b3 + (1 << 8)
2105    mova           m16, [t3+r10*4+416*16]     ; b5 + (1 << 7)
2106    pmaddwd         m1, m3                    ; a3 * src
2107    pmaddwd         m0, m3, [t3+r10*4+416*12] ; a5 * src
2108    vpshldd         m3, m25, 16               ; (dst << 16) + (1 << 15)
2109    psubd          m17, m1                    ; b3 - a3 * src + (1 << 8)
2110    psubd          m16, m0                    ; b5 - a5 * src + (1 << 7)
2111    pslld          m17, 7
2112    palignr    m17{k2}, m16, m16, 1
2113    vpdpwssd        m3, m17, m26
2114    packuswb        m3, m3
2115    vpermb         m16, m27, m3
2116    mova    [dstq+r10], xm16
2117    add            r10, 16
2118    jl .n1_loop
2119    add           dstq, strideq
2120    ret
2121
2122%endif ; ARCH_X86_64
2123