xref: /aosp_15_r20/external/libdav1d/src/x86/refmvs.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA 64
30
31%macro JMP_TABLE 2-*
32    %xdefine %%prefix mangle(private_prefix %+ _%1)
33    %1_table:
34    %xdefine %%base %1_table
35    %rep %0 - 1
36        dd %%prefix %+ .w%2 - %%base
37        %rotate 1
38    %endrep
39%endmacro
40
41%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix
42    %rep %1
43        db %2*3
44        db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \
45           mangle(private_prefix %+ _save_tmvs_%3).write1
46    %endrep
47%endmacro
48
49%if ARCH_X86_64
50mv_proj:       dw    0, 16384, 8192, 5461, 4096, 3276, 2730, 2340
51               dw 2048,  1820, 1638, 1489, 1365, 1260, 1170, 1092
52               dw 1024,   963,  910,  862,  819,  780,  744,  712
53               dw  682,   655,  630,  606,  585,  564,  546,  528
54splat_mv_shuf: db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
55               db  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7
56               db  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
57               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
58%endif
59save_pack0:    db  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0
60               db  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1
61save_pack1:    db  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2
62               db  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3
63save_ref_shuf: db  0, -1, -1, -1,  1, -1, -1, -1,  8, -1, -1, -1,  9, -1, -1, -1
64cond_shuf512:  db  3,  3,  3,  3,  7,  7,  7,  7,  7,  7,  7,  7,  3,  3,  3,  3
65save_cond0:    db  0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
66save_cond1:    db  0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
67pb_128:        times 16 db 128
68pq_8192:       dq 8192
69
70save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3
71                       SAVE_TMVS_TABLE 4,  8, ssse3
72                       SAVE_TMVS_TABLE 4,  4, ssse3
73                       SAVE_TMVS_TABLE 5,  2, ssse3
74                       SAVE_TMVS_TABLE 7,  1, ssse3
75
76%if ARCH_X86_64
77save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2
78                      SAVE_TMVS_TABLE 4,  8, avx2
79                      SAVE_TMVS_TABLE 4,  4, avx2
80                      SAVE_TMVS_TABLE 5,  2, avx2
81                      SAVE_TMVS_TABLE 7,  1, avx2
82
83save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl
84                           SAVE_TMVS_TABLE 4,  8, avx512icl
85                           SAVE_TMVS_TABLE 4,  4, avx512icl
86                           SAVE_TMVS_TABLE 5,  2, avx512icl
87                           SAVE_TMVS_TABLE 7,  1, avx512icl
88
89JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32
90JMP_TABLE splat_mv_avx2,      1, 2, 4, 8, 16, 32
91%endif
92
93JMP_TABLE splat_mv_sse2,      1, 2, 4, 8, 16, 32
94
95struc rf
96    .frm_hdr:         resq 1
97    .iw4:             resd 1
98    .ih4:             resd 1
99    .iw8:             resd 1
100    .ih8:             resd 1
101    .sbsz:            resd 1
102    .use_rf_mvs:      resd 1
103    .sign_bias:       resb 7
104    .mfmv_sign:       resb 7
105    .pocdiff:         resb 7
106    .mfmv_ref:        resb 3
107    .mfmv_ref2cur:    resd 3
108    .mfmv_ref2ref:    resd 3*7
109    .n_mfmvs:         resd 1
110    .n_blocks:        resd 1
111    .rp:              resq 1
112    .rp_ref:          resq 1
113    .rp_proj:         resq 1
114    .rp_stride:       resq 1
115    .r:               resq 1
116    .n_tile_threads:  resd 1
117    .n_frame_threads: resd 1
118endstruc
119
120SECTION .text
121
122%macro movif32 2
123%if ARCH_X86_32
124    mov             %1, %2
125%endif
126%endmacro
127
128INIT_XMM ssse3
129; refmvs_temporal_block *rp, ptrdiff_t stride,
130; refmvs_block **rr, uint8_t *ref_sign,
131; int col_end8, int row_end8, int col_start8, int row_start8
132%if ARCH_X86_64
133cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \
134                             xend, yend, xstart, ystart
135%define base_reg r12
136%else
137cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \
138                            xend, yend, xstart, ystart
139    movq            m5, [ref_signq]
140    lea        strided, [strided*5]
141    mov        stridem, strided
142    mov             r3, xstartm
143    mov             r1, ystartm
144 DEFINE_ARGS b, ystart, rr, cand, xend, x
145%define stridemp r1m
146%define m8  [base+pb_128]
147%define m9  [base+save_pack0+ 0]
148%define m10 [base+save_pack0+16]
149%define base_reg r6
150%endif
151%define base base_reg-.write1
152    LEA       base_reg, .write1
153%if ARCH_X86_64
154    movifnidn    xendd, xendm
155    movifnidn    yendd, yendm
156    mov        xstartd, xstartm
157    mov        ystartd, ystartm
158    movq            m5, [ref_signq]
159%endif
160    movu            m4, [base+save_ref_shuf]
161    movddup         m6, [base+save_cond0]
162    movddup         m7, [base+save_cond1]
163%if ARCH_X86_64
164    mova            m8, [base+pb_128]
165    mova            m9, [base+save_pack0+ 0]
166    mova           m10, [base+save_pack0+16]
167%endif
168    psllq           m5, 8
169%if ARCH_X86_64
170    lea            r9d, [xendq*5]
171    lea        xstartd, [xstartq*5]
172    sub          yendd, ystartd
173    add        ystartd, ystartd
174    lea        strideq, [strideq*5]
175    sub        xstartq, r9
176    add          xendd, r9d
177    add            rpq, r9
178 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
179%else
180    lea             r0, [xendd*5]   ; xend5
181    lea             r3, [r3*5]      ; xstart5
182    sub             r3, r0          ; -w5
183    mov            r6m, r3
184%define xstartq r6m
185    add          xendd, r0          ; xend6
186    add            r0m, r0          ; rp+xend5
187    mov          xendm, xendd
188    sub             r5, r1          ; h
189    add             r1, r1
190    mov            r7m, r1
191    mov            r5m, r5
192%define hd r5mp
193    jmp .loop_y_noload
194%endif
195.loop_y:
196    movif32    ystartd, r7m
197    movif32      xendd, xendm
198.loop_y_noload:
199    and        ystartd, 30
200    mov             xq, xstartq
201    mov             bq, [rrq+ystartq*gprsize]
202    add        ystartd, 2
203    movif32        r7m, ystartd
204    lea             bq, [bq+xendq*4]
205.loop_x:
206%if ARCH_X86_32
207%define rpq  r3
208%define r10  r1
209%define r10d r1
210%define r11  r4
211%define r11d r4
212%endif
213    imul         candq, xq, 0x9999  ; x / 5 * 3
214    sar          candq, 16
215    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
216    movu            m0, [bq+candq*8+12]      ; cand_b
217    movzx         r11d, byte [base+save_tmvs_ssse3_table+r10*2+0]
218    movzx         r10d, byte [base+save_tmvs_ssse3_table+r10*2+1]
219    add            r10, base_reg
220    add          candq, r11
221    jge .calc
222    movu            m1, [bq+candq*8+12]
223    movzx         r11d, byte [bq+candq*8+22]
224    movzx         r11d, byte [base+save_tmvs_ssse3_table+r11*2+1]
225    add            r11, base_reg
226.calc:
227    movif32        rpq, r0m
228    ; ref check
229    punpckhqdq      m2, m0, m1
230    pshufb          m2, m4      ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ...
231    pshufb          m3, m5, m2  ; ref > 0 && res_sign[ref - 1]
232    ; mv check
233    punpcklqdq      m2, m0, m1  ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ...
234    pabsw           m2, m2
235    psrlw           m2, 12      ; (abs(mv.x) | abs(mv.y)) < 4096
236    ; res
237    pcmpgtd         m3, m2
238    pshufd          m2, m3, q2301
239    pand            m3, m6      ; b0c0 b0c1 b1c0 b1c1 | ...
240    pand            m2, m7      ; b0c1 b0c0 b1c1 b1c0 | ...
241    por             m3, m2      ; b0.shuf b1.shuf | ...
242    pxor            m3, m8      ; if cond0|cond1 == 0 => zero out
243    pshufb          m0, m3
244    pshufb          m1, m3
245    call           r10
246    jge .next_line
247    pshufd          m0, m1, q3232
248    call           r11
249    jl .loop_x
250.next_line:
251    add            rpq, stridemp
252    movif32        r0m, rpq
253    dec             hd
254    jg .loop_y
255    RET
256.write1:
257    movd    [rpq+xq+0], m0
258    psrlq           m0, 8
259    movd    [rpq+xq+1], m0
260    add             xq, 5*1
261    ret
262.write2:
263    movq    [rpq+xq+0], m0
264    psrlq           m0, 8
265    movd    [rpq+xq+6], m0
266    add             xq, 5*2
267    ret
268.write4:
269    pshufb          m0, m9
270    movu   [rpq+xq+ 0], m0
271    psrlq           m0, 8
272    movd   [rpq+xq+16], m0
273    add             xq, 5*4
274    ret
275.write8:
276    pshufb          m2, m0, m9
277    movu   [rpq+xq+ 0], m2
278    pshufb          m0, m10
279    movu   [rpq+xq+16], m0
280    psrldq          m2, 2
281    movq   [rpq+xq+32], m2
282    add             xq, 5*8
283    ret
284.write16:
285    pshufb          m2, m0, m9
286    movu   [rpq+xq+ 0], m2
287    pshufb          m0, m10
288    movu   [rpq+xq+16], m0
289    shufps          m2, m0, q1032
290    movu   [rpq+xq+48], m2
291    shufps          m2, m0, q2121
292    movu   [rpq+xq+32], m2
293    shufps          m0, m2, q1032
294    movu   [rpq+xq+64], m0
295    add             xq, 5*16
296    ret
297
298INIT_XMM sse2
299; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4
300cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
301    add           bx4d, bw4d
302    tzcnt         bw4d, bw4d
303    mova            m2, [aq]
304    LEA             aq, splat_mv_sse2_table
305    lea           bx4q, [bx4q*3-32]
306    movsxd        bw4q, [aq+bw4q*4]
307    movifnidn     bh4d, bh4m
308    pshufd          m0, m2, q0210
309    pshufd          m1, m2, q1021
310    pshufd          m2, m2, q2102
311    add           bw4q, aq
312.loop:
313    mov             aq, [rrq]
314    add            rrq, gprsize
315    lea             aq, [aq+bx4q*4]
316    jmp           bw4q
317.w32:
318    mova    [aq-16*16], m0
319    mova    [aq-16*15], m1
320    mova    [aq-16*14], m2
321    mova    [aq-16*13], m0
322    mova    [aq-16*12], m1
323    mova    [aq-16*11], m2
324    mova    [aq-16*10], m0
325    mova    [aq-16* 9], m1
326    mova    [aq-16* 8], m2
327    mova    [aq-16* 7], m0
328    mova    [aq-16* 6], m1
329    mova    [aq-16* 5], m2
330.w16:
331    mova    [aq-16* 4], m0
332    mova    [aq-16* 3], m1
333    mova    [aq-16* 2], m2
334    mova    [aq-16* 1], m0
335    mova    [aq+16* 0], m1
336    mova    [aq+16* 1], m2
337.w8:
338    mova    [aq+16* 2], m0
339    mova    [aq+16* 3], m1
340    mova    [aq+16* 4], m2
341.w4:
342    mova    [aq+16* 5], m0
343    mova    [aq+16* 6], m1
344    mova    [aq+16* 7], m2
345    dec           bh4d
346    jg .loop
347    RET
348.w2:
349    movu      [aq+104], m0
350    movq      [aq+120], m1
351    dec           bh4d
352    jg .loop
353    RET
354.w1:
355    movq      [aq+116], m0
356    movd      [aq+124], m2
357    dec           bh4d
358    jg .loop
359    RET
360
361%if ARCH_X86_64
362INIT_XMM sse4
363; refmvs_frame *rf, int tile_row_idx,
364; int col_start8, int col_end8, int row_start8, int row_end8
365cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
366                                    stride, rp_proj, roff, troff, \
367                                    xendi, xstarti, iw8, ih8, dst
368    xor           r14d, r14d
369    cmp dword [rfq+rf.n_tile_threads], 1
370    mov           ih8d, [rfq+rf.ih8]
371    mov           iw8d, [rfq+rf.iw8]
372    mov        xstartd, xstartd
373    mov          xendd, xendd
374    cmove       tridxd, r14d
375    lea       xstartid, [xstartq-8]
376    lea         xendid, [xendq+8]
377    mov        strideq, [rfq+rf.rp_stride]
378    mov       rp_projq, [rfq+rf.rp_proj]
379    cmp           ih8d, yendd
380    mov     [rsp+0x30], strideq
381    cmovs        yendd, ih8d
382    test      xstartid, xstartid
383    cmovs     xstartid, r14d
384    cmp           iw8d, xendid
385    cmovs       xendid, iw8d
386    mov         troffq, strideq
387    shl         troffq, 4
388    imul        troffq, tridxq
389    mov           dstd, ystartd
390    and           dstd, 15
391    imul          dstq, strideq
392    add           dstq, troffq      ; (16 * tridx + (ystart & 15)) * stride
393    lea           dstq, [dstq*5]
394    add           dstq, rp_projq
395    lea         troffq, [troffq*5]  ; 16 * tridx * stride * 5
396    lea           r13d, [xendq*5]
397    lea            r12, [strideq*5]
398 DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \
399             _, troff, xendi, xstarti, stride5, _, dst
400    lea            w5d, [xstartq*5]
401    add             r7, troffq      ; rp_proj + tile_row_offset
402    mov             hd, yendd
403    mov     [rsp+0x28], r7
404    add           dstq, r13
405    sub            w5q, r13
406    sub             hd, ystartd
407.init_xloop_start:
408    mov            x5q, w5q
409    test           w5b, 1
410    jz .init_2blk
411    mov dword [dstq+x5q], 0x80008000
412    add            x5q, 5
413    jz .init_next_row
414.init_2blk:
415    mov dword [dstq+x5q+0], 0x80008000
416    mov dword [dstq+x5q+5], 0x80008000
417    add            x5q, 10
418    jl .init_2blk
419.init_next_row:
420    add           dstq, stride5q
421    dec             hd
422    jg .init_xloop_start
423 DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \
424             _, _, xendi, xstarti, stride5, _, n
425    mov           r13d, [rfq+rf.n_mfmvs]
426    test          r13d, r13d
427    jz .ret
428    mov     [rsp+0x0c], r13d
429    mov        strideq, [rsp+0x30]
430    movddup         m3, [pq_8192]
431    mov            r9d, ystartd
432    mov     [rsp+0x38], yendd
433    mov     [rsp+0x20], xstartid
434    xor             nd, nd
435    xor            n7d, n7d
436    imul            r9, strideq     ; ystart * stride
437    mov     [rsp+0x48], rfq
438    mov     [rsp+0x18], stride5q
439    lea             r7, [r9*5]
440    mov     [rsp+0x24], ystartd
441    mov     [rsp+0x00], r7
442.nloop:
443 DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \
444             ref, rp_ref, xendi, xstarti, _, _, n
445    mov            rfq, [rsp+0x48]
446    mov           refd, [rfq+rf.mfmv_ref2cur+nq*4]
447    cmp           refd, 0x80000000
448    je .next_n
449    mov     [rsp+0x40], refd
450    mov           offq, [rsp+0x00]          ; ystart * stride * 5
451    movzx         refd, byte [rfq+rf.mfmv_ref+nq]
452    lea       refsignq, [refq-4]
453    mov        rp_refq, [rfq+rf.rp_ref]
454    movq            m2, refsignq
455    add           offq, [rp_refq+refq*8]    ; r = rp_ref[ref] + row_offset
456    mov     [rsp+0x14], nd
457    mov             yd, ystartd
458.yloop:
459    mov           r11d, [rsp+0x24]          ; ystart
460    mov           r12d, [rsp+0x38]          ; yend
461    mov           r14d, yd
462    and           r14d, ~7                  ; y_sb_align
463    cmp           r11d, r14d
464    cmovs         r11d, r14d                ; imax(y_sb_align, ystart)
465    mov     [rsp+0x44], r11d                ; y_proj_start
466    add           r14d, 8
467    cmp           r12d, r14d
468    cmovs         r14d, r12d                ; imin(y_sb_align + 8, yend)
469    mov     [rsp+0x3c], r14d                ; y_proj_end
470 DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \
471             ref, x, xendi, mvx, mvy, rb, ref2ref
472    mov             xd, [rsp+0x20] ; xstarti
473.xloop:
474    lea            rbd, [xq*5]
475    add            rbq, srcq
476    movsx         refd, byte [rbq+4]
477    test          refd, refd
478    jz .next_x_bad_ref
479    mov            rfq, [rsp+0x48]
480    lea       ref2refd, [(rf.mfmv_ref2ref/4)+n7q+refq-1]
481    mov       ref2refd, [rfq+ref2refq*4]    ; rf->mfmv_ref2ref[n][b_ref-1]
482    test      ref2refd, ref2refd
483    jz .next_x_bad_ref
484    lea          fracq, [mv_proj]
485    movzx        fracd, word [fracq+ref2refq*2]
486    mov            mvd, [rbq]
487    imul         fracd, [rsp+0x40] ; ref2cur
488    pmovsxwq        m0, [rbq]
489    movd            m1, fracd
490    punpcklqdq      m1, m1
491    pmuldq          m0, m1          ; mv * frac
492    pshufd          m1, m0, q3311
493    paddd           m0, m3
494    paddd           m0, m1
495    psrad           m0, 14          ; offset = (xy + (xy >> 31) + 8192) >> 14
496    pabsd           m1, m0
497    packssdw        m0, m0
498    psrld           m1, 6
499    packuswb        m1, m1
500    pxor            m0, m2          ; offset ^ ref_sign
501    psignd          m1, m0          ; apply_sign(abs(offset) >> 6, offset ^ refsign)
502    movq          mvxq, m1
503    lea           mvyd, [mvxq+yq]   ; ypos
504    sar           mvxq, 32
505 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \
506             ref, x, xendi, mvx, ypos, rb, ref2ref
507    cmp          yposd, [rsp+0x44] ; y_proj_start
508    jl .next_x_bad_pos_y
509    cmp          yposd, [rsp+0x3c] ; y_proj_end
510    jge .next_x_bad_pos_y
511    and          yposd, 15
512    add           mvxq, xq          ; xpos
513    imul         yposq, [rsp+0x30]  ; pos = (ypos & 15) * stride
514 DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \
515             ref, x, xendi, xpos, pos, rb, ref2ref
516    mov           dstq, [rsp+0x28]  ; dst = rp_proj + tile_row_offset
517    add           posq, xposq       ; pos += xpos
518    lea           posq, [posq*5]
519    add           dstq, posq        ; dst += pos5
520    jmp .write_loop_entry
521.write_loop:
522    add            rbq, 5
523    cmp           refb, byte [rbq+4]
524    jne .xloop
525    cmp            mvd, [rbq]
526    jne .xloop
527    add           dstq, 5
528    inc          xposd
529.write_loop_entry:
530    mov           r12d, xd
531    and           r12d, ~7
532    lea            r5d, [r12-8]
533    cmp            r5d, xstartd
534    cmovs          r5d, xstartd     ; x_proj_start
535    cmp          xposd, r5d
536    jl .next_xpos
537    add           r12d, 16
538    cmp          xendd, r12d
539    cmovs         r12d, xendd       ; x_proj_end
540    cmp          xposd, r12d
541    jge .next_xpos
542    mov       [dstq+0], mvd
543    mov  byte [dstq+4], ref2refb
544.next_xpos:
545    inc             xd
546    cmp             xd, xendid
547    jl .write_loop
548.next_y:
549 DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n
550    add           srcq, [rsp+0x18] ; stride5
551    inc             yd
552    cmp             yd, [rsp+0x38] ; yend
553    jne .yloop
554    mov             nd, [rsp+0x14]
555    mov        ystartd, [rsp+0x24]
556.next_n:
557    add            n7d, 7
558    inc             nd
559    cmp             nd, [rsp+0x0c] ; n_mfmvs
560    jne .nloop
561.ret:
562    RET
563.next_x:
564 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _
565    add            rbq, 5
566    cmp           refb, byte [rbq+4]
567    jne .xloop
568    cmp            mvd, [rbq]
569    jne .xloop
570.next_x_bad_pos_y:
571    inc             xd
572    cmp             xd, xendid
573    jl .next_x
574    jmp .next_y
575.next_x_bad_ref:
576    inc             xd
577    cmp             xd, xendid
578    jl .xloop
579    jmp .next_y
580
581INIT_YMM avx2
582; refmvs_temporal_block *rp, ptrdiff_t stride,
583; refmvs_block **rr, uint8_t *ref_sign,
584; int col_end8, int row_end8, int col_start8, int row_start8
585cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \
586                              xend, yend, xstart, ystart
587%define base r12-.write1
588    lea            r12, [.write1]
589    movifnidn    xendd, xendm
590    movifnidn    yendd, yendm
591    mov        xstartd, xstartm
592    mov        ystartd, ystartm
593    vpbroadcastq    m4, [ref_signq]
594    vpbroadcastq    m3, [base+save_ref_shuf+8]
595    vpbroadcastq    m5, [base+save_cond0]
596    vpbroadcastq    m6, [base+save_cond1]
597    vpbroadcastd    m7, [base+pb_128]
598    mova            m8, [base+save_pack0]
599    mova            m9, [base+save_pack1]
600    psllq           m4, 8
601    lea            r9d, [xendq*5]
602    lea        xstartd, [xstartq*5]
603    sub          yendd, ystartd
604    add        ystartd, ystartd
605    lea        strideq, [strideq*5]
606    sub        xstartq, r9
607    add          xendd, r9d
608    add            rpq, r9
609 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
610.loop_y:
611    and        ystartd, 30
612    mov             xq, xstartq
613    mov             bq, [rrq+ystartq*8]
614    add        ystartd, 2
615    lea             bq, [bq+xendq*4]
616.loop_x:
617    imul         candq, xq, 0x9999
618    sar          candq, 16                   ; x / 5 * 3
619    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
620    movu           xm0, [bq+candq*8+12]      ; cand_b
621    movzx         r11d, byte [base+save_tmvs_avx2_table+r10*2+0]
622    movzx         r10d, byte [base+save_tmvs_avx2_table+r10*2+1]
623    add            r10, r12
624    add          candq, r11
625    jge .calc
626    vinserti128     m0, [bq+candq*8+12], 1
627    movzx         r11d, byte [bq+candq*8+22]
628    movzx         r11d, byte [base+save_tmvs_avx2_table+r11*2+1]
629    add            r11, r12
630.calc:
631    pshufb          m1, m0, m3
632    pabsw           m2, m0
633    pshufb          m1, m4, m1  ; ref > 0 && res_sign[ref - 1]
634    psrlw           m2, 12      ; (abs(mv.x) | abs(mv.y)) < 4096
635    pcmpgtd         m1, m2
636    pshufd          m2, m1, q2301
637    pand            m1, m5      ; b0.cond0 b1.cond0
638    pand            m2, m6      ; b0.cond1 b1.cond1
639    por             m1, m2      ; b0.shuf b1.shuf
640    pxor            m1, m7      ; if cond0|cond1 == 0 => zero out
641    pshufb          m0, m1
642    call           r10
643    jge .next_line
644    vextracti128   xm0, m0, 1
645    call           r11
646    jl .loop_x
647.next_line:
648    add            rpq, strideq
649    dec             hd
650    jg .loop_y
651    RET
652.write1:
653    movd   [rpq+xq+ 0], xm0
654    pextrb [rpq+xq+ 4], xm0, 4
655    add             xq, 5*1
656    ret
657.write2:
658    movq    [rpq+xq+0], xm0
659    psrlq          xm1, xm0, 8
660    movd    [rpq+xq+6], xm1
661    add             xq, 5*2
662    ret
663.write4:
664    pshufb         xm1, xm0, xm8
665    movu   [rpq+xq+ 0], xm1
666    psrlq          xm1, 8
667    movd   [rpq+xq+16], xm1
668    add             xq, 5*4
669    ret
670.write8:
671    vinserti128     m1, m0, xm0, 1
672    pshufb          m1, m8
673    movu   [rpq+xq+ 0], m1
674    psrldq         xm1, 2
675    movq   [rpq+xq+32], xm1
676    add             xq, 5*8
677    ret
678.write16:
679    vinserti128     m1, m0, xm0, 1
680    pshufb          m2, m1, m8
681    movu   [rpq+xq+ 0], m2
682    pshufb          m1, m9
683    movu   [rpq+xq+32], m1
684    shufps         xm2, xm1, q1021
685    movu   [rpq+xq+64], xm2
686    add             xq, 5*16
687    ret
688
689cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
690    add           bx4d, bw4d
691    tzcnt         bw4d, bw4d
692    vbroadcasti128  m0, [aq]
693    lea             aq, [splat_mv_avx2_table]
694    lea           bx4q, [bx4q*3-32]
695    movsxd        bw4q, [aq+bw4q*4]
696    pshufb          m0, [splat_mv_shuf]
697    movifnidn     bh4d, bh4m
698    pshufd          m1, m0, q2102
699    pshufd          m2, m0, q1021
700    add           bw4q, aq
701.loop:
702    mov             aq, [rrq]
703    add            rrq, gprsize
704    lea             aq, [aq+bx4q*4]
705    jmp           bw4q
706.w32:
707    mova     [aq-32*8], m0
708    mova     [aq-32*7], m1
709    mova     [aq-32*6], m2
710    mova     [aq-32*5], m0
711    mova     [aq-32*4], m1
712    mova     [aq-32*3], m2
713.w16:
714    mova     [aq-32*2], m0
715    mova     [aq-32*1], m1
716    mova     [aq+32*0], m2
717.w8:
718    mova     [aq+32*1], m0
719    mova     [aq+32*2], m1
720    mova     [aq+32*3], m2
721    dec           bh4d
722    jg .loop
723    RET
724.w4:
725    movu      [aq+ 80], m0
726    mova      [aq+112], xm1
727    dec           bh4d
728    jg .loop
729    RET
730.w2:
731    movu      [aq+104], xm0
732    movq      [aq+120], xm2
733    dec           bh4d
734    jg .loop
735    RET
736.w1:
737    movq      [aq+116], xm0
738    movd      [aq+124], xm1
739    dec           bh4d
740    jg .loop
741    RET
742
743INIT_ZMM avx512icl
744; refmvs_temporal_block *rp, ptrdiff_t stride,
745; refmvs_block **rr, uint8_t *ref_sign,
746; int col_end8, int row_end8, int col_start8, int row_start8
747cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \
748                              xend, yend, xstart, ystart
749%define base r14-.write1
750    lea            r14, [.write1]
751    movifnidn    xendd, xendm
752    movifnidn    yendd, yendm
753    mov        xstartd, xstartm
754    mov        ystartd, ystartm
755    psllq           m4, [ref_signq]{bcstq}, 8
756    vpbroadcastq    m3, [base+save_ref_shuf+8]
757    vbroadcasti32x4 m5, [base+cond_shuf512]
758    vbroadcasti32x4 m6, [base+save_cond0]
759    vpbroadcastd    m7, [base+pb_128]
760    mova            m8, [base+save_pack0]
761    movu           xm9, [base+save_pack0+4]
762    lea            r9d, [xendq*5]
763    lea        xstartd, [xstartq*5]
764    sub          yendd, ystartd
765    add        ystartd, ystartd
766    lea        strideq, [strideq*5]
767    sub        xstartq, r9
768    add          xendd, r9d
769    add            rpq, r9
770    mov           r10d, 0x1f
771    kmovb           k2, r10d
772 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
773.loop_y:
774    and        ystartd, 30
775    mov             xq, xstartq
776    mov             bq, [rrq+ystartq*8]
777    add        ystartd, 2
778    lea             bq, [bq+xendq*4]
779.loop_x:
780    imul         candq, xq, 0x9999
781    sar          candq, 16                   ; x / 5 * 3
782    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
783    movu           xm0, [bq+candq*8+12]      ; cand_b
784    movzx         r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0]
785    movzx         r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1]
786    add            r10, r14
787    add          candq, r11
788    jge .calc
789    movzx         r11d, byte [bq+candq*8+22]
790    vinserti32x4   ym0, [bq+candq*8+12], 1
791    movzx         r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0]
792    movzx         r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1]
793    add            r11, r14
794    add          candq, r12
795    jge .calc
796    movzx         r12d, byte [bq+candq*8+22]
797    vinserti32x4    m0, [bq+candq*8+12], 2
798    movzx         r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0]
799    movzx         r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1]
800    add            r12, r14
801    add          candq, r13
802    jge .calc
803    vinserti32x4    m0, [bq+candq*8+12], 3
804    movzx         r13d, byte [bq+candq*8+22]
805    movzx         r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1]
806    add            r13, r14
807.calc:
808    pshufb          m1, m0, m3
809    pabsw           m2, m0
810    pshufb          m1, m4, m1      ; ref > 0 && res_sign[ref - 1]
811    psrlw           m2, 12          ; (abs(mv.x) | abs(mv.y)) < 4096
812    psubd           m2, m1
813    pshufb          m2, m5           ; c0 c1 c1 c0
814    pand            m2, m6
815    punpckhqdq      m1, m2, m2
816    vpternlogd      m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80
817    pshufb          m2, m0, m1
818    mova           xm0, xm2
819    call           r10
820    jge .next_line
821    vextracti32x4  xm0, m2, 1
822    call           r11
823    jge .next_line
824    vextracti32x4  xm0, m2, 2
825    call           r12
826    jge .next_line
827    vextracti32x4  xm0, m2, 3
828    call           r13
829    jl .loop_x
830.next_line:
831    add            rpq, strideq
832    dec             hd
833    jg .loop_y
834    RET
835.write1:
836    vmovdqu8 [rpq+xq]{k2}, xm0
837    add             xq, 5*1
838    ret
839.write2:
840    pshufb         xm0, xm8
841    vmovdqu16 [rpq+xq]{k2}, xm0
842    add             xq, 5*2
843    ret
844.write4:
845    vpermb         ym0, ym8, ym0
846    vmovdqu32 [rpq+xq]{k2}, ym0
847    add             xq, 5*4
848    ret
849.write8:
850    vpermb          m0, m8, m0
851    vmovdqu64 [rpq+xq]{k2}, m0
852    add             xq, 5*8
853    ret
854.write16:
855    vpermb          m1, m8, m0
856    movu   [rpq+xq+ 0], m1
857    pshufb         xm0, xm9
858    movu   [rpq+xq+64], xm0
859    add             xq, 5*16
860    ret
861
862INIT_ZMM avx512icl
863cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
864    vbroadcasti32x4    m0, [aq]
865    lea                r1, [splat_mv_avx512icl_table]
866    tzcnt            bw4d, bw4d
867    lea              bx4d, [bx4q*3]
868    pshufb             m0, [splat_mv_shuf]
869    movsxd           bw4q, [r1+bw4q*4]
870    mov               r6d, bh4m
871    add              bw4q, r1
872    lea               rrq, [rrq+r6*8]
873    mov               r1d, 0x3f
874    neg                r6
875    kmovb              k1, r1d
876    jmp              bw4q
877.w1:
878    mov                r1, [rrq+r6*8]
879    vmovdqu16 [r1+bx4q*4]{k1}, xm0
880    inc                r6
881    jl .w1
882    RET
883.w2:
884    mov                r1, [rrq+r6*8]
885    vmovdqu32 [r1+bx4q*4]{k1}, ym0
886    inc                r6
887    jl .w2
888    RET
889.w4:
890    mov                r1, [rrq+r6*8]
891    vmovdqu64 [r1+bx4q*4]{k1}, m0
892    inc                r6
893    jl .w4
894    RET
895.w8:
896    pshufd            ym1, ym0, q1021
897.w8_loop:
898    mov                r1, [rrq+r6*8+0]
899    mov                r3, [rrq+r6*8+8]
900    movu   [r1+bx4q*4+ 0], m0
901    mova   [r1+bx4q*4+64], ym1
902    movu   [r3+bx4q*4+ 0], m0
903    mova   [r3+bx4q*4+64], ym1
904    add                r6, 2
905    jl .w8_loop
906    RET
907.w16:
908    pshufd             m1, m0, q1021
909    pshufd             m2, m0, q2102
910.w16_loop:
911    mov                r1, [rrq+r6*8+0]
912    mov                r3, [rrq+r6*8+8]
913    mova [r1+bx4q*4+64*0], m0
914    mova [r1+bx4q*4+64*1], m1
915    mova [r1+bx4q*4+64*2], m2
916    mova [r3+bx4q*4+64*0], m0
917    mova [r3+bx4q*4+64*1], m1
918    mova [r3+bx4q*4+64*2], m2
919    add                r6, 2
920    jl .w16_loop
921    RET
922.w32:
923    pshufd             m1, m0, q1021
924    pshufd             m2, m0, q2102
925.w32_loop:
926    mov                r1, [rrq+r6*8]
927    lea                r1, [r1+bx4q*4]
928    mova        [r1+64*0], m0
929    mova        [r1+64*1], m1
930    mova        [r1+64*2], m2
931    mova        [r1+64*3], m0
932    mova        [r1+64*4], m1
933    mova        [r1+64*5], m2
934    inc                r6
935    jl .w32_loop
936    RET
937%endif ; ARCH_X86_64
938