1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 64 30 31%macro JMP_TABLE 2-* 32 %xdefine %%prefix mangle(private_prefix %+ _%1) 33 %1_table: 34 %xdefine %%base %1_table 35 %rep %0 - 1 36 dd %%prefix %+ .w%2 - %%base 37 %rotate 1 38 %endrep 39%endmacro 40 41%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix 42 %rep %1 43 db %2*3 44 db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \ 45 mangle(private_prefix %+ _save_tmvs_%3).write1 46 %endrep 47%endmacro 48 49%if ARCH_X86_64 50mv_proj: dw 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 51 dw 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 52 dw 1024, 963, 910, 862, 819, 780, 744, 712 53 dw 682, 655, 630, 606, 585, 564, 546, 528 54splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 55 db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7 56 db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 57 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 58%endif 59save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0 60 db 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 61save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2 62 db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3 63save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1 64cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3 65save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00 66save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00 67pb_128: times 16 db 128 68pq_8192: dq 8192 69 70save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3 71 SAVE_TMVS_TABLE 4, 8, ssse3 72 SAVE_TMVS_TABLE 4, 4, ssse3 73 SAVE_TMVS_TABLE 5, 2, ssse3 74 SAVE_TMVS_TABLE 7, 1, ssse3 75 76%if ARCH_X86_64 77save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2 78 SAVE_TMVS_TABLE 4, 8, avx2 79 SAVE_TMVS_TABLE 4, 4, avx2 80 SAVE_TMVS_TABLE 5, 2, avx2 81 SAVE_TMVS_TABLE 7, 1, avx2 82 83save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl 84 SAVE_TMVS_TABLE 4, 8, avx512icl 85 SAVE_TMVS_TABLE 4, 4, avx512icl 86 SAVE_TMVS_TABLE 5, 2, avx512icl 87 SAVE_TMVS_TABLE 7, 1, avx512icl 88 89JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32 90JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 91%endif 92 93JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 94 95struc rf 96 .frm_hdr: resq 1 97 .iw4: resd 1 98 .ih4: resd 1 99 .iw8: resd 1 100 .ih8: resd 1 101 .sbsz: resd 1 102 .use_rf_mvs: resd 1 103 .sign_bias: resb 7 104 .mfmv_sign: resb 7 105 .pocdiff: resb 7 106 .mfmv_ref: resb 3 107 .mfmv_ref2cur: resd 3 108 .mfmv_ref2ref: resd 3*7 109 .n_mfmvs: resd 1 110 .n_blocks: resd 1 111 .rp: resq 1 112 .rp_ref: resq 1 113 .rp_proj: resq 1 114 .rp_stride: resq 1 115 .r: resq 1 116 .n_tile_threads: resd 1 117 .n_frame_threads: resd 1 118endstruc 119 120SECTION .text 121 122%macro movif32 2 123%if ARCH_X86_32 124 mov %1, %2 125%endif 126%endmacro 127 128INIT_XMM ssse3 129; refmvs_temporal_block *rp, ptrdiff_t stride, 130; refmvs_block **rr, uint8_t *ref_sign, 131; int col_end8, int row_end8, int col_start8, int row_start8 132%if ARCH_X86_64 133cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \ 134 xend, yend, xstart, ystart 135%define base_reg r12 136%else 137cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \ 138 xend, yend, xstart, ystart 139 movq m5, [ref_signq] 140 lea strided, [strided*5] 141 mov stridem, strided 142 mov r3, xstartm 143 mov r1, ystartm 144 DEFINE_ARGS b, ystart, rr, cand, xend, x 145%define stridemp r1m 146%define m8 [base+pb_128] 147%define m9 [base+save_pack0+ 0] 148%define m10 [base+save_pack0+16] 149%define base_reg r6 150%endif 151%define base base_reg-.write1 152 LEA base_reg, .write1 153%if ARCH_X86_64 154 movifnidn xendd, xendm 155 movifnidn yendd, yendm 156 mov xstartd, xstartm 157 mov ystartd, ystartm 158 movq m5, [ref_signq] 159%endif 160 movu m4, [base+save_ref_shuf] 161 movddup m6, [base+save_cond0] 162 movddup m7, [base+save_cond1] 163%if ARCH_X86_64 164 mova m8, [base+pb_128] 165 mova m9, [base+save_pack0+ 0] 166 mova m10, [base+save_pack0+16] 167%endif 168 psllq m5, 8 169%if ARCH_X86_64 170 lea r9d, [xendq*5] 171 lea xstartd, [xstartq*5] 172 sub yendd, ystartd 173 add ystartd, ystartd 174 lea strideq, [strideq*5] 175 sub xstartq, r9 176 add xendd, r9d 177 add rpq, r9 178 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 179%else 180 lea r0, [xendd*5] ; xend5 181 lea r3, [r3*5] ; xstart5 182 sub r3, r0 ; -w5 183 mov r6m, r3 184%define xstartq r6m 185 add xendd, r0 ; xend6 186 add r0m, r0 ; rp+xend5 187 mov xendm, xendd 188 sub r5, r1 ; h 189 add r1, r1 190 mov r7m, r1 191 mov r5m, r5 192%define hd r5mp 193 jmp .loop_y_noload 194%endif 195.loop_y: 196 movif32 ystartd, r7m 197 movif32 xendd, xendm 198.loop_y_noload: 199 and ystartd, 30 200 mov xq, xstartq 201 mov bq, [rrq+ystartq*gprsize] 202 add ystartd, 2 203 movif32 r7m, ystartd 204 lea bq, [bq+xendq*4] 205.loop_x: 206%if ARCH_X86_32 207%define rpq r3 208%define r10 r1 209%define r10d r1 210%define r11 r4 211%define r11d r4 212%endif 213 imul candq, xq, 0x9999 ; x / 5 * 3 214 sar candq, 16 215 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 216 movu m0, [bq+candq*8+12] ; cand_b 217 movzx r11d, byte [base+save_tmvs_ssse3_table+r10*2+0] 218 movzx r10d, byte [base+save_tmvs_ssse3_table+r10*2+1] 219 add r10, base_reg 220 add candq, r11 221 jge .calc 222 movu m1, [bq+candq*8+12] 223 movzx r11d, byte [bq+candq*8+22] 224 movzx r11d, byte [base+save_tmvs_ssse3_table+r11*2+1] 225 add r11, base_reg 226.calc: 227 movif32 rpq, r0m 228 ; ref check 229 punpckhqdq m2, m0, m1 230 pshufb m2, m4 ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ... 231 pshufb m3, m5, m2 ; ref > 0 && res_sign[ref - 1] 232 ; mv check 233 punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ... 234 pabsw m2, m2 235 psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 236 ; res 237 pcmpgtd m3, m2 238 pshufd m2, m3, q2301 239 pand m3, m6 ; b0c0 b0c1 b1c0 b1c1 | ... 240 pand m2, m7 ; b0c1 b0c0 b1c1 b1c0 | ... 241 por m3, m2 ; b0.shuf b1.shuf | ... 242 pxor m3, m8 ; if cond0|cond1 == 0 => zero out 243 pshufb m0, m3 244 pshufb m1, m3 245 call r10 246 jge .next_line 247 pshufd m0, m1, q3232 248 call r11 249 jl .loop_x 250.next_line: 251 add rpq, stridemp 252 movif32 r0m, rpq 253 dec hd 254 jg .loop_y 255 RET 256.write1: 257 movd [rpq+xq+0], m0 258 psrlq m0, 8 259 movd [rpq+xq+1], m0 260 add xq, 5*1 261 ret 262.write2: 263 movq [rpq+xq+0], m0 264 psrlq m0, 8 265 movd [rpq+xq+6], m0 266 add xq, 5*2 267 ret 268.write4: 269 pshufb m0, m9 270 movu [rpq+xq+ 0], m0 271 psrlq m0, 8 272 movd [rpq+xq+16], m0 273 add xq, 5*4 274 ret 275.write8: 276 pshufb m2, m0, m9 277 movu [rpq+xq+ 0], m2 278 pshufb m0, m10 279 movu [rpq+xq+16], m0 280 psrldq m2, 2 281 movq [rpq+xq+32], m2 282 add xq, 5*8 283 ret 284.write16: 285 pshufb m2, m0, m9 286 movu [rpq+xq+ 0], m2 287 pshufb m0, m10 288 movu [rpq+xq+16], m0 289 shufps m2, m0, q1032 290 movu [rpq+xq+48], m2 291 shufps m2, m0, q2121 292 movu [rpq+xq+32], m2 293 shufps m0, m2, q1032 294 movu [rpq+xq+64], m0 295 add xq, 5*16 296 ret 297 298INIT_XMM sse2 299; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4 300cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 301 add bx4d, bw4d 302 tzcnt bw4d, bw4d 303 mova m2, [aq] 304 LEA aq, splat_mv_sse2_table 305 lea bx4q, [bx4q*3-32] 306 movsxd bw4q, [aq+bw4q*4] 307 movifnidn bh4d, bh4m 308 pshufd m0, m2, q0210 309 pshufd m1, m2, q1021 310 pshufd m2, m2, q2102 311 add bw4q, aq 312.loop: 313 mov aq, [rrq] 314 add rrq, gprsize 315 lea aq, [aq+bx4q*4] 316 jmp bw4q 317.w32: 318 mova [aq-16*16], m0 319 mova [aq-16*15], m1 320 mova [aq-16*14], m2 321 mova [aq-16*13], m0 322 mova [aq-16*12], m1 323 mova [aq-16*11], m2 324 mova [aq-16*10], m0 325 mova [aq-16* 9], m1 326 mova [aq-16* 8], m2 327 mova [aq-16* 7], m0 328 mova [aq-16* 6], m1 329 mova [aq-16* 5], m2 330.w16: 331 mova [aq-16* 4], m0 332 mova [aq-16* 3], m1 333 mova [aq-16* 2], m2 334 mova [aq-16* 1], m0 335 mova [aq+16* 0], m1 336 mova [aq+16* 1], m2 337.w8: 338 mova [aq+16* 2], m0 339 mova [aq+16* 3], m1 340 mova [aq+16* 4], m2 341.w4: 342 mova [aq+16* 5], m0 343 mova [aq+16* 6], m1 344 mova [aq+16* 7], m2 345 dec bh4d 346 jg .loop 347 RET 348.w2: 349 movu [aq+104], m0 350 movq [aq+120], m1 351 dec bh4d 352 jg .loop 353 RET 354.w1: 355 movq [aq+116], m0 356 movd [aq+124], m2 357 dec bh4d 358 jg .loop 359 RET 360 361%if ARCH_X86_64 362INIT_XMM sse4 363; refmvs_frame *rf, int tile_row_idx, 364; int col_start8, int col_end8, int row_start8, int row_end8 365cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ 366 stride, rp_proj, roff, troff, \ 367 xendi, xstarti, iw8, ih8, dst 368 xor r14d, r14d 369 cmp dword [rfq+rf.n_tile_threads], 1 370 mov ih8d, [rfq+rf.ih8] 371 mov iw8d, [rfq+rf.iw8] 372 mov xstartd, xstartd 373 mov xendd, xendd 374 cmove tridxd, r14d 375 lea xstartid, [xstartq-8] 376 lea xendid, [xendq+8] 377 mov strideq, [rfq+rf.rp_stride] 378 mov rp_projq, [rfq+rf.rp_proj] 379 cmp ih8d, yendd 380 mov [rsp+0x30], strideq 381 cmovs yendd, ih8d 382 test xstartid, xstartid 383 cmovs xstartid, r14d 384 cmp iw8d, xendid 385 cmovs xendid, iw8d 386 mov troffq, strideq 387 shl troffq, 4 388 imul troffq, tridxq 389 mov dstd, ystartd 390 and dstd, 15 391 imul dstq, strideq 392 add dstq, troffq ; (16 * tridx + (ystart & 15)) * stride 393 lea dstq, [dstq*5] 394 add dstq, rp_projq 395 lea troffq, [troffq*5] ; 16 * tridx * stride * 5 396 lea r13d, [xendq*5] 397 lea r12, [strideq*5] 398 DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \ 399 _, troff, xendi, xstarti, stride5, _, dst 400 lea w5d, [xstartq*5] 401 add r7, troffq ; rp_proj + tile_row_offset 402 mov hd, yendd 403 mov [rsp+0x28], r7 404 add dstq, r13 405 sub w5q, r13 406 sub hd, ystartd 407.init_xloop_start: 408 mov x5q, w5q 409 test w5b, 1 410 jz .init_2blk 411 mov dword [dstq+x5q], 0x80008000 412 add x5q, 5 413 jz .init_next_row 414.init_2blk: 415 mov dword [dstq+x5q+0], 0x80008000 416 mov dword [dstq+x5q+5], 0x80008000 417 add x5q, 10 418 jl .init_2blk 419.init_next_row: 420 add dstq, stride5q 421 dec hd 422 jg .init_xloop_start 423 DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \ 424 _, _, xendi, xstarti, stride5, _, n 425 mov r13d, [rfq+rf.n_mfmvs] 426 test r13d, r13d 427 jz .ret 428 mov [rsp+0x0c], r13d 429 mov strideq, [rsp+0x30] 430 movddup m3, [pq_8192] 431 mov r9d, ystartd 432 mov [rsp+0x38], yendd 433 mov [rsp+0x20], xstartid 434 xor nd, nd 435 xor n7d, n7d 436 imul r9, strideq ; ystart * stride 437 mov [rsp+0x48], rfq 438 mov [rsp+0x18], stride5q 439 lea r7, [r9*5] 440 mov [rsp+0x24], ystartd 441 mov [rsp+0x00], r7 442.nloop: 443 DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \ 444 ref, rp_ref, xendi, xstarti, _, _, n 445 mov rfq, [rsp+0x48] 446 mov refd, [rfq+rf.mfmv_ref2cur+nq*4] 447 cmp refd, 0x80000000 448 je .next_n 449 mov [rsp+0x40], refd 450 mov offq, [rsp+0x00] ; ystart * stride * 5 451 movzx refd, byte [rfq+rf.mfmv_ref+nq] 452 lea refsignq, [refq-4] 453 mov rp_refq, [rfq+rf.rp_ref] 454 movq m2, refsignq 455 add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset 456 mov [rsp+0x14], nd 457 mov yd, ystartd 458.yloop: 459 mov r11d, [rsp+0x24] ; ystart 460 mov r12d, [rsp+0x38] ; yend 461 mov r14d, yd 462 and r14d, ~7 ; y_sb_align 463 cmp r11d, r14d 464 cmovs r11d, r14d ; imax(y_sb_align, ystart) 465 mov [rsp+0x44], r11d ; y_proj_start 466 add r14d, 8 467 cmp r12d, r14d 468 cmovs r14d, r12d ; imin(y_sb_align + 8, yend) 469 mov [rsp+0x3c], r14d ; y_proj_end 470 DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \ 471 ref, x, xendi, mvx, mvy, rb, ref2ref 472 mov xd, [rsp+0x20] ; xstarti 473.xloop: 474 lea rbd, [xq*5] 475 add rbq, srcq 476 movsx refd, byte [rbq+4] 477 test refd, refd 478 jz .next_x_bad_ref 479 mov rfq, [rsp+0x48] 480 lea ref2refd, [(rf.mfmv_ref2ref/4)+n7q+refq-1] 481 mov ref2refd, [rfq+ref2refq*4] ; rf->mfmv_ref2ref[n][b_ref-1] 482 test ref2refd, ref2refd 483 jz .next_x_bad_ref 484 lea fracq, [mv_proj] 485 movzx fracd, word [fracq+ref2refq*2] 486 mov mvd, [rbq] 487 imul fracd, [rsp+0x40] ; ref2cur 488 pmovsxwq m0, [rbq] 489 movd m1, fracd 490 punpcklqdq m1, m1 491 pmuldq m0, m1 ; mv * frac 492 pshufd m1, m0, q3311 493 paddd m0, m3 494 paddd m0, m1 495 psrad m0, 14 ; offset = (xy + (xy >> 31) + 8192) >> 14 496 pabsd m1, m0 497 packssdw m0, m0 498 psrld m1, 6 499 packuswb m1, m1 500 pxor m0, m2 ; offset ^ ref_sign 501 psignd m1, m0 ; apply_sign(abs(offset) >> 6, offset ^ refsign) 502 movq mvxq, m1 503 lea mvyd, [mvxq+yq] ; ypos 504 sar mvxq, 32 505 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \ 506 ref, x, xendi, mvx, ypos, rb, ref2ref 507 cmp yposd, [rsp+0x44] ; y_proj_start 508 jl .next_x_bad_pos_y 509 cmp yposd, [rsp+0x3c] ; y_proj_end 510 jge .next_x_bad_pos_y 511 and yposd, 15 512 add mvxq, xq ; xpos 513 imul yposq, [rsp+0x30] ; pos = (ypos & 15) * stride 514 DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \ 515 ref, x, xendi, xpos, pos, rb, ref2ref 516 mov dstq, [rsp+0x28] ; dst = rp_proj + tile_row_offset 517 add posq, xposq ; pos += xpos 518 lea posq, [posq*5] 519 add dstq, posq ; dst += pos5 520 jmp .write_loop_entry 521.write_loop: 522 add rbq, 5 523 cmp refb, byte [rbq+4] 524 jne .xloop 525 cmp mvd, [rbq] 526 jne .xloop 527 add dstq, 5 528 inc xposd 529.write_loop_entry: 530 mov r12d, xd 531 and r12d, ~7 532 lea r5d, [r12-8] 533 cmp r5d, xstartd 534 cmovs r5d, xstartd ; x_proj_start 535 cmp xposd, r5d 536 jl .next_xpos 537 add r12d, 16 538 cmp xendd, r12d 539 cmovs r12d, xendd ; x_proj_end 540 cmp xposd, r12d 541 jge .next_xpos 542 mov [dstq+0], mvd 543 mov byte [dstq+4], ref2refb 544.next_xpos: 545 inc xd 546 cmp xd, xendid 547 jl .write_loop 548.next_y: 549 DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n 550 add srcq, [rsp+0x18] ; stride5 551 inc yd 552 cmp yd, [rsp+0x38] ; yend 553 jne .yloop 554 mov nd, [rsp+0x14] 555 mov ystartd, [rsp+0x24] 556.next_n: 557 add n7d, 7 558 inc nd 559 cmp nd, [rsp+0x0c] ; n_mfmvs 560 jne .nloop 561.ret: 562 RET 563.next_x: 564 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _ 565 add rbq, 5 566 cmp refb, byte [rbq+4] 567 jne .xloop 568 cmp mvd, [rbq] 569 jne .xloop 570.next_x_bad_pos_y: 571 inc xd 572 cmp xd, xendid 573 jl .next_x 574 jmp .next_y 575.next_x_bad_ref: 576 inc xd 577 cmp xd, xendid 578 jl .xloop 579 jmp .next_y 580 581INIT_YMM avx2 582; refmvs_temporal_block *rp, ptrdiff_t stride, 583; refmvs_block **rr, uint8_t *ref_sign, 584; int col_end8, int row_end8, int col_start8, int row_start8 585cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \ 586 xend, yend, xstart, ystart 587%define base r12-.write1 588 lea r12, [.write1] 589 movifnidn xendd, xendm 590 movifnidn yendd, yendm 591 mov xstartd, xstartm 592 mov ystartd, ystartm 593 vpbroadcastq m4, [ref_signq] 594 vpbroadcastq m3, [base+save_ref_shuf+8] 595 vpbroadcastq m5, [base+save_cond0] 596 vpbroadcastq m6, [base+save_cond1] 597 vpbroadcastd m7, [base+pb_128] 598 mova m8, [base+save_pack0] 599 mova m9, [base+save_pack1] 600 psllq m4, 8 601 lea r9d, [xendq*5] 602 lea xstartd, [xstartq*5] 603 sub yendd, ystartd 604 add ystartd, ystartd 605 lea strideq, [strideq*5] 606 sub xstartq, r9 607 add xendd, r9d 608 add rpq, r9 609 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 610.loop_y: 611 and ystartd, 30 612 mov xq, xstartq 613 mov bq, [rrq+ystartq*8] 614 add ystartd, 2 615 lea bq, [bq+xendq*4] 616.loop_x: 617 imul candq, xq, 0x9999 618 sar candq, 16 ; x / 5 * 3 619 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 620 movu xm0, [bq+candq*8+12] ; cand_b 621 movzx r11d, byte [base+save_tmvs_avx2_table+r10*2+0] 622 movzx r10d, byte [base+save_tmvs_avx2_table+r10*2+1] 623 add r10, r12 624 add candq, r11 625 jge .calc 626 vinserti128 m0, [bq+candq*8+12], 1 627 movzx r11d, byte [bq+candq*8+22] 628 movzx r11d, byte [base+save_tmvs_avx2_table+r11*2+1] 629 add r11, r12 630.calc: 631 pshufb m1, m0, m3 632 pabsw m2, m0 633 pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] 634 psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 635 pcmpgtd m1, m2 636 pshufd m2, m1, q2301 637 pand m1, m5 ; b0.cond0 b1.cond0 638 pand m2, m6 ; b0.cond1 b1.cond1 639 por m1, m2 ; b0.shuf b1.shuf 640 pxor m1, m7 ; if cond0|cond1 == 0 => zero out 641 pshufb m0, m1 642 call r10 643 jge .next_line 644 vextracti128 xm0, m0, 1 645 call r11 646 jl .loop_x 647.next_line: 648 add rpq, strideq 649 dec hd 650 jg .loop_y 651 RET 652.write1: 653 movd [rpq+xq+ 0], xm0 654 pextrb [rpq+xq+ 4], xm0, 4 655 add xq, 5*1 656 ret 657.write2: 658 movq [rpq+xq+0], xm0 659 psrlq xm1, xm0, 8 660 movd [rpq+xq+6], xm1 661 add xq, 5*2 662 ret 663.write4: 664 pshufb xm1, xm0, xm8 665 movu [rpq+xq+ 0], xm1 666 psrlq xm1, 8 667 movd [rpq+xq+16], xm1 668 add xq, 5*4 669 ret 670.write8: 671 vinserti128 m1, m0, xm0, 1 672 pshufb m1, m8 673 movu [rpq+xq+ 0], m1 674 psrldq xm1, 2 675 movq [rpq+xq+32], xm1 676 add xq, 5*8 677 ret 678.write16: 679 vinserti128 m1, m0, xm0, 1 680 pshufb m2, m1, m8 681 movu [rpq+xq+ 0], m2 682 pshufb m1, m9 683 movu [rpq+xq+32], m1 684 shufps xm2, xm1, q1021 685 movu [rpq+xq+64], xm2 686 add xq, 5*16 687 ret 688 689cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 690 add bx4d, bw4d 691 tzcnt bw4d, bw4d 692 vbroadcasti128 m0, [aq] 693 lea aq, [splat_mv_avx2_table] 694 lea bx4q, [bx4q*3-32] 695 movsxd bw4q, [aq+bw4q*4] 696 pshufb m0, [splat_mv_shuf] 697 movifnidn bh4d, bh4m 698 pshufd m1, m0, q2102 699 pshufd m2, m0, q1021 700 add bw4q, aq 701.loop: 702 mov aq, [rrq] 703 add rrq, gprsize 704 lea aq, [aq+bx4q*4] 705 jmp bw4q 706.w32: 707 mova [aq-32*8], m0 708 mova [aq-32*7], m1 709 mova [aq-32*6], m2 710 mova [aq-32*5], m0 711 mova [aq-32*4], m1 712 mova [aq-32*3], m2 713.w16: 714 mova [aq-32*2], m0 715 mova [aq-32*1], m1 716 mova [aq+32*0], m2 717.w8: 718 mova [aq+32*1], m0 719 mova [aq+32*2], m1 720 mova [aq+32*3], m2 721 dec bh4d 722 jg .loop 723 RET 724.w4: 725 movu [aq+ 80], m0 726 mova [aq+112], xm1 727 dec bh4d 728 jg .loop 729 RET 730.w2: 731 movu [aq+104], xm0 732 movq [aq+120], xm2 733 dec bh4d 734 jg .loop 735 RET 736.w1: 737 movq [aq+116], xm0 738 movd [aq+124], xm1 739 dec bh4d 740 jg .loop 741 RET 742 743INIT_ZMM avx512icl 744; refmvs_temporal_block *rp, ptrdiff_t stride, 745; refmvs_block **rr, uint8_t *ref_sign, 746; int col_end8, int row_end8, int col_start8, int row_start8 747cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \ 748 xend, yend, xstart, ystart 749%define base r14-.write1 750 lea r14, [.write1] 751 movifnidn xendd, xendm 752 movifnidn yendd, yendm 753 mov xstartd, xstartm 754 mov ystartd, ystartm 755 psllq m4, [ref_signq]{bcstq}, 8 756 vpbroadcastq m3, [base+save_ref_shuf+8] 757 vbroadcasti32x4 m5, [base+cond_shuf512] 758 vbroadcasti32x4 m6, [base+save_cond0] 759 vpbroadcastd m7, [base+pb_128] 760 mova m8, [base+save_pack0] 761 movu xm9, [base+save_pack0+4] 762 lea r9d, [xendq*5] 763 lea xstartd, [xstartq*5] 764 sub yendd, ystartd 765 add ystartd, ystartd 766 lea strideq, [strideq*5] 767 sub xstartq, r9 768 add xendd, r9d 769 add rpq, r9 770 mov r10d, 0x1f 771 kmovb k2, r10d 772 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 773.loop_y: 774 and ystartd, 30 775 mov xq, xstartq 776 mov bq, [rrq+ystartq*8] 777 add ystartd, 2 778 lea bq, [bq+xendq*4] 779.loop_x: 780 imul candq, xq, 0x9999 781 sar candq, 16 ; x / 5 * 3 782 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 783 movu xm0, [bq+candq*8+12] ; cand_b 784 movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0] 785 movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1] 786 add r10, r14 787 add candq, r11 788 jge .calc 789 movzx r11d, byte [bq+candq*8+22] 790 vinserti32x4 ym0, [bq+candq*8+12], 1 791 movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0] 792 movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1] 793 add r11, r14 794 add candq, r12 795 jge .calc 796 movzx r12d, byte [bq+candq*8+22] 797 vinserti32x4 m0, [bq+candq*8+12], 2 798 movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0] 799 movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1] 800 add r12, r14 801 add candq, r13 802 jge .calc 803 vinserti32x4 m0, [bq+candq*8+12], 3 804 movzx r13d, byte [bq+candq*8+22] 805 movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1] 806 add r13, r14 807.calc: 808 pshufb m1, m0, m3 809 pabsw m2, m0 810 pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] 811 psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 812 psubd m2, m1 813 pshufb m2, m5 ; c0 c1 c1 c0 814 pand m2, m6 815 punpckhqdq m1, m2, m2 816 vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80 817 pshufb m2, m0, m1 818 mova xm0, xm2 819 call r10 820 jge .next_line 821 vextracti32x4 xm0, m2, 1 822 call r11 823 jge .next_line 824 vextracti32x4 xm0, m2, 2 825 call r12 826 jge .next_line 827 vextracti32x4 xm0, m2, 3 828 call r13 829 jl .loop_x 830.next_line: 831 add rpq, strideq 832 dec hd 833 jg .loop_y 834 RET 835.write1: 836 vmovdqu8 [rpq+xq]{k2}, xm0 837 add xq, 5*1 838 ret 839.write2: 840 pshufb xm0, xm8 841 vmovdqu16 [rpq+xq]{k2}, xm0 842 add xq, 5*2 843 ret 844.write4: 845 vpermb ym0, ym8, ym0 846 vmovdqu32 [rpq+xq]{k2}, ym0 847 add xq, 5*4 848 ret 849.write8: 850 vpermb m0, m8, m0 851 vmovdqu64 [rpq+xq]{k2}, m0 852 add xq, 5*8 853 ret 854.write16: 855 vpermb m1, m8, m0 856 movu [rpq+xq+ 0], m1 857 pshufb xm0, xm9 858 movu [rpq+xq+64], xm0 859 add xq, 5*16 860 ret 861 862INIT_ZMM avx512icl 863cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4 864 vbroadcasti32x4 m0, [aq] 865 lea r1, [splat_mv_avx512icl_table] 866 tzcnt bw4d, bw4d 867 lea bx4d, [bx4q*3] 868 pshufb m0, [splat_mv_shuf] 869 movsxd bw4q, [r1+bw4q*4] 870 mov r6d, bh4m 871 add bw4q, r1 872 lea rrq, [rrq+r6*8] 873 mov r1d, 0x3f 874 neg r6 875 kmovb k1, r1d 876 jmp bw4q 877.w1: 878 mov r1, [rrq+r6*8] 879 vmovdqu16 [r1+bx4q*4]{k1}, xm0 880 inc r6 881 jl .w1 882 RET 883.w2: 884 mov r1, [rrq+r6*8] 885 vmovdqu32 [r1+bx4q*4]{k1}, ym0 886 inc r6 887 jl .w2 888 RET 889.w4: 890 mov r1, [rrq+r6*8] 891 vmovdqu64 [r1+bx4q*4]{k1}, m0 892 inc r6 893 jl .w4 894 RET 895.w8: 896 pshufd ym1, ym0, q1021 897.w8_loop: 898 mov r1, [rrq+r6*8+0] 899 mov r3, [rrq+r6*8+8] 900 movu [r1+bx4q*4+ 0], m0 901 mova [r1+bx4q*4+64], ym1 902 movu [r3+bx4q*4+ 0], m0 903 mova [r3+bx4q*4+64], ym1 904 add r6, 2 905 jl .w8_loop 906 RET 907.w16: 908 pshufd m1, m0, q1021 909 pshufd m2, m0, q2102 910.w16_loop: 911 mov r1, [rrq+r6*8+0] 912 mov r3, [rrq+r6*8+8] 913 mova [r1+bx4q*4+64*0], m0 914 mova [r1+bx4q*4+64*1], m1 915 mova [r1+bx4q*4+64*2], m2 916 mova [r3+bx4q*4+64*0], m0 917 mova [r3+bx4q*4+64*1], m1 918 mova [r3+bx4q*4+64*2], m2 919 add r6, 2 920 jl .w16_loop 921 RET 922.w32: 923 pshufd m1, m0, q1021 924 pshufd m2, m0, q2102 925.w32_loop: 926 mov r1, [rrq+r6*8] 927 lea r1, [r1+bx4q*4] 928 mova [r1+64*0], m0 929 mova [r1+64*1], m1 930 mova [r1+64*2], m2 931 mova [r1+64*3], m0 932 mova [r1+64*4], m1 933 mova [r1+64*5], m2 934 inc r6 935 jl .w32_loop 936 RET 937%endif ; ARCH_X86_64 938