1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32 33; dav1d_obmc_masks[] * -512 34const obmc_masks_avx2 35 dw 0, 0, -9728, 0, -12800, -7168, -2560, 0 36 dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0 37 dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120 38 dw -4096, -3072, -2048, -1536, 0, 0, 0, 0 39 dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240 40 dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608 41 dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024 42 dw 0, 0, 0, 0, 0, 0, 0, 0 43 44deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 45subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 46subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 47subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 48subpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 49subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 50rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 51rescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7 52resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 53 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 54blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 55wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 56bdct_lb_q: times 8 db 0 57 times 8 db 4 58 times 8 db 8 59 times 8 db 12 60 61prep_mul: dw 16, 16, 4, 4 62put_bilin_h_rnd: dw 8, 8, 10, 10 63put_8tap_h_rnd: dd 34, 40 64s_8tap_h_rnd: dd 2, 8 65s_8tap_h_sh: dd 2, 4 66put_s_8tap_v_rnd: dd 512, 128 67put_s_8tap_v_sh: dd 10, 8 68prep_8tap_1d_rnd: dd 8 - (8192 << 4) 69prep_8tap_2d_rnd: dd 32 - (8192 << 5) 70warp8x8t_rnd: dd 16384 - (8192 << 15) 71warp8x8_shift: dd 5, 3 72warp8x8_rnd: dw 4096, 4096, 16384, 16384 73bidir_rnd: dw -16400, -16400, -16388, -16388 74bidir_mul: dw 2048, 2048, 8192, 8192 75 76%define pw_16 prep_mul 77%define pd_512 put_s_8tap_v_rnd 78 79pw_2: times 2 dw 2 80pw_64: times 2 dw 64 81pw_2048: times 2 dw 2048 82pw_8192: times 2 dw 8192 83pw_27615: times 2 dw 27615 84pw_32766: times 2 dw 32766 85pw_m512: times 2 dw -512 86pd_32: dd 32 87pd_63: dd 63 88pd_64: dd 64 89pd_32768: dd 32768 90pd_65538: dd 65538 91pd_m524256: dd -524256 ; -8192 << 6 + 32 92pd_0x3ff: dd 0x3ff 93pq_0x40000000: dq 0x40000000 94 dd 0 95 96%macro BIDIR_JMP_TABLE 2-* 97 %xdefine %1_%2_table (%%table - 2*%3) 98 %xdefine %%base %1_%2_table 99 %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) 100 %%table: 101 %rep %0 - 2 102 dd %%prefix %+ .w%3 - %%base 103 %rotate 1 104 %endrep 105%endmacro 106 107BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 108BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 109BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 110BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 111BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 112BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 113BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 114BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 115BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128 116 117%macro BASE_JMP_TABLE 3-* 118 %xdefine %1_%2_table (%%table - %3) 119 %xdefine %%base %1_%2 120 %%table: 121 %rep %0 - 2 122 dw %%base %+ _w%3 - %%base 123 %rotate 1 124 %endrep 125%endmacro 126 127%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put) 128%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep) 129 130BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 131BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 132 133%macro HV_JMP_TABLE 5-* 134 %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) 135 %xdefine %%base %1_%3 136 %assign %%types %4 137 %if %%types & 1 138 %xdefine %1_%2_h_%3_table (%%h - %5) 139 %%h: 140 %rep %0 - 4 141 dw %%prefix %+ .h_w%5 - %%base 142 %rotate 1 143 %endrep 144 %rotate 4 145 %endif 146 %if %%types & 2 147 %xdefine %1_%2_v_%3_table (%%v - %5) 148 %%v: 149 %rep %0 - 4 150 dw %%prefix %+ .v_w%5 - %%base 151 %rotate 1 152 %endrep 153 %rotate 4 154 %endif 155 %if %%types & 4 156 %xdefine %1_%2_hv_%3_table (%%hv - %5) 157 %%hv: 158 %rep %0 - 4 159 dw %%prefix %+ .hv_w%5 - %%base 160 %rotate 1 161 %endrep 162 %endif 163%endmacro 164 165HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 166HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 167 168%macro SCALED_JMP_TABLE 2-* 169 %xdefine %1_%2_table (%%table - %3) 170 %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) 171%%table: 172 %rep %0 - 2 173 dw %%base %+ .w%3 - %%base 174 %rotate 1 175 %endrep 176 %rotate 2 177 %%dy_1024: 178 %xdefine %1_%2_dy1_table (%%dy_1024 - %3) 179 %rep %0 - 2 180 dw %%base %+ .dy1_w%3 - %%base 181 %rotate 1 182 %endrep 183 %rotate 2 184 %%dy_2048: 185 %xdefine %1_%2_dy2_table (%%dy_2048 - %3) 186 %rep %0 - 2 187 dw %%base %+ .dy2_w%3 - %%base 188 %rotate 1 189 %endrep 190%endmacro 191 192SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 193SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 194 195%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 196 197cextern mc_subpel_filters 198%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 199 200cextern mc_warp_filter 201cextern resize_filter 202 203SECTION .text 204 205INIT_XMM avx2 206cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy 207 mov mxyd, r6m ; mx 208 lea r7, [put_avx2] 209%if UNIX64 210 DECLARE_REG_TMP 8 211 %define org_w r8d 212 mov r8d, wd 213%else 214 DECLARE_REG_TMP 7 215 %define org_w wm 216%endif 217 tzcnt wd, wm 218 movifnidn hd, hm 219 test mxyd, mxyd 220 jnz .h 221 mov mxyd, r7m ; my 222 test mxyd, mxyd 223 jnz .v 224.put: 225 movzx wd, word [r7+wq*2+table_offset(put,)] 226 add wq, r7 227 jmp wq 228.put_w2: 229 mov r6d, [srcq+ssq*0] 230 mov r7d, [srcq+ssq*1] 231 lea srcq, [srcq+ssq*2] 232 mov [dstq+dsq*0], r6d 233 mov [dstq+dsq*1], r7d 234 lea dstq, [dstq+dsq*2] 235 sub hd, 2 236 jg .put_w2 237 RET 238.put_w4: 239 mov r6, [srcq+ssq*0] 240 mov r7, [srcq+ssq*1] 241 lea srcq, [srcq+ssq*2] 242 mov [dstq+dsq*0], r6 243 mov [dstq+dsq*1], r7 244 lea dstq, [dstq+dsq*2] 245 sub hd, 2 246 jg .put_w4 247 RET 248.put_w8: 249 movu m0, [srcq+ssq*0] 250 movu m1, [srcq+ssq*1] 251 lea srcq, [srcq+ssq*2] 252 mova [dstq+dsq*0], m0 253 mova [dstq+dsq*1], m1 254 lea dstq, [dstq+dsq*2] 255 sub hd, 2 256 jg .put_w8 257 RET 258INIT_YMM avx2 259.put_w16: 260 movu m0, [srcq+ssq*0] 261 movu m1, [srcq+ssq*1] 262 lea srcq, [srcq+ssq*2] 263 mova [dstq+dsq*0], m0 264 mova [dstq+dsq*1], m1 265 lea dstq, [dstq+dsq*2] 266 sub hd, 2 267 jg .put_w16 268 RET 269.put_w32: 270 movu m0, [srcq+ssq*0+32*0] 271 movu m1, [srcq+ssq*0+32*1] 272 movu m2, [srcq+ssq*1+32*0] 273 movu m3, [srcq+ssq*1+32*1] 274 lea srcq, [srcq+ssq*2] 275 mova [dstq+dsq*0+32*0], m0 276 mova [dstq+dsq*0+32*1], m1 277 mova [dstq+dsq*1+32*0], m2 278 mova [dstq+dsq*1+32*1], m3 279 lea dstq, [dstq+dsq*2] 280 sub hd, 2 281 jg .put_w32 282 RET 283.put_w64: 284 movu m0, [srcq+32*0] 285 movu m1, [srcq+32*1] 286 movu m2, [srcq+32*2] 287 movu m3, [srcq+32*3] 288 add srcq, ssq 289 mova [dstq+32*0], m0 290 mova [dstq+32*1], m1 291 mova [dstq+32*2], m2 292 mova [dstq+32*3], m3 293 add dstq, dsq 294 dec hd 295 jg .put_w64 296 RET 297.put_w128: 298 movu m0, [srcq+32*0] 299 movu m1, [srcq+32*1] 300 movu m2, [srcq+32*2] 301 movu m3, [srcq+32*3] 302 mova [dstq+32*0], m0 303 mova [dstq+32*1], m1 304 mova [dstq+32*2], m2 305 mova [dstq+32*3], m3 306 movu m0, [srcq+32*4] 307 movu m1, [srcq+32*5] 308 movu m2, [srcq+32*6] 309 movu m3, [srcq+32*7] 310 add srcq, ssq 311 mova [dstq+32*4], m0 312 mova [dstq+32*5], m1 313 mova [dstq+32*6], m2 314 mova [dstq+32*7], m3 315 add dstq, dsq 316 dec hd 317 jg .put_w128 318 RET 319.h: 320 movd xm5, mxyd 321 mov mxyd, r7m ; my 322 vpbroadcastd m4, [pw_16] 323 vpbroadcastw m5, xm5 324 psubw m4, m5 325 test mxyd, mxyd 326 jnz .hv 327 ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v 328 movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] 329 mov r6d, r8m ; bitdepth_max 330 add wq, r7 331 shr r6d, 11 332 vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4] 333 jmp wq 334.h_w2: 335 movq xm1, [srcq+ssq*0] 336 movhps xm1, [srcq+ssq*1] 337 lea srcq, [srcq+ssq*2] 338 pmullw xm0, xm4, xm1 339 psrlq xm1, 16 340 pmullw xm1, xm5 341 paddw xm0, xm3 342 paddw xm0, xm1 343 psrlw xm0, 4 344 movd [dstq+dsq*0], xm0 345 pextrd [dstq+dsq*1], xm0, 2 346 lea dstq, [dstq+dsq*2] 347 sub hd, 2 348 jg .h_w2 349 RET 350.h_w4: 351 movq xm0, [srcq+ssq*0] 352 movhps xm0, [srcq+ssq*1] 353 movq xm1, [srcq+ssq*0+2] 354 movhps xm1, [srcq+ssq*1+2] 355 lea srcq, [srcq+ssq*2] 356 pmullw xm0, xm4 357 pmullw xm1, xm5 358 paddw xm0, xm3 359 paddw xm0, xm1 360 psrlw xm0, 4 361 movq [dstq+dsq*0], xm0 362 movhps [dstq+dsq*1], xm0 363 lea dstq, [dstq+dsq*2] 364 sub hd, 2 365 jg .h_w4 366 RET 367.h_w8: 368 movu xm0, [srcq+ssq*0] 369 vinserti128 m0, [srcq+ssq*1], 1 370 movu xm1, [srcq+ssq*0+2] 371 vinserti128 m1, [srcq+ssq*1+2], 1 372 lea srcq, [srcq+ssq*2] 373 pmullw m0, m4 374 pmullw m1, m5 375 paddw m0, m3 376 paddw m0, m1 377 psrlw m0, 4 378 mova [dstq+dsq*0], xm0 379 vextracti128 [dstq+dsq*1], m0, 1 380 lea dstq, [dstq+dsq*2] 381 sub hd, 2 382 jg .h_w8 383 RET 384.h_w16: 385 pmullw m0, m4, [srcq+ssq*0] 386 pmullw m1, m5, [srcq+ssq*0+2] 387 paddw m0, m3 388 paddw m0, m1 389 pmullw m1, m4, [srcq+ssq*1] 390 pmullw m2, m5, [srcq+ssq*1+2] 391 lea srcq, [srcq+ssq*2] 392 paddw m1, m3 393 paddw m1, m2 394 psrlw m0, 4 395 psrlw m1, 4 396 mova [dstq+dsq*0], m0 397 mova [dstq+dsq*1], m1 398 lea dstq, [dstq+dsq*2] 399 sub hd, 2 400 jg .h_w16 401 RET 402.h_w32: 403 pmullw m0, m4, [srcq+32*0] 404 pmullw m1, m5, [srcq+32*0+2] 405 paddw m0, m3 406 paddw m0, m1 407 pmullw m1, m4, [srcq+32*1] 408 pmullw m2, m5, [srcq+32*1+2] 409 add srcq, ssq 410 paddw m1, m3 411 paddw m1, m2 412 psrlw m0, 4 413 psrlw m1, 4 414 mova [dstq+32*0], m0 415 mova [dstq+32*1], m1 416 add dstq, dsq 417 dec hd 418 jg .h_w32 419 RET 420.h_w64: 421.h_w128: 422 movifnidn t0d, org_w 423.h_w64_loop0: 424 mov r6d, t0d 425.h_w64_loop: 426 pmullw m0, m4, [srcq+r6*2-32*1] 427 pmullw m1, m5, [srcq+r6*2-32*1+2] 428 paddw m0, m3 429 paddw m0, m1 430 pmullw m1, m4, [srcq+r6*2-32*2] 431 pmullw m2, m5, [srcq+r6*2-32*2+2] 432 paddw m1, m3 433 paddw m1, m2 434 psrlw m0, 4 435 psrlw m1, 4 436 mova [dstq+r6*2-32*1], m0 437 mova [dstq+r6*2-32*2], m1 438 sub r6d, 32 439 jg .h_w64_loop 440 add srcq, ssq 441 add dstq, dsq 442 dec hd 443 jg .h_w64_loop0 444 RET 445.v: 446 movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] 447 shl mxyd, 11 448 movd xm5, mxyd 449 add wq, r7 450 vpbroadcastw m5, xm5 451 jmp wq 452.v_w2: 453 movd xm0, [srcq+ssq*0] 454.v_w2_loop: 455 movd xm1, [srcq+ssq*1] 456 lea srcq, [srcq+ssq*2] 457 punpckldq xm2, xm0, xm1 458 movd xm0, [srcq+ssq*0] 459 punpckldq xm1, xm0 460 psubw xm1, xm2 461 pmulhrsw xm1, xm5 462 paddw xm1, xm2 463 movd [dstq+dsq*0], xm1 464 pextrd [dstq+dsq*1], xm1, 1 465 lea dstq, [dstq+dsq*2] 466 sub hd, 2 467 jg .v_w2_loop 468 RET 469.v_w4: 470 movq xm0, [srcq+ssq*0] 471.v_w4_loop: 472 movq xm1, [srcq+ssq*1] 473 lea srcq, [srcq+ssq*2] 474 punpcklqdq xm2, xm0, xm1 475 movq xm0, [srcq+ssq*0] 476 punpcklqdq xm1, xm0 477 psubw xm1, xm2 478 pmulhrsw xm1, xm5 479 paddw xm1, xm2 480 movq [dstq+dsq*0], xm1 481 movhps [dstq+dsq*1], xm1 482 lea dstq, [dstq+dsq*2] 483 sub hd, 2 484 jg .v_w4_loop 485 RET 486.v_w8: 487 movu xm0, [srcq+ssq*0] 488.v_w8_loop: 489 vbroadcasti128 m1, [srcq+ssq*1] 490 lea srcq, [srcq+ssq*2] 491 vpblendd m2, m0, m1, 0xf0 492 vbroadcasti128 m0, [srcq+ssq*0] 493 vpblendd m1, m0, 0xf0 494 psubw m1, m2 495 pmulhrsw m1, m5 496 paddw m1, m2 497 mova [dstq+dsq*0], xm1 498 vextracti128 [dstq+dsq*1], m1, 1 499 lea dstq, [dstq+dsq*2] 500 sub hd, 2 501 jg .v_w8_loop 502 RET 503.v_w32: 504 movu m0, [srcq+ssq*0+32*0] 505 movu m1, [srcq+ssq*0+32*1] 506.v_w32_loop: 507 movu m2, [srcq+ssq*1+32*0] 508 movu m3, [srcq+ssq*1+32*1] 509 lea srcq, [srcq+ssq*2] 510 psubw m4, m2, m0 511 pmulhrsw m4, m5 512 paddw m4, m0 513 movu m0, [srcq+ssq*0+32*0] 514 mova [dstq+dsq*0+32*0], m4 515 psubw m4, m3, m1 516 pmulhrsw m4, m5 517 paddw m4, m1 518 movu m1, [srcq+ssq*0+32*1] 519 mova [dstq+dsq*0+32*1], m4 520 psubw m4, m0, m2 521 pmulhrsw m4, m5 522 paddw m4, m2 523 mova [dstq+dsq*1+32*0], m4 524 psubw m4, m1, m3 525 pmulhrsw m4, m5 526 paddw m4, m3 527 mova [dstq+dsq*1+32*1], m4 528 lea dstq, [dstq+dsq*2] 529 sub hd, 2 530 jg .v_w32_loop 531 RET 532.v_w16: 533.v_w64: 534.v_w128: 535 movifnidn t0d, org_w 536 add t0d, t0d 537 mov r4, srcq 538 lea r6d, [hq+t0*8-256] 539 mov r7, dstq 540.v_w16_loop0: 541 movu m0, [srcq+ssq*0] 542.v_w16_loop: 543 movu m3, [srcq+ssq*1] 544 lea srcq, [srcq+ssq*2] 545 psubw m1, m3, m0 546 pmulhrsw m1, m5 547 paddw m1, m0 548 movu m0, [srcq+ssq*0] 549 psubw m2, m0, m3 550 pmulhrsw m2, m5 551 paddw m2, m3 552 mova [dstq+dsq*0], m1 553 mova [dstq+dsq*1], m2 554 lea dstq, [dstq+dsq*2] 555 sub hd, 2 556 jg .v_w16_loop 557 add r4, 32 558 add r7, 32 559 movzx hd, r6b 560 mov srcq, r4 561 mov dstq, r7 562 sub r6d, 1<<8 563 jg .v_w16_loop0 564 RET 565.hv: 566 movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] 567 WIN64_SPILL_XMM 8 568 shl mxyd, 11 569 vpbroadcastd m3, [pw_2] 570 movd xm6, mxyd 571 vpbroadcastd m7, [pw_8192] 572 add wq, r7 573 vpbroadcastw m6, xm6 574 test dword r8m, 0x800 575 jnz .hv_12bpc 576 psllw m4, 2 577 psllw m5, 2 578 vpbroadcastd m7, [pw_2048] 579.hv_12bpc: 580 jmp wq 581.hv_w2: 582 vpbroadcastq xm1, [srcq+ssq*0] 583 pmullw xm0, xm4, xm1 584 psrlq xm1, 16 585 pmullw xm1, xm5 586 paddw xm0, xm3 587 paddw xm0, xm1 588 psrlw xm0, 2 589.hv_w2_loop: 590 movq xm2, [srcq+ssq*1] 591 lea srcq, [srcq+ssq*2] 592 movhps xm2, [srcq+ssq*0] 593 pmullw xm1, xm4, xm2 594 psrlq xm2, 16 595 pmullw xm2, xm5 596 paddw xm1, xm3 597 paddw xm1, xm2 598 psrlw xm1, 2 ; 1 _ 2 _ 599 shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _ 600 mova xm0, xm1 601 psubw xm1, xm2 602 paddw xm1, xm1 603 pmulhw xm1, xm6 604 paddw xm1, xm2 605 pmulhrsw xm1, xm7 606 movd [dstq+dsq*0], xm1 607 pextrd [dstq+dsq*1], xm1, 2 608 lea dstq, [dstq+dsq*2] 609 sub hd, 2 610 jg .hv_w2_loop 611 RET 612.hv_w4: 613 pmullw xm0, xm4, [srcq+ssq*0-8] 614 pmullw xm1, xm5, [srcq+ssq*0-6] 615 paddw xm0, xm3 616 paddw xm0, xm1 617 psrlw xm0, 2 618.hv_w4_loop: 619 movq xm1, [srcq+ssq*1] 620 movq xm2, [srcq+ssq*1+2] 621 lea srcq, [srcq+ssq*2] 622 movhps xm1, [srcq+ssq*0] 623 movhps xm2, [srcq+ssq*0+2] 624 pmullw xm1, xm4 625 pmullw xm2, xm5 626 paddw xm1, xm3 627 paddw xm1, xm2 628 psrlw xm1, 2 ; 1 2 629 shufpd xm2, xm0, xm1, 0x01 ; 0 1 630 mova xm0, xm1 631 psubw xm1, xm2 632 paddw xm1, xm1 633 pmulhw xm1, xm6 634 paddw xm1, xm2 635 pmulhrsw xm1, xm7 636 movq [dstq+dsq*0], xm1 637 movhps [dstq+dsq*1], xm1 638 lea dstq, [dstq+dsq*2] 639 sub hd, 2 640 jg .hv_w4_loop 641 RET 642.hv_w8: 643 pmullw xm0, xm4, [srcq+ssq*0] 644 pmullw xm1, xm5, [srcq+ssq*0+2] 645 paddw xm0, xm3 646 paddw xm0, xm1 647 psrlw xm0, 2 648 vinserti128 m0, xm0, 1 649.hv_w8_loop: 650 movu xm1, [srcq+ssq*1] 651 movu xm2, [srcq+ssq*1+2] 652 lea srcq, [srcq+ssq*2] 653 vinserti128 m1, [srcq+ssq*0], 1 654 vinserti128 m2, [srcq+ssq*0+2], 1 655 pmullw m1, m4 656 pmullw m2, m5 657 paddw m1, m3 658 paddw m1, m2 659 psrlw m1, 2 ; 1 2 660 vperm2i128 m2, m0, m1, 0x21 ; 0 1 661 mova m0, m1 662 psubw m1, m2 663 paddw m1, m1 664 pmulhw m1, m6 665 paddw m1, m2 666 pmulhrsw m1, m7 667 mova [dstq+dsq*0], xm1 668 vextracti128 [dstq+dsq*1], m1, 1 669 lea dstq, [dstq+dsq*2] 670 sub hd, 2 671 jg .hv_w8_loop 672 RET 673.hv_w16: 674.hv_w32: 675.hv_w64: 676.hv_w128: 677%if UNIX64 678 lea r6d, [r8*2-32] 679%else 680 mov r6d, wm 681 lea r6d, [r6*2-32] 682%endif 683 mov r4, srcq 684 lea r6d, [hq+r6*8] 685 mov r7, dstq 686.hv_w16_loop0: 687 pmullw m0, m4, [srcq+ssq*0] 688 pmullw m1, m5, [srcq+ssq*0+2] 689 paddw m0, m3 690 paddw m0, m1 691 psrlw m0, 2 692.hv_w16_loop: 693 pmullw m1, m4, [srcq+ssq*1] 694 pmullw m2, m5, [srcq+ssq*1+2] 695 lea srcq, [srcq+ssq*2] 696 paddw m1, m3 697 paddw m1, m2 698 psrlw m1, 2 699 psubw m2, m1, m0 700 paddw m2, m2 701 pmulhw m2, m6 702 paddw m2, m0 703 pmulhrsw m2, m7 704 mova [dstq+dsq*0], m2 705 pmullw m0, m4, [srcq+ssq*0] 706 pmullw m2, m5, [srcq+ssq*0+2] 707 paddw m0, m3 708 paddw m0, m2 709 psrlw m0, 2 710 psubw m2, m0, m1 711 paddw m2, m2 712 pmulhw m2, m6 713 paddw m2, m1 714 pmulhrsw m2, m7 715 mova [dstq+dsq*1], m2 716 lea dstq, [dstq+dsq*2] 717 sub hd, 2 718 jg .hv_w16_loop 719 add r4, 32 720 add r7, 32 721 movzx hd, r6b 722 mov srcq, r4 723 mov dstq, r7 724 sub r6d, 1<<8 725 jg .hv_w16_loop0 726 RET 727 728cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 729 movifnidn mxyd, r5m ; mx 730 lea r6, [prep_avx2] 731%if UNIX64 732 DECLARE_REG_TMP 7 733 %define org_w r7d 734%else 735 DECLARE_REG_TMP 6 736 %define org_w r5m 737%endif 738 mov org_w, wd 739 tzcnt wd, wm 740 movifnidn hd, hm 741 test mxyd, mxyd 742 jnz .h 743 mov mxyd, r6m ; my 744 test mxyd, mxyd 745 jnz .v 746.prep: 747 movzx wd, word [r6+wq*2+table_offset(prep,)] 748 mov r5d, r7m ; bitdepth_max 749 vpbroadcastd m5, [r6-prep_avx2+pw_8192] 750 add wq, r6 751 shr r5d, 11 752 vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4] 753 lea stride3q, [strideq*3] 754 jmp wq 755.prep_w4: 756 movq xm0, [srcq+strideq*0] 757 movhps xm0, [srcq+strideq*1] 758 vpbroadcastq m1, [srcq+strideq*2] 759 vpbroadcastq m2, [srcq+stride3q ] 760 lea srcq, [srcq+strideq*4] 761 vpblendd m0, m1, 0x30 762 vpblendd m0, m2, 0xc0 763 pmullw m0, m4 764 psubw m0, m5 765 mova [tmpq], m0 766 add tmpq, 32 767 sub hd, 4 768 jg .prep_w4 769 RET 770.prep_w8: 771 movu xm0, [srcq+strideq*0] 772 vinserti128 m0, [srcq+strideq*1], 1 773 movu xm1, [srcq+strideq*2] 774 vinserti128 m1, [srcq+stride3q ], 1 775 lea srcq, [srcq+strideq*4] 776 pmullw m0, m4 777 pmullw m1, m4 778 psubw m0, m5 779 psubw m1, m5 780 mova [tmpq+32*0], m0 781 mova [tmpq+32*1], m1 782 add tmpq, 32*2 783 sub hd, 4 784 jg .prep_w8 785 RET 786.prep_w16: 787 pmullw m0, m4, [srcq+strideq*0] 788 pmullw m1, m4, [srcq+strideq*1] 789 pmullw m2, m4, [srcq+strideq*2] 790 pmullw m3, m4, [srcq+stride3q ] 791 lea srcq, [srcq+strideq*4] 792 psubw m0, m5 793 psubw m1, m5 794 psubw m2, m5 795 psubw m3, m5 796 mova [tmpq+32*0], m0 797 mova [tmpq+32*1], m1 798 mova [tmpq+32*2], m2 799 mova [tmpq+32*3], m3 800 add tmpq, 32*4 801 sub hd, 4 802 jg .prep_w16 803 RET 804.prep_w32: 805 pmullw m0, m4, [srcq+strideq*0+32*0] 806 pmullw m1, m4, [srcq+strideq*0+32*1] 807 pmullw m2, m4, [srcq+strideq*1+32*0] 808 pmullw m3, m4, [srcq+strideq*1+32*1] 809 lea srcq, [srcq+strideq*2] 810 psubw m0, m5 811 psubw m1, m5 812 psubw m2, m5 813 psubw m3, m5 814 mova [tmpq+32*0], m0 815 mova [tmpq+32*1], m1 816 mova [tmpq+32*2], m2 817 mova [tmpq+32*3], m3 818 add tmpq, 32*4 819 sub hd, 2 820 jg .prep_w32 821 RET 822.prep_w64: 823 pmullw m0, m4, [srcq+32*0] 824 pmullw m1, m4, [srcq+32*1] 825 pmullw m2, m4, [srcq+32*2] 826 pmullw m3, m4, [srcq+32*3] 827 add srcq, strideq 828 psubw m0, m5 829 psubw m1, m5 830 psubw m2, m5 831 psubw m3, m5 832 mova [tmpq+32*0], m0 833 mova [tmpq+32*1], m1 834 mova [tmpq+32*2], m2 835 mova [tmpq+32*3], m3 836 add tmpq, 32*4 837 dec hd 838 jg .prep_w64 839 RET 840.prep_w128: 841 pmullw m0, m4, [srcq+32*0] 842 pmullw m1, m4, [srcq+32*1] 843 pmullw m2, m4, [srcq+32*2] 844 pmullw m3, m4, [srcq+32*3] 845 psubw m0, m5 846 psubw m1, m5 847 psubw m2, m5 848 psubw m3, m5 849 mova [tmpq+32*0], m0 850 mova [tmpq+32*1], m1 851 mova [tmpq+32*2], m2 852 mova [tmpq+32*3], m3 853 pmullw m0, m4, [srcq+32*4] 854 pmullw m1, m4, [srcq+32*5] 855 pmullw m2, m4, [srcq+32*6] 856 pmullw m3, m4, [srcq+32*7] 857 add tmpq, 32*8 858 add srcq, strideq 859 psubw m0, m5 860 psubw m1, m5 861 psubw m2, m5 862 psubw m3, m5 863 mova [tmpq-32*4], m0 864 mova [tmpq-32*3], m1 865 mova [tmpq-32*2], m2 866 mova [tmpq-32*1], m3 867 dec hd 868 jg .prep_w128 869 RET 870.h: 871 movd xm5, mxyd 872 mov mxyd, r6m ; my 873 vpbroadcastd m4, [pw_16] 874 vpbroadcastw m5, xm5 875 vpbroadcastd m3, [pw_32766] 876 psubw m4, m5 877 test dword r7m, 0x800 878 jnz .h_12bpc 879 psllw m4, 2 880 psllw m5, 2 881.h_12bpc: 882 test mxyd, mxyd 883 jnz .hv 884 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] 885 add wq, r6 886 lea stride3q, [strideq*3] 887 jmp wq 888.h_w4: 889 movu xm1, [srcq+strideq*0] 890 vinserti128 m1, [srcq+strideq*2], 1 891 movu xm2, [srcq+strideq*1] 892 vinserti128 m2, [srcq+stride3q ], 1 893 lea srcq, [srcq+strideq*4] 894 punpcklqdq m0, m1, m2 895 psrldq m1, 2 896 pslldq m2, 6 897 pmullw m0, m4 898 vpblendd m1, m2, 0xcc 899 pmullw m1, m5 900 psubw m0, m3 901 paddw m0, m1 902 psraw m0, 2 903 mova [tmpq], m0 904 add tmpq, 32 905 sub hd, 4 906 jg .h_w4 907 RET 908.h_w8: 909 movu xm0, [srcq+strideq*0] 910 vinserti128 m0, [srcq+strideq*1], 1 911 movu xm1, [srcq+strideq*0+2] 912 vinserti128 m1, [srcq+strideq*1+2], 1 913 lea srcq, [srcq+strideq*2] 914 pmullw m0, m4 915 pmullw m1, m5 916 psubw m0, m3 917 paddw m0, m1 918 psraw m0, 2 919 mova [tmpq], m0 920 add tmpq, 32 921 sub hd, 2 922 jg .h_w8 923 RET 924.h_w16: 925 pmullw m0, m4, [srcq+strideq*0] 926 pmullw m1, m5, [srcq+strideq*0+2] 927 psubw m0, m3 928 paddw m0, m1 929 pmullw m1, m4, [srcq+strideq*1] 930 pmullw m2, m5, [srcq+strideq*1+2] 931 lea srcq, [srcq+strideq*2] 932 psubw m1, m3 933 paddw m1, m2 934 psraw m0, 2 935 psraw m1, 2 936 mova [tmpq+32*0], m0 937 mova [tmpq+32*1], m1 938 add tmpq, 32*2 939 sub hd, 2 940 jg .h_w16 941 RET 942.h_w32: 943.h_w64: 944.h_w128: 945 movifnidn t0d, org_w 946.h_w32_loop0: 947 mov r3d, t0d 948.h_w32_loop: 949 pmullw m0, m4, [srcq+r3*2-32*1] 950 pmullw m1, m5, [srcq+r3*2-32*1+2] 951 psubw m0, m3 952 paddw m0, m1 953 pmullw m1, m4, [srcq+r3*2-32*2] 954 pmullw m2, m5, [srcq+r3*2-32*2+2] 955 psubw m1, m3 956 paddw m1, m2 957 psraw m0, 2 958 psraw m1, 2 959 mova [tmpq+r3*2-32*1], m0 960 mova [tmpq+r3*2-32*2], m1 961 sub r3d, 32 962 jg .h_w32_loop 963 add srcq, strideq 964 lea tmpq, [tmpq+t0*2] 965 dec hd 966 jg .h_w32_loop0 967 RET 968.v: 969 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] 970 movd xm5, mxyd 971 vpbroadcastd m4, [pw_16] 972 vpbroadcastw m5, xm5 973 vpbroadcastd m3, [pw_32766] 974 add wq, r6 975 lea stride3q, [strideq*3] 976 psubw m4, m5 977 test dword r7m, 0x800 978 jnz .v_12bpc 979 psllw m4, 2 980 psllw m5, 2 981.v_12bpc: 982 jmp wq 983.v_w4: 984 movq xm0, [srcq+strideq*0] 985.v_w4_loop: 986 vpbroadcastq m2, [srcq+strideq*2] 987 vpbroadcastq xm1, [srcq+strideq*1] 988 vpblendd m2, m0, 0x03 ; 0 2 2 2 989 vpbroadcastq m0, [srcq+stride3q ] 990 lea srcq, [srcq+strideq*4] 991 vpblendd m1, m0, 0xf0 ; 1 1 3 3 992 vpbroadcastq m0, [srcq+strideq*0] 993 vpblendd m1, m2, 0x33 ; 0 1 2 3 994 vpblendd m0, m2, 0x0c ; 4 2 4 4 995 punpckhqdq m2, m1, m0 ; 1 2 3 4 996 pmullw m1, m4 997 pmullw m2, m5 998 psubw m1, m3 999 paddw m1, m2 1000 psraw m1, 2 1001 mova [tmpq], m1 1002 add tmpq, 32 1003 sub hd, 4 1004 jg .v_w4_loop 1005 RET 1006.v_w8: 1007 movu xm0, [srcq+strideq*0] 1008.v_w8_loop: 1009 vbroadcasti128 m2, [srcq+strideq*1] 1010 lea srcq, [srcq+strideq*2] 1011 vpblendd m1, m0, m2, 0xf0 ; 0 1 1012 vbroadcasti128 m0, [srcq+strideq*0] 1013 vpblendd m2, m0, 0xf0 ; 1 2 1014 pmullw m1, m4 1015 pmullw m2, m5 1016 psubw m1, m3 1017 paddw m1, m2 1018 psraw m1, 2 1019 mova [tmpq], m1 1020 add tmpq, 32 1021 sub hd, 2 1022 jg .v_w8_loop 1023 RET 1024.v_w16: 1025 movu m0, [srcq+strideq*0] 1026.v_w16_loop: 1027 movu m2, [srcq+strideq*1] 1028 lea srcq, [srcq+strideq*2] 1029 pmullw m0, m4 1030 pmullw m1, m5, m2 1031 psubw m0, m3 1032 paddw m1, m0 1033 movu m0, [srcq+strideq*0] 1034 psraw m1, 2 1035 pmullw m2, m4 1036 mova [tmpq+32*0], m1 1037 pmullw m1, m5, m0 1038 psubw m2, m3 1039 paddw m1, m2 1040 psraw m1, 2 1041 mova [tmpq+32*1], m1 1042 add tmpq, 32*2 1043 sub hd, 2 1044 jg .v_w16_loop 1045 RET 1046.v_w32: 1047.v_w64: 1048.v_w128: 1049%if WIN64 1050 PUSH r7 1051%endif 1052 movifnidn r7d, org_w 1053 add r7d, r7d 1054 mov r3, srcq 1055 lea r6d, [hq+r7*8-256] 1056 mov r5, tmpq 1057.v_w32_loop0: 1058 movu m0, [srcq+strideq*0] 1059.v_w32_loop: 1060 movu m2, [srcq+strideq*1] 1061 lea srcq, [srcq+strideq*2] 1062 pmullw m0, m4 1063 pmullw m1, m5, m2 1064 psubw m0, m3 1065 paddw m1, m0 1066 movu m0, [srcq+strideq*0] 1067 psraw m1, 2 1068 pmullw m2, m4 1069 mova [tmpq+r7*0], m1 1070 pmullw m1, m5, m0 1071 psubw m2, m3 1072 paddw m1, m2 1073 psraw m1, 2 1074 mova [tmpq+r7*1], m1 1075 lea tmpq, [tmpq+r7*2] 1076 sub hd, 2 1077 jg .v_w32_loop 1078 add r3, 32 1079 add r5, 32 1080 movzx hd, r6b 1081 mov srcq, r3 1082 mov tmpq, r5 1083 sub r6d, 1<<8 1084 jg .v_w32_loop0 1085%if WIN64 1086 POP r7 1087%endif 1088 RET 1089.hv: 1090 WIN64_SPILL_XMM 7 1091 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] 1092 shl mxyd, 11 1093 movd xm6, mxyd 1094 add wq, r6 1095 lea stride3q, [strideq*3] 1096 vpbroadcastw m6, xm6 1097 jmp wq 1098.hv_w4: 1099 movu xm1, [srcq+strideq*0] 1100%if WIN64 1101 movaps [rsp+24], xmm7 1102%endif 1103 pmullw xm0, xm4, xm1 1104 psrldq xm1, 2 1105 pmullw xm1, xm5 1106 psubw xm0, xm3 1107 paddw xm0, xm1 1108 psraw xm0, 2 1109 vpbroadcastq m0, xm0 1110.hv_w4_loop: 1111 movu xm1, [srcq+strideq*1] 1112 vinserti128 m1, [srcq+stride3q ], 1 1113 movu xm2, [srcq+strideq*2] 1114 lea srcq, [srcq+strideq*4] 1115 vinserti128 m2, [srcq+strideq*0], 1 1116 punpcklqdq m7, m1, m2 1117 psrldq m1, 2 1118 pslldq m2, 6 1119 pmullw m7, m4 1120 vpblendd m1, m2, 0xcc 1121 pmullw m1, m5 1122 psubw m7, m3 1123 paddw m1, m7 1124 psraw m1, 2 ; 1 2 3 4 1125 vpblendd m0, m1, 0x3f 1126 vpermq m2, m0, q2103 ; 0 1 2 3 1127 mova m0, m1 1128 psubw m1, m2 1129 pmulhrsw m1, m6 1130 paddw m1, m2 1131 mova [tmpq], m1 1132 add tmpq, 32 1133 sub hd, 4 1134 jg .hv_w4_loop 1135%if WIN64 1136 movaps xmm7, [rsp+24] 1137%endif 1138 RET 1139.hv_w8: 1140 pmullw xm0, xm4, [srcq+strideq*0] 1141 pmullw xm1, xm5, [srcq+strideq*0+2] 1142 psubw xm0, xm3 1143 paddw xm0, xm1 1144 psraw xm0, 2 1145 vinserti128 m0, xm0, 1 1146.hv_w8_loop: 1147 movu xm1, [srcq+strideq*1] 1148 movu xm2, [srcq+strideq*1+2] 1149 lea srcq, [srcq+strideq*2] 1150 vinserti128 m1, [srcq+strideq*0], 1 1151 vinserti128 m2, [srcq+strideq*0+2], 1 1152 pmullw m1, m4 1153 pmullw m2, m5 1154 psubw m1, m3 1155 paddw m1, m2 1156 psraw m1, 2 ; 1 2 1157 vperm2i128 m2, m0, m1, 0x21 ; 0 1 1158 mova m0, m1 1159 psubw m1, m2 1160 pmulhrsw m1, m6 1161 paddw m1, m2 1162 mova [tmpq], m1 1163 add tmpq, 32 1164 sub hd, 2 1165 jg .hv_w8_loop 1166 RET 1167.hv_w16: 1168.hv_w32: 1169.hv_w64: 1170.hv_w128: 1171%if WIN64 1172 PUSH r7 1173%endif 1174 movifnidn r7d, org_w 1175 add r7d, r7d 1176 mov r3, srcq 1177 lea r6d, [hq+r7*8-256] 1178 mov r5, tmpq 1179.hv_w16_loop0: 1180 pmullw m0, m4, [srcq] 1181 pmullw m1, m5, [srcq+2] 1182 psubw m0, m3 1183 paddw m0, m1 1184 psraw m0, 2 1185.hv_w16_loop: 1186 pmullw m1, m4, [srcq+strideq*1] 1187 pmullw m2, m5, [srcq+strideq*1+2] 1188 lea srcq, [srcq+strideq*2] 1189 psubw m1, m3 1190 paddw m1, m2 1191 psraw m1, 2 1192 psubw m2, m1, m0 1193 pmulhrsw m2, m6 1194 paddw m2, m0 1195 mova [tmpq+r7*0], m2 1196 pmullw m0, m4, [srcq+strideq*0] 1197 pmullw m2, m5, [srcq+strideq*0+2] 1198 psubw m0, m3 1199 paddw m0, m2 1200 psraw m0, 2 1201 psubw m2, m0, m1 1202 pmulhrsw m2, m6 1203 paddw m2, m1 1204 mova [tmpq+r7*1], m2 1205 lea tmpq, [tmpq+r7*2] 1206 sub hd, 2 1207 jg .hv_w16_loop 1208 add r3, 32 1209 add r5, 32 1210 movzx hd, r6b 1211 mov srcq, r3 1212 mov tmpq, r5 1213 sub r6d, 1<<8 1214 jg .hv_w16_loop0 1215%if WIN64 1216 POP r7 1217%endif 1218 RET 1219 1220; int8_t subpel_filters[5][15][8] 1221%assign FILTER_REGULAR (0*15 << 16) | 3*15 1222%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1223%assign FILTER_SHARP (2*15 << 16) | 3*15 1224 1225%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to 1226cglobal %1_%2_16bpc 1227 mov t0d, FILTER_%3 1228%ifidn %3, %4 1229 mov t1d, t0d 1230%else 1231 mov t1d, FILTER_%4 1232%endif 1233%if %0 == 5 ; skip the jump in the last filter 1234 jmp mangle(private_prefix %+ _%5 %+ SUFFIX) 1235%endif 1236%endmacro 1237 1238%if WIN64 1239DECLARE_REG_TMP 4, 5 1240%else 1241DECLARE_REG_TMP 7, 8 1242%endif 1243 1244%define PUT_8TAP_FN FN put_8tap, 1245PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc 1246PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc 1247PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc 1248PUT_8TAP_FN regular, REGULAR, REGULAR 1249 1250cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my 1251%define base r8-put_avx2 1252 imul mxd, mxm, 0x010101 1253 add mxd, t0d ; 6tap_h, mx, 4tap_h 1254 imul myd, mym, 0x010101 1255 add myd, t1d ; 6tap_v, my, 4tap_v 1256 lea r8, [put_avx2] 1257 movifnidn wd, wm 1258 movifnidn hd, hm 1259 test mxd, 0xf00 1260 jnz .h 1261 test myd, 0xf00 1262 jnz .v 1263.put: 1264 tzcnt wd, wd 1265 movzx wd, word [r8+wq*2+table_offset(put,)] 1266 add wq, r8 1267%if WIN64 1268 pop r8 1269%endif 1270 jmp wq 1271.h_w2: 1272 movzx mxd, mxb 1273 sub srcq, 2 1274 mova xm2, [subpel_h_shuf2] 1275 vpbroadcastd xm3, [base+subpel_filters+mxq*8+2] 1276 pmovsxbw xm3, xm3 1277.h_w2_loop: 1278 movu xm0, [srcq+ssq*0] 1279 movu xm1, [srcq+ssq*1] 1280 lea srcq, [srcq+ssq*2] 1281 pshufb xm0, xm2 1282 pshufb xm1, xm2 1283 pmaddwd xm0, xm3 1284 pmaddwd xm1, xm3 1285 phaddd xm0, xm1 1286 paddd xm0, xm4 1287 psrad xm0, 6 1288 packusdw xm0, xm0 1289 pminsw xm0, xm5 1290 movd [dstq+dsq*0], xm0 1291 pextrd [dstq+dsq*1], xm0, 1 1292 lea dstq, [dstq+dsq*2] 1293 sub hd, 2 1294 jg .h_w2_loop 1295 RET 1296.h_w4: 1297 movzx mxd, mxb 1298 sub srcq, 2 1299 pmovsxbw xm3, [base+subpel_filters+mxq*8] 1300 WIN64_SPILL_XMM 8 1301 vbroadcasti128 m6, [subpel_h_shufA] 1302 vbroadcasti128 m7, [subpel_h_shufB] 1303 pshufd xm3, xm3, q2211 1304 vpbroadcastq m2, xm3 1305 vpermq m3, m3, q1111 1306.h_w4_loop: 1307 movu xm1, [srcq+ssq*0] 1308 vinserti128 m1, [srcq+ssq*1], 1 1309 lea srcq, [srcq+ssq*2] 1310 pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 1311 pshufb m1, m7 ; 2 3 3 4 4 5 5 6 1312 pmaddwd m0, m2 1313 pmaddwd m1, m3 1314 paddd m0, m4 1315 paddd m0, m1 1316 psrad m0, 6 1317 vextracti128 xm1, m0, 1 1318 packusdw xm0, xm1 1319 pminsw xm0, xm5 1320 movq [dstq+dsq*0], xm0 1321 movhps [dstq+dsq*1], xm0 1322 lea dstq, [dstq+dsq*2] 1323 sub hd, 2 1324 jg .h_w4_loop 1325 RET 1326.h: 1327 test myd, 0xf00 1328 jnz .hv 1329 mov r7d, r8m 1330 vpbroadcastw m5, r8m 1331 shr r7d, 11 1332 vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] 1333 cmp wd, 4 1334 je .h_w4 1335 jl .h_w2 1336 WIN64_SPILL_XMM 11 1337 shr mxd, 16 1338 sub srcq, 4 1339 vpbroadcastq m0, [base+subpel_filters+1+mxq*8] 1340 vbroadcasti128 m6, [base+subpel_h_shufA] 1341 punpcklbw m0, m0 1342 psraw m0, 8 ; sign-extend 1343 pshufd m7, m0, q0000 1344 pshufd m8, m0, q1111 1345 pshufd m9, m0, q2222 1346 sub wd, 16 1347 jge .h_w16 1348.h_w8: 1349%macro PUT_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 1350 pshufb m%1, m6 ; 01 12 23 34 1351 pshufb m%2, m6 ; 45 56 67 78 1352 pmaddwd m%4, m7, m%1 ; a0 1353 pshufb m%3, m6 ; 89 9a ab bc 1354 pmaddwd m%5, m9, m%2 ; a2 1355 shufpd m%1, m%2, 0x05 ; 23 34 45 56 1356 paddd m%4, m%5 ; a0+a2 1357 pmaddwd m%5, m7, m%2 ; b0 1358 shufpd m%2, m%3, 0x05 ; 67 78 89 9a 1359 pmaddwd m%3, m9 ; b2 1360 pmaddwd m%1, m8 ; a1 1361 pmaddwd m%2, m8 ; b1 1362 paddd m%3, m%5 ; b0+b2 1363 paddd m%4, m4 1364 paddd m%3, m4 1365 paddd m%1, m%4 1366 paddd m%2, m%3 1367 psrad m%1, 6 1368 psrad m%2, 6 1369 packusdw m%1, m%2 1370 pminsw m%1, m5 1371%endmacro 1372 movu xm0, [srcq+ssq*0+ 0] 1373 vinserti128 m0, [srcq+ssq*1+ 0], 1 1374 movu xm2, [srcq+ssq*0+16] 1375 vinserti128 m2, [srcq+ssq*1+16], 1 1376 shufpd m1, m0, m2, 0x05 1377 lea srcq, [srcq+ssq*2] 1378 PUT_6TAP_H 0, 1, 2, 3, 10 1379 mova [dstq+dsq*0], xm0 1380 vextracti128 [dstq+dsq*1], m0, 1 1381 lea dstq, [dstq+dsq*2] 1382 sub hd, 2 1383 jg .h_w8 1384 RET 1385.h_w16: 1386 mov r6d, wd 1387.h_w16_loop: 1388 movu m0, [srcq+r6*2+ 0] 1389 movu m1, [srcq+r6*2+ 8] 1390 movu m2, [srcq+r6*2+16] 1391 PUT_6TAP_H 0, 1, 2, 3, 10 1392 mova [dstq+r6*2], m0 1393 sub r6d, 16 1394 jge .h_w16_loop 1395 add srcq, ssq 1396 add dstq, dsq 1397 dec hd 1398 jg .h_w16 1399 RET 1400.v: 1401 movzx mxd, myb 1402 shr myd, 16 1403 cmp hd, 6 1404 cmovs myd, mxd 1405 vpbroadcastq m0, [base+subpel_filters+1+myq*8] 1406 WIN64_SPILL_XMM 10, 12 1407 vpbroadcastd m5, [pd_32] 1408 vpbroadcastw m6, r8m 1409 punpcklbw m0, m0 1410 mov r6, ssq 1411 psraw m0, 8 ; sign-extend 1412 neg r6 1413 pshufd m7, m0, q0000 1414 pshufd m8, m0, q1111 1415 pshufd m9, m0, q2222 1416 cmp wd, 4 1417 jg .v_w8 1418 je .v_w4 1419.v_w2: 1420 movd xm2, [srcq+r6 *2] 1421 pinsrd xm2, [srcq+r6 *1], 1 1422 pinsrd xm2, [srcq+ssq*0], 2 1423 pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3 1424 lea srcq, [srcq+ssq*2] 1425 movd xm0, [srcq+ssq*0] 1426 palignr xm3, xm0, xm2, 4 ; 1 2 3 4 1427 punpcklwd xm1, xm2, xm3 ; 01 12 1428 punpckhwd xm2, xm3 ; 23 34 1429.v_w2_loop: 1430 movd xm3, [srcq+ssq*1] 1431 pmaddwd xm4, xm7, xm1 ; a0 b0 1432 mova xm1, xm2 1433 pmaddwd xm2, xm8 ; a1 b1 1434 lea srcq, [srcq+ssq*2] 1435 paddd xm4, xm2 1436 punpckldq xm2, xm0, xm3 ; 4 5 1437 movd xm0, [srcq+ssq*0] 1438 punpckldq xm3, xm0 ; 5 6 1439 punpcklwd xm2, xm3 ; 45 56 1440 pmaddwd xm3, xm9, xm2 ; a2 b2 1441 paddd xm4, xm5 1442 paddd xm4, xm3 1443 psrad xm4, 6 1444 packusdw xm4, xm4 1445 pminsw xm4, xm6 1446 movd [dstq+dsq*0], xm4 1447 pextrd [dstq+dsq*1], xm4, 1 1448 lea dstq, [dstq+dsq*2] 1449 sub hd, 2 1450 jg .v_w2_loop 1451 RET 1452.v_w4: 1453 movq xm1, [srcq+r6 *2] 1454 vpbroadcastq m3, [srcq+r6 *1] 1455 vpbroadcastq m2, [srcq+ssq*0] 1456 vpbroadcastq m4, [srcq+ssq*1] 1457 lea srcq, [srcq+ssq*2] 1458 vpbroadcastq m0, [srcq+ssq*0] 1459 vpblendd m1, m3, 0x30 1460 vpblendd m3, m2, 0x30 1461 punpcklwd m1, m3 ; 01 12 1462 vpblendd m2, m4, 0x30 1463 vpblendd m4, m0, 0x30 1464 punpcklwd m2, m4 ; 23 34 1465.v_w4_loop: 1466 vpbroadcastq m3, [srcq+ssq*1] 1467 pmaddwd m4, m7, m1 ; a0 b0 1468 mova m1, m2 1469 pmaddwd m2, m8 ; a1 b1 1470 lea srcq, [srcq+ssq*2] 1471 paddd m4, m2 1472 vpblendd m2, m0, m3, 0x30 1473 vpbroadcastq m0, [srcq+ssq*0] 1474 vpblendd m3, m0, 0x30 1475 punpcklwd m2, m3 ; 45 56 1476 pmaddwd m3, m9, m2 ; a2 b2 1477 paddd m4, m5 1478 paddd m4, m3 1479 psrad m4, 6 1480 vextracti128 xm3, m4, 1 1481 packusdw xm4, xm3 1482 pminsw xm4, xm6 1483 movq [dstq+dsq*0], xm4 1484 movhps [dstq+dsq*1], xm4 1485 lea dstq, [dstq+dsq*2] 1486 sub hd, 2 1487 jg .v_w4_loop 1488 RET 1489.v_w8: 1490 shl wd, 5 1491 WIN64_PUSH_XMM 12 1492 lea wd, [hq+wq-256] 1493.v_w8_loop0: 1494 vbroadcasti128 m3, [srcq+r6 *2] 1495 vbroadcasti128 m4, [srcq+r6 *1] 1496 lea r7, [srcq+ssq*2] 1497 vbroadcasti128 m0, [srcq+ssq*0] 1498 vbroadcasti128 m1, [srcq+ssq*1] 1499 mov r8, dstq 1500 vbroadcasti128 m2, [r7+ssq*0] 1501 shufpd m3, m0, 0x0c 1502 shufpd m4, m1, 0x0c 1503 punpcklwd m1, m3, m4 ; 01 1504 punpckhwd m3, m4 ; 23 1505 shufpd m0, m2, 0x0c 1506 punpcklwd m2, m4, m0 ; 12 1507 punpckhwd m4, m0 ; 34 1508.v_w8_loop: 1509 vbroadcasti128 m5, [r7+ssq*1] 1510 pmaddwd m10, m7, m1 ; a0 1511 lea r7, [r7+ssq*2] 1512 pmaddwd m11, m7, m2 ; b0 1513 mova m1, m3 1514 pmaddwd m3, m8 ; a1 1515 mova m2, m4 1516 pmaddwd m4, m8 ; b1 1517 paddd m10, m3 1518 vbroadcasti128 m3, [r7+ssq*0] 1519 paddd m11, m4 1520 shufpd m4, m0, m5, 0x0d 1521 shufpd m0, m5, m3, 0x0c 1522 punpcklwd m3, m4, m0 ; 45 1523 punpckhwd m4, m0 ; 56 1524 pmaddwd m5, m9, m3 ; a2 1525 paddd m10, m5 1526 pmaddwd m5, m9, m4 ; b2 1527 paddd m5, m11 1528 psrad m10, 5 1529 psrad m5, 5 1530 packusdw m10, m5 1531 pxor m5, m5 1532 pavgw m5, m10 1533 pminsw m5, m6 1534 vpermq m5, m5, q3120 1535 mova [r8+dsq*0], xm5 1536 vextracti128 [r8+dsq*1], m5, 1 1537 lea r8, [r8+dsq*2] 1538 sub hd, 2 1539 jg .v_w8_loop 1540 add srcq, 16 1541 add dstq, 16 1542 movzx hd, wb 1543 sub wd, 1<<8 1544 jg .v_w8_loop0 1545 RET 1546.hv: 1547 WIN64_SPILL_XMM 12, 16 1548 vpbroadcastd m10, [pd_512] 1549 vpbroadcastw m11, r8m 1550 cmp wd, 4 1551 jg .hv_w8 1552 movzx mxd, mxb 1553 vpbroadcastd m0, [base+subpel_filters+mxq*8+2] 1554 movzx mxd, myb 1555 shr myd, 16 1556 cmp hd, 6 1557 cmovs myd, mxd 1558 vpbroadcastq m1, [base+subpel_filters+1+myq*8] 1559 mov r6, ssq 1560 sub srcq, 2 1561 neg r6 1562 pxor m6, m6 1563 punpcklbw m6, m0 1564 punpcklbw m1, m1 1565 psraw m1, 8 ; sign-extend 1566 test dword r8m, 0x800 1567 jz .hv_10bit 1568 psraw m6, 2 1569 psllw m1, 2 1570.hv_10bit: 1571 pshufd m7, m1, q0000 1572 pshufd m8, m1, q1111 1573 pshufd m9, m1, q2222 1574 cmp wd, 4 1575 je .hv_w4 1576 vbroadcasti128 m5, [subpel_h_shuf2] 1577 vbroadcasti128 m0, [srcq+ssq*0] 1578 vinserti128 m2, m0, [srcq+r6*2], 1 ; 2 0 1579 movu xm1, [srcq+ssq*1] 1580 vinserti128 m1, [srcq+r6 *1], 1 ; 3 1 1581 lea srcq, [srcq+ssq*2] 1582 vinserti128 m0, [srcq+ssq*0], 0 ; 4 2 1583 REPX {pshufb x, m5}, m2, m1, m0 1584 REPX {pmaddwd x, m6}, m2, m1, m0 1585 phaddd m2, m1 1586 phaddd m1, m0 1587 paddd m2, m10 1588 paddd m1, m10 1589 psrad m2, 10 1590 psrad m1, 10 1591 packssdw m2, m1 ; 2 3 3 4 0 1 1 2 1592 punpckhqdq m0, m2, m2 1593 punpcklwd m2, m0 ; 23 34 1594 vextracti128 xm1, m2, 1 ; 01 12 1595.hv_w2_loop: 1596 movu xm3, [srcq+ssq*1] 1597 lea srcq, [srcq+ssq*2] 1598 movu xm4, [srcq+ssq*0] 1599 pshufb xm3, xm5 1600 pshufb xm4, xm5 1601 pmaddwd xm3, xm6 1602 pmaddwd xm4, xm6 1603 phaddd xm3, xm4 1604 pmaddwd xm4, xm7, xm1 ; a0 b0 1605 mova xm1, xm2 1606 pmaddwd xm2, xm8 ; a1 b1 1607 paddd xm4, xm2 1608 paddd xm3, xm10 1609 psrad xm3, 10 1610 packssdw xm3, xm3 1611 palignr xm2, xm3, xm0, 12 1612 mova xm0, xm3 1613 punpcklwd xm2, xm0 ; 45 56 1614 pmaddwd xm3, xm9, xm2 ; a2 b2 1615 paddd xm4, xm10 1616 paddd xm4, xm3 1617 psrad xm4, 10 1618 packusdw xm4, xm4 1619 pminsw xm4, xm11 1620 movd [dstq+dsq*0], xm4 1621 pextrd [dstq+dsq*1], xm4, 1 1622 lea dstq, [dstq+dsq*2] 1623 sub hd, 2 1624 jg .hv_w2_loop 1625 RET 1626.hv_w4: 1627 WIN64_PUSH_XMM 14 1628 vbroadcasti128 m12, [subpel_h_shufA] 1629 pshufd m5, m6, q0000 1630 vbroadcasti128 m13, [subpel_h_shufB] 1631 pshufd m6, m6, q1111 1632 movu xm2, [srcq+r6 *2] 1633 vinserti128 m2, [srcq+r6 *1], 1 ; 0 1 1634 movu xm0, [srcq+ssq*0] 1635 vinserti128 m0, [srcq+ssq*1], 1 ; 2 3 1636 lea srcq, [srcq+ssq*2] 1637 movu xm3, [srcq+ssq*0] ; 4 1638 pshufb m1, m2, m12 1639 pmaddwd m1, m5 1640 pshufb m2, m13 1641 pmaddwd m2, m6 1642 pshufb m4, m0, m12 1643 pmaddwd m4, m5 1644 pshufb m0, m13 1645 pmaddwd m0, m6 1646 paddd m2, m1 1647 pshufb xm1, xm3, xm12 1648 pmaddwd xm1, xm5 1649 pshufb xm3, xm13 1650 pmaddwd xm3, xm6 1651 paddd m0, m4 1652 paddd m2, m10 1653 paddd xm1, xm10 1654 paddd m0, m10 1655 paddd xm3, xm1 1656 REPX {psrad x, 10}, m2, m0, xm3 1657 packssdw m2, m0 ; 0 2 1 3 1658 packssdw xm0, xm3 ; 2 4 1659 vperm2i128 m0, m2, 0x03 1660 punpcklwd m1, m2, m0 ; 01 12 1661 punpckhwd m2, m0 ; 23 34 1662.hv_w4_loop: 1663 movu xm3, [srcq+ssq*1] 1664 lea srcq, [srcq+ssq*2] 1665 vinserti128 m3, [srcq+ssq*0], 1 1666 pmaddwd m4, m7, m1 ; a0 b0 1667 mova m1, m2 1668 pmaddwd m2, m8 ; a1 b1 1669 paddd m4, m2 1670 pshufb m2, m3, m12 1671 pmaddwd m2, m5 1672 pshufb m3, m13 1673 pmaddwd m3, m6 1674 paddd m2, m10 1675 paddd m3, m2 1676 psrad m3, 10 1677 packssdw m3, m3 ; 5 5 6 6 1678 vperm2i128 m2, m0, m3, 0x21 1679 mova m0, m3 1680 punpckhwd m2, m3 ; 45 56 1681 pmaddwd m3, m9, m2 ; a2 b2 1682 paddd m4, m10 1683 paddd m4, m3 1684 psrad m4, 10 1685 vextracti128 xm3, m4, 1 1686 packusdw xm4, xm3 1687 pminsw xm4, xm11 1688 movq [dstq+dsq*0], xm4 1689 movhps [dstq+dsq*1], xm4 1690 lea dstq, [dstq+dsq*2] 1691 sub hd, 2 1692 jg .hv_w4_loop 1693 RET 1694.hv_w8: 1695 WIN64_PUSH_XMM 16, 12 1696 shr mxd, 16 1697 vbroadcasti128 m12, [subpel_h_shufA] 1698 vpbroadcastq m2, [base+subpel_filters+1+mxq*8] 1699 movzx mxd, myb 1700 shr myd, 16 1701 cmp hd, 6 1702 cmovs myd, mxd 1703 pmovsxbw xm1, [base+subpel_filters+1+myq*8] 1704 shl wd, 5 1705 mov r6, ssq 1706 sub srcq, 4 1707 pxor m0, m0 1708 neg r6 1709 punpcklbw m0, m2 1710 lea wd, [hq+wq-256] 1711 test dword r8m, 0x800 1712 jz .hv_w8_10bit 1713 psraw m0, 2 1714 psllw xm1, 2 1715.hv_w8_10bit: 1716 pshufd m7, m0, q0000 1717 pshufd m8, m0, q1111 1718%if WIN64 1719 %define v_mul (rsp+stack_offset+40) ; r4m 1720%else 1721 %define v_mul (rsp+stack_offset+ 8) ; r6m 1722%endif 1723 mova [v_mul], xm1 1724 pshufd m9, m0, q2222 1725.hv_w8_loop0: 1726 vbroadcasti128 m0, [srcq+ssq*0+ 0] 1727 vinserti128 m3, m0, [srcq+r6*2+ 0], 0 1728 lea r7, [srcq+ssq*2] 1729 vbroadcasti128 m2, [srcq+ssq*0+16] 1730 vinserti128 m1, m2, [srcq+r6*2+16], 0 1731 mov r8, dstq 1732 vinserti128 m0, [r7 +ssq*0+ 0], 1 1733 vinserti128 m2, [r7 +ssq*0+16], 1 1734 shufpd m4, m3, m1, 0x05 1735%macro PUT_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 1736 pshufb m%1, m12 ; 01 12 23 34 1737 pshufb m%2, m12 ; 45 56 67 78 1738 pmaddwd m%4, m7, m%1 ; a0 1739 pshufb m%3, m12 ; 89 9a ab bc 1740 pmaddwd m%5, m9, m%2 ; a2 1741 shufpd m%1, m%2, 0x05 ; 23 34 45 56 1742 paddd m%4, m%5 ; a0+a2 1743 pmaddwd m%5, m7, m%2 ; b0 1744 shufpd m%2, m%3, 0x05 ; 67 78 89 9a 1745 pmaddwd m%3, m9 ; b2 1746 pmaddwd m%1, m8 ; a1 1747 pmaddwd m%2, m8 ; b1 1748 paddd m%3, m%5 ; b0+b2 1749 paddd m%4, m10 1750 paddd m%3, m10 1751 paddd m%1, m%4 1752 paddd m%2, m%3 1753 psrad m%1, 10 1754 psrad m%2, 10 1755 packssdw m%1, m%2 1756%endmacro 1757 PUT_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2 1758 movu xm4, [srcq+r6 *1+ 0] 1759 vinserti128 m4, [srcq+ssq*1+ 0], 1 1760 shufpd m1, m0, m2, 0x05 1761 PUT_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4 1762 movu xm2, [srcq+r6 *1+16] 1763 vinserti128 m2, [srcq+ssq*1+16], 1 1764 shufpd m1, m4, m2, 0x05 1765 PUT_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3 1766 vpermq m3, m3, q3120 1767 vpermq m4, m4, q3120 1768 vpermq m0, m0, q3120 1769 punpcklwd m1, m3, m4 ; 01 1770 punpckhwd m3, m4 ; 23 1771 punpcklwd m2, m4, m0 ; 12 1772 punpckhwd m4, m0 ; 34 1773.hv_w8_loop: 1774 vpbroadcastd m15, [v_mul+4*0] 1775 vpbroadcastd m13, [v_mul+4*1] 1776 movu xm5, [r7+ssq*1+ 0] 1777 movu xm6, [r7+ssq*1+16] 1778 lea r7, [r7+ssq*2] 1779 pmaddwd m14, m15, m1 ; a0 1780 pmaddwd m15, m2 ; b0 1781 vinserti128 m5, [r7+ssq*0+ 0], 1 1782 vinserti128 m6, [r7+ssq*0+16], 1 1783 mova m1, m3 1784 pmaddwd m3, m13 ; a1 1785 mova m2, m4 1786 pmaddwd m4, m13 ; b1 1787 paddd m14, m3 1788 shufpd m3, m5, m6, 0x05 1789 paddd m15, m4 1790 PUT_6TAP_HV_H 5, 3, 6, 4, 13 ; 5 6 1791 vpbroadcastd m6, [v_mul+4*2] 1792 vpermq m5, m5, q3120 1793 shufpd m4, m0, m5, 0x05 1794 mova m0, m5 1795 punpcklwd m3, m4, m5 ; 45 1796 punpckhwd m4, m5 ; 56 1797 pmaddwd m5, m6, m3 ; a2 1798 pmaddwd m6, m4 ; b2 1799 paddd m14, m10 1800 paddd m15, m10 1801 paddd m5, m14 1802 paddd m6, m15 1803 psrad m5, 10 1804 psrad m6, 10 1805 packusdw m5, m6 1806 pminsw m5, m11 1807 vpermq m5, m5, q3120 1808 mova [r8+dsq*0], xm5 1809 vextracti128 [r8+dsq*1], m5, 1 1810 lea r8, [r8+dsq*2] 1811 sub hd, 2 1812 jg .hv_w8_loop 1813 add srcq, 16 1814 add dstq, 16 1815 movzx hd, wb 1816 sub wd, 1<<8 1817 jg .hv_w8_loop0 1818 RET 1819 1820PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc 1821PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc 1822PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc 1823PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc 1824PUT_8TAP_FN sharp, SHARP, SHARP 1825 1826cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my 1827%define base r8-put_avx2 1828 imul mxd, mxm, 0x010101 1829 add mxd, t0d ; 8tap_h, mx, 4tap_h 1830 imul myd, mym, 0x010101 1831 add myd, t1d ; 8tap_v, my, 4tap_v 1832 lea r8, [put_avx2] 1833 movifnidn wd, wm 1834 movifnidn hd, hm 1835 test mxd, 0xf00 1836 jnz .h 1837 test myd, 0xf00 1838 jz mangle(private_prefix %+ _put_6tap_16bpc_avx2).put 1839.v: 1840 movzx mxd, myb 1841 shr myd, 16 1842 cmp hd, 6 1843 cmovs myd, mxd 1844 vpbroadcastq m0, [base+subpel_filters+myq*8] 1845 WIN64_SPILL_XMM 12, 15 1846 vpbroadcastd m6, [pd_32] 1847 vpbroadcastw m7, r8m 1848 lea r6, [ssq*3] 1849 sub srcq, r6 1850 punpcklbw m0, m0 1851 psraw m0, 8 ; sign-extend 1852 pshufd m8, m0, q0000 1853 pshufd m9, m0, q1111 1854 pshufd m10, m0, q2222 1855 pshufd m11, m0, q3333 1856 cmp wd, 4 1857 jg .v_w8 1858 je .v_w4 1859.v_w2: 1860 movd xm2, [srcq+ssq*0] 1861 pinsrd xm2, [srcq+ssq*1], 1 1862 pinsrd xm2, [srcq+ssq*2], 2 1863 pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3 1864 lea srcq, [srcq+ssq*4] 1865 movd xm3, [srcq+ssq*0] 1866 vpbroadcastd xm1, [srcq+ssq*1] 1867 vpbroadcastd xm0, [srcq+ssq*2] 1868 add srcq, r6 1869 vpblendd xm3, xm1, 0x02 ; 4 5 1870 vpblendd xm1, xm0, 0x02 ; 5 6 1871 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 1872 punpcklwd xm3, xm1 ; 45 56 1873 punpcklwd xm1, xm2, xm4 ; 01 12 1874 punpckhwd xm2, xm4 ; 23 34 1875.v_w2_loop: 1876 vpbroadcastd xm4, [srcq+ssq*0] 1877 pmaddwd xm5, xm8, xm1 ; a0 b0 1878 mova xm1, xm2 1879 pmaddwd xm2, xm9 ; a1 b1 1880 paddd xm5, xm6 1881 paddd xm5, xm2 1882 mova xm2, xm3 1883 pmaddwd xm3, xm10 ; a2 b2 1884 paddd xm5, xm3 1885 vpblendd xm3, xm0, xm4, 0x02 ; 6 7 1886 vpbroadcastd xm0, [srcq+ssq*1] 1887 lea srcq, [srcq+ssq*2] 1888 vpblendd xm4, xm0, 0x02 ; 7 8 1889 punpcklwd xm3, xm4 ; 67 78 1890 pmaddwd xm4, xm11, xm3 ; a3 b3 1891 paddd xm5, xm4 1892 psrad xm5, 6 1893 packusdw xm5, xm5 1894 pminsw xm5, xm7 1895 movd [dstq+dsq*0], xm5 1896 pextrd [dstq+dsq*1], xm5, 1 1897 lea dstq, [dstq+dsq*2] 1898 sub hd, 2 1899 jg .v_w2_loop 1900 RET 1901.v_w4: 1902 movq xm1, [srcq+ssq*0] 1903 vpbroadcastq m0, [srcq+ssq*1] 1904 vpbroadcastq m2, [srcq+ssq*2] 1905 vpbroadcastq m4, [srcq+r6 ] 1906 lea srcq, [srcq+ssq*4] 1907 vpbroadcastq m3, [srcq+ssq*0] 1908 vpbroadcastq m5, [srcq+ssq*1] 1909 vpblendd m1, m0, 0x30 1910 vpblendd m0, m2, 0x30 1911 punpcklwd m1, m0 ; 01 12 1912 vpbroadcastq m0, [srcq+ssq*2] 1913 add srcq, r6 1914 vpblendd m2, m4, 0x30 1915 vpblendd m4, m3, 0x30 1916 punpcklwd m2, m4 ; 23 34 1917 vpblendd m3, m5, 0x30 1918 vpblendd m5, m0, 0x30 1919 punpcklwd m3, m5 ; 45 56 1920.v_w4_loop: 1921 vpbroadcastq m4, [srcq+ssq*0] 1922 pmaddwd m5, m8, m1 ; a0 b0 1923 mova m1, m2 1924 pmaddwd m2, m9 ; a1 b1 1925 paddd m5, m6 1926 paddd m5, m2 1927 mova m2, m3 1928 pmaddwd m3, m10 ; a2 b2 1929 paddd m5, m3 1930 vpblendd m3, m0, m4, 0x30 1931 vpbroadcastq m0, [srcq+ssq*1] 1932 lea srcq, [srcq+ssq*2] 1933 vpblendd m4, m0, 0x30 1934 punpcklwd m3, m4 ; 67 78 1935 pmaddwd m4, m11, m3 ; a3 b3 1936 paddd m5, m4 1937 psrad m5, 6 1938 vextracti128 xm4, m5, 1 1939 packusdw xm5, xm4 1940 pminsw xm5, xm7 1941 movq [dstq+dsq*0], xm5 1942 movhps [dstq+dsq*1], xm5 1943 lea dstq, [dstq+dsq*2] 1944 sub hd, 2 1945 jg .v_w4_loop 1946 RET 1947.v_w8: 1948 shl wd, 5 1949 WIN64_PUSH_XMM 15 1950 lea wd, [hq+wq-256] 1951.v_w8_loop0: 1952 vbroadcasti128 m4, [srcq+ssq*0] 1953 vbroadcasti128 m5, [srcq+ssq*1] 1954 lea r7, [srcq+ssq*4] 1955 vbroadcasti128 m0, [srcq+r6 ] 1956 vbroadcasti128 m6, [srcq+ssq*2] 1957 mov r8, dstq 1958 vbroadcasti128 m1, [r7+ssq*0] 1959 vbroadcasti128 m2, [r7+ssq*1] 1960 vbroadcasti128 m3, [r7+ssq*2] 1961 add r7, r6 1962 shufpd m4, m0, 0x0c 1963 shufpd m5, m1, 0x0c 1964 punpcklwd m1, m4, m5 ; 01 1965 punpckhwd m4, m5 ; 34 1966 shufpd m6, m2, 0x0c 1967 punpcklwd m2, m5, m6 ; 12 1968 punpckhwd m5, m6 ; 45 1969 shufpd m0, m3, 0x0c 1970 punpcklwd m3, m6, m0 ; 23 1971 punpckhwd m6, m0 ; 56 1972.v_w8_loop: 1973 vbroadcasti128 m14, [r7+ssq*0] 1974 pmaddwd m12, m8, m1 ; a0 1975 pmaddwd m13, m8, m2 ; b0 1976 mova m1, m3 1977 mova m2, m4 1978 pmaddwd m3, m9 ; a1 1979 pmaddwd m4, m9 ; b1 1980 paddd m12, m3 1981 paddd m13, m4 1982 mova m3, m5 1983 mova m4, m6 1984 pmaddwd m5, m10 ; a2 1985 pmaddwd m6, m10 ; b2 1986 paddd m12, m5 1987 vbroadcasti128 m5, [r7+ssq*1] 1988 lea r7, [r7+ssq*2] 1989 paddd m13, m6 1990 shufpd m6, m0, m14, 0x0d 1991 shufpd m0, m14, m5, 0x0c 1992 punpcklwd m5, m6, m0 ; 67 1993 punpckhwd m6, m0 ; 78 1994 pmaddwd m14, m11, m5 ; a3 1995 paddd m12, m14 1996 pmaddwd m14, m11, m6 ; b3 1997 paddd m13, m14 1998 psrad m12, 5 1999 psrad m13, 5 2000 packusdw m12, m13 2001 pxor m13, m13 2002 pavgw m12, m13 2003 pminsw m12, m7 2004 vpermq m12, m12, q3120 2005 mova [r8+dsq*0], xm12 2006 vextracti128 [r8+dsq*1], m12, 1 2007 lea r8, [r8+dsq*2] 2008 sub hd, 2 2009 jg .v_w8_loop 2010 add srcq, 16 2011 add dstq, 16 2012 movzx hd, wb 2013 sub wd, 1<<8 2014 jg .v_w8_loop0 2015 RET 2016.h: 2017 RESET_STACK_STATE 2018 test myd, 0xf00 2019 jnz .hv 2020 mov r7d, r8m 2021 vpbroadcastw m5, r8m 2022 shr r7d, 11 2023 vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] 2024 cmp wd, 4 2025 jl mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w2 2026 je mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w4 2027 WIN64_SPILL_XMM 13 2028 shr mxd, 16 2029 sub srcq, 6 2030 vpbroadcastq m0, [base+subpel_filters+mxq*8] 2031 vbroadcasti128 m6, [subpel_h_shufA] 2032 vbroadcasti128 m7, [subpel_h_shufB] 2033 punpcklbw m0, m0 2034 psraw m0, 8 ; sign-extend 2035 pshufd m8, m0, q0000 2036 pshufd m9, m0, q1111 2037 pshufd m10, m0, q2222 2038 pshufd m11, m0, q3333 2039 sub wd, 16 2040 jge .h_w16 2041.h_w8: 2042%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 2043 pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 2044 pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 2045 pmaddwd m%5, m9, m%4 ; abcd1 2046 pmaddwd m%1, m8 ; abcd0 2047 pshufb m%2, m7 ; 6 7 7 8 8 9 9 a 2048 shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 2049 paddd m%5, m4 2050 paddd m%1, m%5 2051 pmaddwd m%5, m11, m%2 ; abcd3 2052 paddd m%1, m%5 2053 pmaddwd m%5, m10, m%4 ; abcd2 2054 pshufb m%3, m7 ; a b b c c d d e 2055 pmaddwd m%4, m8 ; efgh0 2056 paddd m%1, m%5 2057 pmaddwd m%5, m9, m%2 ; efgh1 2058 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c 2059 pmaddwd m%3, m11 ; efgh3 2060 pmaddwd m%2, m10 ; efgh2 2061 paddd m%4, m4 2062 paddd m%4, m%5 2063 paddd m%3, m%4 2064 paddd m%2, m%3 2065 psrad m%1, 6 2066 psrad m%2, 6 2067 packusdw m%1, m%2 2068 pminsw m%1, m5 2069%endmacro 2070 movu xm0, [srcq+ssq*0+ 0] 2071 vinserti128 m0, [srcq+ssq*1+ 0], 1 2072 movu xm2, [srcq+ssq*0+16] 2073 vinserti128 m2, [srcq+ssq*1+16], 1 2074 lea srcq, [srcq+ssq*2] 2075 shufpd m1, m0, m2, 0x05 2076 PUT_8TAP_H 0, 1, 2, 3, 12 2077 mova [dstq+dsq*0], xm0 2078 vextracti128 [dstq+dsq*1], m0, 1 2079 lea dstq, [dstq+dsq*2] 2080 sub hd, 2 2081 jg .h_w8 2082 RET 2083.h_w16: 2084 mov r6d, wd 2085.h_w16_loop: 2086 movu m0, [srcq+r6*2+ 0] 2087 movu m1, [srcq+r6*2+ 8] 2088 movu m2, [srcq+r6*2+16] 2089 PUT_8TAP_H 0, 1, 2, 3, 12 2090 mova [dstq+r6*2], m0 2091 sub r6d, 16 2092 jge .h_w16_loop 2093 add srcq, ssq 2094 add dstq, dsq 2095 dec hd 2096 jg .h_w16 2097 RET 2098.hv: 2099 WIN64_SPILL_XMM 16 2100 vpbroadcastw m15, r8m 2101 cmp wd, 4 2102 jg .hv_w8 2103 movzx mxd, mxb 2104 vpbroadcastd m0, [base+subpel_filters+mxq*8+2] 2105 movzx mxd, myb 2106 shr myd, 16 2107 cmp hd, 6 2108 cmovs myd, mxd 2109 vpbroadcastq m1, [base+subpel_filters+myq*8] 2110 vpbroadcastd m6, [pd_512] 2111 lea r6, [ssq*3] 2112 sub srcq, 2 2113 sub srcq, r6 2114 pxor m7, m7 2115 punpcklbw m7, m0 2116 punpcklbw m1, m1 2117 psraw m1, 8 ; sign-extend 2118 test dword r8m, 0x800 2119 jz .hv_10bit 2120 psraw m7, 2 2121 psllw m1, 2 2122.hv_10bit: 2123 pshufd m11, m1, q0000 2124 pshufd m12, m1, q1111 2125 pshufd m13, m1, q2222 2126 pshufd m14, m1, q3333 2127 cmp wd, 4 2128 je .hv_w4 2129 vbroadcasti128 m9, [subpel_h_shuf2] 2130 vbroadcasti128 m1, [srcq+r6 ] ; 3 3 2131 movu xm3, [srcq+ssq*2] 2132 movu xm0, [srcq+ssq*0] 2133 movu xm2, [srcq+ssq*1] 2134 lea srcq, [srcq+ssq*4] 2135 vinserti128 m3, [srcq+ssq*0], 1 ; 2 4 2136 vinserti128 m0, [srcq+ssq*1], 1 ; 0 5 2137 vinserti128 m2, [srcq+ssq*2], 1 ; 1 6 2138 add srcq, r6 2139 pshufb m1, m9 2140 pshufb m3, m9 2141 pshufb m0, m9 2142 pshufb m2, m9 2143 pmaddwd m1, m7 2144 pmaddwd m3, m7 2145 pmaddwd m0, m7 2146 pmaddwd m2, m7 2147 phaddd m1, m3 2148 phaddd m0, m2 2149 paddd m1, m6 2150 paddd m0, m6 2151 psrad m1, 10 2152 psrad m0, 10 2153 packssdw m1, m0 ; 3 2 0 1 2154 vextracti128 xm0, m1, 1 ; 3 4 5 6 2155 pshufd xm2, xm1, q1301 ; 2 3 1 2 2156 pshufd xm3, xm0, q2121 ; 4 5 4 5 2157 punpckhwd xm1, xm2 ; 01 12 2158 punpcklwd xm2, xm0 ; 23 34 2159 punpckhwd xm3, xm0 ; 45 56 2160.hv_w2_loop: 2161 movu xm4, [srcq+ssq*0] 2162 movu xm5, [srcq+ssq*1] 2163 lea srcq, [srcq+ssq*2] 2164 pshufb xm4, xm9 2165 pshufb xm5, xm9 2166 pmaddwd xm4, xm7 2167 pmaddwd xm5, xm7 2168 phaddd xm4, xm5 2169 pmaddwd xm5, xm11, xm1 ; a0 b0 2170 mova xm1, xm2 2171 pmaddwd xm2, xm12 ; a1 b1 2172 paddd xm5, xm2 2173 mova xm2, xm3 2174 pmaddwd xm3, xm13 ; a2 b2 2175 paddd xm5, xm3 2176 paddd xm4, xm6 2177 psrad xm4, 10 2178 packssdw xm4, xm4 2179 palignr xm3, xm4, xm0, 12 2180 mova xm0, xm4 2181 punpcklwd xm3, xm0 ; 67 78 2182 pmaddwd xm4, xm14, xm3 ; a3 b3 2183 paddd xm5, xm6 2184 paddd xm5, xm4 2185 psrad xm5, 10 2186 packusdw xm5, xm5 2187 pminsw xm5, xm15 2188 movd [dstq+dsq*0], xm5 2189 pextrd [dstq+dsq*1], xm5, 1 2190 lea dstq, [dstq+dsq*2] 2191 sub hd, 2 2192 jg .hv_w2_loop 2193 RET 2194.hv_w4: 2195 vbroadcasti128 m9, [subpel_h_shufA] 2196 vbroadcasti128 m10, [subpel_h_shufB] 2197 pshufd m8, m7, q1111 2198 pshufd m7, m7, q0000 2199 movu xm1, [srcq+ssq*0] 2200 vinserti128 m1, [srcq+ssq*1], 1 ; 0 1 2201 vbroadcasti128 m0, [srcq+r6 ] 2202 vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3 2203 lea srcq, [srcq+ssq*4] 2204 vinserti128 m0, [srcq+ssq*0], 1 ; 3 4 2205 movu xm3, [srcq+ssq*1] 2206 vinserti128 m3, [srcq+ssq*2], 1 ; 5 6 2207 add srcq, r6 2208 pshufb m4, m1, m9 2209 pshufb m1, m10 2210 pmaddwd m4, m7 2211 pmaddwd m1, m8 2212 pshufb m5, m2, m9 2213 pshufb m2, m10 2214 pmaddwd m5, m7 2215 pmaddwd m2, m8 2216 paddd m4, m6 2217 paddd m1, m4 2218 pshufb m4, m0, m9 2219 pshufb m0, m10 2220 pmaddwd m4, m7 2221 pmaddwd m0, m8 2222 paddd m5, m6 2223 paddd m2, m5 2224 pshufb m5, m3, m9 2225 pshufb m3, m10 2226 pmaddwd m5, m7 2227 pmaddwd m3, m8 2228 paddd m4, m6 2229 paddd m4, m0 2230 paddd m5, m6 2231 paddd m5, m3 2232 vperm2i128 m0, m1, m2, 0x21 2233 psrld m1, 10 2234 psrld m2, 10 2235 vperm2i128 m3, m4, m5, 0x21 2236 pslld m4, 6 2237 pslld m5, 6 2238 pblendw m2, m4, 0xaa ; 23 34 2239 pslld m0, 6 2240 pblendw m1, m0, 0xaa ; 01 12 2241 psrld m3, 10 2242 pblendw m3, m5, 0xaa ; 45 56 2243 psrad m0, m5, 16 2244.hv_w4_loop: 2245 movu xm4, [srcq+ssq*0] 2246 vinserti128 m4, [srcq+ssq*1], 1 2247 lea srcq, [srcq+ssq*2] 2248 pmaddwd m5, m11, m1 ; a0 b0 2249 mova m1, m2 2250 pmaddwd m2, m12 ; a1 b1 2251 paddd m5, m6 2252 paddd m5, m2 2253 mova m2, m3 2254 pmaddwd m3, m13 ; a2 b2 2255 paddd m5, m3 2256 pshufb m3, m4, m9 2257 pshufb m4, m10 2258 pmaddwd m3, m7 2259 pmaddwd m4, m8 2260 paddd m3, m6 2261 paddd m4, m3 2262 psrad m4, 10 2263 packssdw m0, m4 ; _ 7 6 8 2264 vpermq m3, m0, q1122 ; _ 6 _ 7 2265 punpckhwd m3, m0 ; 67 78 2266 mova m0, m4 2267 pmaddwd m4, m14, m3 ; a3 b3 2268 paddd m4, m5 2269 psrad m4, 10 2270 vextracti128 xm5, m4, 1 2271 packusdw xm4, xm5 2272 pminsw xm4, xm15 2273 movq [dstq+dsq*0], xm4 2274 movhps [dstq+dsq*1], xm4 2275 lea dstq, [dstq+dsq*2] 2276 sub hd, 2 2277 jg .hv_w4_loop 2278 RET 2279.hv_w8: 2280 shr mxd, 16 2281 vpbroadcastq m2, [base+subpel_filters+mxq*8] 2282 movzx mxd, myb 2283 shr myd, 16 2284 cmp hd, 6 2285 cmovs myd, mxd 2286 pmovsxbw xm1, [base+subpel_filters+myq*8] 2287 shl wd, 5 2288 lea r6, [ssq*3] 2289 sub srcq, 6 2290 pxor m0, m0 2291 sub srcq, r6 2292 punpcklbw m0, m2 2293 lea wd, [hq+wq-256] 2294 test dword r8m, 0x800 2295 jz .hv_w8_10bit 2296 psraw m0, 2 2297 psllw xm1, 2 2298.hv_w8_10bit: 2299 pshufd m11, m0, q0000 2300 pshufd m12, m0, q1111 2301 mova [v_mul], xm1 2302 pshufd m13, m0, q2222 2303 pshufd m14, m0, q3333 2304.hv_w8_loop0: 2305%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 2306 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 2307 pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 2308 pmaddwd m3, m12, m2 2309 pmaddwd m%1, m11 2310 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a 2311 shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 2312 paddd m3, m10 2313 paddd m%1, m3 2314 pmaddwd m3, m14, m%2 2315 paddd m%1, m3 2316 pmaddwd m3, m13, m2 2317 pshufb m%3, m9 ; a b b c c d d e 2318 pmaddwd m2, m11 2319 paddd m%1, m3 2320 pmaddwd m3, m12, m%2 2321 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c 2322 pmaddwd m%3, m14 2323 pmaddwd m%2, m13 2324 paddd m2, m10 2325 paddd m2, m3 2326 paddd m%3, m2 2327 paddd m%2, m%3 2328 psrad m%1, 10 2329 psrad m%2, 10 2330 packssdw m%1, m%2 2331%endmacro 2332 movu xm4, [srcq+r6 *1+ 0] 2333 vbroadcasti128 m8, [subpel_h_shufA] 2334 lea r7, [srcq+ssq*4] 2335 movu xm6, [srcq+r6 *1+ 8] 2336 vbroadcasti128 m9, [subpel_h_shufB] 2337 mov r8, dstq 2338 movu xm0, [srcq+r6 *1+16] 2339 vpbroadcastd m10, [pd_512] 2340 movu xm5, [srcq+ssq*0+ 0] 2341 vinserti128 m5, [r7 +ssq*0+ 0], 1 2342 movu xm1, [srcq+ssq*0+16] 2343 vinserti128 m1, [r7 +ssq*0+16], 1 2344 shufpd m7, m5, m1, 0x05 2345 INIT_XMM avx2 2346 PUT_8TAP_HV_H 4, 6, 0 ; 3 2347 INIT_YMM avx2 2348 PUT_8TAP_HV_H 5, 7, 1 ; 0 4 2349 movu xm0, [srcq+ssq*2+ 0] 2350 vinserti128 m0, [srcq+r6 *2+ 0], 1 2351 movu xm1, [srcq+ssq*2+16] 2352 vinserti128 m1, [srcq+r6 *2+16], 1 2353 shufpd m7, m0, m1, 0x05 2354 PUT_8TAP_HV_H 0, 7, 1 ; 2 6 2355 movu xm6, [srcq+ssq*1+ 0] 2356 movu xm1, [srcq+ssq*1+16] 2357 vinserti128 m6, [r7 +ssq*1+ 0], 1 2358 vinserti128 m1, [r7 +ssq*1+16], 1 2359 add r7, r6 2360 shufpd m7, m6, m1, 0x05 2361 PUT_8TAP_HV_H 6, 7, 1 ; 1 5 2362 vpermq m4, m4, q1100 2363 vpermq m5, m5, q3120 2364 vpermq m6, m6, q3120 2365 vpermq m7, m0, q3120 2366 punpcklwd m3, m7, m4 ; 23 2367 punpckhwd m4, m5 ; 34 2368 punpcklwd m1, m5, m6 ; 01 2369 punpckhwd m5, m6 ; 45 2370 punpcklwd m2, m6, m7 ; 12 2371 punpckhwd m6, m7 ; 56 2372.hv_w8_loop: 2373 vpbroadcastd m9, [v_mul+4*0] 2374 vpbroadcastd m7, [v_mul+4*1] 2375 vpbroadcastd m10, [v_mul+4*2] 2376 pmaddwd m8, m9, m1 ; a0 2377 pmaddwd m9, m2 ; b0 2378 mova m1, m3 2379 mova m2, m4 2380 pmaddwd m3, m7 ; a1 2381 pmaddwd m4, m7 ; b1 2382 paddd m8, m3 2383 paddd m9, m4 2384 mova m3, m5 2385 mova m4, m6 2386 pmaddwd m5, m10 ; a2 2387 pmaddwd m6, m10 ; b2 2388 paddd m8, m5 2389 paddd m9, m6 2390 movu xm5, [r7+ssq*0] 2391 vinserti128 m5, [r7+ssq*1], 1 2392 vbroadcasti128 m7, [subpel_h_shufA] 2393 vbroadcasti128 m10, [subpel_h_shufB] 2394 movu xm6, [r7+ssq*0+16] 2395 vinserti128 m6, [r7+ssq*1+16], 1 2396 vextracti128 [r8], m0, 1 2397 pshufb m0, m5, m7 ; 01 2398 pshufb m5, m10 ; 23 2399 pmaddwd m0, m11 2400 pmaddwd m5, m12 2401 paddd m0, m5 2402 pshufb m5, m6, m7 ; 89 2403 pshufb m6, m10 ; ab 2404 pmaddwd m5, m13 2405 pmaddwd m6, m14 2406 paddd m6, m5 2407 movu xm5, [r7+ssq*0+8] 2408 vinserti128 m5, [r7+ssq*1+8], 1 2409 lea r7, [r7+ssq*2] 2410 pshufb m7, m5, m7 2411 pshufb m5, m10 2412 pmaddwd m10, m13, m7 2413 pmaddwd m7, m11 2414 paddd m0, m10 2415 vpbroadcastd m10, [pd_512] 2416 paddd m6, m7 2417 pmaddwd m7, m14, m5 2418 pmaddwd m5, m12 2419 paddd m0, m7 2420 paddd m5, m6 2421 vbroadcasti128 m6, [r8] 2422 paddd m8, m10 2423 paddd m9, m10 2424 paddd m0, m10 2425 paddd m5, m10 2426 vpbroadcastd m10, [v_mul+4*3] 2427 psrad m0, 10 2428 psrad m5, 10 2429 packssdw m0, m5 2430 vpermq m7, m0, q3120 ; 7 8 2431 shufpd m6, m7, 0x04 ; 6 7 2432 punpcklwd m5, m6, m7 ; 67 2433 punpckhwd m6, m7 ; 78 2434 pmaddwd m7, m10, m5 ; a3 2435 pmaddwd m10, m6 ; b3 2436 paddd m7, m8 2437 paddd m9, m10 2438 psrad m7, 10 2439 psrad m9, 10 2440 packusdw m7, m9 2441 pminsw m7, m15 2442 vpermq m7, m7, q3120 2443 mova [r8+dsq*0], xm7 2444 vextracti128 [r8+dsq*1], m7, 1 2445 lea r8, [r8+dsq*2] 2446 sub hd, 2 2447 jg .hv_w8_loop 2448 add srcq, 16 2449 add dstq, 16 2450 movzx hd, wb 2451 sub wd, 1<<8 2452 jg .hv_w8_loop0 2453 RET 2454 2455%if WIN64 2456DECLARE_REG_TMP 6, 4 2457%else 2458DECLARE_REG_TMP 6, 7 2459%endif 2460 2461%define PREP_8TAP_FN FN prep_8tap, 2462PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc 2463PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc 2464PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc 2465PREP_8TAP_FN regular, REGULAR, REGULAR 2466 2467cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my 2468%define base r7-prep_avx2 2469 imul mxd, mxm, 0x010101 2470 add mxd, t0d ; 6tap_h, mx, 4tap_h 2471 imul myd, mym, 0x010101 2472 add myd, t1d ; 6tap_v, my, 4tap_v 2473 lea r7, [prep_avx2] 2474 movifnidn hd, hm 2475 test mxd, 0xf00 2476 jnz .h 2477 test myd, 0xf00 2478 jnz .v 2479.prep: 2480 tzcnt wd, wd 2481 mov r6d, r7m ; bitdepth_max 2482 movzx wd, word [r7+wq*2+table_offset(prep,)] 2483 vpbroadcastd m5, [r7-prep_avx2+pw_8192] 2484 shr r6d, 11 2485 add wq, r7 2486 vpbroadcastd m4, [base+prep_mul+r6*4] 2487 lea r6, [ssq*3] 2488%if WIN64 2489 pop r7 2490%endif 2491 jmp wq 2492.h_w4: 2493 movzx mxd, mxb 2494 sub srcq, 2 2495 pmovsxbw xm0, [base+subpel_filters+mxq*8] 2496 vbroadcasti128 m3, [subpel_h_shufA] 2497 lea r6, [ssq*3] 2498 vbroadcasti128 m4, [subpel_h_shufB] 2499 WIN64_SPILL_XMM 8 2500 pshufd xm0, xm0, q2211 2501 test dword r7m, 0x800 2502 jnz .h_w4_12bpc 2503 psllw xm0, 2 2504.h_w4_12bpc: 2505 vpbroadcastq m6, xm0 2506 vpermq m7, m0, q1111 2507.h_w4_loop: 2508 movu xm1, [srcq+ssq*0] 2509 vinserti128 m1, [srcq+ssq*2], 1 2510 movu xm2, [srcq+ssq*1] 2511 vinserti128 m2, [srcq+r6 *1], 1 2512 lea srcq, [srcq+ssq*4] 2513 pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 2514 pshufb m1, m4 ; 2 3 3 4 4 5 5 6 2515 pmaddwd m0, m6 2516 pmaddwd m1, m7 2517 paddd m0, m5 2518 paddd m0, m1 2519 pshufb m1, m2, m3 2520 pshufb m2, m4 2521 pmaddwd m1, m6 2522 pmaddwd m2, m7 2523 paddd m1, m5 2524 paddd m1, m2 2525 psrad m0, 4 2526 psrad m1, 4 2527 packssdw m0, m1 2528 mova [tmpq], m0 2529 add tmpq, 32 2530 sub hd, 4 2531 jg .h_w4_loop 2532 RET 2533.h: 2534 test myd, 0xf00 2535 jnz .hv 2536 vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) 2537 cmp wd, 4 2538 je .h_w4 2539 shr mxd, 16 2540 sub srcq, 4 2541 vpbroadcastq m0, [base+subpel_filters+1+mxq*8] 2542 WIN64_SPILL_XMM 10 2543 vbroadcasti128 m6, [subpel_h_shufA] 2544 punpcklbw m0, m0 2545 psraw m0, 8 ; sign-extend 2546 test dword r7m, 0x800 2547 jnz .h_12bpc 2548 psllw m0, 2 2549.h_12bpc: 2550 pshufd m7, m0, q0000 2551 pshufd m8, m0, q1111 2552 pshufd m9, m0, q2222 2553 cmp wd, 8 2554 jg .h_w16 2555.h_w8: 2556 movu xm0, [srcq+ssq*0+ 0] 2557 vinserti128 m0, [srcq+ssq*1+ 0], 1 2558 movu xm2, [srcq+ssq*0+16] 2559 vinserti128 m2, [srcq+ssq*1+16], 1 2560 lea srcq, [srcq+ssq*2] 2561 shufpd m1, m0, m2, 0x05 2562%macro PREP_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 2563 pshufb m%1, m6 ; 01 12 23 34 2564 pshufb m%2, m6 ; 45 56 67 78 2565 pmaddwd m%4, m7, m%1 ; a0 2566 pshufb m%3, m6 ; 89 9a ab bc 2567 pmaddwd m%5, m9, m%2 ; a2 2568 shufpd m%1, m%2, 0x05 ; 23 34 45 56 2569 paddd m%4, m%5 ; a0+a2 2570 pmaddwd m%5, m7, m%2 ; b0 2571 shufpd m%2, m%3, 0x05 ; 67 78 89 9a 2572 pmaddwd m%3, m9 ; b2 2573 pmaddwd m%1, m8 ; a1 2574 pmaddwd m%2, m8 ; b1 2575 paddd m%3, m%5 ; b0+b2 2576 paddd m%4, m5 2577 paddd m%3, m5 2578 paddd m%1, m%4 2579 paddd m%2, m%3 2580 psrad m%1, 4 2581 psrad m%2, 4 2582 packssdw m%1, m%2 2583%endmacro 2584 PREP_6TAP_H 0, 1, 2, 3, 4 2585 mova [tmpq], m0 2586 add tmpq, 32 2587 sub hd, 2 2588 jg .h_w8 2589 RET 2590.h_w16: 2591 add wd, wd 2592.h_w16_loop0: 2593 mov r6d, wd 2594.h_w16_loop: 2595 movu m0, [srcq+r6-32] 2596 movu m1, [srcq+r6-24] 2597 movu m2, [srcq+r6-16] 2598 PREP_6TAP_H 0, 1, 2, 3, 4 2599 mova [tmpq+r6-32], m0 2600 sub r6d, 32 2601 jg .h_w16_loop 2602 add srcq, ssq 2603 add tmpq, wq 2604 dec hd 2605 jg .h_w16_loop0 2606 RET 2607.v: 2608 movzx mxd, myb 2609 shr myd, 16 2610 cmp hd, 4 2611 cmove myd, mxd 2612 vpbroadcastq m0, [base+subpel_filters+1+myq*8] 2613 WIN64_SPILL_XMM 9, 12 2614 vpbroadcastd m5, [prep_8tap_1d_rnd] 2615 mov r6, ssq 2616 punpcklbw m0, m0 2617 neg r6 2618 psraw m0, 8 ; sign-extend 2619 test dword r7m, 0x800 2620 jnz .v_12bpc 2621 psllw m0, 2 2622.v_12bpc: 2623 pshufd m6, m0, q0000 2624 pshufd m7, m0, q1111 2625 pshufd m8, m0, q2222 2626 cmp wd, 4 2627 jg .v_w8 2628.v_w4: 2629 movq xm1, [srcq+r6 *2] 2630 vpbroadcastq m3, [srcq+r6 *1] 2631 vpbroadcastq m2, [srcq+ssq*0] 2632 vpbroadcastq m4, [srcq+ssq*1] 2633 lea srcq, [srcq+ssq*2] 2634 vpbroadcastq m0, [srcq+ssq*0] 2635 vpblendd m1, m3, 0x30 2636 vpblendd m3, m2, 0x30 2637 punpcklwd m1, m3 ; 01 12 2638 vpblendd m2, m4, 0x30 2639 vpblendd m4, m0, 0x30 2640 punpcklwd m2, m4 ; 23 34 2641.v_w4_loop: 2642 vpbroadcastq m3, [srcq+ssq*1] 2643 lea srcq, [srcq+ssq*2] 2644 pmaddwd m4, m6, m1 ; a0 b0 2645 mova m1, m2 2646 pmaddwd m2, m7 ; a1 b1 2647 paddd m4, m2 2648 vpblendd m2, m0, m3, 0x30 2649 vpbroadcastq m0, [srcq+ssq*0] 2650 vpblendd m3, m0, 0x30 2651 punpcklwd m2, m3 ; 45 56 2652 pmaddwd m3, m8, m2 ; a2 b2 2653 paddd m4, m5 2654 paddd m4, m3 2655 psrad m4, 4 2656 vextracti128 xm3, m4, 1 2657 packssdw xm4, xm3 2658 mova [tmpq], xm4 2659 add tmpq, 16 2660 sub hd, 2 2661 jg .v_w4_loop 2662 RET 2663.v_w8: 2664 WIN64_PUSH_XMM 12 2665%if WIN64 2666 push r8 2667%endif 2668 mov r8d, wd 2669 shl wd, 5 2670 lea wd, [hq+wq-256] 2671.v_w8_loop0: 2672 vbroadcasti128 m3, [srcq+r6 *2] 2673 vbroadcasti128 m4, [srcq+r6 *1] 2674 lea r5, [srcq+ssq*2] 2675 vbroadcasti128 m0, [srcq+ssq*0] 2676 vbroadcasti128 m1, [srcq+ssq*1] 2677 mov r7, tmpq 2678 vbroadcasti128 m2, [r5+ssq*0] 2679 shufpd m3, m0, 0x0c 2680 shufpd m4, m1, 0x0c 2681 punpcklwd m1, m3, m4 ; 01 2682 punpckhwd m3, m4 ; 23 2683 shufpd m0, m2, 0x0c 2684 punpcklwd m2, m4, m0 ; 12 2685 punpckhwd m4, m0 ; 34 2686.v_w8_loop: 2687 vbroadcasti128 m9, [r5+ssq*1] 2688 pmaddwd m10, m6, m1 ; a0 2689 lea r5, [r5+ssq*2] 2690 pmaddwd m11, m6, m2 ; b0 2691 mova m1, m3 2692 pmaddwd m3, m7 ; a1 2693 mova m2, m4 2694 pmaddwd m4, m7 ; b1 2695 paddd m10, m5 2696 paddd m11, m5 2697 paddd m10, m3 2698 vbroadcasti128 m3, [r5+ssq*0] 2699 paddd m11, m4 2700 shufpd m4, m0, m9, 0x0d 2701 shufpd m0, m9, m3, 0x0c 2702 punpcklwd m3, m4, m0 ; 45 2703 punpckhwd m4, m0 ; 56 2704 pmaddwd m9, m8, m3 ; a2 2705 paddd m10, m9 2706 pmaddwd m9, m8, m4 ; b2 2707 paddd m11, m9 2708 psrad m10, 4 2709 psrad m11, 4 2710 packssdw m10, m11 2711 vpermq m10, m10, q3120 2712 mova [r7+r8*0], xm10 2713 vextracti128 [r7+r8*2], m10, 1 2714 lea r7, [r7+r8*4] 2715 sub hd, 2 2716 jg .v_w8_loop 2717 add srcq, 16 2718 add tmpq, 16 2719 movzx hd, wb 2720 sub wd, 1<<8 2721 jg .v_w8_loop0 2722%if WIN64 2723 pop r8 2724%endif 2725 RET 2726.hv: 2727 WIN64_SPILL_XMM 13, 15 2728 vpbroadcastd m7, [prep_8tap_2d_rnd] 2729 vbroadcasti128 m8, [subpel_h_shufA] 2730 cmp wd, 4 2731 jg .hv_w8 2732 movzx mxd, mxb 2733 vpbroadcastd m0, [base+subpel_filters+mxq*8+2] 2734 movzx mxd, myb 2735 shr myd, 16 2736 cmp hd, 4 2737 cmove myd, mxd 2738 vpbroadcastq m1, [base+subpel_filters+1+myq*8] 2739 mov r6, ssq 2740 sub srcq, 2 2741 pxor m6, m6 2742 neg r6 2743 punpcklbw m6, m0 2744 punpcklbw m1, m1 2745 psraw m6, 4 2746 psraw m1, 8 2747 test dword r7m, 0x800 2748 jz .hv_w4_10bit 2749 psraw m6, 2 2750.hv_w4_10bit: 2751 pshufd m10, m1, q0000 2752 pshufd m11, m1, q1111 2753 pshufd m12, m1, q2222 2754.hv_w4: 2755 movu xm2, [srcq+r6 *2] 2756 vinserti128 m2, [srcq+r6 *1], 1 ; 0 1 2757 pshufd m5, m6, q0000 2758 vbroadcasti128 m9, [base+subpel_h_shufB] 2759 movu xm0, [srcq+ssq*0] 2760 pshufd m6, m6, q1111 2761 vinserti128 m0, [srcq+ssq*1], 1 ; 2 3 2762 lea srcq, [srcq+ssq*2] 2763 movu xm3, [srcq+ssq*0] ; 4 2764 pshufb m1, m2, m8 2765 pmaddwd m1, m5 2766 pshufb m2, m9 2767 pmaddwd m2, m6 2768 pshufb m4, m0, m8 2769 pmaddwd m4, m5 2770 pshufb m0, m9 2771 pmaddwd m0, m6 2772 paddd m2, m1 2773 pshufb xm1, xm3, xm8 2774 pmaddwd xm1, xm5 2775 pshufb xm3, xm9 2776 pmaddwd xm3, xm6 2777 paddd m0, m4 2778 paddd m2, m7 2779 paddd xm1, xm7 2780 paddd m0, m7 2781 paddd xm3, xm1 2782 REPX {psrad x, 6}, m2, m0, xm3 2783 packssdw m2, m0 ; 0 2 1 3 2784 packssdw xm0, xm3 ; 2 4 2785 vperm2i128 m0, m2, 0x03 2786 punpcklwd m1, m2, m0 ; 01 12 2787 punpckhwd m2, m0 ; 23 34 2788.hv_w4_loop: 2789 movu xm3, [srcq+ssq*1] 2790 lea srcq, [srcq+ssq*2] 2791 vinserti128 m3, [srcq+ssq*0], 1 2792 pmaddwd m4, m10, m1 ; a0 b0 2793 mova m1, m2 2794 pmaddwd m2, m11 ; a1 b1 2795 paddd m4, m2 2796 pshufb m2, m3, m8 2797 pmaddwd m2, m5 2798 pshufb m3, m9 2799 pmaddwd m3, m6 2800 paddd m2, m7 2801 paddd m3, m2 2802 psrad m3, 6 2803 packssdw m3, m3 ; 5 5 6 6 2804 vperm2i128 m2, m0, m3, 0x21 2805 mova m0, m3 2806 punpckhwd m2, m3 ; 45 56 2807 pmaddwd m3, m12, m2 ; a2 b2 2808 paddd m4, m7 2809 paddd m4, m3 2810 psrad m4, 6 2811 vextracti128 xm3, m4, 1 2812 packssdw xm4, xm3 2813 mova [tmpq], xm4 2814 add tmpq, 16 2815 sub hd, 2 2816 jg .hv_w4_loop 2817 RET 2818.hv_w8: 2819 shr mxd, 16 2820 vpbroadcastq m2, [base+subpel_filters+1+mxq*8] 2821 movzx mxd, myb 2822 shr myd, 16 2823 cmp hd, 4 2824 cmove myd, mxd 2825 pmovsxbw xm1, [base+subpel_filters+1+myq*8] 2826 WIN64_PUSH_XMM 15 2827%if WIN64 2828 PUSH r8 2829%endif 2830 mov r8d, wd 2831 shl wd, 5 2832 mov r6, ssq 2833 sub srcq, 4 2834 neg r6 2835 lea wd, [hq+wq-256] 2836 pxor m0, m0 2837 punpcklbw m0, m2 2838 psraw m0, 4 2839 test dword r7m, 0x800 2840 jz .hv_w8_10bit 2841 psraw m0, 2 2842.hv_w8_10bit: 2843 pshufd m10, m0, q0000 2844 pshufd m11, m0, q1111 2845 mova [v_mul], xm1 2846 pshufd m12, m0, q2222 2847.hv_w8_loop0: 2848 vbroadcasti128 m0, [srcq+ssq*0+ 0] 2849 vinserti128 m3, m0, [srcq+r6*2+ 0], 0 2850 lea r5, [srcq+ssq*2] 2851 vbroadcasti128 m2, [srcq+ssq*0+16] 2852 vinserti128 m1, m2, [srcq+r6*2+16], 0 2853 mov r7, tmpq 2854 vinserti128 m0, [r5 +ssq*0+ 0], 1 2855 vinserti128 m2, [r5 +ssq*0+16], 1 2856 shufpd m4, m3, m1, 0x05 2857%macro PREP_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 2858 pshufb m%1, m8 ; 01 12 23 34 2859 pshufb m%2, m8 ; 45 56 67 78 2860 pmaddwd m%4, m10, m%1 ; a0 2861 pshufb m%3, m8 ; 89 9a ab bc 2862 pmaddwd m%5, m12, m%2 ; a2 2863 shufpd m%1, m%2, 0x05 ; 23 34 45 56 2864 paddd m%4, m%5 ; a0+a2 2865 pmaddwd m%5, m10, m%2 ; b0 2866 shufpd m%2, m%3, 0x05 ; 67 78 89 9a 2867 pmaddwd m%3, m12 ; b2 2868 pmaddwd m%1, m11 ; a1 2869 pmaddwd m%2, m11 ; b1 2870 paddd m%3, m%5 ; b0+b2 2871 paddd m%4, m7 2872 paddd m%3, m7 2873 paddd m%1, m%4 2874 paddd m%2, m%3 2875 psrad m%1, 6 2876 psrad m%2, 6 2877 packssdw m%1, m%2 2878%endmacro 2879 PREP_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2 2880 movu xm4, [srcq+r6 *1+ 0] 2881 vinserti128 m4, [srcq+ssq*1+ 0], 1 2882 shufpd m1, m0, m2, 0x05 2883 PREP_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4 2884 movu xm2, [srcq+r6 *1+16] 2885 vinserti128 m2, [srcq+ssq*1+16], 1 2886 shufpd m1, m4, m2, 0x05 2887 PREP_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3 2888 vpermq m3, m3, q3120 2889 vpermq m4, m4, q3120 2890 vpermq m0, m0, q3120 2891 punpcklwd m1, m3, m4 ; 01 2892 punpckhwd m3, m4 ; 23 2893 punpcklwd m2, m4, m0 ; 12 2894 punpckhwd m4, m0 ; 34 2895.hv_w8_loop: 2896 vpbroadcastd m14, [v_mul+4*0] 2897 vpbroadcastd m9, [v_mul+4*1] 2898 movu xm5, [r5+ssq*1+ 0] 2899 movu xm6, [r5+ssq*1+16] 2900 lea r5, [r5+ssq*2] 2901 pmaddwd m13, m14, m1 ; a0 2902 pmaddwd m14, m2 ; b0 2903 vinserti128 m5, [r5+ssq*0+ 0], 1 2904 vinserti128 m6, [r5+ssq*0+16], 1 2905 mova m1, m3 2906 pmaddwd m3, m9 ; a1 2907 mova m2, m4 2908 pmaddwd m4, m9 ; b1 2909 paddd m13, m3 2910 shufpd m3, m5, m6, 0x05 2911 paddd m14, m4 2912 PREP_6TAP_HV_H 5, 3, 6, 4, 9 ; 5 6 2913 vpbroadcastd m6, [v_mul+4*2] 2914 vpermq m5, m5, q3120 2915 shufpd m4, m0, m5, 0x05 2916 mova m0, m5 2917 punpcklwd m3, m4, m5 ; 45 2918 punpckhwd m4, m5 ; 56 2919 pmaddwd m5, m6, m3 ; a2 2920 pmaddwd m6, m4 ; b2 2921 paddd m13, m7 2922 paddd m14, m7 2923 paddd m5, m13 2924 paddd m6, m14 2925 psrad m5, 6 2926 psrad m6, 6 2927 packssdw m5, m6 2928 vpermq m5, m5, q3120 2929 mova [r7+r8*0], xm5 2930 vextracti128 [r7+r8*2], m5, 1 2931 lea r7, [r7+r8*4] 2932 sub hd, 2 2933 jg .hv_w8_loop 2934 add srcq, 16 2935 add tmpq, 16 2936 movzx hd, wb 2937 sub wd, 1<<8 2938 jg .hv_w8_loop0 2939%if WIN64 2940 POP r8 2941%endif 2942 RET 2943 2944PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc 2945PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc 2946PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc 2947PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc 2948PREP_8TAP_FN sharp, SHARP, SHARP 2949 2950cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my 2951%define base r7-prep_avx2 2952 imul mxd, mxm, 0x010101 2953 add mxd, t0d ; 8tap_h, mx, 4tap_h 2954 imul myd, mym, 0x010101 2955 add myd, t1d ; 8tap_v, my, 4tap_v 2956 lea r7, [prep_avx2] 2957 movifnidn hd, hm 2958 test mxd, 0xf00 2959 jnz .h 2960 test myd, 0xf00 2961 jz mangle(private_prefix %+ _prep_6tap_16bpc_avx2).prep 2962.v: 2963 movzx mxd, myb 2964 shr myd, 16 2965 cmp hd, 4 2966 cmove myd, mxd 2967 vpbroadcastq m0, [base+subpel_filters+myq*8] 2968 WIN64_SPILL_XMM 12, 15 2969 vpbroadcastd m7, [prep_8tap_1d_rnd] 2970 lea r6, [strideq*3] 2971 punpcklbw m0, m0 2972 sub srcq, r6 2973 psraw m0, 8 ; sign-extend 2974 test dword r7m, 0x800 2975 jnz .v_12bpc 2976 psllw m0, 2 2977.v_12bpc: 2978 pshufd m8, m0, q0000 2979 pshufd m9, m0, q1111 2980 pshufd m10, m0, q2222 2981 pshufd m11, m0, q3333 2982 cmp wd, 4 2983 jg .v_w8 2984.v_w4: 2985 movq xm1, [srcq+strideq*0] 2986 vpbroadcastq m0, [srcq+strideq*1] 2987 vpbroadcastq m2, [srcq+strideq*2] 2988 vpbroadcastq m4, [srcq+r6 ] 2989 lea srcq, [srcq+strideq*4] 2990 vpbroadcastq m3, [srcq+strideq*0] 2991 vpbroadcastq m5, [srcq+strideq*1] 2992 vpblendd m1, m0, 0x30 2993 vpblendd m0, m2, 0x30 2994 punpcklwd m1, m0 ; 01 12 2995 vpbroadcastq m0, [srcq+strideq*2] 2996 add srcq, r6 2997 vpblendd m2, m4, 0x30 2998 vpblendd m4, m3, 0x30 2999 punpcklwd m2, m4 ; 23 34 3000 vpblendd m3, m5, 0x30 3001 vpblendd m5, m0, 0x30 3002 punpcklwd m3, m5 ; 45 56 3003.v_w4_loop: 3004 vpbroadcastq m4, [srcq+strideq*0] 3005 pmaddwd m5, m8, m1 ; a0 b0 3006 mova m1, m2 3007 pmaddwd m2, m9 ; a1 b1 3008 paddd m5, m7 3009 paddd m5, m2 3010 mova m2, m3 3011 pmaddwd m3, m10 ; a2 b2 3012 paddd m5, m3 3013 vpblendd m3, m0, m4, 0x30 3014 vpbroadcastq m0, [srcq+strideq*1] 3015 lea srcq, [srcq+strideq*2] 3016 vpblendd m4, m0, 0x30 3017 punpcklwd m3, m4 ; 67 78 3018 pmaddwd m4, m11, m3 ; a3 b3 3019 paddd m5, m4 3020 psrad m5, 4 3021 vextracti128 xm4, m5, 1 3022 packssdw xm5, xm4 3023 mova [tmpq], xm5 3024 add tmpq, 16 3025 sub hd, 2 3026 jg .v_w4_loop 3027 RET 3028.v_w8: 3029%if WIN64 3030 WIN64_PUSH_XMM 15 3031 push r8 3032%endif 3033 mov r8d, wd 3034 shl wd, 5 3035 lea wd, [hq+wq-256] 3036.v_w8_loop0: 3037 vbroadcasti128 m4, [srcq+strideq*0] 3038 vbroadcasti128 m5, [srcq+strideq*1] 3039 lea r5, [srcq+strideq*4] 3040 vbroadcasti128 m0, [srcq+r6 ] 3041 vbroadcasti128 m6, [srcq+strideq*2] 3042 mov r7, tmpq 3043 vbroadcasti128 m1, [r5+strideq*0] 3044 vbroadcasti128 m2, [r5+strideq*1] 3045 vbroadcasti128 m3, [r5+strideq*2] 3046 add r5, r6 3047 shufpd m4, m0, 0x0c 3048 shufpd m5, m1, 0x0c 3049 punpcklwd m1, m4, m5 ; 01 3050 punpckhwd m4, m5 ; 34 3051 shufpd m6, m2, 0x0c 3052 punpcklwd m2, m5, m6 ; 12 3053 punpckhwd m5, m6 ; 45 3054 shufpd m0, m3, 0x0c 3055 punpcklwd m3, m6, m0 ; 23 3056 punpckhwd m6, m0 ; 56 3057.v_w8_loop: 3058 vbroadcasti128 m14, [r5+strideq*0] 3059 pmaddwd m12, m8, m1 ; a0 3060 pmaddwd m13, m8, m2 ; b0 3061 mova m1, m3 3062 mova m2, m4 3063 pmaddwd m3, m9 ; a1 3064 pmaddwd m4, m9 ; b1 3065 paddd m12, m7 3066 paddd m13, m7 3067 paddd m12, m3 3068 paddd m13, m4 3069 mova m3, m5 3070 mova m4, m6 3071 pmaddwd m5, m10 ; a2 3072 pmaddwd m6, m10 ; b2 3073 paddd m12, m5 3074 vbroadcasti128 m5, [r5+strideq*1] 3075 lea r5, [r5+strideq*2] 3076 paddd m13, m6 3077 shufpd m6, m0, m14, 0x0d 3078 shufpd m0, m14, m5, 0x0c 3079 punpcklwd m5, m6, m0 ; 67 3080 punpckhwd m6, m0 ; 78 3081 pmaddwd m14, m11, m5 ; a3 3082 paddd m12, m14 3083 pmaddwd m14, m11, m6 ; b3 3084 paddd m13, m14 3085 psrad m12, 4 3086 psrad m13, 4 3087 packssdw m12, m13 3088 vpermq m12, m12, q3120 3089 mova [r7+r8*0], xm12 3090 vextracti128 [r7+r8*2], m12, 1 3091 lea r7, [r7+r8*4] 3092 sub hd, 2 3093 jg .v_w8_loop 3094 add srcq, 16 3095 add tmpq, 16 3096 movzx hd, wb 3097 sub wd, 1<<8 3098 jg .v_w8_loop0 3099%if WIN64 3100 pop r8 3101%endif 3102 RET 3103.h: 3104 test myd, 0xf00 3105 jnz .hv 3106 vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) 3107 cmp wd, 4 3108 je mangle(private_prefix %+ _prep_6tap_16bpc_avx2).h_w4 3109 shr mxd, 16 3110 sub srcq, 6 3111 vpbroadcastq m0, [base+subpel_filters+mxq*8] 3112 WIN64_SPILL_XMM 12 3113 vbroadcasti128 m6, [subpel_h_shufA] 3114 vbroadcasti128 m7, [subpel_h_shufB] 3115 punpcklbw m0, m0 3116 psraw m0, 8 ; sign-extend 3117 test dword r7m, 0x800 3118 jnz .h_12bpc 3119 psllw m0, 2 3120.h_12bpc: 3121 pshufd m8, m0, q0000 3122 pshufd m9, m0, q1111 3123 pshufd m10, m0, q2222 3124 pshufd m11, m0, q3333 3125 cmp wd, 8 3126 jg .h_w16 3127.h_w8: 3128%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] 3129 pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 3130 pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 3131 pmaddwd m%5, m9, m%4 ; abcd1 3132 pmaddwd m%1, m8 ; abcd0 3133 pshufb m%2, m7 ; 6 7 7 8 8 9 9 a 3134 shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 3135 paddd m%5, m5 3136 paddd m%1, m%5 3137 pmaddwd m%5, m11, m%2 ; abcd3 3138 paddd m%1, m%5 3139 pmaddwd m%5, m10, m%4 ; abcd2 3140 pshufb m%3, m7 ; a b b c c d d e 3141 pmaddwd m%4, m8 ; efgh0 3142 paddd m%1, m%5 3143 pmaddwd m%5, m9, m%2 ; efgh1 3144 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c 3145 pmaddwd m%3, m11 ; efgh3 3146 pmaddwd m%2, m10 ; efgh2 3147 paddd m%4, m5 3148 paddd m%4, m%5 3149 paddd m%3, m%4 3150 paddd m%2, m%3 3151 psrad m%1, 4 3152 psrad m%2, 4 3153 packssdw m%1, m%2 3154%endmacro 3155 movu xm0, [srcq+strideq*0+ 0] 3156 vinserti128 m0, [srcq+strideq*1+ 0], 1 3157 movu xm2, [srcq+strideq*0+16] 3158 vinserti128 m2, [srcq+strideq*1+16], 1 3159 lea srcq, [srcq+strideq*2] 3160 shufpd m1, m0, m2, 0x05 3161 PREP_8TAP_H 0, 1, 2, 3, 4 3162 mova [tmpq], m0 3163 add tmpq, 32 3164 sub hd, 2 3165 jg .h_w8 3166 RET 3167.h_w16: 3168 add wd, wd 3169.h_w16_loop0: 3170 mov r6d, wd 3171.h_w16_loop: 3172 movu m0, [srcq+r6-32] 3173 movu m1, [srcq+r6-24] 3174 movu m2, [srcq+r6-16] 3175 PREP_8TAP_H 0, 1, 2, 3, 4 3176 mova [tmpq+r6-32], m0 3177 sub r6d, 32 3178 jg .h_w16_loop 3179 add srcq, strideq 3180 add tmpq, wq 3181 dec hd 3182 jg .h_w16_loop0 3183 RET 3184.hv: 3185 WIN64_SPILL_XMM 16 3186 vpbroadcastd m15, [prep_8tap_2d_rnd] 3187 cmp wd, 4 3188 jg .hv_w8 3189 movzx mxd, mxb 3190 vpbroadcastd m0, [base+subpel_filters+mxq*8+2] 3191 movzx mxd, myb 3192 shr myd, 16 3193 cmp hd, 4 3194 cmove myd, mxd 3195 vpbroadcastq m1, [base+subpel_filters+myq*8] 3196 lea r6, [strideq*3] 3197 sub srcq, 2 3198 pxor m7, m7 3199 sub srcq, r6 3200 punpcklbw m7, m0 3201 punpcklbw m1, m1 3202 psraw m7, 4 3203 psraw m1, 8 3204 test dword r7m, 0x800 3205 jz .hv_w4_10bit 3206 psraw m7, 2 3207.hv_w4_10bit: 3208 pshufd m11, m1, q0000 3209 pshufd m12, m1, q1111 3210 pshufd m13, m1, q2222 3211 pshufd m14, m1, q3333 3212.hv_w4: 3213 vbroadcasti128 m9, [subpel_h_shufA] 3214 vbroadcasti128 m10, [subpel_h_shufB] 3215 pshufd m8, m7, q1111 3216 pshufd m7, m7, q0000 3217 movu xm1, [srcq+strideq*0] 3218 vinserti128 m1, [srcq+strideq*1], 1 ; 0 1 3219 vbroadcasti128 m0, [srcq+r6 ] 3220 vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3 3221 lea srcq, [srcq+strideq*4] 3222 vinserti128 m0, [srcq+strideq*0], 1 ; 3 4 3223 movu xm3, [srcq+strideq*1] 3224 vinserti128 m3, [srcq+strideq*2], 1 ; 5 6 3225 add srcq, r6 3226 pshufb m4, m1, m9 3227 pshufb m1, m10 3228 pmaddwd m4, m7 3229 pmaddwd m1, m8 3230 pshufb m5, m2, m9 3231 pshufb m2, m10 3232 pmaddwd m5, m7 3233 pmaddwd m2, m8 3234 paddd m4, m15 3235 paddd m1, m4 3236 pshufb m4, m0, m9 3237 pshufb m0, m10 3238 pmaddwd m4, m7 3239 pmaddwd m0, m8 3240 paddd m5, m15 3241 paddd m2, m5 3242 pshufb m5, m3, m9 3243 pshufb m3, m10 3244 pmaddwd m5, m7 3245 pmaddwd m3, m8 3246 paddd m4, m15 3247 paddd m4, m0 3248 paddd m5, m15 3249 paddd m5, m3 3250 vperm2i128 m0, m1, m2, 0x21 3251 psrld m1, 6 3252 psrld m2, 6 3253 vperm2i128 m3, m4, m5, 0x21 3254 pslld m4, 10 3255 pslld m5, 10 3256 pblendw m2, m4, 0xaa ; 23 34 3257 pslld m0, 10 3258 pblendw m1, m0, 0xaa ; 01 12 3259 psrld m3, 6 3260 pblendw m3, m5, 0xaa ; 45 56 3261 psrad m0, m5, 16 3262.hv_w4_loop: 3263 movu xm4, [srcq+strideq*0] 3264 vinserti128 m4, [srcq+strideq*1], 1 3265 lea srcq, [srcq+strideq*2] 3266 pmaddwd m5, m11, m1 ; a0 b0 3267 mova m1, m2 3268 pmaddwd m2, m12 ; a1 b1 3269 paddd m5, m15 3270 paddd m5, m2 3271 mova m2, m3 3272 pmaddwd m3, m13 ; a2 b2 3273 paddd m5, m3 3274 pshufb m3, m4, m9 3275 pshufb m4, m10 3276 pmaddwd m3, m7 3277 pmaddwd m4, m8 3278 paddd m3, m15 3279 paddd m4, m3 3280 psrad m4, 6 3281 packssdw m0, m4 ; _ 7 6 8 3282 vpermq m3, m0, q1122 ; _ 6 _ 7 3283 punpckhwd m3, m0 ; 67 78 3284 mova m0, m4 3285 pmaddwd m4, m14, m3 ; a3 b3 3286 paddd m4, m5 3287 psrad m4, 6 3288 vextracti128 xm5, m4, 1 3289 packssdw xm4, xm5 3290 mova [tmpq], xm4 3291 add tmpq, 16 3292 sub hd, 2 3293 jg .hv_w4_loop 3294 RET 3295.hv_w8: 3296 shr mxd, 16 3297 vpbroadcastq m2, [base+subpel_filters+mxq*8] 3298 movzx mxd, myb 3299 shr myd, 16 3300 cmp hd, 4 3301 cmove myd, mxd 3302 pmovsxbw xm1, [base+subpel_filters+myq*8] 3303%if WIN64 3304 PUSH r8 3305%endif 3306 mov r8d, wd 3307 shl wd, 5 3308 lea r6, [strideq*3] 3309 sub srcq, 6 3310 sub srcq, r6 3311 lea wd, [hq+wq-256] 3312 pxor m0, m0 3313 punpcklbw m0, m2 3314 psraw m0, 4 3315 test dword r7m, 0x800 3316 jz .hv_w8_10bit 3317 psraw m0, 2 3318.hv_w8_10bit: 3319 pshufd m11, m0, q0000 3320 pshufd m12, m0, q1111 3321 mova [v_mul], xm1 3322 pshufd m13, m0, q2222 3323 pshufd m14, m0, q3333 3324.hv_w8_loop0: 3325%macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 3326 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 3327 pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 3328 pmaddwd m3, m12, m2 3329 pmaddwd m%1, m11 3330 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a 3331 shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 3332 paddd m3, m15 3333 paddd m%1, m3 3334 pmaddwd m3, m14, m%2 3335 paddd m%1, m3 3336 pmaddwd m3, m13, m2 3337 pshufb m%3, m9 ; a b b c c d d e 3338 pmaddwd m2, m11 3339 paddd m%1, m3 3340 pmaddwd m3, m12, m%2 3341 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c 3342 pmaddwd m%3, m14 3343 pmaddwd m%2, m13 3344 paddd m2, m15 3345 paddd m2, m3 3346 paddd m2, m%3 3347 paddd m2, m%2 3348 psrad m%1, 6 3349 psrad m2, 6 3350 packssdw m%1, m2 3351%endmacro 3352 movu xm4, [srcq+r6 + 0] 3353 vbroadcasti128 m8, [subpel_h_shufA] 3354 lea r5, [srcq+strideq*4] 3355 movu xm6, [srcq+r6 + 8] 3356 vbroadcasti128 m9, [subpel_h_shufB] 3357 mov r7, tmpq 3358 movu xm0, [srcq+r6 +16] 3359 movu xm5, [srcq+strideq*0+ 0] 3360 vinserti128 m5, [r5 +strideq*0+ 0], 1 3361 movu xm1, [srcq+strideq*0+16] 3362 vinserti128 m1, [r5 +strideq*0+16], 1 3363 shufpd m7, m5, m1, 0x05 3364 INIT_XMM avx2 3365 PREP_8TAP_HV_H 4, 6, 0 ; 3 3366 INIT_YMM avx2 3367 PREP_8TAP_HV_H 5, 7, 1 ; 0 4 3368 movu xm0, [srcq+strideq*2+ 0] 3369 vinserti128 m0, [srcq+r6 *2+ 0], 1 3370 movu xm1, [srcq+strideq*2+16] 3371 vinserti128 m1, [srcq+r6 *2+16], 1 3372 shufpd m7, m0, m1, 0x05 3373 PREP_8TAP_HV_H 0, 7, 1 ; 2 6 3374 movu xm6, [srcq+strideq*1+ 0] 3375 movu xm1, [srcq+strideq*1+16] 3376 vinserti128 m6, [r5 +strideq*1+ 0], 1 3377 vinserti128 m1, [r5 +strideq*1+16], 1 3378 add r5, r6 3379 shufpd m7, m6, m1, 0x05 3380 PREP_8TAP_HV_H 6, 7, 1 ; 1 5 3381 vpermq m4, m4, q1100 3382 vpermq m5, m5, q3120 3383 vpermq m6, m6, q3120 3384 vpermq m7, m0, q3120 3385 punpcklwd m3, m7, m4 ; 23 3386 punpckhwd m4, m5 ; 34 3387 punpcklwd m1, m5, m6 ; 01 3388 punpckhwd m5, m6 ; 45 3389 punpcklwd m2, m6, m7 ; 12 3390 punpckhwd m6, m7 ; 56 3391.hv_w8_loop: 3392 vpbroadcastd m9, [v_mul+4*0] 3393 vpbroadcastd m7, [v_mul+4*1] 3394 vpbroadcastd m10, [v_mul+4*2] 3395 pmaddwd m8, m9, m1 ; a0 3396 pmaddwd m9, m2 ; b0 3397 mova m1, m3 3398 mova m2, m4 3399 pmaddwd m3, m7 ; a1 3400 pmaddwd m4, m7 ; b1 3401 paddd m8, m15 3402 paddd m9, m15 3403 paddd m8, m3 3404 paddd m9, m4 3405 mova m3, m5 3406 mova m4, m6 3407 pmaddwd m5, m10 ; a2 3408 pmaddwd m6, m10 ; b2 3409 paddd m8, m5 3410 paddd m9, m6 3411 movu xm5, [r5+strideq*0] 3412 vinserti128 m5, [r5+strideq*1], 1 3413 vbroadcasti128 m7, [subpel_h_shufA] 3414 vbroadcasti128 m10, [subpel_h_shufB] 3415 movu xm6, [r5+strideq*0+16] 3416 vinserti128 m6, [r5+strideq*1+16], 1 3417 vextracti128 [r7], m0, 1 3418 pshufb m0, m5, m7 ; 01 3419 pshufb m5, m10 ; 23 3420 pmaddwd m0, m11 3421 pmaddwd m5, m12 3422 paddd m0, m15 3423 paddd m0, m5 3424 pshufb m5, m6, m7 ; 89 3425 pshufb m6, m10 ; ab 3426 pmaddwd m5, m13 3427 pmaddwd m6, m14 3428 paddd m5, m15 3429 paddd m6, m5 3430 movu xm5, [r5+strideq*0+8] 3431 vinserti128 m5, [r5+strideq*1+8], 1 3432 lea r5, [r5+strideq*2] 3433 pshufb m7, m5, m7 3434 pshufb m5, m10 3435 pmaddwd m10, m13, m7 3436 pmaddwd m7, m11 3437 paddd m0, m10 3438 paddd m6, m7 3439 pmaddwd m7, m14, m5 3440 pmaddwd m5, m12 3441 paddd m0, m7 3442 paddd m5, m6 3443 vbroadcasti128 m6, [r7] 3444 vpbroadcastd m10, [v_mul+4*3] 3445 psrad m0, 6 3446 psrad m5, 6 3447 packssdw m0, m5 3448 vpermq m7, m0, q3120 ; 7 8 3449 shufpd m6, m7, 0x04 ; 6 7 3450 punpcklwd m5, m6, m7 ; 67 3451 punpckhwd m6, m7 ; 78 3452 pmaddwd m7, m10, m5 ; a3 3453 pmaddwd m10, m6 ; b3 3454 paddd m7, m8 3455 paddd m9, m10 3456 psrad m7, 6 3457 psrad m9, 6 3458 packssdw m7, m9 3459 vpermq m7, m7, q3120 3460 mova [r7+r8*0], xm7 3461 vextracti128 [r7+r8*2], m7, 1 3462 lea r7, [r7+r8*4] 3463 sub hd, 2 3464 jg .hv_w8_loop 3465 add srcq, 16 3466 add tmpq, 16 3467 movzx hd, wb 3468 sub wd, 1<<8 3469 jg .hv_w8_loop0 3470%if WIN64 3471 POP r8 3472%endif 3473 RET 3474 3475%macro movifprep 2 3476 %if isprep 3477 mov %1, %2 3478 %endif 3479%endmacro 3480 3481%macro REMAP_REG 2 3482 %xdefine r%1 r%2 3483 %xdefine r%1q r%2q 3484 %xdefine r%1d r%2d 3485%endmacro 3486 3487%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 3488 %if isprep 3489 %xdefine r14_save r14 3490 %assign %%i 14 3491 %rep 14 3492 %assign %%j %%i-1 3493 REMAP_REG %%i, %%j 3494 %assign %%i %%i-1 3495 %endrep 3496 %endif 3497%endmacro 3498 3499%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 3500 %if isprep 3501 %assign %%i 1 3502 %rep 13 3503 %assign %%j %%i+1 3504 REMAP_REG %%i, %%j 3505 %assign %%i %%i+1 3506 %endrep 3507 %xdefine r14 r14_save 3508 %undef r14_save 3509 %endif 3510%endmacro 3511 3512%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged 3513 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 3514 RET 3515 %if %1 3516 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 3517 %endif 3518%endmacro 3519 3520%macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd 3521 movu xm%1, [srcq+ r4*2] 3522 movu xm%2, [srcq+ r6*2] 3523 movu xm%3, [srcq+ r7*2] 3524 movu xm%4, [srcq+ r9*2] 3525 vinserti128 m%1, [srcq+r10*2], 1 3526 vinserti128 m%2, [srcq+r11*2], 1 3527 vinserti128 m%3, [srcq+r13*2], 1 3528 vinserti128 m%4, [srcq+ rX*2], 1 3529 add srcq, ssq 3530 movu xm%5, [srcq+ r4*2] 3531 movu xm%6, [srcq+ r6*2] 3532 movu xm%7, [srcq+ r7*2] 3533 movu xm%8, [srcq+ r9*2] 3534 vinserti128 m%5, [srcq+r10*2], 1 3535 vinserti128 m%6, [srcq+r11*2], 1 3536 vinserti128 m%7, [srcq+r13*2], 1 3537 vinserti128 m%8, [srcq+ rX*2], 1 3538 add srcq, ssq 3539 pmaddwd m%1, m12 3540 pmaddwd m%2, m13 3541 pmaddwd m%3, m14 3542 pmaddwd m%4, m15 3543 pmaddwd m%5, m12 3544 pmaddwd m%6, m13 3545 pmaddwd m%7, m14 3546 pmaddwd m%8, m15 3547 phaddd m%1, m%2 3548 %if %9 3549 mova m10, [rsp+0x00] 3550 %endif 3551 phaddd m%3, m%4 3552 phaddd m%5, m%6 3553 phaddd m%7, m%8 3554 phaddd m%1, m%3 3555 phaddd m%5, m%7 3556 paddd m%1, m10 3557 paddd m%5, m10 3558 psrad m%1, xm11 3559 psrad m%5, xm11 3560 packssdw m%1, m%5 3561%endmacro 3562 3563%macro MC_8TAP_SCALED 1 3564%ifidn %1, put 3565 %assign isput 1 3566 %assign isprep 0 3567cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax 3568 %xdefine base_reg r12 3569 mov r7d, pxmaxm 3570%else 3571 %assign isput 0 3572 %assign isprep 1 3573cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax 3574 %define tmp_stridem qword [rsp+0xd0] 3575 %xdefine base_reg r11 3576%endif 3577 lea base_reg, [%1_8tap_scaled_16bpc_avx2] 3578%define base base_reg-%1_8tap_scaled_16bpc_avx2 3579 tzcnt wd, wm 3580 vpbroadcastd m8, dxm 3581%if isprep && UNIX64 3582 movd xm10, mxd 3583 vpbroadcastd m10, xm10 3584 mov r5d, t0d 3585 DECLARE_REG_TMP 5, 7 3586 mov r6d, pxmaxm 3587%else 3588 vpbroadcastd m10, mxm 3589 %if isput 3590 vpbroadcastw m11, pxmaxm 3591 %else 3592 mov r6d, pxmaxm 3593 %endif 3594%endif 3595 mov dyd, dym 3596%if isput 3597 %if WIN64 3598 mov r8d, hm 3599 DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 3600 %define hm r5m 3601 %define dxm r8m 3602 %else 3603 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 3604 %define hm r6m 3605 %endif 3606 %define dsm [rsp+0x98] 3607 %define rX r1 3608 %define rXd r1d 3609%else ; prep 3610 %if WIN64 3611 mov r7d, hm 3612 DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 3613 %define hm r4m 3614 %define dxm r7m 3615 %else 3616 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 3617 %define hm [rsp+0x98] 3618 %endif 3619 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 3620 %define rX r14 3621 %define rXd r14d 3622%endif 3623 shr r7d, 11 3624 vpbroadcastd m6, [base+pd_0x3ff] 3625 vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4] 3626 movd xm7, [base+s_8tap_h_sh+r7*4] 3627%if isput 3628 vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4] 3629 pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2 3630%else 3631 vpbroadcastd m13, [base+pd_m524256] 3632%endif 3633 pxor m9, m9 3634 lea ss3q, [ssq*3] 3635 movzx r7d, t1b 3636 shr t1d, 16 3637 cmp hd, 6 3638 cmovs t1d, r7d 3639 sub srcq, ss3q 3640 cmp dyd, 1024 3641 je .dy1 3642 cmp dyd, 2048 3643 je .dy2 3644 movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] 3645 add wq, base_reg 3646 jmp wq 3647%if isput 3648.w2: 3649 mov myd, mym 3650 movzx t0d, t0b 3651 sub srcq, 2 3652 movd xm15, t0d 3653 punpckldq m8, m9, m8 3654 paddd m10, m8 ; mx+dx*[0,1] 3655 vpbroadcastd xm14, [base+pq_0x40000000+2] 3656 vpbroadcastd xm15, xm15 3657 pand xm8, xm10, xm6 3658 psrld xm8, 6 3659 paddd xm15, xm8 3660 movd r4d, xm15 3661 pextrd r6d, xm15, 1 3662 vbroadcasti128 m5, [base+bdct_lb_q] 3663 vbroadcasti128 m6, [base+subpel_s_shuf2] 3664 vpbroadcastd xm15, [base+subpel_filters+r4*8+2] 3665 vpbroadcastd xm4, [base+subpel_filters+r6*8+2] 3666 pcmpeqd xm8, xm9 3667 psrld m10, 10 3668 paddd m10, m10 3669 movu xm0, [srcq+ssq*0] 3670 movu xm1, [srcq+ssq*1] 3671 movu xm2, [srcq+ssq*2] 3672 movu xm3, [srcq+ss3q ] 3673 lea srcq, [srcq+ssq*4] 3674 pshufb m10, m5 3675 paddb m10, m6 3676 vpblendd xm15, xm4, 0xa 3677 pblendvb xm15, xm14, xm8 3678 pmovsxbw m15, xm15 3679 vinserti128 m0, [srcq+ssq*0], 1 ; 0 4 3680 vinserti128 m1, [srcq+ssq*1], 1 ; 1 5 3681 vinserti128 m2, [srcq+ssq*2], 1 ; 2 6 3682 vinserti128 m3, [srcq+ss3q ], 1 ; 3 7 3683 lea srcq, [srcq+ssq*4] 3684 REPX {pshufb x, m10}, m0, m1, m2, m3 3685 REPX {pmaddwd x, m15}, m0, m1, m2, m3 3686 phaddd m0, m1 3687 phaddd m2, m3 3688 paddd m0, m12 3689 paddd m2, m12 3690 psrad m0, xm7 3691 psrad m2, xm7 3692 packssdw m0, m2 ; 0 1 2 3 4 5 6 7 3693 vextracti128 xm1, m0, 1 3694 palignr xm2, xm1, xm0, 4 ; 1 2 3 4 3695 punpcklwd xm3, xm0, xm2 ; 01 12 3696 punpckhwd xm0, xm2 ; 23 34 3697 pshufd xm4, xm1, q0321 ; 5 6 7 _ 3698 punpcklwd xm2, xm1, xm4 ; 45 56 3699 punpckhwd xm4, xm1, xm4 ; 67 __ 3700.w2_loop: 3701 and myd, 0x3ff 3702 mov r6d, 64 << 24 3703 mov r4d, myd 3704 shr r4d, 6 3705 lea r4d, [t1+r4] 3706 cmovnz r6q, [base+subpel_filters+r4*8] 3707 movq xm14, r6q 3708 pmovsxbw xm14, xm14 3709 pshufd xm8, xm14, q0000 3710 pshufd xm9, xm14, q1111 3711 pmaddwd xm5, xm3, xm8 3712 pmaddwd xm6, xm0, xm9 3713 pshufd xm8, xm14, q2222 3714 pshufd xm14, xm14, q3333 3715 paddd xm5, xm6 3716 pmaddwd xm6, xm2, xm8 3717 pmaddwd xm8, xm4, xm14 3718 psrldq xm9, xm7, 8 3719 paddd xm5, xm6 3720 paddd xm5, xm13 3721 paddd xm5, xm8 3722 psrad xm5, xm9 3723 packusdw xm5, xm5 3724 pminsw xm5, xm11 3725 movd [dstq], xm5 3726 add dstq, dsq 3727 dec hd 3728 jz .ret 3729 add myd, dyd 3730 test myd, ~0x3ff 3731 jz .w2_loop 3732 movu xm5, [srcq] 3733 test myd, 0x400 3734 jz .w2_skip_line 3735 add srcq, ssq 3736 shufps xm3, xm0, q1032 ; 01 12 3737 shufps xm0, xm2, q1032 ; 23 34 3738 shufps xm2, xm4, q1032 ; 45 56 3739 pshufb xm5, xm10 3740 pmaddwd xm5, xm15 3741 phaddd xm5, xm5 3742 paddd xm5, xm12 3743 psrad xm5, xm7 3744 packssdw xm5, xm5 3745 palignr xm1, xm5, xm1, 12 3746 punpcklqdq xm1, xm1 ; 6 7 6 7 3747 punpcklwd xm4, xm1, xm5 ; 67 __ 3748 jmp .w2_loop 3749.w2_skip_line: 3750 movu xm6, [srcq+ssq*1] 3751 lea srcq, [srcq+ssq*2] 3752 mova xm3, xm0 ; 01 12 3753 mova xm0, xm2 ; 23 34 3754 pshufb xm5, xm10 3755 pshufb xm6, xm10 3756 pmaddwd xm5, xm15 3757 pmaddwd xm6, xm15 3758 phaddd xm5, xm6 3759 paddd xm5, xm12 3760 psrad xm5, xm7 3761 packssdw xm5, xm5 ; 6 7 6 7 3762 palignr xm1, xm5, xm1, 8 ; 4 5 6 7 3763 pshufd xm5, xm1, q0321 ; 5 6 7 _ 3764 punpcklwd xm2, xm1, xm5 ; 45 56 3765 punpckhwd xm4, xm1, xm5 ; 67 __ 3766 jmp .w2_loop 3767%endif 3768.w4: 3769 mov myd, mym 3770 mova [rsp+0x00], m12 3771%if isput 3772 mova [rsp+0x20], xm13 3773%else 3774 SWAP m11, m13 3775%endif 3776 mova [rsp+0x30], xm7 3777 vbroadcasti128 m7, [base+rescale_mul] 3778 movzx t0d, t0b 3779 sub srcq, 2 3780 movd xm15, t0d 3781 pmaddwd m8, m7 3782 vpbroadcastq m2, [base+pq_0x40000000+1] 3783 vpbroadcastd xm15, xm15 3784 SWAP m13, m10 3785 paddd m13, m8 ; mx+dx*[0-3] 3786 pand m6, m13 3787 psrld m6, 6 3788 paddd xm15, xm6 3789 movd r4d, xm15 3790 pextrd r6d, xm15, 1 3791 pextrd r11d, xm15, 2 3792 pextrd r13d, xm15, 3 3793 vbroadcasti128 m5, [base+bdct_lb_q+ 0] 3794 vbroadcasti128 m1, [base+bdct_lb_q+16] 3795 vbroadcasti128 m0, [base+subpel_s_shuf2] 3796 vpbroadcastd xm14, [base+subpel_filters+r4*8+2] 3797 vpbroadcastd xm7, [base+subpel_filters+r6*8+2] 3798 vpbroadcastd xm15, [base+subpel_filters+r11*8+2] 3799 vpbroadcastd xm8, [base+subpel_filters+r13*8+2] 3800 pcmpeqd m6, m9 3801 punpckldq m10, m6, m6 3802 punpckhdq m6, m6 3803 psrld m13, 10 3804 paddd m13, m13 3805 vpblendd xm14, xm7, 0xa 3806 vpblendd xm15, xm8, 0xa 3807 pmovsxbw m14, xm14 3808 pmovsxbw m15, xm15 3809 pblendvb m14, m2, m10 3810 pblendvb m15, m2, m6 3811 pextrd r4, xm13, 2 3812 pshufb m12, m13, m5 3813 pshufb m13, m1 3814 lea r6, [r4+ssq*1] 3815 lea r11, [r4+ssq*2] 3816 lea r13, [r4+ss3q ] 3817 movu xm7, [srcq+ssq*0] 3818 movu xm9, [srcq+ssq*1] 3819 movu xm8, [srcq+ssq*2] 3820 movu xm10, [srcq+ss3q ] 3821 movu xm1, [srcq+r4 ] 3822 movu xm3, [srcq+r6 ] 3823 movu xm2, [srcq+r11 ] 3824 movu xm4, [srcq+r13 ] 3825 lea srcq, [srcq+ssq*4] 3826 vinserti128 m7, [srcq+ssq*0], 1 3827 vinserti128 m9, [srcq+ssq*1], 1 3828 vinserti128 m8, [srcq+ssq*2], 1 3829 vinserti128 m10, [srcq+ss3q ], 1 3830 vinserti128 m1, [srcq+r4 ], 1 3831 vinserti128 m3, [srcq+r6 ], 1 3832 vinserti128 m2, [srcq+r11 ], 1 3833 vinserti128 m4, [srcq+r13 ], 1 3834 lea srcq, [srcq+ssq*4] 3835 vpbroadcastb m5, xm13 3836 psubb m13, m5 3837 paddb m12, m0 3838 paddb m13, m0 3839 REPX {pshufb x, m12}, m7, m9, m8, m10 3840 REPX {pmaddwd x, m14}, m7, m9, m8, m10 3841 REPX {pshufb x, m13}, m1, m2, m3, m4 3842 REPX {pmaddwd x, m15}, m1, m2, m3, m4 3843 mova m5, [rsp+0x00] 3844 movd xm6, [rsp+0x30] 3845 phaddd m7, m1 3846 phaddd m9, m3 3847 phaddd m8, m2 3848 phaddd m10, m4 3849 REPX {paddd x, m5}, m7, m9, m8, m10 3850 REPX {psrad x, xm6}, m7, m9, m8, m10 3851 packssdw m7, m9 ; 0 1 4 5 3852 packssdw m8, m10 ; 2 3 6 7 3853 vextracti128 xm9, m7, 1 ; 4 5 3854 vextracti128 xm3, m8, 1 ; 6 7 3855 shufps xm4, xm7, xm8, q1032 ; 1 2 3856 shufps xm5, xm8, xm9, q1032 ; 3 4 3857 shufps xm6, xm9, xm3, q1032 ; 5 6 3858 psrldq xm10, xm3, 8 ; 7 _ 3859 punpcklwd xm0, xm7, xm4 ; 01 3860 punpckhwd xm7, xm4 ; 12 3861 punpcklwd xm1, xm8, xm5 ; 23 3862 punpckhwd xm8, xm5 ; 34 3863 punpcklwd xm2, xm9, xm6 ; 45 3864 punpckhwd xm9, xm6 ; 56 3865 punpcklwd xm3, xm10 ; 67 3866 mova [rsp+0x40], xm7 3867 mova [rsp+0x50], xm8 3868 mova [rsp+0x60], xm9 3869.w4_loop: 3870 and myd, 0x3ff 3871 mov r11d, 64 << 24 3872 mov r13d, myd 3873 shr r13d, 6 3874 lea r13d, [t1+r13] 3875 cmovnz r11q, [base+subpel_filters+r13*8] 3876 movq xm9, r11q 3877 pmovsxbw xm9, xm9 3878 pshufd xm7, xm9, q0000 3879 pshufd xm8, xm9, q1111 3880 pmaddwd xm4, xm0, xm7 3881 pmaddwd xm5, xm1, xm8 3882 pshufd xm7, xm9, q2222 3883 pshufd xm9, xm9, q3333 3884 pmaddwd xm6, xm2, xm7 3885 pmaddwd xm8, xm3, xm9 3886%if isput 3887 mova xm7, [rsp+0x20] 3888 movd xm9, [rsp+0x38] 3889%else 3890 SWAP m7, m11 3891%endif 3892 paddd xm4, xm5 3893 paddd xm6, xm8 3894 paddd xm4, xm6 3895 paddd xm4, xm7 3896%if isput 3897 psrad xm4, xm9 3898 packusdw xm4, xm4 3899 pminuw xm4, xm11 3900 movq [dstq], xm4 3901 add dstq, dsq 3902%else 3903 SWAP m11, m7 3904 psrad xm4, 6 3905 packssdw xm4, xm4 3906 movq [tmpq], xm4 3907 add tmpq, 8 3908%endif 3909 dec hd 3910 jz .ret 3911 add myd, dyd 3912 test myd, ~0x3ff 3913 jz .w4_loop 3914 mova xm8, [rsp+0x00] 3915 movd xm9, [rsp+0x30] 3916 movu xm4, [srcq] 3917 movu xm5, [srcq+r4] 3918 test myd, 0x400 3919 jz .w4_skip_line 3920 mova xm0, [rsp+0x40] 3921 mova [rsp+0x40], xm1 3922 mova xm1, [rsp+0x50] 3923 mova [rsp+0x50], xm2 3924 mova xm2, [rsp+0x60] 3925 mova [rsp+0x60], xm3 3926 pshufb xm4, xm12 3927 pshufb xm5, xm13 3928 pmaddwd xm4, xm14 3929 pmaddwd xm5, xm15 3930 phaddd xm4, xm5 3931 paddd xm4, xm8 3932 psrad xm4, xm9 3933 packssdw xm4, xm4 3934 punpcklwd xm3, xm10, xm4 3935 mova xm10, xm4 3936 add srcq, ssq 3937 jmp .w4_loop 3938.w4_skip_line: 3939 movu xm6, [srcq+ssq*1] 3940 movu xm7, [srcq+r6] 3941 movu m0, [rsp+0x50] 3942 pshufb xm4, xm12 3943 pshufb xm6, xm12 3944 pshufb xm5, xm13 3945 pshufb xm7, xm13 3946 pmaddwd xm4, xm14 3947 pmaddwd xm6, xm14 3948 pmaddwd xm5, xm15 3949 pmaddwd xm7, xm15 3950 mova [rsp+0x40], m0 3951 phaddd xm4, xm5 3952 phaddd xm6, xm7 3953 paddd xm4, xm8 3954 paddd xm6, xm8 3955 psrad xm4, xm9 3956 psrad xm6, xm9 3957 packssdw xm4, xm6 3958 punpcklwd xm9, xm10, xm4 3959 mova [rsp+0x60], xm9 3960 psrldq xm10, xm4, 8 3961 mova xm0, xm1 3962 mova xm1, xm2 3963 mova xm2, xm3 3964 punpcklwd xm3, xm4, xm10 3965 lea srcq, [srcq+ssq*2] 3966 jmp .w4_loop 3967 SWAP m10, m13 3968%if isprep 3969 SWAP m13, m11 3970%endif 3971.w8: 3972 mov dword [rsp+0x80], 1 3973 movifprep tmp_stridem, 16 3974 jmp .w_start 3975.w16: 3976 mov dword [rsp+0x80], 2 3977 movifprep tmp_stridem, 32 3978 jmp .w_start 3979.w32: 3980 mov dword [rsp+0x80], 4 3981 movifprep tmp_stridem, 64 3982 jmp .w_start 3983.w64: 3984 mov dword [rsp+0x80], 8 3985 movifprep tmp_stridem, 128 3986 jmp .w_start 3987.w128: 3988 mov dword [rsp+0x80], 16 3989 movifprep tmp_stridem, 256 3990.w_start: 3991 SWAP m10, m12, m1 3992 SWAP m11, m7 3993 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free 3994%if isput 3995 movifnidn dsm, dsq 3996 mova [rsp+0xb0], xm7 3997%endif 3998 mova [rsp+0x00], m10 3999 mova [rsp+0x20], m13 4000 shr t0d, 16 4001 sub srcq, 6 4002 pmaddwd m8, [base+rescale_mul2] 4003 movd xm15, t0d 4004 mov [rsp+0x84], t0d 4005 mov [rsp+0x88], srcq 4006 mov [rsp+0x90], r0q ; dstq / tmpq 4007%if UNIX64 4008 mov hm, hd 4009%endif 4010 shl dword dxm, 3 ; dx*8 4011 vpbroadcastd m15, xm15 4012 paddd m1, m8 ; mx+dx*[0-7] 4013 jmp .hloop 4014.hloop_prep: 4015 dec dword [rsp+0x80] 4016 jz .ret 4017 add qword [rsp+0x90], 16 4018 mov hd, hm 4019 vpbroadcastd m8, dxm 4020 vpbroadcastd m6, [base+pd_0x3ff] 4021 paddd m1, m8, [rsp+0x40] 4022 vpbroadcastd m15, [rsp+0x84] 4023 pxor m9, m9 4024 mov srcq, [rsp+0x88] 4025 mov r0q, [rsp+0x90] ; dstq / tmpq 4026.hloop: 4027 vpbroadcastq xm2, [base+pq_0x40000000] 4028 pand m5, m1, m6 4029 psrld m5, 6 4030 paddd m15, m5 4031 pcmpeqd m5, m9 4032 vextracti128 xm7, m15, 1 4033 movq r6, xm15 4034 pextrq r9, xm15, 1 4035 movq r11, xm7 4036 pextrq rX, xm7, 1 4037 mov r4d, r6d 4038 shr r6, 32 4039 mov r7d, r9d 4040 shr r9, 32 4041 mov r10d, r11d 4042 shr r11, 32 4043 mov r13d, rXd 4044 shr rX, 32 4045 mova [rsp+0x40], m1 4046 movq xm12, [base+subpel_filters+ r4*8] 4047 movq xm13, [base+subpel_filters+ r6*8] 4048 movhps xm12, [base+subpel_filters+ r7*8] 4049 movhps xm13, [base+subpel_filters+ r9*8] 4050 movq xm14, [base+subpel_filters+r10*8] 4051 movq xm15, [base+subpel_filters+r11*8] 4052 movhps xm14, [base+subpel_filters+r13*8] 4053 movhps xm15, [base+subpel_filters+ rX*8] 4054 psrld m1, 10 4055 vextracti128 xm7, m1, 1 4056 vextracti128 xm6, m5, 1 4057 movq [rsp+0xa0], xm1 4058 movq [rsp+0xa8], xm7 4059 movq r6, xm1 4060 pextrq r11, xm1, 1 4061 movq r9, xm7 4062 pextrq rX, xm7, 1 4063 mov r4d, r6d 4064 shr r6, 32 4065 mov r10d, r11d 4066 shr r11, 32 4067 mov r7d, r9d 4068 shr r9, 32 4069 mov r13d, rXd 4070 shr rX, 32 4071 pshufd xm4, xm5, q2200 4072 pshufd xm5, xm5, q3311 4073 pshufd xm7, xm6, q2200 4074 pshufd xm6, xm6, q3311 4075 pblendvb xm12, xm2, xm4 4076 pblendvb xm13, xm2, xm5 4077 pblendvb xm14, xm2, xm7 4078 pblendvb xm15, xm2, xm6 4079 pmovsxbw m12, xm12 4080 pmovsxbw m13, xm13 4081 pmovsxbw m14, xm14 4082 pmovsxbw m15, xm15 4083 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 4084 mova [rsp+0x60], m0 4085 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 4086 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 4087 MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b 4088 mova m0, [rsp+0x60] 4089 vbroadcasti128 m9, [base+subpel_s_shuf8] 4090 mov myd, mym 4091 mov dyd, dym 4092 pshufb m0, m9 ; 01a 01b 4093 pshufb m1, m9 ; 23a 23b 4094 pshufb m2, m9 ; 45a 45b 4095 pshufb m3, m9 ; 67a 67b 4096.vloop: 4097 and myd, 0x3ff 4098 mov r6d, 64 << 24 4099 mov r4d, myd 4100 shr r4d, 6 4101 lea r4d, [t1+r4] 4102 cmovnz r6q, [base+subpel_filters+r4*8] 4103 movq xm9, r6q 4104 punpcklqdq xm9, xm9 4105 pmovsxbw m9, xm9 4106 pshufd m8, m9, q0000 4107 pshufd m7, m9, q1111 4108 pmaddwd m4, m0, m8 4109 pmaddwd m5, m1, m7 4110 pshufd m8, m9, q2222 4111 pshufd m9, m9, q3333 4112 pmaddwd m6, m2, m8 4113 pmaddwd m7, m3, m9 4114%if isput 4115 psrldq xm8, xm11, 8 4116%endif 4117 paddd m4, [rsp+0x20] 4118 paddd m6, m7 4119 paddd m4, m5 4120 paddd m4, m6 4121%if isput 4122 psrad m4, xm8 4123 vextracti128 xm5, m4, 1 4124 packusdw xm4, xm5 4125 pminsw xm4, [rsp+0xb0] 4126 mova [dstq], xm4 4127 add dstq, dsm 4128%else 4129 psrad m4, 6 4130 vextracti128 xm5, m4, 1 4131 packssdw xm4, xm5 4132 mova [tmpq], xm4 4133 add tmpq, tmp_stridem 4134%endif 4135 dec hd 4136 jz .hloop_prep 4137 add myd, dyd 4138 test myd, ~0x3ff 4139 jz .vloop 4140 test myd, 0x400 4141 mov [rsp+0x60], myd 4142 mov r4d, [rsp+0xa0] 4143 mov r6d, [rsp+0xa4] 4144 mov r7d, [rsp+0xa8] 4145 mov r9d, [rsp+0xac] 4146 jz .skip_line 4147 vbroadcasti128 m9, [base+wswap] 4148 movu xm4, [srcq+ r4*2] 4149 movu xm5, [srcq+ r6*2] 4150 movu xm6, [srcq+ r7*2] 4151 movu xm7, [srcq+ r9*2] 4152 vinserti128 m4, [srcq+r10*2], 1 4153 vinserti128 m5, [srcq+r11*2], 1 4154 vinserti128 m6, [srcq+r13*2], 1 4155 vinserti128 m7, [srcq+ rX*2], 1 4156 add srcq, ssq 4157 mov myd, [rsp+0x60] 4158 mov dyd, dym 4159 pshufb m0, m9 4160 pshufb m1, m9 4161 pshufb m2, m9 4162 pshufb m3, m9 4163 pmaddwd m4, m12 4164 pmaddwd m5, m13 4165 pmaddwd m6, m14 4166 pmaddwd m7, m15 4167 phaddd m4, m5 4168 phaddd m6, m7 4169 phaddd m4, m6 4170 paddd m4, m10 4171 psrad m4, xm11 4172 pslld m4, 16 4173 pblendw m0, m1, 0xaa 4174 pblendw m1, m2, 0xaa 4175 pblendw m2, m3, 0xaa 4176 pblendw m3, m4, 0xaa 4177 jmp .vloop 4178.skip_line: 4179 mova m0, m1 4180 mova m1, m2 4181 mova m2, m3 4182 MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1 4183 vbroadcasti128 m9, [base+subpel_s_shuf8] 4184 mov myd, [rsp+0x60] 4185 mov dyd, dym 4186 pshufb m3, m9 4187 jmp .vloop 4188 SWAP m1, m12, m10 4189 SWAP m7, m11 4190.dy1: 4191 movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] 4192 add wq, base_reg 4193 jmp wq 4194%if isput 4195.dy1_w2: 4196 mov myd, mym 4197 movzx t0d, t0b 4198 sub srcq, 2 4199 movd xm15, t0d 4200 punpckldq m8, m9, m8 4201 paddd m10, m8 ; mx+dx*[0-1] 4202 vpbroadcastd xm14, [base+pq_0x40000000+2] 4203 vpbroadcastd xm15, xm15 4204 pand xm8, xm10, xm6 4205 psrld xm8, 6 4206 paddd xm15, xm8 4207 movd r4d, xm15 4208 pextrd r6d, xm15, 1 4209 vbroadcasti128 m5, [base+bdct_lb_q] 4210 vbroadcasti128 m6, [base+subpel_s_shuf2] 4211 vpbroadcastd m15, [base+subpel_filters+r4*8+2] 4212 vpbroadcastd m4, [base+subpel_filters+r6*8+2] 4213 pcmpeqd xm8, xm9 4214 psrld m10, 10 4215 paddd m10, m10 4216 movu xm0, [srcq+ssq*0] 4217 movu xm1, [srcq+ssq*1] 4218 movu xm2, [srcq+ssq*2] 4219 movu xm3, [srcq+ss3q ] 4220 lea srcq, [srcq+ssq*4] 4221 shr myd, 6 4222 mov r4d, 64 << 24 4223 lea myd, [t1+myq] 4224 cmovnz r4q, [base+subpel_filters+myq*8] 4225 pshufb m10, m5 4226 paddb m10, m6 4227 vpblendd xm15, xm4, 0xa 4228 pblendvb xm15, xm14, xm8 4229 pmovsxbw m15, xm15 4230 vinserti128 m0, [srcq+ssq*0], 1 4231 vinserti128 m1, [srcq+ssq*1], 1 4232 vinserti128 m2, [srcq+ssq*2], 1 4233 add srcq, ss3q 4234 movq xm6, r4q 4235 pmovsxbw xm6, xm6 4236 pshufd xm8, xm6, q0000 4237 pshufd xm9, xm6, q1111 4238 pshufd xm14, xm6, q2222 4239 pshufd xm6, xm6, q3333 4240 REPX {pshufb x, m10}, m0, m1, m2 4241 pshufb xm3, xm10 4242 REPX {pmaddwd x, m15}, m0, m1, m2 4243 pmaddwd xm3, xm15 4244 phaddd m0, m1 4245 phaddd m2, m3 4246 paddd m0, m12 4247 paddd m2, m12 4248 psrad m0, xm7 4249 psrad m2, xm7 4250 packssdw m0, m2 4251 vextracti128 xm1, m0, 1 4252 palignr xm2, xm1, xm0, 4 4253 pshufd xm4, xm1, q2121 4254 punpcklwd xm3, xm0, xm2 ; 01 12 4255 punpckhwd xm0, xm2 ; 23 34 4256 punpcklwd xm2, xm1, xm4 ; 45 56 4257.dy1_w2_loop: 4258 movu xm1, [srcq+ssq*0] 4259 movu xm5, [srcq+ssq*1] 4260 lea srcq, [srcq+ssq*2] 4261 pshufb xm1, xm10 4262 pshufb xm5, xm10 4263 pmaddwd xm1, xm15 4264 pmaddwd xm5, xm15 4265 phaddd xm1, xm5 4266 pmaddwd xm5, xm3, xm8 4267 mova xm3, xm0 4268 pmaddwd xm0, xm9 4269 paddd xm1, xm12 4270 psrad xm1, xm7 4271 packssdw xm1, xm1 4272 paddd xm5, xm0 4273 mova xm0, xm2 4274 pmaddwd xm2, xm14 4275 paddd xm5, xm2 4276 palignr xm2, xm1, xm4, 12 4277 punpcklwd xm2, xm1 ; 67 78 4278 pmaddwd xm4, xm2, xm6 4279 paddd xm5, xm13 4280 paddd xm5, xm4 4281 mova xm4, xm1 4282 psrldq xm1, xm7, 8 4283 psrad xm5, xm1 4284 packusdw xm5, xm5 4285 pminsw xm5, xm11 4286 movd [dstq+dsq*0], xm5 4287 pextrd [dstq+dsq*1], xm5, 1 4288 lea dstq, [dstq+dsq*2] 4289 sub hd, 2 4290 jg .dy1_w2_loop 4291 RET 4292%endif 4293.dy1_w4: 4294 mov myd, mym 4295%if isput 4296 mova [rsp+0x50], xm11 4297%endif 4298 mova [rsp+0x00], m12 4299 mova [rsp+0x20], m13 4300 mova [rsp+0x40], xm7 4301 vbroadcasti128 m7, [base+rescale_mul] 4302 movzx t0d, t0b 4303 sub srcq, 2 4304 movd xm15, t0d 4305 pmaddwd m8, m7 4306 vpbroadcastq m2, [base+pq_0x40000000+1] 4307 vpbroadcastd xm15, xm15 4308 SWAP m13, m10 4309 paddd m13, m8 ; mx+dx*[0-3] 4310 pand m6, m13 4311 psrld m6, 6 4312 paddd xm15, xm6 4313 movd r4d, xm15 4314 pextrd r6d, xm15, 1 4315 pextrd r11d, xm15, 2 4316 pextrd r13d, xm15, 3 4317 vbroadcasti128 m5, [base+bdct_lb_q+ 0] 4318 vbroadcasti128 m1, [base+bdct_lb_q+16] 4319 vbroadcasti128 m4, [base+subpel_s_shuf2] 4320 vpbroadcastd xm14, [base+subpel_filters+r4*8+2] 4321 vpbroadcastd xm7, [base+subpel_filters+r6*8+2] 4322 vpbroadcastd xm15, [base+subpel_filters+r11*8+2] 4323 vpbroadcastd xm8, [base+subpel_filters+r13*8+2] 4324 pcmpeqd m6, m9 4325 punpckldq m10, m6, m6 4326 punpckhdq m6, m6 4327 psrld m13, 10 4328 paddd m13, m13 4329 vpblendd xm14, xm7, 0xa 4330 vpblendd xm15, xm8, 0xa 4331 pmovsxbw m14, xm14 4332 pmovsxbw m15, xm15 4333 pblendvb m14, m2, m10 4334 pblendvb m15, m2, m6 4335 pextrd r4, xm13, 2 4336 pshufb m12, m13, m5 4337 pshufb m13, m1 4338 lea r6, [r4+ssq*2] 4339 lea r11, [r4+ssq*1] 4340 lea r13, [r4+ss3q ] 4341 movu xm0, [srcq+ssq*0] 4342 movu xm7, [srcq+r4 ] 4343 movu xm1, [srcq+ssq*2] 4344 movu xm8, [srcq+r6 ] 4345 vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 4346 vinserti128 m7, [srcq+r11 ], 1 4347 vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 4348 vinserti128 m8, [srcq+r13 ], 1 4349 lea srcq, [srcq+ssq*4] 4350 movu xm2, [srcq+ssq*0] 4351 movu xm9, [srcq+r4 ] 4352 movu xm3, [srcq+ssq*2] ; 6 _ 4353 movu xm10, [srcq+r6 ] 4354 vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 4355 vinserti128 m9, [srcq+r11 ], 1 4356 lea srcq, [srcq+ss3q ] 4357 vpbroadcastb m5, xm13 4358 psubb m13, m5 4359 paddb m12, m4 4360 paddb m13, m4 4361 mova m5, [rsp+0x00] 4362 movd xm6, [rsp+0x40] 4363 pshufb m0, m12 4364 pshufb m1, m12 4365 pmaddwd m0, m14 4366 pmaddwd m1, m14 4367 pshufb m7, m13 4368 pshufb m8, m13 4369 pmaddwd m7, m15 4370 pmaddwd m8, m15 4371 pshufb m2, m12 4372 pshufb xm3, xm12 4373 pmaddwd m2, m14 4374 pmaddwd xm3, xm14 4375 pshufb m9, m13 4376 pshufb xm10, xm13 4377 pmaddwd m9, m15 4378 pmaddwd xm10, xm15 4379 phaddd m0, m7 4380 phaddd m1, m8 4381 phaddd m2, m9 4382 phaddd xm3, xm10 4383 paddd m0, m5 4384 paddd m1, m5 4385 paddd m2, m5 4386 paddd xm3, xm5 4387 psrad m0, xm6 4388 psrad m1, xm6 4389 psrad m2, xm6 4390 psrad xm3, xm6 4391 vperm2i128 m4, m0, m1, 0x21 ; 1 2 4392 vperm2i128 m5, m1, m2, 0x21 ; 3 4 4393 vperm2i128 m6, m2, m3, 0x21 ; 5 6 4394 shr myd, 6 4395 mov r13d, 64 << 24 4396 lea myd, [t1+myq] 4397 cmovnz r13q, [base+subpel_filters+myq*8] 4398 pslld m4, 16 4399 pslld m5, 16 4400 pslld m6, 16 4401 pblendw m0, m4, 0xaa ; 01 12 4402 pblendw m1, m5, 0xaa ; 23 34 4403 pblendw m2, m6, 0xaa ; 45 56 4404 movq xm10, r13q 4405 punpcklqdq xm10, xm10 4406 pmovsxbw m10, xm10 4407 pshufd m7, m10, q0000 4408 pshufd m8, m10, q1111 4409 pshufd m9, m10, q2222 4410 pshufd m10, m10, q3333 4411.dy1_w4_loop: 4412 movu xm11, [srcq+ssq*0] 4413 movu xm6, [srcq+r4 ] 4414 vinserti128 m11, [srcq+ssq*1], 1 4415 vinserti128 m6, [srcq+r11 ], 1 4416 lea srcq, [srcq+ssq*2] 4417 pmaddwd m4, m0, m7 4418 pmaddwd m5, m1, m8 4419 pshufb m11, m12 4420 pshufb m6, m13 4421 pmaddwd m11, m14 4422 pmaddwd m6, m15 4423 paddd m4, [rsp+0x20] 4424 phaddd m11, m6 4425 pmaddwd m6, m2, m9 4426 paddd m11, [rsp+0x00] 4427 psrad m11, [rsp+0x40] 4428 mova m0, m1 4429 mova m1, m2 4430 paddd m5, m6 4431 paddd m4, m5 4432 vinserti128 m2, m3, xm11, 1 4433 pslld m3, m11, 16 4434 pblendw m2, m3, 0xaa ; 67 78 4435 pmaddwd m5, m2, m10 4436 vextracti128 xm3, m11, 1 4437 paddd m4, m5 4438%if isput 4439 psrad m4, [rsp+0x48] 4440 vextracti128 xm5, m4, 1 4441 packusdw xm4, xm5 4442 pminsw xm4, [rsp+0x50] 4443 movq [dstq+dsq*0], xm4 4444 movhps [dstq+dsq*1], xm4 4445 lea dstq, [dstq+dsq*2] 4446%else 4447 psrad m4, 6 4448 vextracti128 xm5, m4, 1 4449 packssdw xm4, xm5 4450 mova [tmpq], xm4 4451 add tmpq, 16 4452%endif 4453 sub hd, 2 4454 jg .dy1_w4_loop 4455 MC_8TAP_SCALED_RET 4456 SWAP m10, m13 4457.dy1_w8: 4458 mov dword [rsp+0xa0], 1 4459 movifprep tmp_stridem, 16 4460 jmp .dy1_w_start 4461.dy1_w16: 4462 mov dword [rsp+0xa0], 2 4463 movifprep tmp_stridem, 32 4464 jmp .dy1_w_start 4465.dy1_w32: 4466 mov dword [rsp+0xa0], 4 4467 movifprep tmp_stridem, 64 4468 jmp .dy1_w_start 4469.dy1_w64: 4470 mov dword [rsp+0xa0], 8 4471 movifprep tmp_stridem, 128 4472 jmp .dy1_w_start 4473.dy1_w128: 4474 mov dword [rsp+0xa0], 16 4475 movifprep tmp_stridem, 256 4476.dy1_w_start: 4477 SWAP m10, m12, m1 4478 SWAP m11, m7 4479 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free 4480 mov myd, mym 4481%if isput 4482 %define dsm [rsp+0xb8] 4483 movifnidn dsm, dsq 4484 mova [rsp+0xc0], xm7 4485%else 4486 %if UNIX64 4487 %define hm [rsp+0xb8] 4488 %endif 4489%endif 4490 mova [rsp+0x00], m10 4491 mova [rsp+0x20], m13 4492 mova [rsp+0x40], xm11 4493 shr t0d, 16 4494 sub srcq, 6 4495 shr myd, 6 4496 mov r4d, 64 << 24 4497 lea myd, [t1+myq] 4498 cmovnz r4q, [base+subpel_filters+myq*8] 4499 pmaddwd m8, [base+rescale_mul2] 4500 movd xm15, t0d 4501 mov [rsp+0xa4], t0d 4502 mov [rsp+0xa8], srcq 4503 mov [rsp+0xb0], r0q ; dstq / tmpq 4504%if UNIX64 4505 mov hm, hd 4506%endif 4507 shl dword dxm, 3 ; dx*8 4508 vpbroadcastd m15, xm15 4509 paddd m1, m8 ; mx+dx*[0-7] 4510 movq xm0, r4q 4511 pmovsxbw xm0, xm0 4512 mova [rsp+0x50], xm0 4513 jmp .dy1_hloop 4514.dy1_hloop_prep: 4515 dec dword [rsp+0xa0] 4516 jz .ret 4517 add qword [rsp+0xb0], 16 4518 mov hd, hm 4519 vpbroadcastd m8, dxm 4520 vpbroadcastd m6, [base+pd_0x3ff] 4521 paddd m1, m8, [rsp+0x60] 4522 vpbroadcastd m15, [rsp+0xa4] 4523 pxor m9, m9 4524 mov srcq, [rsp+0xa8] 4525 mov r0q, [rsp+0xb0] ; dstq / tmpq 4526 mova m10, [rsp+0x00] 4527 mova xm11, [rsp+0x40] 4528.dy1_hloop: 4529 vpbroadcastq xm2, [base+pq_0x40000000] 4530 pand m5, m1, m6 4531 psrld m5, 6 4532 paddd m15, m5 4533 pcmpeqd m5, m9 4534 vextracti128 xm7, m15, 1 4535 movq r6, xm15 4536 pextrq r9, xm15, 1 4537 movq r11, xm7 4538 pextrq rX, xm7, 1 4539 mov r4d, r6d 4540 shr r6, 32 4541 mov r7d, r9d 4542 shr r9, 32 4543 mov r10d, r11d 4544 shr r11, 32 4545 mov r13d, rXd 4546 shr rX, 32 4547 mova [rsp+0x60], m1 4548 movq xm12, [base+subpel_filters+ r4*8] 4549 movq xm13, [base+subpel_filters+ r6*8] 4550 movhps xm12, [base+subpel_filters+ r7*8] 4551 movhps xm13, [base+subpel_filters+ r9*8] 4552 movq xm14, [base+subpel_filters+r10*8] 4553 movq xm15, [base+subpel_filters+r11*8] 4554 movhps xm14, [base+subpel_filters+r13*8] 4555 movhps xm15, [base+subpel_filters+ rX*8] 4556 psrld m1, 10 4557 vextracti128 xm7, m1, 1 4558 vextracti128 xm6, m5, 1 4559 movq r6, xm1 4560 pextrq r11, xm1, 1 4561 movq r9, xm7 4562 pextrq rX, xm7, 1 4563 mov r4d, r6d 4564 shr r6, 32 4565 mov r10d, r11d 4566 shr r11, 32 4567 mov r7d, r9d 4568 shr r9, 32 4569 mov r13d, rXd 4570 shr rX, 32 4571 pshufd xm4, xm5, q2200 4572 pshufd xm5, xm5, q3311 4573 pshufd xm7, xm6, q2200 4574 pshufd xm6, xm6, q3311 4575 pblendvb xm12, xm2, xm4 4576 pblendvb xm13, xm2, xm5 4577 pblendvb xm14, xm2, xm7 4578 pblendvb xm15, xm2, xm6 4579 pmovsxbw m12, xm12 4580 pmovsxbw m13, xm13 4581 pmovsxbw m14, xm14 4582 pmovsxbw m15, xm15 4583 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 4584 mova [rsp+0x80], m0 4585 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 4586 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 4587 MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b 4588 mova m0, [rsp+0x80] 4589 vbroadcasti128 m7, [base+subpel_s_shuf8] 4590 vpbroadcastd m8, [rsp+0x50] 4591 vpbroadcastd m9, [rsp+0x54] 4592 vpbroadcastd m10, [rsp+0x58] 4593 vpbroadcastd m11, [rsp+0x5c] 4594 pshufb m0, m7 ; 01a 01b 4595 pshufb m1, m7 ; 23a 23b 4596 pshufb m2, m7 ; 45a 45b 4597 pshufb m3, m7 ; 67a 67b 4598.dy1_vloop: 4599 pmaddwd m4, m0, m8 4600 pmaddwd m5, m1, m9 4601 pmaddwd m6, m2, m10 4602 pmaddwd m7, m3, m11 4603 paddd m4, [rsp+0x20] 4604 paddd m6, m7 4605 paddd m4, m5 4606 paddd m4, m6 4607%if isput 4608 psrad m4, [rsp+0x48] 4609 vextracti128 xm5, m4, 1 4610 packusdw xm4, xm5 4611 pminsw xm4, [rsp+0xc0] 4612 mova [dstq], xm4 4613 add dstq, dsm 4614%else 4615 psrad m4, 6 4616 vextracti128 xm5, m4, 1 4617 packssdw xm4, xm5 4618 mova [tmpq], xm4 4619 add tmpq, tmp_stridem 4620%endif 4621 dec hd 4622 jz .dy1_hloop_prep 4623 vbroadcasti128 m7, [base+wswap] 4624 pshufb m0, m7 4625 pshufb m1, m7 4626 pshufb m2, m7 4627 pshufb m3, m7 4628 movu xm4, [srcq+ r4*2] 4629 movu xm5, [srcq+ r6*2] 4630 movu xm6, [srcq+ r7*2] 4631 movu xm7, [srcq+ r9*2] 4632 vinserti128 m4, [srcq+r10*2], 1 4633 vinserti128 m5, [srcq+r11*2], 1 4634 vinserti128 m6, [srcq+r13*2], 1 4635 vinserti128 m7, [srcq+ rX*2], 1 4636 add srcq, ssq 4637 pmaddwd m4, m12 4638 pmaddwd m5, m13 4639 pmaddwd m6, m14 4640 pmaddwd m7, m15 4641 phaddd m4, m5 4642 phaddd m6, m7 4643 phaddd m4, m6 4644 paddd m4, [rsp+0x00] 4645 psrad m4, [rsp+0x40] 4646 pslld m4, 16 4647 pblendw m0, m1, 0xaa 4648 pblendw m1, m2, 0xaa 4649 pblendw m2, m3, 0xaa 4650 pblendw m3, m4, 0xaa 4651 jmp .dy1_vloop 4652 SWAP m1, m12, m10 4653 SWAP m7, m11 4654.dy2: 4655 movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] 4656 add wq, base_reg 4657 jmp wq 4658%if isput 4659.dy2_w2: 4660 mov myd, mym 4661 movzx t0d, t0b 4662 sub srcq, 2 4663 movd xm15, t0d 4664 punpckldq m8, m9, m8 4665 paddd m10, m8 ; mx+dx*[0-1] 4666 vpbroadcastd xm14, [base+pq_0x40000000+2] 4667 vpbroadcastd xm15, xm15 4668 pand xm8, xm10, xm6 4669 psrld xm8, 6 4670 paddd xm15, xm8 4671 movd r4d, xm15 4672 pextrd r6d, xm15, 1 4673 vbroadcasti128 m5, [base+bdct_lb_q] 4674 vbroadcasti128 m6, [base+subpel_s_shuf2] 4675 vpbroadcastd xm15, [base+subpel_filters+r4*8+2] 4676 vpbroadcastd xm4, [base+subpel_filters+r6*8+2] 4677 pcmpeqd xm8, xm9 4678 psrld m10, 10 4679 paddd m10, m10 4680 movu xm0, [srcq+ssq*0] 4681 movu xm1, [srcq+ssq*2] 4682 movu xm2, [srcq+ssq*4] 4683 pshufb m10, m5 4684 paddb m10, m6 4685 vpblendd xm15, xm4, 0xa 4686 pblendvb xm15, xm14, xm8 4687 pmovsxbw m15, xm15 4688 vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 4689 vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 4690 lea srcq, [srcq+ssq*4] 4691 vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 4692 lea srcq, [srcq+ssq*2] 4693 shr myd, 6 4694 mov r4d, 64 << 24 4695 lea myd, [t1+myq] 4696 cmovnz r4q, [base+subpel_filters+myq*8] 4697 pshufb m0, m10 4698 pshufb m1, m10 4699 pshufb m2, m10 4700 pmaddwd m0, m15 4701 pmaddwd m1, m15 4702 pmaddwd m2, m15 4703 movq xm6, r4q 4704 pmovsxbw xm6, xm6 4705 phaddd m0, m1 4706 phaddd m1, m2 4707 paddd m0, m12 4708 paddd m1, m12 4709 psrad m0, xm7 4710 psrad m1, xm7 4711 packssdw m0, m1 ; 0 2 2 4 1 3 3 5 4712 vextracti128 xm1, m0, 1 4713 pshufd xm8, xm6, q0000 4714 pshufd xm9, xm6, q1111 4715 pshufd xm14, xm6, q2222 4716 pshufd xm6, xm6, q3333 4717 punpcklwd xm2, xm0, xm1 ; 01 23 4718 punpckhwd xm1, xm0, xm1 ; 23 45 4719.dy2_w2_loop: 4720 movu xm3, [srcq+ssq*0] 4721 movu xm5, [srcq+ssq*2] 4722 vinserti128 m3, [srcq+ssq*1], 1 ; 6 7 4723 vinserti128 m5, [srcq+ss3q ], 1 ; 8 9 4724 lea srcq, [srcq+ssq*4] 4725 pmaddwd xm4, xm2, xm8 4726 pmaddwd xm1, xm9 4727 pshufb m3, m10 4728 pshufb m5, m10 4729 pmaddwd m3, m15 4730 pmaddwd m5, m15 4731 phaddd m3, m5 4732 paddd xm4, xm1 4733 paddd m3, m12 4734 psrad m3, xm7 4735 packssdw m3, m3 4736 pshufd m3, m3, q2100 4737 palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9 4738 vextracti128 xm1, m0, 1 4739 punpcklwd xm2, xm0, xm1 ; 45 67 4740 punpckhwd xm1, xm0, xm1 ; 67 89 4741 pmaddwd xm3, xm2, xm14 4742 pmaddwd xm5, xm1, xm6 4743 paddd xm4, xm13 4744 paddd xm4, xm3 4745 psrldq xm3, xm7, 8 4746 paddd xm4, xm5 4747 psrad xm4, xm3 4748 packusdw xm4, xm4 4749 pminsw xm4, xm11 4750 movd [dstq+dsq*0], xm4 4751 pextrd [dstq+dsq*1], xm4, 1 4752 lea dstq, [dstq+dsq*2] 4753 sub hd, 2 4754 jg .dy2_w2_loop 4755 RET 4756%endif 4757.dy2_w4: 4758 mov myd, mym 4759%if isput 4760 mova [rsp+0x50], xm11 4761%endif 4762 mova [rsp+0x00], m12 4763 mova [rsp+0x20], m13 4764 mova [rsp+0x40], xm7 4765 vbroadcasti128 m7, [base+rescale_mul] 4766 movzx t0d, t0b 4767 sub srcq, 2 4768 movd xm15, t0d 4769 pmaddwd m8, m7 4770 vpbroadcastq m2, [base+pq_0x40000000+1] 4771 vpbroadcastd xm15, xm15 4772 SWAP m13, m10 4773 paddd m13, m8 ; mx+dx*[0-3] 4774 pand m6, m13 4775 psrld m6, 6 4776 paddd xm15, xm6 4777 movd r4d, xm15 4778 pextrd r6d, xm15, 1 4779 pextrd r11d, xm15, 2 4780 pextrd r13d, xm15, 3 4781 vbroadcasti128 m5, [base+bdct_lb_q+ 0] 4782 vbroadcasti128 m1, [base+bdct_lb_q+16] 4783 vbroadcasti128 m4, [base+subpel_s_shuf2] 4784 vpbroadcastd xm14, [base+subpel_filters+r4*8+2] 4785 vpbroadcastd xm7, [base+subpel_filters+r6*8+2] 4786 vpbroadcastd xm15, [base+subpel_filters+r11*8+2] 4787 vpbroadcastd xm8, [base+subpel_filters+r13*8+2] 4788 shr myd, 6 4789 mov r13d, 64 << 24 4790 lea myd, [t1+myq] 4791 cmovnz r13q, [base+subpel_filters+myq*8] 4792 pcmpeqd m6, m9 4793 punpckldq m11, m6, m6 4794 punpckhdq m6, m6 4795 psrld m13, 10 4796 paddd m13, m13 4797 vpblendd xm14, xm7, 0xa 4798 vpblendd xm15, xm8, 0xa 4799 pmovsxbw m14, xm14 4800 pmovsxbw m15, xm15 4801 movq xm10, r13q 4802 pblendvb m14, m2, m11 4803 pblendvb m15, m2, m6 4804 pextrd r4, xm13, 2 4805 pshufb m12, m13, m5 4806 pshufb m13, m1 4807 lea r6, [r4+ssq*1] 4808 lea r11, [r4+ssq*2] 4809 lea r13, [r4+ss3q ] 4810 movu xm0, [srcq+ssq*0] 4811 movu xm7, [srcq+r4 ] 4812 movu xm1, [srcq+ssq*1] 4813 movu xm8, [srcq+r6 ] 4814 vinserti128 m0, [srcq+ssq*2], 1 ; 0 2 4815 vinserti128 m7, [srcq+r11 ], 1 4816 vinserti128 m1, [srcq+ss3q ], 1 ; 1 3 4817 vinserti128 m8, [srcq+r13 ], 1 4818 lea srcq, [srcq+ssq*4] 4819 movu xm2, [srcq+ssq*0] 4820 movu xm9, [srcq+r4 ] 4821 vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 4822 vinserti128 m9, [srcq+r6 ], 1 4823 lea srcq, [srcq+ssq*2] 4824 vpbroadcastb m5, xm13 4825 psubb m13, m5 4826 paddb m12, m4 4827 paddb m13, m4 4828 mova m5, [rsp+0x00] 4829 movd xm6, [rsp+0x40] 4830 pshufb m0, m12 4831 pshufb m1, m12 4832 pshufb m2, m12 4833 pmaddwd m0, m14 4834 pmaddwd m1, m14 4835 pmaddwd m2, m14 4836 pshufb m7, m13 4837 pshufb m8, m13 4838 pshufb m9, m13 4839 pmaddwd m7, m15 4840 pmaddwd m8, m15 4841 pmaddwd m9, m15 4842 punpcklqdq xm10, xm10 4843 pmovsxbw m10, xm10 4844 phaddd m0, m7 4845 phaddd m1, m8 4846 phaddd m2, m9 4847 paddd m0, m5 4848 paddd m1, m5 4849 paddd m2, m5 4850 psrad m0, xm6 4851 psrad m1, xm6 4852 psrad m2, xm6 4853 vperm2i128 m3, m0, m2, 0x21 ; 2 4 4854 vperm2i128 m2, m1, 0x13 ; 3 5 4855 pshufd m7, m10, q0000 4856 pshufd m8, m10, q1111 4857 pshufd m9, m10, q2222 4858 pshufd m10, m10, q3333 4859 packssdw m0, m3 ; 0 2 2 4 4860 packssdw m1, m2 ; 1 3 3 5 4861 punpckhwd m2, m0, m1 ; 23 45 4862 punpcklwd m0, m1 ; 01 23 4863.dy2_w4_loop: 4864 movu xm1, [srcq+ssq*0] 4865 movu xm6, [srcq+r4 ] 4866 movu xm3, [srcq+ssq*1] 4867 movu xm11, [srcq+r6 ] 4868 vinserti128 m1, [srcq+ssq*2], 1 ; 6 8 4869 vinserti128 m6, [srcq+r11 ], 1 4870 vinserti128 m3, [srcq+ss3q ], 1 ; 7 9 4871 vinserti128 m11, [srcq+r13 ], 1 4872 lea srcq, [srcq+ssq*4] 4873 pmaddwd m4, m0, m7 4874 pmaddwd m5, m2, m8 4875 pshufb m1, m12 4876 pshufb m3, m12 4877 pmaddwd m1, m14 4878 pmaddwd m3, m14 4879 mova m0, [rsp+0x00] 4880 pshufb m6, m13 4881 pshufb m11, m13 4882 pmaddwd m6, m15 4883 pmaddwd m11, m15 4884 paddd m4, m5 4885 movd xm5, [rsp+0x40] 4886 phaddd m1, m6 4887 phaddd m3, m11 4888 paddd m1, m0 4889 paddd m3, m0 4890 psrad m1, xm5 4891 psrad m3, xm5 4892 pslld m3, 16 4893 pblendw m1, m3, 0xaa ; 67 89 4894 vperm2i128 m0, m2, m1, 0x21 ; 45 67 4895 paddd m4, [rsp+0x20] 4896 mova m2, m1 4897 pmaddwd m5, m0, m9 4898 pmaddwd m6, m2, m10 4899 paddd m4, m5 4900 paddd m4, m6 4901%if isput 4902 psrad m4, [rsp+0x48] 4903 vextracti128 xm5, m4, 1 4904 packusdw xm4, xm5 4905 pminsw xm4, [rsp+0x50] 4906 movq [dstq+dsq*0], xm4 4907 movhps [dstq+dsq*1], xm4 4908 lea dstq, [dstq+dsq*2] 4909%else 4910 psrad m4, 6 4911 vextracti128 xm5, m4, 1 4912 packssdw xm4, xm5 4913 mova [tmpq], xm4 4914 add tmpq, 16 4915%endif 4916 sub hd, 2 4917 jg .dy2_w4_loop 4918 MC_8TAP_SCALED_RET 4919 SWAP m10, m13 4920.dy2_w8: 4921 mov dword [rsp+0xa0], 1 4922 movifprep tmp_stridem, 16 4923 jmp .dy2_w_start 4924.dy2_w16: 4925 mov dword [rsp+0xa0], 2 4926 movifprep tmp_stridem, 32 4927 jmp .dy2_w_start 4928.dy2_w32: 4929 mov dword [rsp+0xa0], 4 4930 movifprep tmp_stridem, 64 4931 jmp .dy2_w_start 4932.dy2_w64: 4933 mov dword [rsp+0xa0], 8 4934 movifprep tmp_stridem, 128 4935 jmp .dy2_w_start 4936.dy2_w128: 4937 mov dword [rsp+0xa0], 16 4938 movifprep tmp_stridem, 256 4939.dy2_w_start: 4940 SWAP m10, m12, m1 4941 SWAP m11, m7 4942 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free 4943 mov myd, mym 4944%if isput 4945 movifnidn dsm, dsq 4946 mova [rsp+0xc0], xm7 4947%endif 4948 mova [rsp+0x00], m10 4949 mova [rsp+0x20], m13 4950 mova [rsp+0x40], xm11 4951 shr t0d, 16 4952 sub srcq, 6 4953 shr myd, 6 4954 mov r4d, 64 << 24 4955 lea myd, [t1+myq] 4956 cmovnz r4q, [base+subpel_filters+myq*8] 4957 pmaddwd m8, [base+rescale_mul2] 4958 movd xm15, t0d 4959 mov [rsp+0xa4], t0d 4960 mov [rsp+0xa8], srcq 4961 mov [rsp+0xb0], r0q ; dstq / tmpq 4962%if UNIX64 4963 mov hm, hd 4964%endif 4965 shl dword dxm, 3 ; dx*8 4966 vpbroadcastd m15, xm15 4967 paddd m1, m8 ; mx+dx*[0-7] 4968 movq xm0, r4q 4969 pmovsxbw xm0, xm0 4970 mova [rsp+0x50], xm0 4971 jmp .dy2_hloop 4972.dy2_hloop_prep: 4973 dec dword [rsp+0xa0] 4974 jz .ret 4975 add qword [rsp+0xb0], 16 4976 mov hd, hm 4977 vpbroadcastd m8, dxm 4978 vpbroadcastd m6, [base+pd_0x3ff] 4979 paddd m1, m8, [rsp+0x60] 4980 vpbroadcastd m15, [rsp+0xa4] 4981 pxor m9, m9 4982 mov srcq, [rsp+0xa8] 4983 mov r0q, [rsp+0xb0] ; dstq / tmpq 4984 mova m10, [rsp+0x00] 4985 mova xm11, [rsp+0x40] 4986.dy2_hloop: 4987 vpbroadcastq xm2, [base+pq_0x40000000] 4988 pand m5, m1, m6 4989 psrld m5, 6 4990 paddd m15, m5 4991 pcmpeqd m5, m9 4992 vextracti128 xm7, m15, 1 4993 movq r6, xm15 4994 pextrq r9, xm15, 1 4995 movq r11, xm7 4996 pextrq rX, xm7, 1 4997 mov r4d, r6d 4998 shr r6, 32 4999 mov r7d, r9d 5000 shr r9, 32 5001 mov r10d, r11d 5002 shr r11, 32 5003 mov r13d, rXd 5004 shr rX, 32 5005 mova [rsp+0x60], m1 5006 movq xm12, [base+subpel_filters+ r4*8] 5007 movq xm13, [base+subpel_filters+ r6*8] 5008 movhps xm12, [base+subpel_filters+ r7*8] 5009 movhps xm13, [base+subpel_filters+ r9*8] 5010 movq xm14, [base+subpel_filters+r10*8] 5011 movq xm15, [base+subpel_filters+r11*8] 5012 movhps xm14, [base+subpel_filters+r13*8] 5013 movhps xm15, [base+subpel_filters+ rX*8] 5014 psrld m1, 10 5015 vextracti128 xm7, m1, 1 5016 vextracti128 xm6, m5, 1 5017 movq r6, xm1 5018 pextrq r11, xm1, 1 5019 movq r9, xm7 5020 pextrq rX, xm7, 1 5021 mov r4d, r6d 5022 shr r6, 32 5023 mov r10d, r11d 5024 shr r11, 32 5025 mov r7d, r9d 5026 shr r9, 32 5027 mov r13d, rXd 5028 shr rX, 32 5029 pshufd xm4, xm5, q2200 5030 pshufd xm5, xm5, q3311 5031 pshufd xm7, xm6, q2200 5032 pshufd xm6, xm6, q3311 5033 pblendvb xm12, xm2, xm4 5034 pblendvb xm13, xm2, xm5 5035 pblendvb xm14, xm2, xm7 5036 pblendvb xm15, xm2, xm6 5037 pmovsxbw m12, xm12 5038 pmovsxbw m13, xm13 5039 pmovsxbw m14, xm14 5040 pmovsxbw m15, xm15 5041 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 5042 mova [rsp+0x80], m0 5043 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 5044 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 5045 MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b 5046 mova m0, [rsp+0x80] 5047 vbroadcasti128 m7, [base+subpel_s_shuf8] 5048 vpbroadcastd m8, [rsp+0x50] 5049 vpbroadcastd m9, [rsp+0x54] 5050 vpbroadcastd m10, [rsp+0x58] 5051 vpbroadcastd m11, [rsp+0x5c] 5052 pshufb m0, m7 ; 01a 01b 5053 pshufb m1, m7 ; 23a 23b 5054 pshufb m2, m7 ; 45a 45b 5055 pshufb m3, m7 ; 67a 67b 5056.dy2_vloop: 5057 pmaddwd m4, m0, m8 5058 pmaddwd m5, m1, m9 5059 pmaddwd m6, m2, m10 5060 pmaddwd m7, m3, m11 5061 paddd m4, [rsp+0x20] 5062 paddd m6, m7 5063 paddd m4, m5 5064 paddd m4, m6 5065%if isput 5066 psrad m4, [rsp+0x48] 5067 vextracti128 xm5, m4, 1 5068 packusdw xm4, xm5 5069 pminsw xm4, [rsp+0xc0] 5070 mova [dstq], xm4 5071 add dstq, dsm 5072%else 5073 psrad m4, 6 5074 vextracti128 xm5, m4, 1 5075 packssdw xm4, xm5 5076 mova [tmpq], xm4 5077 add tmpq, tmp_stridem 5078%endif 5079 dec hd 5080 jz .dy2_hloop_prep 5081 mova m0, m1 5082 mova m1, m2 5083 mova m2, m3 5084 movu xm3, [srcq+ r4*2] 5085 movu xm4, [srcq+ r6*2] 5086 movu xm5, [srcq+ r7*2] 5087 movu xm6, [srcq+ r9*2] 5088 vinserti128 m3, [srcq+r10*2], 1 5089 vinserti128 m4, [srcq+r11*2], 1 5090 vinserti128 m5, [srcq+r13*2], 1 5091 vinserti128 m6, [srcq+ rX*2], 1 5092 add srcq, ssq 5093 pmaddwd m3, m12 5094 pmaddwd m4, m13 5095 pmaddwd m5, m14 5096 pmaddwd m6, m15 5097 phaddd m3, m4 5098 phaddd m5, m6 5099 phaddd m3, m5 5100 movu xm4, [srcq+ r4*2] 5101 movu xm5, [srcq+ r6*2] 5102 movu xm6, [srcq+ r7*2] 5103 movu xm7, [srcq+ r9*2] 5104 vinserti128 m4, [srcq+r10*2], 1 5105 vinserti128 m5, [srcq+r11*2], 1 5106 vinserti128 m6, [srcq+r13*2], 1 5107 vinserti128 m7, [srcq+ rX*2], 1 5108 add srcq, ssq 5109 pmaddwd m4, m12 5110 pmaddwd m5, m13 5111 pmaddwd m6, m14 5112 pmaddwd m7, m15 5113 phaddd m4, m5 5114 phaddd m6, m7 5115 mova m5, [rsp+0x00] 5116 movd xm7, [rsp+0x40] 5117 phaddd m4, m6 5118 paddd m3, m5 5119 paddd m4, m5 5120 psrad m3, xm7 5121 psrad m4, xm7 5122 pslld m4, 16 5123 pblendw m3, m4, 0xaa 5124 jmp .dy2_vloop 5125.ret: 5126 MC_8TAP_SCALED_RET 0 5127%undef isput 5128%undef isprep 5129%endmacro 5130 5131%macro BILIN_SCALED_FN 1 5132cglobal %1_bilin_scaled_16bpc 5133 mov t0d, (5*15 << 16) | 5*15 5134 mov t1d, t0d 5135 jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) 5136%endmacro 5137 5138%if WIN64 5139DECLARE_REG_TMP 6, 5 5140%else 5141DECLARE_REG_TMP 6, 8 5142%endif 5143 5144%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, 5145BILIN_SCALED_FN put 5146PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_16bpc 5147PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_16bpc 5148PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_16bpc 5149PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_16bpc 5150PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_16bpc 5151PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_16bpc 5152PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_16bpc 5153PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_16bpc 5154PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR 5155MC_8TAP_SCALED put 5156 5157%if WIN64 5158DECLARE_REG_TMP 5, 4 5159%else 5160DECLARE_REG_TMP 6, 7 5161%endif 5162 5163%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, 5164BILIN_SCALED_FN prep 5165PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_16bpc 5166PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_16bpc 5167PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_16bpc 5168PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_16bpc 5169PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_16bpc 5170PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_16bpc 5171PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_16bpc 5172PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_16bpc 5173PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR 5174MC_8TAP_SCALED prep 5175 5176%macro WARP_V 5 ; dst, 01, 23, 45, 67 5177 lea tmp1d, [myq+deltaq*4] 5178 lea tmp2d, [myq+deltaq*1] 5179 shr myd, 10 5180 shr tmp1d, 10 5181 movq xm8, [filterq+myq *8] 5182 vinserti128 m8, [filterq+tmp1q*8], 1 ; a e 5183 lea tmp1d, [tmp2q+deltaq*4] 5184 lea myd, [tmp2q+deltaq*1] 5185 shr tmp2d, 10 5186 shr tmp1d, 10 5187 movq xm0, [filterq+tmp2q*8] 5188 vinserti128 m0, [filterq+tmp1q*8], 1 ; b f 5189 lea tmp1d, [myq+deltaq*4] 5190 lea tmp2d, [myq+deltaq*1] 5191 shr myd, 10 5192 shr tmp1d, 10 5193 movq xm9, [filterq+myq *8] 5194 vinserti128 m9, [filterq+tmp1q*8], 1 ; c g 5195 lea tmp1d, [tmp2q+deltaq*4] 5196 lea myd, [tmp2q+gammaq] ; my += gamma 5197 punpcklwd m8, m0 5198 shr tmp2d, 10 5199 shr tmp1d, 10 5200 movq xm0, [filterq+tmp2q*8] 5201 vinserti128 m0, [filterq+tmp1q*8], 1 ; d h 5202 punpcklwd m0, m9, m0 5203 punpckldq m9, m8, m0 5204 punpckhdq m0, m8, m0 5205 punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 5206 punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 5207 pmaddwd m%2, m8 5208 pmaddwd m9, m%3 5209 punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 5210 punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 5211 pmaddwd m8, m%4 5212 pmaddwd m0, m%5 5213 paddd m9, m%2 5214 mova m%2, m%3 5215 paddd m0, m8 5216 mova m%3, m%4 5217 mova m%4, m%5 5218 paddd m%1, m0, m9 5219%endmacro 5220 5221cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts 5222 mov r6d, r7m 5223 lea r9, [$$] 5224 shr r6d, 11 5225 vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4] 5226 vpbroadcastd m14, [warp8x8t_rnd] 5227 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main 5228 jmp .start 5229.loop: 5230 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2 5231 lea tmpq, [tmpq+tsq*4] 5232.start: 5233 paddd m7, m14 5234 paddd m0, m14 5235 psrad m7, 15 5236 psrad m0, 15 5237 packssdw m7, m0 5238 vpermq m7, m7, q3120 5239 mova [tmpq+tsq*0], xm7 5240 vextracti128 [tmpq+tsq*2], m7, 1 5241 dec r4d 5242 jg .loop 5243.end: 5244 RET 5245 5246cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \ 5247 alpha, beta, filter, tmp1, delta, \ 5248 my, gamma 5249 mov r6d, r7m 5250 lea filterq, [$$] 5251 shr r6d, 11 5252 vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4] 5253 vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4] 5254 vpbroadcastw m15, r7m ; pixel_max 5255 call .main 5256 jmp .start 5257.loop: 5258 call .main2 5259 lea dstq, [dstq+dsq*2] 5260.start: 5261 psrad m7, 16 5262 psrad m0, 16 5263 packusdw m7, m0 5264 pmulhrsw m7, m14 5265 pminsw m7, m15 5266 vpermq m7, m7, q3120 5267 mova [dstq+dsq*0], xm7 5268 vextracti128 [dstq+dsq*1], m7, 1 5269 dec r4d 5270 jg .loop 5271.end: 5272 RET 5273ALIGN function_align 5274.main: 5275 ; Stack args offset by one (r4m -> r5m etc.) due to call 5276%if WIN64 5277 mov abcdq, r5m 5278 mov mxd, r6m 5279%endif 5280 movsx alphad, word [abcdq+2*0] 5281 movsx betad, word [abcdq+2*1] 5282 vpbroadcastd m12, [pd_32768] 5283 pxor m11, m11 5284 add filterq, mc_warp_filter-$$ 5285 lea tmp1q, [ssq*3] 5286 add mxd, 512+(64<<10) 5287 lea tmp2d, [alphaq*3] 5288 sub srcq, tmp1q ; src -= src_stride*3 5289 sub betad, tmp2d ; beta -= alpha*3 5290 mov myd, r7m 5291 call .h 5292 psrld m1, m0, 16 5293 call .h 5294 pblendw m1, m0, 0xaa ; 01 5295 psrld m2, m0, 16 5296 call .h 5297 pblendw m2, m0, 0xaa ; 12 5298 psrld m3, m0, 16 5299 call .h 5300 pblendw m3, m0, 0xaa ; 23 5301 psrld m4, m0, 16 5302 call .h 5303 pblendw m4, m0, 0xaa ; 34 5304 psrld m5, m0, 16 5305 call .h 5306 pblendw m5, m0, 0xaa ; 45 5307 psrld m6, m0, 16 5308 call .h 5309 pblendw m6, m0, 0xaa ; 56 5310 movsx deltad, word [abcdq+2*2] 5311 movsx gammad, word [abcdq+2*3] 5312 add myd, 512+(64<<10) 5313 mov r4d, 4 5314 lea tmp1d, [deltaq*3] 5315 sub gammad, tmp1d ; gamma -= delta*3 5316.main2: 5317 call .h 5318 psrld m7, m6, 16 5319 pblendw m7, m0, 0xaa ; 67 5320 WARP_V 7, 1, 3, 5, 7 5321 call .h 5322 psrld m10, m5, 16 5323 pblendw m10, m0, 0xaa ; 78 5324 WARP_V 0, 2, 4, 6, 10 5325 ret 5326ALIGN function_align 5327.h: 5328 lea tmp1d, [mxq+alphaq*4] 5329 lea tmp2d, [mxq+alphaq*1] 5330 movu xm10, [srcq-6] 5331 vinserti128 m10, [srcq+2], 1 5332 shr mxd, 10 ; 0 5333 shr tmp1d, 10 ; 4 5334 movq xm0, [filterq+mxq *8] 5335 vinserti128 m0, [filterq+tmp1q*8], 1 5336 lea tmp1d, [tmp2q+alphaq*4] 5337 lea mxd, [tmp2q+alphaq*1] 5338 movu xm8, [srcq-4] 5339 vinserti128 m8, [srcq+4], 1 5340 shr tmp2d, 10 ; 1 5341 shr tmp1d, 10 ; 5 5342 movq xm9, [filterq+tmp2q*8] 5343 vinserti128 m9, [filterq+tmp1q*8], 1 5344 lea tmp1d, [mxq+alphaq*4] 5345 lea tmp2d, [mxq+alphaq*1] 5346 shr mxd, 10 ; 2 5347 shr tmp1d, 10 ; 6 5348 punpcklbw m0, m11, m0 5349 pmaddwd m0, m10 5350 movu xm10, [srcq-2] 5351 vinserti128 m10, [srcq+6], 1 5352 punpcklbw m9, m11, m9 5353 pmaddwd m9, m8 5354 movq xm8, [filterq+mxq *8] 5355 vinserti128 m8, [filterq+tmp1q*8], 1 5356 lea tmp1d, [tmp2q+alphaq*4] 5357 lea mxd, [tmp2q+betaq] ; mx += beta 5358 phaddd m0, m9 ; 0 1 4 5 5359 movu xm9, [srcq+0] 5360 vinserti128 m9, [srcq+8], 1 5361 shr tmp2d, 10 ; 3 5362 shr tmp1d, 10 ; 7 5363 punpcklbw m8, m11, m8 5364 pmaddwd m8, m10 5365 movq xm10, [filterq+tmp2q*8] 5366 vinserti128 m10, [filterq+tmp1q*8], 1 5367 punpcklbw m10, m11, m10 5368 pmaddwd m9, m10 5369 add srcq, ssq 5370 phaddd m8, m9 ; 2 3 6 7 5371 phaddd m0, m8 ; 0 1 2 3 4 5 6 7 5372 vpsllvd m0, m13 5373 paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword 5374 ret 5375 5376%macro BIDIR_FN 0 5377 call .main 5378 lea stride3q, [strideq*3] 5379 jmp wq 5380.w4: 5381 movq [dstq ], xm0 5382 movhps [dstq+strideq*1], xm0 5383 vextracti128 xm0, m0, 1 5384 movq [dstq+strideq*2], xm0 5385 movhps [dstq+stride3q ], xm0 5386 cmp hd, 4 5387 je .ret 5388 lea dstq, [dstq+strideq*4] 5389 movq [dstq ], xm1 5390 movhps [dstq+strideq*1], xm1 5391 vextracti128 xm1, m1, 1 5392 movq [dstq+strideq*2], xm1 5393 movhps [dstq+stride3q ], xm1 5394 cmp hd, 8 5395 je .ret 5396 lea dstq, [dstq+strideq*4] 5397 movq [dstq ], xm2 5398 movhps [dstq+strideq*1], xm2 5399 vextracti128 xm2, m2, 1 5400 movq [dstq+strideq*2], xm2 5401 movhps [dstq+stride3q ], xm2 5402 lea dstq, [dstq+strideq*4] 5403 movq [dstq ], xm3 5404 movhps [dstq+strideq*1], xm3 5405 vextracti128 xm3, m3, 1 5406 movq [dstq+strideq*2], xm3 5407 movhps [dstq+stride3q ], xm3 5408.ret: 5409 RET 5410.w8: 5411 mova [dstq+strideq*0], xm0 5412 vextracti128 [dstq+strideq*1], m0, 1 5413 mova [dstq+strideq*2], xm1 5414 vextracti128 [dstq+stride3q ], m1, 1 5415 cmp hd, 4 5416 jne .w8_loop_start 5417 RET 5418.w8_loop: 5419 call .main 5420 lea dstq, [dstq+strideq*4] 5421 mova [dstq+strideq*0], xm0 5422 vextracti128 [dstq+strideq*1], m0, 1 5423 mova [dstq+strideq*2], xm1 5424 vextracti128 [dstq+stride3q ], m1, 1 5425.w8_loop_start: 5426 lea dstq, [dstq+strideq*4] 5427 mova [dstq+strideq*0], xm2 5428 vextracti128 [dstq+strideq*1], m2, 1 5429 mova [dstq+strideq*2], xm3 5430 vextracti128 [dstq+stride3q ], m3, 1 5431 sub hd, 8 5432 jg .w8_loop 5433 RET 5434.w16_loop: 5435 call .main 5436 lea dstq, [dstq+strideq*4] 5437.w16: 5438 mova [dstq+strideq*0], m0 5439 mova [dstq+strideq*1], m1 5440 mova [dstq+strideq*2], m2 5441 mova [dstq+stride3q ], m3 5442 sub hd, 4 5443 jg .w16_loop 5444 RET 5445.w32_loop: 5446 call .main 5447 lea dstq, [dstq+strideq*2] 5448.w32: 5449 mova [dstq+strideq*0+32*0], m0 5450 mova [dstq+strideq*0+32*1], m1 5451 mova [dstq+strideq*1+32*0], m2 5452 mova [dstq+strideq*1+32*1], m3 5453 sub hd, 2 5454 jg .w32_loop 5455 RET 5456.w64_loop: 5457 call .main 5458 add dstq, strideq 5459.w64: 5460 mova [dstq+32*0], m0 5461 mova [dstq+32*1], m1 5462 mova [dstq+32*2], m2 5463 mova [dstq+32*3], m3 5464 dec hd 5465 jg .w64_loop 5466 RET 5467.w128_loop: 5468 call .main 5469 add dstq, strideq 5470.w128: 5471 mova [dstq+32*0], m0 5472 mova [dstq+32*1], m1 5473 mova [dstq+32*2], m2 5474 mova [dstq+32*3], m3 5475 call .main 5476 mova [dstq+32*4], m0 5477 mova [dstq+32*5], m1 5478 mova [dstq+32*6], m2 5479 mova [dstq+32*7], m3 5480 dec hd 5481 jg .w128_loop 5482 RET 5483%endmacro 5484 5485%if WIN64 5486DECLARE_REG_TMP 5 5487%else 5488DECLARE_REG_TMP 7 5489%endif 5490 5491cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 5492%define base r6-avg_avx2_table 5493 lea r6, [avg_avx2_table] 5494 tzcnt wd, wm 5495 mov t0d, r6m ; pixel_max 5496 movsxd wq, [r6+wq*4] 5497 shr t0d, 11 5498 vpbroadcastd m4, [base+bidir_rnd+t0*4] 5499 vpbroadcastd m5, [base+bidir_mul+t0*4] 5500 movifnidn hd, hm 5501 add wq, r6 5502 BIDIR_FN 5503ALIGN function_align 5504.main: 5505 mova m0, [tmp1q+32*0] 5506 paddsw m0, [tmp2q+32*0] 5507 mova m1, [tmp1q+32*1] 5508 paddsw m1, [tmp2q+32*1] 5509 mova m2, [tmp1q+32*2] 5510 paddsw m2, [tmp2q+32*2] 5511 mova m3, [tmp1q+32*3] 5512 paddsw m3, [tmp2q+32*3] 5513 add tmp1q, 32*4 5514 add tmp2q, 32*4 5515 pmaxsw m0, m4 5516 pmaxsw m1, m4 5517 pmaxsw m2, m4 5518 pmaxsw m3, m4 5519 psubsw m0, m4 5520 psubsw m1, m4 5521 psubsw m2, m4 5522 psubsw m3, m4 5523 pmulhw m0, m5 5524 pmulhw m1, m5 5525 pmulhw m2, m5 5526 pmulhw m3, m5 5527 ret 5528 5529cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3 5530 lea r6, [w_avg_avx2_table] 5531 tzcnt wd, wm 5532 mov t0d, r6m ; weight 5533 vpbroadcastw m8, r7m ; pixel_max 5534 vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538] 5535 movsxd wq, [r6+wq*4] 5536 paddw m7, m8 5537 add wq, r6 5538 lea r6d, [t0-16] 5539 shl t0d, 16 5540 sub t0d, r6d ; 16-weight, weight 5541 pslld m7, 7 5542 rorx r6d, t0d, 30 ; << 2 5543 test dword r7m, 0x800 5544 cmovz r6d, t0d 5545 movifnidn hd, hm 5546 movd xm6, r6d 5547 vpbroadcastd m6, xm6 5548 BIDIR_FN 5549ALIGN function_align 5550.main: 5551 mova m4, [tmp1q+32*0] 5552 mova m0, [tmp2q+32*0] 5553 punpckhwd m5, m0, m4 5554 punpcklwd m0, m4 5555 mova m4, [tmp1q+32*1] 5556 mova m1, [tmp2q+32*1] 5557 pmaddwd m5, m6 5558 pmaddwd m0, m6 5559 paddd m5, m7 5560 paddd m0, m7 5561 psrad m5, 8 5562 psrad m0, 8 5563 packusdw m0, m5 5564 punpckhwd m5, m1, m4 5565 punpcklwd m1, m4 5566 mova m4, [tmp1q+32*2] 5567 mova m2, [tmp2q+32*2] 5568 pmaddwd m5, m6 5569 pmaddwd m1, m6 5570 paddd m5, m7 5571 paddd m1, m7 5572 psrad m5, 8 5573 psrad m1, 8 5574 packusdw m1, m5 5575 punpckhwd m5, m2, m4 5576 punpcklwd m2, m4 5577 mova m4, [tmp1q+32*3] 5578 mova m3, [tmp2q+32*3] 5579 add tmp1q, 32*4 5580 add tmp2q, 32*4 5581 pmaddwd m5, m6 5582 pmaddwd m2, m6 5583 paddd m5, m7 5584 paddd m2, m7 5585 psrad m5, 8 5586 psrad m2, 8 5587 packusdw m2, m5 5588 punpckhwd m5, m3, m4 5589 punpcklwd m3, m4 5590 pmaddwd m5, m6 5591 pmaddwd m3, m6 5592 paddd m5, m7 5593 paddd m3, m7 5594 psrad m5, 8 5595 psrad m3, 8 5596 packusdw m3, m5 5597 pminsw m0, m8 5598 pminsw m1, m8 5599 pminsw m2, m8 5600 pminsw m3, m8 5601 ret 5602 5603cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 5604%define base r7-mask_avx2_table 5605 lea r7, [mask_avx2_table] 5606 tzcnt wd, wm 5607 mov r6d, r7m ; pixel_max 5608 movifnidn hd, hm 5609 shr r6d, 11 5610 movsxd wq, [r7+wq*4] 5611 vpbroadcastd m8, [base+pw_64] 5612 vpbroadcastd m9, [base+bidir_rnd+r6*4] 5613 vpbroadcastd m10, [base+bidir_mul+r6*4] 5614 mov maskq, maskmp 5615 add wq, r7 5616 BIDIR_FN 5617ALIGN function_align 5618.main: 5619%macro MASK 1 5620 pmovzxbw m5, [maskq+16*%1] 5621 mova m%1, [tmp1q+32*%1] 5622 mova m6, [tmp2q+32*%1] 5623 punpckhwd m4, m%1, m6 5624 punpcklwd m%1, m6 5625 psubw m7, m8, m5 5626 punpckhwd m6, m5, m7 ; m, 64-m 5627 punpcklwd m5, m7 5628 pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m) 5629 pmaddwd m%1, m5 5630 psrad m4, 5 5631 psrad m%1, 5 5632 packssdw m%1, m4 5633 pmaxsw m%1, m9 5634 psubsw m%1, m9 5635 pmulhw m%1, m10 5636%endmacro 5637 MASK 0 5638 MASK 1 5639 MASK 2 5640 MASK 3 5641 add maskq, 16*4 5642 add tmp1q, 32*4 5643 add tmp2q, 32*4 5644 ret 5645 5646cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 5647%define base r7-w_mask_420_avx2_table 5648 lea r7, [w_mask_420_avx2_table] 5649 tzcnt wd, wm 5650 mov r6d, r8m ; pixel_max 5651 movd xm0, r7m ; sign 5652 movifnidn hd, hm 5653 shr r6d, 11 5654 movsxd wq, [r7+wq*4] 5655 vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 5656 vpbroadcastd m11, [base+pw_64] 5657 vpbroadcastd m12, [base+bidir_rnd+r6*4] 5658 vpbroadcastd m13, [base+bidir_mul+r6*4] 5659 movd xm14, [base+pw_2] 5660 mov maskq, maskmp 5661 psubw xm14, xm0 5662 vpbroadcastw m14, xm14 5663 add wq, r7 5664 call .main 5665 lea stride3q, [strideq*3] 5666 jmp wq 5667.w4: 5668 phaddd m4, m5 5669 paddw m4, m14 5670 psrlw m4, 2 5671 packuswb m4, m4 5672 vextracti128 xm5, m4, 1 5673 punpcklwd xm4, xm5 5674 movq [dstq+strideq*0], xm0 5675 movhps [dstq+strideq*1], xm0 5676 vextracti128 xm0, m0, 1 5677 movq [dstq+strideq*2], xm0 5678 movhps [dstq+stride3q ], xm0 5679 mova [maskq], xm4 5680 cmp hd, 8 5681 jl .w4_end 5682 lea dstq, [dstq+strideq*4] 5683 movq [dstq+strideq*0], xm1 5684 movhps [dstq+strideq*1], xm1 5685 vextracti128 xm1, m1, 1 5686 movq [dstq+strideq*2], xm1 5687 movhps [dstq+stride3q ], xm1 5688 je .w4_end 5689 lea dstq, [dstq+strideq*4] 5690 movq [dstq+strideq*0], xm2 5691 movhps [dstq+strideq*1], xm2 5692 vextracti128 xm2, m2, 1 5693 movq [dstq+strideq*2], xm2 5694 movhps [dstq+stride3q ], xm2 5695 lea dstq, [dstq+strideq*4] 5696 movq [dstq+strideq*0], xm3 5697 movhps [dstq+strideq*1], xm3 5698 vextracti128 xm3, m3, 1 5699 movq [dstq+strideq*2], xm3 5700 movhps [dstq+stride3q ], xm3 5701.w4_end: 5702 RET 5703.w8_loop: 5704 call .main 5705 lea dstq, [dstq+strideq*4] 5706 add maskq, 16 5707.w8: 5708 vperm2i128 m6, m4, m5, 0x21 5709 vpblendd m4, m5, 0xf0 5710 paddw m4, m14 5711 paddw m4, m6 5712 psrlw m4, 2 5713 vextracti128 xm5, m4, 1 5714 packuswb xm4, xm5 5715 mova [dstq+strideq*0], xm0 5716 vextracti128 [dstq+strideq*1], m0, 1 5717 mova [dstq+strideq*2], xm1 5718 vextracti128 [dstq+stride3q ], m1, 1 5719 mova [maskq], xm4 5720 sub hd, 8 5721 jl .w8_end 5722 lea dstq, [dstq+strideq*4] 5723 mova [dstq+strideq*0], xm2 5724 vextracti128 [dstq+strideq*1], m2, 1 5725 mova [dstq+strideq*2], xm3 5726 vextracti128 [dstq+stride3q ], m3, 1 5727 jg .w8_loop 5728.w8_end: 5729 RET 5730.w16_loop: 5731 call .main 5732 lea dstq, [dstq+strideq*4] 5733 add maskq, 16 5734.w16: 5735 punpcklqdq m6, m4, m5 5736 punpckhqdq m4, m5 5737 paddw m6, m14 5738 paddw m4, m6 5739 psrlw m4, 2 5740 vextracti128 xm5, m4, 1 5741 packuswb xm4, xm5 5742 pshufd xm4, xm4, q3120 5743 mova [dstq+strideq*0], m0 5744 mova [dstq+strideq*1], m1 5745 mova [dstq+strideq*2], m2 5746 mova [dstq+stride3q ], m3 5747 mova [maskq], xm4 5748 sub hd, 4 5749 jg .w16_loop 5750 RET 5751.w32_loop: 5752 call .main 5753 lea dstq, [dstq+strideq*4] 5754 add maskq, 32 5755.w32: 5756 paddw m4, m14 5757 paddw m4, m5 5758 psrlw m15, m4, 2 5759 mova [dstq+strideq*0+32*0], m0 5760 mova [dstq+strideq*0+32*1], m1 5761 mova [dstq+strideq*1+32*0], m2 5762 mova [dstq+strideq*1+32*1], m3 5763 call .main 5764 mova m6, [deint_shuf] 5765 paddw m4, m14 5766 paddw m4, m5 5767 psrlw m4, 2 5768 packuswb m15, m4 5769 vpermd m4, m6, m15 5770 mova [dstq+strideq*2+32*0], m0 5771 mova [dstq+strideq*2+32*1], m1 5772 mova [dstq+stride3q +32*0], m2 5773 mova [dstq+stride3q +32*1], m3 5774 mova [maskq], m4 5775 sub hd, 4 5776 jg .w32_loop 5777 RET 5778.w64_loop: 5779 call .main 5780 lea dstq, [dstq+strideq*2] 5781 add maskq, 32 5782.w64: 5783 paddw m4, m14 5784 paddw m15, m14, m5 5785 mova [dstq+strideq*0+32*0], m0 5786 mova [dstq+strideq*0+32*1], m1 5787 mova [dstq+strideq*0+32*2], m2 5788 mova [dstq+strideq*0+32*3], m3 5789 mova [maskq], m4 ; no available registers 5790 call .main 5791 paddw m4, [maskq] 5792 mova m6, [deint_shuf] 5793 paddw m5, m15 5794 psrlw m4, 2 5795 psrlw m5, 2 5796 packuswb m4, m5 ; 0 2 4 6 1 3 5 7 5797 vpermd m4, m6, m4 5798 mova [dstq+strideq*1+32*0], m0 5799 mova [dstq+strideq*1+32*1], m1 5800 mova [dstq+strideq*1+32*2], m2 5801 mova [dstq+strideq*1+32*3], m3 5802 mova [maskq], m4 5803 sub hd, 2 5804 jg .w64_loop 5805 RET 5806.w128_loop: 5807 call .main 5808 lea dstq, [dstq+strideq*2] 5809 add maskq, 64 5810.w128: 5811 paddw m4, m14 5812 paddw m5, m14 5813 mova [dstq+strideq*0+32*0], m0 5814 mova [dstq+strideq*0+32*1], m1 5815 mova [dstq+strideq*0+32*2], m2 5816 mova [dstq+strideq*0+32*3], m3 5817 mova [maskq+32*0], m4 5818 mova [dstq+strideq], m5 5819 call .main 5820 paddw m4, m14 5821 paddw m15, m14, m5 5822 mova [dstq+strideq*0+32*4], m0 5823 mova [dstq+strideq*0+32*5], m1 5824 mova [dstq+strideq*0+32*6], m2 5825 mova [dstq+strideq*0+32*7], m3 5826 mova [maskq+32*1], m4 5827 call .main 5828 paddw m4, [maskq+32*0] 5829 paddw m5, [dstq+strideq] 5830 mova m6, [deint_shuf] 5831 psrlw m4, 2 5832 psrlw m5, 2 5833 packuswb m4, m5 5834 vpermd m4, m6, m4 5835 mova [dstq+strideq*1+32*0], m0 5836 mova [dstq+strideq*1+32*1], m1 5837 mova [dstq+strideq*1+32*2], m2 5838 mova [dstq+strideq*1+32*3], m3 5839 mova [maskq+32*0], m4 5840 call .main 5841 paddw m4, [maskq+32*1] 5842 mova m6, [deint_shuf] 5843 paddw m5, m15 5844 psrlw m4, 2 5845 psrlw m5, 2 5846 packuswb m4, m5 5847 vpermd m4, m6, m4 5848 mova [dstq+strideq*1+32*4], m0 5849 mova [dstq+strideq*1+32*5], m1 5850 mova [dstq+strideq*1+32*6], m2 5851 mova [dstq+strideq*1+32*7], m3 5852 mova [maskq+32*1], m4 5853 sub hd, 2 5854 jg .w128_loop 5855 RET 5856ALIGN function_align 5857.main: 5858%macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul 5859 mova m%1, [tmp1q+32*%1] 5860 mova m%2, [tmp2q+32*%1] 5861 punpcklwd m8, m%2, m%1 5862 punpckhwd m9, m%2, m%1 5863 psubsw m%1, m%2 5864 pabsw m%1, m%1 5865 psubusw m7, m10, m%1 5866 psrlw m7, 10 ; 64-m 5867 psubw m%2, m%3, m7 ; m 5868 punpcklwd m%1, m7, m%2 5869 punpckhwd m7, m%2 5870 pmaddwd m%1, m8 5871 pmaddwd m7, m9 5872 psrad m%1, 5 5873 psrad m7, 5 5874 packssdw m%1, m7 5875 pmaxsw m%1, m%4 5876 psubsw m%1, m%4 5877 pmulhw m%1, m%5 5878%endmacro 5879 W_MASK 0, 4 5880 W_MASK 1, 5 5881 phaddw m4, m5 5882 W_MASK 2, 5 5883 W_MASK 3, 6 5884 phaddw m5, m6 5885 add tmp1q, 32*4 5886 add tmp2q, 32*4 5887 ret 5888 5889cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 5890%define base r7-w_mask_422_avx2_table 5891 lea r7, [w_mask_422_avx2_table] 5892 tzcnt wd, wm 5893 mov r6d, r8m ; pixel_max 5894 vpbroadcastb m14, r7m ; sign 5895 movifnidn hd, hm 5896 shr r6d, 11 5897 movsxd wq, [r7+wq*4] 5898 vpbroadcastd m10, [base+pw_27615] 5899 vpbroadcastd m11, [base+pw_64] 5900 vpbroadcastd m12, [base+bidir_rnd+r6*4] 5901 vpbroadcastd m13, [base+bidir_mul+r6*4] 5902 mova m15, [base+deint_shuf] 5903 mov maskq, maskmp 5904 add wq, r7 5905 call .main 5906 lea stride3q, [strideq*3] 5907 jmp wq 5908.w4: 5909 movq [dstq+strideq*0], xm0 5910 movhps [dstq+strideq*1], xm0 5911 vextracti128 xm0, m0, 1 5912 movq [dstq+strideq*2], xm0 5913 movhps [dstq+stride3q ], xm0 5914 cmp hd, 8 5915 jl .w4_end 5916 lea dstq, [dstq+strideq*4] 5917 movq [dstq+strideq*0], xm1 5918 movhps [dstq+strideq*1], xm1 5919 vextracti128 xm1, m1, 1 5920 movq [dstq+strideq*2], xm1 5921 movhps [dstq+stride3q ], xm1 5922 je .w4_end 5923 lea dstq, [dstq+strideq*4] 5924 movq [dstq+strideq*0], xm2 5925 movhps [dstq+strideq*1], xm2 5926 vextracti128 xm2, m2, 1 5927 movq [dstq+strideq*2], xm2 5928 movhps [dstq+stride3q ], xm2 5929 lea dstq, [dstq+strideq*4] 5930 movq [dstq+strideq*0], xm3 5931 movhps [dstq+strideq*1], xm3 5932 vextracti128 xm3, m3, 1 5933 movq [dstq+strideq*2], xm3 5934 movhps [dstq+stride3q ], xm3 5935.w4_end: 5936 RET 5937.w8_loop: 5938 call .main 5939 lea dstq, [dstq+strideq*4] 5940.w8: 5941 mova [dstq+strideq*0], xm0 5942 vextracti128 [dstq+strideq*1], m0, 1 5943 mova [dstq+strideq*2], xm1 5944 vextracti128 [dstq+stride3q ], m1, 1 5945 sub hd, 8 5946 jl .w8_end 5947 lea dstq, [dstq+strideq*4] 5948 mova [dstq+strideq*0], xm2 5949 vextracti128 [dstq+strideq*1], m2, 1 5950 mova [dstq+strideq*2], xm3 5951 vextracti128 [dstq+stride3q ], m3, 1 5952 jg .w8_loop 5953.w8_end: 5954 RET 5955.w16_loop: 5956 call .main 5957 lea dstq, [dstq+strideq*4] 5958.w16: 5959 mova [dstq+strideq*0], m0 5960 mova [dstq+strideq*1], m1 5961 mova [dstq+strideq*2], m2 5962 mova [dstq+stride3q ], m3 5963 sub hd, 4 5964 jg .w16_loop 5965 RET 5966.w32_loop: 5967 call .main 5968 lea dstq, [dstq+strideq*2] 5969.w32: 5970 mova [dstq+strideq*0+32*0], m0 5971 mova [dstq+strideq*0+32*1], m1 5972 mova [dstq+strideq*1+32*0], m2 5973 mova [dstq+strideq*1+32*1], m3 5974 sub hd, 2 5975 jg .w32_loop 5976 RET 5977.w64_loop: 5978 call .main 5979 add dstq, strideq 5980.w64: 5981 mova [dstq+32*0], m0 5982 mova [dstq+32*1], m1 5983 mova [dstq+32*2], m2 5984 mova [dstq+32*3], m3 5985 dec hd 5986 jg .w64_loop 5987 RET 5988.w128_loop: 5989 call .main 5990 add dstq, strideq 5991.w128: 5992 mova [dstq+32*0], m0 5993 mova [dstq+32*1], m1 5994 mova [dstq+32*2], m2 5995 mova [dstq+32*3], m3 5996 call .main 5997 mova [dstq+32*4], m0 5998 mova [dstq+32*5], m1 5999 mova [dstq+32*6], m2 6000 mova [dstq+32*7], m3 6001 dec hd 6002 jg .w128_loop 6003 RET 6004ALIGN function_align 6005.main: 6006 W_MASK 0, 4 6007 W_MASK 1, 5 6008 phaddw m4, m5 6009 W_MASK 2, 5 6010 W_MASK 3, 6 6011 phaddw m5, m6 6012 add tmp1q, 32*4 6013 add tmp2q, 32*4 6014 packuswb m4, m5 6015 pxor m5, m5 6016 psubb m4, m14 6017 pavgb m4, m5 6018 vpermd m4, m15, m4 6019 mova [maskq], m4 6020 add maskq, 32 6021 ret 6022 6023cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 6024%define base r7-w_mask_444_avx2_table 6025 lea r7, [w_mask_444_avx2_table] 6026 tzcnt wd, wm 6027 mov r6d, r8m ; pixel_max 6028 movifnidn hd, hm 6029 shr r6d, 11 6030 movsxd wq, [r7+wq*4] 6031 vpbroadcastd m10, [base+pw_27615] 6032 vpbroadcastd m4, [base+pw_64] 6033 vpbroadcastd m5, [base+bidir_rnd+r6*4] 6034 vpbroadcastd m6, [base+bidir_mul+r6*4] 6035 mov maskq, maskmp 6036 add wq, r7 6037 call .main 6038 lea stride3q, [strideq*3] 6039 jmp wq 6040.w4: 6041 movq [dstq+strideq*0], xm0 6042 movhps [dstq+strideq*1], xm0 6043 vextracti128 xm0, m0, 1 6044 movq [dstq+strideq*2], xm0 6045 movhps [dstq+stride3q ], xm0 6046 cmp hd, 8 6047 jl .w4_end 6048 lea dstq, [dstq+strideq*4] 6049 movq [dstq+strideq*0], xm1 6050 movhps [dstq+strideq*1], xm1 6051 vextracti128 xm1, m1, 1 6052 movq [dstq+strideq*2], xm1 6053 movhps [dstq+stride3q ], xm1 6054 je .w4_end 6055 call .main 6056 lea dstq, [dstq+strideq*4] 6057 movq [dstq+strideq*0], xm0 6058 movhps [dstq+strideq*1], xm0 6059 vextracti128 xm0, m0, 1 6060 movq [dstq+strideq*2], xm0 6061 movhps [dstq+stride3q ], xm0 6062 lea dstq, [dstq+strideq*4] 6063 movq [dstq+strideq*0], xm1 6064 movhps [dstq+strideq*1], xm1 6065 vextracti128 xm1, m1, 1 6066 movq [dstq+strideq*2], xm1 6067 movhps [dstq+stride3q ], xm1 6068.w4_end: 6069 RET 6070.w8_loop: 6071 call .main 6072 lea dstq, [dstq+strideq*4] 6073.w8: 6074 mova [dstq+strideq*0], xm0 6075 vextracti128 [dstq+strideq*1], m0, 1 6076 mova [dstq+strideq*2], xm1 6077 vextracti128 [dstq+stride3q ], m1, 1 6078 sub hd, 4 6079 jg .w8_loop 6080.w8_end: 6081 RET 6082.w16_loop: 6083 call .main 6084 lea dstq, [dstq+strideq*2] 6085.w16: 6086 mova [dstq+strideq*0], m0 6087 mova [dstq+strideq*1], m1 6088 sub hd, 2 6089 jg .w16_loop 6090 RET 6091.w32_loop: 6092 call .main 6093 add dstq, strideq 6094.w32: 6095 mova [dstq+32*0], m0 6096 mova [dstq+32*1], m1 6097 dec hd 6098 jg .w32_loop 6099 RET 6100.w64_loop: 6101 call .main 6102 add dstq, strideq 6103.w64: 6104 mova [dstq+32*0], m0 6105 mova [dstq+32*1], m1 6106 call .main 6107 mova [dstq+32*2], m0 6108 mova [dstq+32*3], m1 6109 dec hd 6110 jg .w64_loop 6111 RET 6112.w128_loop: 6113 call .main 6114 add dstq, strideq 6115.w128: 6116 mova [dstq+32*0], m0 6117 mova [dstq+32*1], m1 6118 call .main 6119 mova [dstq+32*2], m0 6120 mova [dstq+32*3], m1 6121 call .main 6122 mova [dstq+32*4], m0 6123 mova [dstq+32*5], m1 6124 call .main 6125 mova [dstq+32*6], m0 6126 mova [dstq+32*7], m1 6127 dec hd 6128 jg .w128_loop 6129 RET 6130ALIGN function_align 6131.main: 6132 W_MASK 0, 2, 4, 5, 6 6133 W_MASK 1, 3, 4, 5, 6 6134 packuswb m2, m3 6135 vpermq m2, m2, q3120 6136 add tmp1q, 32*2 6137 add tmp2q, 32*2 6138 mova [maskq], m2 6139 add maskq, 32 6140 ret 6141 6142; (a * (64 - m) + b * m + 32) >> 6 6143; = (((b - a) * m + 32) >> 6) + a 6144; = (((b - a) * (m << 9) + 16384) >> 15) + a 6145; except m << 9 overflows int16_t when m == 64 (which is possible), 6146; but if we negate m it works out (-64 << 9 == -32768). 6147; = (((a - b) * (m * -512) + 16384) >> 15) + a 6148cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask 6149%define base r6-blend_avx2_table 6150 lea r6, [blend_avx2_table] 6151 tzcnt wd, wm 6152 movifnidn hd, hm 6153 movsxd wq, [r6+wq*4] 6154 movifnidn maskq, maskmp 6155 vpbroadcastd m6, [base+pw_m512] 6156 add wq, r6 6157 lea r6, [dsq*3] 6158 jmp wq 6159.w4: 6160 pmovzxbw m3, [maskq] 6161 movq xm0, [dstq+dsq*0] 6162 movhps xm0, [dstq+dsq*1] 6163 vpbroadcastq m1, [dstq+dsq*2] 6164 vpbroadcastq m2, [dstq+r6 ] 6165 vpblendd m0, m1, 0x30 6166 vpblendd m0, m2, 0xc0 6167 psubw m1, m0, [tmpq] 6168 add maskq, 16 6169 add tmpq, 32 6170 pmullw m3, m6 6171 pmulhrsw m1, m3 6172 paddw m0, m1 6173 vextracti128 xm1, m0, 1 6174 movq [dstq+dsq*0], xm0 6175 movhps [dstq+dsq*1], xm0 6176 movq [dstq+dsq*2], xm1 6177 movhps [dstq+r6 ], xm1 6178 lea dstq, [dstq+dsq*4] 6179 sub hd, 4 6180 jg .w4 6181 RET 6182.w8: 6183 pmovzxbw m4, [maskq+16*0] 6184 pmovzxbw m5, [maskq+16*1] 6185 mova xm0, [dstq+dsq*0] 6186 vinserti128 m0, [dstq+dsq*1], 1 6187 mova xm1, [dstq+dsq*2] 6188 vinserti128 m1, [dstq+r6 ], 1 6189 psubw m2, m0, [tmpq+32*0] 6190 psubw m3, m1, [tmpq+32*1] 6191 add maskq, 16*2 6192 add tmpq, 32*2 6193 pmullw m4, m6 6194 pmullw m5, m6 6195 pmulhrsw m2, m4 6196 pmulhrsw m3, m5 6197 paddw m0, m2 6198 paddw m1, m3 6199 mova [dstq+dsq*0], xm0 6200 vextracti128 [dstq+dsq*1], m0, 1 6201 mova [dstq+dsq*2], xm1 6202 vextracti128 [dstq+r6 ], m1, 1 6203 lea dstq, [dstq+dsq*4] 6204 sub hd, 4 6205 jg .w8 6206 RET 6207.w16: 6208 pmovzxbw m4, [maskq+16*0] 6209 pmovzxbw m5, [maskq+16*1] 6210 mova m0, [dstq+dsq*0] 6211 psubw m2, m0, [tmpq+ 32*0] 6212 mova m1, [dstq+dsq*1] 6213 psubw m3, m1, [tmpq+ 32*1] 6214 add maskq, 16*2 6215 add tmpq, 32*2 6216 pmullw m4, m6 6217 pmullw m5, m6 6218 pmulhrsw m2, m4 6219 pmulhrsw m3, m5 6220 paddw m0, m2 6221 paddw m1, m3 6222 mova [dstq+dsq*0], m0 6223 mova [dstq+dsq*1], m1 6224 lea dstq, [dstq+dsq*2] 6225 sub hd, 2 6226 jg .w16 6227 RET 6228.w32: 6229 pmovzxbw m4, [maskq+16*0] 6230 pmovzxbw m5, [maskq+16*1] 6231 mova m0, [dstq+32*0] 6232 psubw m2, m0, [tmpq+32*0] 6233 mova m1, [dstq+32*1] 6234 psubw m3, m1, [tmpq+32*1] 6235 add maskq, 16*2 6236 add tmpq, 32*2 6237 pmullw m4, m6 6238 pmullw m5, m6 6239 pmulhrsw m2, m4 6240 pmulhrsw m3, m5 6241 paddw m0, m2 6242 paddw m1, m3 6243 mova [dstq+32*0], m0 6244 mova [dstq+32*1], m1 6245 add dstq, dsq 6246 dec hd 6247 jg .w32 6248 RET 6249 6250INIT_XMM avx2 6251cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h 6252%define base r5-blend_v_avx2_table 6253 lea r5, [blend_v_avx2_table] 6254 tzcnt wd, wm 6255 movifnidn hd, hm 6256 movsxd wq, [r5+wq*4] 6257 add wq, r5 6258 jmp wq 6259.w2: 6260 vpbroadcastd m2, [base+obmc_masks_avx2+2*2] 6261.w2_loop: 6262 movd m0, [dstq+dsq*0] 6263 pinsrd m0, [dstq+dsq*1], 1 6264 movq m1, [tmpq] 6265 add tmpq, 4*2 6266 psubw m1, m0, m1 6267 pmulhrsw m1, m2 6268 paddw m0, m1 6269 movd [dstq+dsq*0], m0 6270 pextrd [dstq+dsq*1], m0, 1 6271 lea dstq, [dstq+dsq*2] 6272 sub hd, 2 6273 jg .w2_loop 6274 RET 6275.w4: 6276 vpbroadcastq m2, [base+obmc_masks_avx2+4*2] 6277.w4_loop: 6278 movq m0, [dstq+dsq*0] 6279 movhps m0, [dstq+dsq*1] 6280 psubw m1, m0, [tmpq] 6281 add tmpq, 8*2 6282 pmulhrsw m1, m2 6283 paddw m0, m1 6284 movq [dstq+dsq*0], m0 6285 movhps [dstq+dsq*1], m0 6286 lea dstq, [dstq+dsq*2] 6287 sub hd, 2 6288 jg .w4_loop 6289 RET 6290INIT_YMM avx2 6291.w8: 6292 vbroadcasti128 m2, [base+obmc_masks_avx2+8*2] 6293.w8_loop: 6294 mova xm0, [dstq+dsq*0] 6295 vinserti128 m0, [dstq+dsq*1], 1 6296 psubw m1, m0, [tmpq] 6297 add tmpq, 16*2 6298 pmulhrsw m1, m2 6299 paddw m0, m1 6300 mova [dstq+dsq*0], xm0 6301 vextracti128 [dstq+dsq*1], m0, 1 6302 lea dstq, [dstq+dsq*2] 6303 sub hd, 2 6304 jg .w8_loop 6305 RET 6306.w16: 6307 mova m4, [base+obmc_masks_avx2+16*2] 6308.w16_loop: 6309 mova m0, [dstq+dsq*0] 6310 psubw m2, m0, [tmpq+ 32*0] 6311 mova m1, [dstq+dsq*1] 6312 psubw m3, m1, [tmpq+ 32*1] 6313 add tmpq, 32*2 6314 pmulhrsw m2, m4 6315 pmulhrsw m3, m4 6316 paddw m0, m2 6317 paddw m1, m3 6318 mova [dstq+dsq*0], m0 6319 mova [dstq+dsq*1], m1 6320 lea dstq, [dstq+dsq*2] 6321 sub hd, 2 6322 jg .w16_loop 6323 RET 6324.w32: 6325%if WIN64 6326 movaps [rsp+ 8], xmm6 6327 movaps [rsp+24], xmm7 6328%endif 6329 mova m6, [base+obmc_masks_avx2+32*2] 6330 vbroadcasti128 m7, [base+obmc_masks_avx2+32*3] 6331.w32_loop: 6332 mova m0, [dstq+dsq*0+32*0] 6333 psubw m3, m0, [tmpq +32*0] 6334 mova xm2, [dstq+dsq*0+32*1] 6335 mova xm5, [tmpq +32*1] 6336 mova m1, [dstq+dsq*1+32*0] 6337 psubw m4, m1, [tmpq +32*2] 6338 vinserti128 m2, [dstq+dsq*1+32*1], 1 6339 vinserti128 m5, [tmpq +32*3], 1 6340 add tmpq, 32*4 6341 psubw m5, m2, m5 6342 pmulhrsw m3, m6 6343 pmulhrsw m4, m6 6344 pmulhrsw m5, m7 6345 paddw m0, m3 6346 paddw m1, m4 6347 paddw m2, m5 6348 mova [dstq+dsq*0+32*0], m0 6349 mova [dstq+dsq*1+32*0], m1 6350 mova [dstq+dsq*0+32*1], xm2 6351 vextracti128 [dstq+dsq*1+32*1], m2, 1 6352 lea dstq, [dstq+dsq*2] 6353 sub hd, 2 6354 jg .w32_loop 6355%if WIN64 6356 movaps xmm6, [rsp+ 8] 6357 movaps xmm7, [rsp+24] 6358%endif 6359 RET 6360 6361%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp 6362 mova m0, [dstq+32*(%1+0)] 6363 psubw m2, m0, [tmpq+32*(%2+0)] 6364 mova m1, [dstq+32*(%1+1)] 6365 psubw m3, m1, [tmpq+32*(%2+1)] 6366%if %3 6367 add tmpq, 32*%3 6368%endif 6369 pmulhrsw m2, m4 6370 pmulhrsw m3, m4 6371 paddw m0, m2 6372 paddw m1, m3 6373 mova [dstq+32*(%1+0)], m0 6374 mova [dstq+32*(%1+1)], m1 6375%endmacro 6376 6377INIT_XMM avx2 6378cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask 6379%define base r5-blend_h_avx2_table 6380 lea r5, [blend_h_avx2_table] 6381 tzcnt wd, wm 6382 mov hd, hm 6383 movsxd wq, [r5+wq*4] 6384 add wq, r5 6385 lea maskq, [base+obmc_masks_avx2+hq*2] 6386 lea hd, [hq*3] 6387 shr hd, 2 ; h * 3/4 6388 lea maskq, [maskq+hq*2] 6389 neg hq 6390 jmp wq 6391.w2: 6392 movd m0, [dstq+dsq*0] 6393 pinsrd m0, [dstq+dsq*1], 1 6394 movd m2, [maskq+hq*2] 6395 movq m1, [tmpq] 6396 add tmpq, 4*2 6397 punpcklwd m2, m2 6398 psubw m1, m0, m1 6399 pmulhrsw m1, m2 6400 paddw m0, m1 6401 movd [dstq+dsq*0], m0 6402 pextrd [dstq+dsq*1], m0, 1 6403 lea dstq, [dstq+dsq*2] 6404 add hq, 2 6405 jl .w2 6406 RET 6407.w4: 6408 mova m3, [blend_shuf] 6409.w4_loop: 6410 movq m0, [dstq+dsq*0] 6411 movhps m0, [dstq+dsq*1] 6412 movd m2, [maskq+hq*2] 6413 psubw m1, m0, [tmpq] 6414 add tmpq, 8*2 6415 pshufb m2, m3 6416 pmulhrsw m1, m2 6417 paddw m0, m1 6418 movq [dstq+dsq*0], m0 6419 movhps [dstq+dsq*1], m0 6420 lea dstq, [dstq+dsq*2] 6421 add hq, 2 6422 jl .w4_loop 6423 RET 6424INIT_YMM avx2 6425.w8: 6426 vbroadcasti128 m3, [blend_shuf] 6427 shufpd m3, m3, 0x0c 6428.w8_loop: 6429 mova xm0, [dstq+dsq*0] 6430 vinserti128 m0, [dstq+dsq*1], 1 6431 vpbroadcastd m2, [maskq+hq*2] 6432 psubw m1, m0, [tmpq] 6433 add tmpq, 16*2 6434 pshufb m2, m3 6435 pmulhrsw m1, m2 6436 paddw m0, m1 6437 mova [dstq+dsq*0], xm0 6438 vextracti128 [dstq+dsq*1], m0, 1 6439 lea dstq, [dstq+dsq*2] 6440 add hq, 2 6441 jl .w8_loop 6442 RET 6443.w16: 6444 vpbroadcastw m4, [maskq+hq*2] 6445 vpbroadcastw m5, [maskq+hq*2+2] 6446 mova m0, [dstq+dsq*0] 6447 psubw m2, m0, [tmpq+ 32*0] 6448 mova m1, [dstq+dsq*1] 6449 psubw m3, m1, [tmpq+ 32*1] 6450 add tmpq, 32*2 6451 pmulhrsw m2, m4 6452 pmulhrsw m3, m5 6453 paddw m0, m2 6454 paddw m1, m3 6455 mova [dstq+dsq*0], m0 6456 mova [dstq+dsq*1], m1 6457 lea dstq, [dstq+dsq*2] 6458 add hq, 2 6459 jl .w16 6460 RET 6461.w32: 6462 vpbroadcastw m4, [maskq+hq*2] 6463 BLEND_H_ROW 0, 0, 2 6464 add dstq, dsq 6465 inc hq 6466 jl .w32 6467 RET 6468.w64: 6469 vpbroadcastw m4, [maskq+hq*2] 6470 BLEND_H_ROW 0, 0 6471 BLEND_H_ROW 2, 2, 4 6472 add dstq, dsq 6473 inc hq 6474 jl .w64 6475 RET 6476.w128: 6477 vpbroadcastw m4, [maskq+hq*2] 6478 BLEND_H_ROW 0, 0 6479 BLEND_H_ROW 2, 2, 8 6480 BLEND_H_ROW 4, -4 6481 BLEND_H_ROW 6, -2 6482 add dstq, dsq 6483 inc hq 6484 jl .w128 6485 RET 6486 6487cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ 6488 bottomext, rightext 6489 ; we assume that the buffer (stride) is larger than width, so we can 6490 ; safely overwrite by a few bytes 6491 6492 ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 6493 xor r12d, r12d 6494 lea r10, [ihq-1] 6495 cmp yq, ihq 6496 cmovs r10, yq 6497 test yq, yq 6498 cmovs r10, r12 6499 imul r10, sstrideq 6500 add srcq, r10 6501 6502 ; ref += iclip(x, 0, iw - 1) 6503 lea r10, [iwq-1] 6504 cmp xq, iwq 6505 cmovs r10, xq 6506 test xq, xq 6507 cmovs r10, r12 6508 lea srcq, [srcq+r10*2] 6509 6510 ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) 6511 lea bottomextq, [yq+bhq] 6512 sub bottomextq, ihq 6513 lea r3, [bhq-1] 6514 cmovs bottomextq, r12 6515 6516 DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ 6517 bottomext, rightext 6518 6519 ; top_ext = iclip(-y, 0, bh - 1) 6520 neg topextq 6521 cmovs topextq, r12 6522 cmp bottomextq, bhq 6523 cmovns bottomextq, r3 6524 cmp topextq, bhq 6525 cmovg topextq, r3 6526 6527 ; right_ext = iclip(x + bw - iw, 0, bw - 1) 6528 lea rightextq, [xq+bwq] 6529 sub rightextq, iwq 6530 lea r2, [bwq-1] 6531 cmovs rightextq, r12 6532 6533 DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ 6534 bottomext, rightext 6535 6536 ; left_ext = iclip(-x, 0, bw - 1) 6537 neg leftextq 6538 cmovs leftextq, r12 6539 cmp rightextq, bwq 6540 cmovns rightextq, r2 6541 cmp leftextq, bwq 6542 cmovns leftextq, r2 6543 6544 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ 6545 dst, dstride, src, sstride, bottomext, rightext 6546 6547 ; center_h = bh - top_ext - bottom_ext 6548 lea r3, [bottomextq+topextq] 6549 sub centerhq, r3 6550 6551 ; blk += top_ext * PXSTRIDE(dst_stride) 6552 mov r2, topextq 6553 imul r2, dstrideq 6554 add dstq, r2 6555 mov r9m, dstq 6556 6557 ; center_w = bw - left_ext - right_ext 6558 mov centerwq, bwq 6559 lea r3, [rightextq+leftextq] 6560 sub centerwq, r3 6561 6562%macro v_loop 3 ; need_left_ext, need_right_ext, suffix 6563.v_loop_%3: 6564%if %1 6565 ; left extension 6566 xor r3, r3 6567 vpbroadcastw m0, [srcq] 6568.left_loop_%3: 6569 mova [dstq+r3*2], m0 6570 add r3, 16 6571 cmp r3, leftextq 6572 jl .left_loop_%3 6573 6574 ; body 6575 lea r12, [dstq+leftextq*2] 6576%endif 6577 xor r3, r3 6578.body_loop_%3: 6579 movu m0, [srcq+r3*2] 6580%if %1 6581 movu [r12+r3*2], m0 6582%else 6583 movu [dstq+r3*2], m0 6584%endif 6585 add r3, 16 6586 cmp r3, centerwq 6587 jl .body_loop_%3 6588 6589%if %2 6590 ; right extension 6591%if %1 6592 lea r12, [r12+centerwq*2] 6593%else 6594 lea r12, [dstq+centerwq*2] 6595%endif 6596 xor r3, r3 6597 vpbroadcastw m0, [srcq+centerwq*2-2] 6598.right_loop_%3: 6599 movu [r12+r3*2], m0 6600 add r3, 16 6601 cmp r3, rightextq 6602 jl .right_loop_%3 6603 6604%endif 6605 add dstq, dstrideq 6606 add srcq, sstrideq 6607 dec centerhq 6608 jg .v_loop_%3 6609%endmacro 6610 6611 test leftextq, leftextq 6612 jnz .need_left_ext 6613 test rightextq, rightextq 6614 jnz .need_right_ext 6615 v_loop 0, 0, 0 6616 jmp .body_done 6617 6618.need_left_ext: 6619 test rightextq, rightextq 6620 jnz .need_left_right_ext 6621 v_loop 1, 0, 1 6622 jmp .body_done 6623 6624.need_left_right_ext: 6625 v_loop 1, 1, 2 6626 jmp .body_done 6627 6628.need_right_ext: 6629 v_loop 0, 1, 3 6630 6631.body_done: 6632 ; bottom edge extension 6633 test bottomextq, bottomextq 6634 jz .top 6635 mov srcq, dstq 6636 sub srcq, dstrideq 6637 xor r1, r1 6638.bottom_x_loop: 6639 mova m0, [srcq+r1*2] 6640 lea r3, [dstq+r1*2] 6641 mov r4, bottomextq 6642.bottom_y_loop: 6643 mova [r3], m0 6644 add r3, dstrideq 6645 dec r4 6646 jg .bottom_y_loop 6647 add r1, 16 6648 cmp r1, bwq 6649 jl .bottom_x_loop 6650 6651.top: 6652 ; top edge extension 6653 test topextq, topextq 6654 jz .end 6655 mov srcq, r9m 6656 mov dstq, dstm 6657 xor r1, r1 6658.top_x_loop: 6659 mova m0, [srcq+r1*2] 6660 lea r3, [dstq+r1*2] 6661 mov r4, topextq 6662.top_y_loop: 6663 mova [r3], m0 6664 add r3, dstrideq 6665 dec r4 6666 jg .top_y_loop 6667 add r1, 16 6668 cmp r1, bwq 6669 jl .top_x_loop 6670 6671.end: 6672 RET 6673 6674cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \ 6675 dst_w, h, src_w, dx, mx0, pxmax 6676 sub dword mx0m, 4<<14 6677 sub dword src_wm, 8 6678 vpbroadcastd m5, dxm 6679 vpbroadcastd m8, mx0m 6680 vpbroadcastd m6, src_wm 6681 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax 6682 LEA r7, $$ 6683%define base r7-$$ 6684 vpbroadcastd m3, [base+pd_64] 6685 vpbroadcastw xm7, pxmaxm 6686 pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] 6687 pslld m5, 3 ; dx*8 6688 pslld m6, 14 6689 paddd m8, m2 ; mx+[0..7]*dx 6690.loop_y: 6691 xor xd, xd 6692 mova m4, m8 ; per-line working version of mx 6693.loop_x: 6694 vpbroadcastd m10, [base+pd_63] 6695 pxor m2, m2 6696 pmaxsd m0, m4, m2 6697 psrad m9, m4, 8 ; filter offset (unmasked) 6698 pminsd m0, m6 ; iclip(mx, 0, src_w-8) 6699 psubd m1, m4, m0 ; pshufb offset 6700 psrad m0, 14 ; clipped src_x offset 6701 psrad m1, 14 ; pshufb edge_emu offset 6702 pand m9, m10 ; filter offset (masked) 6703 ; load source pixels 6704 movd r8d, xm0 6705 pextrd r9d, xm0, 1 6706 pextrd r10d, xm0, 2 6707 pextrd r11d, xm0, 3 6708 vextracti128 xm0, m0, 1 6709 movu xm10, [srcq+r8*2] 6710 movu xm11, [srcq+r9*2] 6711 movu xm12, [srcq+r10*2] 6712 movu xm13, [srcq+r11*2] 6713 movd r8d, xm0 6714 pextrd r9d, xm0, 1 6715 pextrd r10d, xm0, 2 6716 pextrd r11d, xm0, 3 6717 vinserti128 m10, [srcq+r8*2], 1 6718 vinserti128 m11, [srcq+r9*2], 1 6719 vinserti128 m12, [srcq+r10*2], 1 6720 vinserti128 m13, [srcq+r11*2], 1 6721 ptest m1, m1 6722 jz .filter 6723 movq r9, xm1 6724 pextrq r11, xm1, 1 6725 movsxd r8, r9d 6726 sar r9, 32 6727 movsxd r10, r11d 6728 sar r11, 32 6729 vextracti128 xm1, m1, 1 6730 movu xm14, [base+resize_shuf+8+r8*2] 6731 movu xm15, [base+resize_shuf+8+r9*2] 6732 movu xm0, [base+resize_shuf+8+r10*2] 6733 movu xm2, [base+resize_shuf+8+r11*2] 6734 movq r9, xm1 6735 pextrq r11, xm1, 1 6736 movsxd r8, r9d 6737 sar r9, 32 6738 movsxd r10, r11d 6739 sar r11, 32 6740 vinserti128 m14, [base+resize_shuf+8+r8*2], 1 6741 vinserti128 m15, [base+resize_shuf+8+r9*2], 1 6742 vinserti128 m0, [base+resize_shuf+8+r10*2], 1 6743 vinserti128 m2, [base+resize_shuf+8+r11*2], 1 6744 pshufb m10, m14 6745 pshufb m11, m15 6746 pshufb m12, m0 6747 pshufb m13, m2 6748.filter: 6749 movd r8d, xm9 6750 pextrd r9d, xm9, 1 6751 pextrd r10d, xm9, 2 6752 pextrd r11d, xm9, 3 6753 vextracti128 xm9, m9, 1 6754 movq xm14, [base+resize_filter+r8*8] 6755 movq xm15, [base+resize_filter+r9*8] 6756 movq xm0, [base+resize_filter+r10*8] 6757 movq xm2, [base+resize_filter+r11*8] 6758 movd r8d, xm9 6759 pextrd r9d, xm9, 1 6760 pextrd r10d, xm9, 2 6761 pextrd r11d, xm9, 3 6762 movhps xm14, [base+resize_filter+r8*8] 6763 movhps xm15, [base+resize_filter+r9*8] 6764 movhps xm0, [base+resize_filter+r10*8] 6765 movhps xm2, [base+resize_filter+r11*8] 6766 pmovsxbw m14, xm14 6767 pmovsxbw m15, xm15 6768 pmovsxbw m0, xm0 6769 pmovsxbw m2, xm2 6770 pmaddwd m10, m14 6771 pmaddwd m11, m15 6772 pmaddwd m12, m0 6773 pmaddwd m13, m2 6774 phaddd m10, m11 6775 phaddd m12, m13 6776 phaddd m10, m12 6777 psubd m10, m3, m10 6778 psrad m10, 7 6779 vextracti128 xm0, m10, 1 6780 packusdw xm10, xm0 6781 pminsw xm10, xm7 6782 mova [dstq+xq*2], xm10 6783 paddd m4, m5 6784 add xd, 8 6785 cmp xd, dst_wd 6786 jl .loop_x 6787 add dstq, dst_strideq 6788 add srcq, src_strideq 6789 dec hd 6790 jg .loop_y 6791 RET 6792 6793%endif ; ARCH_X86_64 6794