1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 30 31; dav1d_obmc_masks[] << 9 32obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 33 dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0 34 dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120 35 dw 4096, 3072, 2048, 1536, 0, 0, 0, 0 36 dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240 37 dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608 38 dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024 39 40blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 41spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 42spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 43spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 44spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 45spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 46unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 47rescale_mul: dd 0, 1, 2, 3 48resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 49 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 50bdct_lb_q: times 8 db 0 51 times 8 db 4 52 times 8 db 8 53 times 8 db 12 54 55pw_2: times 8 dw 2 56pw_16: times 4 dw 16 57prep_mul: times 4 dw 16 58 times 8 dw 4 59pw_64: times 8 dw 64 60pw_256: times 8 dw 256 61pw_2048: times 4 dw 2048 62bidir_mul: times 4 dw 2048 63pw_8192: times 8 dw 8192 64pw_27615: times 8 dw 27615 65pw_32766: times 8 dw 32766 66pw_m512: times 8 dw -512 67pd_63: times 4 dd 63 68pd_64: times 4 dd 64 69pd_512: times 4 dd 512 70pd_2560: times 2 dd 2560 71pd_8704: times 2 dd 8704 72pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32 73pd_0x3ff: times 4 dd 0x3ff 74pd_0x4000: times 4 dd 0x4000 75pq_0x400000: times 2 dq 0x400000 76pq_0x40000000: times 2 dq 0x40000000 77pd_65538: times 2 dd 65538 78 79put_bilin_h_rnd: times 4 dw 8 80 times 4 dw 10 81s_8tap_h_rnd: times 2 dd 2 82 times 2 dd 8 83put_s_8tap_v_rnd: times 2 dd 512 84 times 2 dd 128 85s_8tap_h_sh: dd 2, 4 86put_s_8tap_v_sh: dd 10, 8 87bidir_rnd: times 4 dw -16400 88 times 4 dw -16388 89put_8tap_h_rnd: dd 34, 34, 40, 40 90prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) 91prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) 92 93warp8x8_shift: dd 11, 13 94warp8x8_rnd1: dd 1024, 1024, 4096, 4096 95warp8x8_rnd2: times 4 dw 4096 96 times 4 dw 16384 97warp8x8t_rnd: times 2 dd 16384 - (8192 << 15) 98 99%macro BIDIR_JMP_TABLE 2-* 100 %xdefine %1_%2_table (%%table - 2*%3) 101 %xdefine %%base %1_%2_table 102 %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) 103 %%table: 104 %rep %0 - 2 105 dd %%prefix %+ .w%3 - %%base 106 %rotate 1 107 %endrep 108%endmacro 109 110BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 111BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 112BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 113BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 114BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 115BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 116BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 117BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 118BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 119 120%macro BASE_JMP_TABLE 3-* 121 %xdefine %1_%2_table (%%table - %3) 122 %xdefine %%base %1_%2 123 %%table: 124 %rep %0 - 2 125 dw %%base %+ _w%3 - %%base 126 %rotate 1 127 %endrep 128%endmacro 129 130%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put) 131%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep) 132 133BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 134BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 135 136%macro SCALED_JMP_TABLE 2-* 137 %xdefine %1_%2_table (%%table - %3) 138 %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) 139%%table: 140 %rep %0 - 2 141 dw %%base %+ .w%3 - %%base 142 %rotate 1 143 %endrep 144 %rotate 2 145%%dy_1024: 146 %xdefine %1_%2_dy1_table (%%dy_1024 - %3) 147 %rep %0 - 2 148 dw %%base %+ .dy1_w%3 - %%base 149 %rotate 1 150 %endrep 151 %rotate 2 152%%dy_2048: 153 %xdefine %1_%2_dy2_table (%%dy_2048 - %3) 154 %rep %0 - 2 155 dw %%base %+ .dy2_w%3 - %%base 156 %rotate 1 157 %endrep 158%endmacro 159 160SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 161SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 162 163cextern mc_subpel_filters 164%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 165 166cextern mc_warp_filter 167cextern resize_filter 168 169SECTION .text 170 171%if UNIX64 172DECLARE_REG_TMP 7 173%else 174DECLARE_REG_TMP 5 175%endif 176 177INIT_XMM ssse3 178cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy 179%define base t0-put_ssse3 180 mov mxyd, r6m ; mx 181 LEA t0, put_ssse3 182 movifnidn wd, wm 183 test mxyd, mxyd 184 jnz .h 185 mov mxyd, r7m ; my 186 test mxyd, mxyd 187 jnz .v 188.put: 189 tzcnt wd, wd 190 movzx wd, word [base+put_ssse3_table+wq*2] 191 add wq, t0 192 movifnidn hd, hm 193 jmp wq 194.put_w2: 195 mov r4d, [srcq+ssq*0] 196 mov r6d, [srcq+ssq*1] 197 lea srcq, [srcq+ssq*2] 198 mov [dstq+dsq*0], r4d 199 mov [dstq+dsq*1], r6d 200 lea dstq, [dstq+dsq*2] 201 sub hd, 2 202 jg .put_w2 203 RET 204.put_w4: 205 movq m0, [srcq+ssq*0] 206 movq m1, [srcq+ssq*1] 207 lea srcq, [srcq+ssq*2] 208 movq [dstq+dsq*0], m0 209 movq [dstq+dsq*1], m1 210 lea dstq, [dstq+dsq*2] 211 sub hd, 2 212 jg .put_w4 213 RET 214.put_w8: 215 movu m0, [srcq+ssq*0] 216 movu m1, [srcq+ssq*1] 217 lea srcq, [srcq+ssq*2] 218 mova [dstq+dsq*0], m0 219 mova [dstq+dsq*1], m1 220 lea dstq, [dstq+dsq*2] 221 sub hd, 2 222 jg .put_w8 223 RET 224.put_w16: 225 movu m0, [srcq+ssq*0+16*0] 226 movu m1, [srcq+ssq*0+16*1] 227 movu m2, [srcq+ssq*1+16*0] 228 movu m3, [srcq+ssq*1+16*1] 229 lea srcq, [srcq+ssq*2] 230 mova [dstq+dsq*0+16*0], m0 231 mova [dstq+dsq*0+16*1], m1 232 mova [dstq+dsq*1+16*0], m2 233 mova [dstq+dsq*1+16*1], m3 234 lea dstq, [dstq+dsq*2] 235 sub hd, 2 236 jg .put_w16 237 RET 238.put_w32: 239 movu m0, [srcq+16*0] 240 movu m1, [srcq+16*1] 241 movu m2, [srcq+16*2] 242 movu m3, [srcq+16*3] 243 add srcq, ssq 244 mova [dstq+16*0], m0 245 mova [dstq+16*1], m1 246 mova [dstq+16*2], m2 247 mova [dstq+16*3], m3 248 add dstq, dsq 249 dec hd 250 jg .put_w32 251 RET 252.put_w64: 253 movu m0, [srcq+16*0] 254 movu m1, [srcq+16*1] 255 movu m2, [srcq+16*2] 256 movu m3, [srcq+16*3] 257 mova [dstq+16*0], m0 258 mova [dstq+16*1], m1 259 mova [dstq+16*2], m2 260 mova [dstq+16*3], m3 261 movu m0, [srcq+16*4] 262 movu m1, [srcq+16*5] 263 movu m2, [srcq+16*6] 264 movu m3, [srcq+16*7] 265 add srcq, ssq 266 mova [dstq+16*4], m0 267 mova [dstq+16*5], m1 268 mova [dstq+16*6], m2 269 mova [dstq+16*7], m3 270 add dstq, dsq 271 dec hd 272 jg .put_w64 273 RET 274.put_w128: 275 add srcq, 16*8 276 add dstq, 16*8 277.put_w128_loop: 278 movu m0, [srcq-16*8] 279 movu m1, [srcq-16*7] 280 movu m2, [srcq-16*6] 281 movu m3, [srcq-16*5] 282 mova [dstq-16*8], m0 283 mova [dstq-16*7], m1 284 mova [dstq-16*6], m2 285 mova [dstq-16*5], m3 286 movu m0, [srcq-16*4] 287 movu m1, [srcq-16*3] 288 movu m2, [srcq-16*2] 289 movu m3, [srcq-16*1] 290 mova [dstq-16*4], m0 291 mova [dstq-16*3], m1 292 mova [dstq-16*2], m2 293 mova [dstq-16*1], m3 294 movu m0, [srcq+16*0] 295 movu m1, [srcq+16*1] 296 movu m2, [srcq+16*2] 297 movu m3, [srcq+16*3] 298 mova [dstq+16*0], m0 299 mova [dstq+16*1], m1 300 mova [dstq+16*2], m2 301 mova [dstq+16*3], m3 302 movu m0, [srcq+16*4] 303 movu m1, [srcq+16*5] 304 movu m2, [srcq+16*6] 305 movu m3, [srcq+16*7] 306 add srcq, ssq 307 mova [dstq+16*4], m0 308 mova [dstq+16*5], m1 309 mova [dstq+16*6], m2 310 mova [dstq+16*7], m3 311 add dstq, dsq 312 dec hd 313 jg .put_w128_loop 314 RET 315.h: 316 movd m5, mxyd 317 mov mxyd, r7m ; my 318 mova m4, [base+pw_16] 319 pshufb m5, [base+pw_256] 320 psubw m4, m5 321 test mxyd, mxyd 322 jnz .hv 323 ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v 324 mov r6d, r8m ; bitdepth_max 325 shr r6d, 11 326 movddup m3, [base+put_bilin_h_rnd+r6*8] 327 movifnidn hd, hm 328 sub wd, 8 329 jg .h_w16 330 je .h_w8 331 cmp wd, -4 332 je .h_w4 333.h_w2: 334 movq m1, [srcq+ssq*0] 335 movhps m1, [srcq+ssq*1] 336 lea srcq, [srcq+ssq*2] 337 pmullw m0, m4, m1 338 psrlq m1, 16 339 pmullw m1, m5 340 paddw m0, m3 341 paddw m0, m1 342 psrlw m0, 4 343 movd [dstq+dsq*0], m0 344 punpckhqdq m0, m0 345 movd [dstq+dsq*1], m0 346 lea dstq, [dstq+dsq*2] 347 sub hd, 2 348 jg .h_w2 349 RET 350.h_w4: 351 movq m0, [srcq+ssq*0] 352 movhps m0, [srcq+ssq*1] 353 movq m1, [srcq+ssq*0+2] 354 movhps m1, [srcq+ssq*1+2] 355 lea srcq, [srcq+ssq*2] 356 pmullw m0, m4 357 pmullw m1, m5 358 paddw m0, m3 359 paddw m0, m1 360 psrlw m0, 4 361 movq [dstq+dsq*0], m0 362 movhps [dstq+dsq*1], m0 363 lea dstq, [dstq+dsq*2] 364 sub hd, 2 365 jg .h_w4 366 RET 367.h_w8: 368 movu m0, [srcq+ssq*0] 369 movu m1, [srcq+ssq*0+2] 370 pmullw m0, m4 371 pmullw m1, m5 372 paddw m0, m3 373 paddw m0, m1 374 movu m1, [srcq+ssq*1] 375 movu m2, [srcq+ssq*1+2] 376 lea srcq, [srcq+ssq*2] 377 pmullw m1, m4 378 pmullw m2, m5 379 paddw m1, m3 380 paddw m1, m2 381 psrlw m0, 4 382 psrlw m1, 4 383 mova [dstq+dsq*0], m0 384 mova [dstq+dsq*1], m1 385 lea dstq, [dstq+dsq*2] 386 sub hd, 2 387 jg .h_w8 388 RET 389.h_w16: 390 lea srcq, [srcq+wq*2] 391 lea dstq, [dstq+wq*2] 392 neg wq 393.h_w16_loop0: 394 mov r6, wq 395.h_w16_loop: 396 movu m0, [srcq+r6*2+ 0] 397 movu m1, [srcq+r6*2+ 2] 398 pmullw m0, m4 399 pmullw m1, m5 400 paddw m0, m3 401 paddw m0, m1 402 movu m1, [srcq+r6*2+16] 403 movu m2, [srcq+r6*2+18] 404 pmullw m1, m4 405 pmullw m2, m5 406 paddw m1, m3 407 paddw m1, m2 408 psrlw m0, 4 409 psrlw m1, 4 410 mova [dstq+r6*2+16*0], m0 411 mova [dstq+r6*2+16*1], m1 412 add r6, 16 413 jl .h_w16_loop 414 add srcq, ssq 415 add dstq, dsq 416 dec hd 417 jg .h_w16_loop0 418 RET 419.v: 420 shl mxyd, 11 421 movd m5, mxyd 422 pshufb m5, [base+pw_256] 423 movifnidn hd, hm 424 cmp wd, 4 425 jg .v_w8 426 je .v_w4 427.v_w2: 428 movd m0, [srcq+ssq*0] 429.v_w2_loop: 430 movd m1, [srcq+ssq*1] 431 lea srcq, [srcq+ssq*2] 432 punpcklqdq m2, m0, m1 433 movd m0, [srcq+ssq*0] 434 punpcklqdq m1, m0 435 psubw m1, m2 436 pmulhrsw m1, m5 437 paddw m1, m2 438 movd [dstq+dsq*0], m1 439 punpckhqdq m1, m1 440 movd [dstq+dsq*1], m1 441 lea dstq, [dstq+dsq*2] 442 sub hd, 2 443 jg .v_w2_loop 444 RET 445.v_w4: 446 movq m0, [srcq+ssq*0] 447.v_w4_loop: 448 movq m1, [srcq+ssq*1] 449 lea srcq, [srcq+ssq*2] 450 punpcklqdq m2, m0, m1 451 movq m0, [srcq+ssq*0] 452 punpcklqdq m1, m0 453 psubw m1, m2 454 pmulhrsw m1, m5 455 paddw m1, m2 456 movq [dstq+dsq*0], m1 457 movhps [dstq+dsq*1], m1 458 lea dstq, [dstq+dsq*2] 459 sub hd, 2 460 jg .v_w4_loop 461 RET 462.v_w8: 463%if ARCH_X86_64 464%if WIN64 465 push r7 466%endif 467 shl wd, 5 468 mov r7, srcq 469 lea r6d, [wq+hq-256] 470 mov r4, dstq 471%else 472 mov r6, srcq 473%endif 474.v_w8_loop0: 475 movu m0, [srcq+ssq*0] 476.v_w8_loop: 477 movu m3, [srcq+ssq*1] 478 lea srcq, [srcq+ssq*2] 479 psubw m1, m3, m0 480 pmulhrsw m1, m5 481 paddw m1, m0 482 movu m0, [srcq+ssq*0] 483 psubw m2, m0, m3 484 pmulhrsw m2, m5 485 paddw m2, m3 486 mova [dstq+dsq*0], m1 487 mova [dstq+dsq*1], m2 488 lea dstq, [dstq+dsq*2] 489 sub hd, 2 490 jg .v_w8_loop 491%if ARCH_X86_64 492 add r7, 16 493 add r4, 16 494 movzx hd, r6b 495 mov srcq, r7 496 mov dstq, r4 497 sub r6d, 1<<8 498%else 499 mov dstq, dstmp 500 add r6, 16 501 mov hd, hm 502 add dstq, 16 503 mov srcq, r6 504 mov dstmp, dstq 505 sub wd, 8 506%endif 507 jg .v_w8_loop0 508%if WIN64 509 pop r7 510%endif 511 RET 512.hv: 513 WIN64_SPILL_XMM 8 514 shl mxyd, 11 515 mova m3, [base+pw_2] 516 movd m6, mxyd 517 mova m7, [base+pw_8192] 518 pshufb m6, [base+pw_256] 519 test dword r8m, 0x800 520 jnz .hv_12bpc 521 psllw m4, 2 522 psllw m5, 2 523 mova m7, [base+pw_2048] 524.hv_12bpc: 525 movifnidn hd, hm 526 cmp wd, 4 527 jg .hv_w8 528 je .hv_w4 529.hv_w2: 530 movddup m0, [srcq+ssq*0] 531 pshufhw m1, m0, q0321 532 pmullw m0, m4 533 pmullw m1, m5 534 paddw m0, m3 535 paddw m0, m1 536 psrlw m0, 2 537.hv_w2_loop: 538 movq m2, [srcq+ssq*1] 539 lea srcq, [srcq+ssq*2] 540 movhps m2, [srcq+ssq*0] 541 pmullw m1, m4, m2 542 psrlq m2, 16 543 pmullw m2, m5 544 paddw m1, m3 545 paddw m1, m2 546 psrlw m1, 2 ; 1 _ 2 _ 547 shufpd m2, m0, m1, 0x01 ; 0 _ 1 _ 548 mova m0, m1 549 psubw m1, m2 550 paddw m1, m1 551 pmulhw m1, m6 552 paddw m1, m2 553 pmulhrsw m1, m7 554 movd [dstq+dsq*0], m1 555 punpckhqdq m1, m1 556 movd [dstq+dsq*1], m1 557 lea dstq, [dstq+dsq*2] 558 sub hd, 2 559 jg .hv_w2_loop 560 RET 561.hv_w4: 562 movddup m0, [srcq+ssq*0] 563 movddup m1, [srcq+ssq*0+2] 564 pmullw m0, m4 565 pmullw m1, m5 566 paddw m0, m3 567 paddw m0, m1 568 psrlw m0, 2 569.hv_w4_loop: 570 movq m1, [srcq+ssq*1] 571 movq m2, [srcq+ssq*1+2] 572 lea srcq, [srcq+ssq*2] 573 movhps m1, [srcq+ssq*0] 574 movhps m2, [srcq+ssq*0+2] 575 pmullw m1, m4 576 pmullw m2, m5 577 paddw m1, m3 578 paddw m1, m2 579 psrlw m1, 2 ; 1 2 580 shufpd m2, m0, m1, 0x01 ; 0 1 581 mova m0, m1 582 psubw m1, m2 583 paddw m1, m1 584 pmulhw m1, m6 585 paddw m1, m2 586 pmulhrsw m1, m7 587 movq [dstq+dsq*0], m1 588 movhps [dstq+dsq*1], m1 589 lea dstq, [dstq+dsq*2] 590 sub hd, 2 591 jg .hv_w4_loop 592 RET 593.hv_w8: 594%if ARCH_X86_64 595%if WIN64 596 push r7 597%endif 598 shl wd, 5 599 lea r6d, [wq+hq-256] 600 mov r4, srcq 601 mov r7, dstq 602%else 603 mov r6, srcq 604%endif 605.hv_w8_loop0: 606 movu m0, [srcq+ssq*0] 607 movu m1, [srcq+ssq*0+2] 608 pmullw m0, m4 609 pmullw m1, m5 610 paddw m0, m3 611 paddw m0, m1 612 psrlw m0, 2 613.hv_w8_loop: 614 movu m1, [srcq+ssq*1] 615 movu m2, [srcq+ssq*1+2] 616 lea srcq, [srcq+ssq*2] 617 pmullw m1, m4 618 pmullw m2, m5 619 paddw m1, m3 620 paddw m1, m2 621 psrlw m1, 2 622 psubw m2, m1, m0 623 paddw m2, m2 624 pmulhw m2, m6 625 paddw m2, m0 626 pmulhrsw m2, m7 627 mova [dstq+dsq*0], m2 628 movu m0, [srcq+ssq*0] 629 movu m2, [srcq+ssq*0+2] 630 pmullw m0, m4 631 pmullw m2, m5 632 paddw m0, m3 633 paddw m0, m2 634 psrlw m0, 2 635 psubw m2, m0, m1 636 paddw m2, m2 637 pmulhw m2, m6 638 paddw m2, m1 639 pmulhrsw m2, m7 640 mova [dstq+dsq*1], m2 641 lea dstq, [dstq+dsq*2] 642 sub hd, 2 643 jg .hv_w8_loop 644%if ARCH_X86_64 645 add r4, 16 646 add r7, 16 647 movzx hd, r6b 648 mov srcq, r4 649 mov dstq, r7 650 sub r6d, 1<<8 651%else 652 mov dstq, dstmp 653 add r6, 16 654 mov hd, hm 655 add dstq, 16 656 mov srcq, r6 657 mov dstmp, dstq 658 sub wd, 8 659%endif 660 jg .hv_w8_loop0 661%if WIN64 662 pop r7 663%endif 664 RET 665 666cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 667%define base r6-prep_ssse3 668 movifnidn mxyd, r5m ; mx 669 LEA r6, prep_ssse3 670 movifnidn hd, hm 671 test mxyd, mxyd 672 jnz .h 673 mov mxyd, r6m ; my 674 test mxyd, mxyd 675 jnz .v 676.prep: 677 tzcnt wd, wd 678 movzx wd, word [base+prep_ssse3_table+wq*2] 679 mov r5d, r7m ; bitdepth_max 680 mova m5, [base+pw_8192] 681 add wq, r6 682 shr r5d, 11 683 movddup m4, [base+prep_mul+r5*8] 684 lea stride3q, [strideq*3] 685 jmp wq 686.prep_w4: 687 movq m0, [srcq+strideq*0] 688 movhps m0, [srcq+strideq*1] 689 movq m1, [srcq+strideq*2] 690 movhps m1, [srcq+stride3q ] 691 lea srcq, [srcq+strideq*4] 692 pmullw m0, m4 693 pmullw m1, m4 694 psubw m0, m5 695 psubw m1, m5 696 mova [tmpq+16*0], m0 697 mova [tmpq+16*1], m1 698 add tmpq, 16*2 699 sub hd, 4 700 jg .prep_w4 701 RET 702.prep_w8: 703 movu m0, [srcq+strideq*0] 704 movu m1, [srcq+strideq*1] 705 movu m2, [srcq+strideq*2] 706 movu m3, [srcq+stride3q ] 707 lea srcq, [srcq+strideq*4] 708 REPX {pmullw x, m4}, m0, m1, m2, m3 709 REPX {psubw x, m5}, m0, m1, m2, m3 710 mova [tmpq+16*0], m0 711 mova [tmpq+16*1], m1 712 mova [tmpq+16*2], m2 713 mova [tmpq+16*3], m3 714 add tmpq, 16*4 715 sub hd, 4 716 jg .prep_w8 717 RET 718.prep_w16: 719 movu m0, [srcq+strideq*0+16*0] 720 movu m1, [srcq+strideq*0+16*1] 721 movu m2, [srcq+strideq*1+16*0] 722 movu m3, [srcq+strideq*1+16*1] 723 lea srcq, [srcq+strideq*2] 724 REPX {pmullw x, m4}, m0, m1, m2, m3 725 REPX {psubw x, m5}, m0, m1, m2, m3 726 mova [tmpq+16*0], m0 727 mova [tmpq+16*1], m1 728 mova [tmpq+16*2], m2 729 mova [tmpq+16*3], m3 730 add tmpq, 16*4 731 sub hd, 2 732 jg .prep_w16 733 RET 734.prep_w32: 735 movu m0, [srcq+16*0] 736 movu m1, [srcq+16*1] 737 movu m2, [srcq+16*2] 738 movu m3, [srcq+16*3] 739 add srcq, strideq 740 REPX {pmullw x, m4}, m0, m1, m2, m3 741 REPX {psubw x, m5}, m0, m1, m2, m3 742 mova [tmpq+16*0], m0 743 mova [tmpq+16*1], m1 744 mova [tmpq+16*2], m2 745 mova [tmpq+16*3], m3 746 add tmpq, 16*4 747 dec hd 748 jg .prep_w32 749 RET 750.prep_w64: 751 movu m0, [srcq+16*0] 752 movu m1, [srcq+16*1] 753 movu m2, [srcq+16*2] 754 movu m3, [srcq+16*3] 755 REPX {pmullw x, m4}, m0, m1, m2, m3 756 REPX {psubw x, m5}, m0, m1, m2, m3 757 mova [tmpq+16*0], m0 758 mova [tmpq+16*1], m1 759 mova [tmpq+16*2], m2 760 mova [tmpq+16*3], m3 761 movu m0, [srcq+16*4] 762 movu m1, [srcq+16*5] 763 movu m2, [srcq+16*6] 764 movu m3, [srcq+16*7] 765 add srcq, strideq 766 REPX {pmullw x, m4}, m0, m1, m2, m3 767 REPX {psubw x, m5}, m0, m1, m2, m3 768 mova [tmpq+16*4], m0 769 mova [tmpq+16*5], m1 770 mova [tmpq+16*6], m2 771 mova [tmpq+16*7], m3 772 add tmpq, 16*8 773 dec hd 774 jg .prep_w64 775 RET 776.prep_w128: 777 movu m0, [srcq+16* 0] 778 movu m1, [srcq+16* 1] 779 movu m2, [srcq+16* 2] 780 movu m3, [srcq+16* 3] 781 REPX {pmullw x, m4}, m0, m1, m2, m3 782 REPX {psubw x, m5}, m0, m1, m2, m3 783 mova [tmpq+16*0], m0 784 mova [tmpq+16*1], m1 785 mova [tmpq+16*2], m2 786 mova [tmpq+16*3], m3 787 movu m0, [srcq+16* 4] 788 movu m1, [srcq+16* 5] 789 movu m2, [srcq+16* 6] 790 movu m3, [srcq+16* 7] 791 REPX {pmullw x, m4}, m0, m1, m2, m3 792 REPX {psubw x, m5}, m0, m1, m2, m3 793 mova [tmpq+16*4], m0 794 mova [tmpq+16*5], m1 795 mova [tmpq+16*6], m2 796 mova [tmpq+16*7], m3 797 movu m0, [srcq+16* 8] 798 movu m1, [srcq+16* 9] 799 movu m2, [srcq+16*10] 800 movu m3, [srcq+16*11] 801 add tmpq, 16*16 802 REPX {pmullw x, m4}, m0, m1, m2, m3 803 REPX {psubw x, m5}, m0, m1, m2, m3 804 mova [tmpq-16*8], m0 805 mova [tmpq-16*7], m1 806 mova [tmpq-16*6], m2 807 mova [tmpq-16*5], m3 808 movu m0, [srcq+16*12] 809 movu m1, [srcq+16*13] 810 movu m2, [srcq+16*14] 811 movu m3, [srcq+16*15] 812 add srcq, strideq 813 REPX {pmullw x, m4}, m0, m1, m2, m3 814 REPX {psubw x, m5}, m0, m1, m2, m3 815 mova [tmpq-16*4], m0 816 mova [tmpq-16*3], m1 817 mova [tmpq-16*2], m2 818 mova [tmpq-16*1], m3 819 dec hd 820 jg .prep_w128 821 RET 822.h: 823 movd m4, mxyd 824 mov mxyd, r6m ; my 825 mova m3, [base+pw_16] 826 pshufb m4, [base+pw_256] 827 mova m5, [base+pw_32766] 828 psubw m3, m4 829 test dword r7m, 0x800 830 jnz .h_12bpc 831 psllw m3, 2 832 psllw m4, 2 833.h_12bpc: 834 test mxyd, mxyd 835 jnz .hv 836 sub wd, 8 837 je .h_w8 838 jg .h_w16 839.h_w4: 840 movq m0, [srcq+strideq*0] 841 movhps m0, [srcq+strideq*1] 842 movq m1, [srcq+strideq*0+2] 843 movhps m1, [srcq+strideq*1+2] 844 lea srcq, [srcq+strideq*2] 845 pmullw m0, m3 846 pmullw m1, m4 847 psubw m0, m5 848 paddw m0, m1 849 psraw m0, 2 850 mova [tmpq], m0 851 add tmpq, 16 852 sub hd, 2 853 jg .h_w4 854 RET 855.h_w8: 856 movu m0, [srcq+strideq*0] 857 movu m1, [srcq+strideq*0+2] 858 pmullw m0, m3 859 pmullw m1, m4 860 psubw m0, m5 861 paddw m0, m1 862 movu m1, [srcq+strideq*1] 863 movu m2, [srcq+strideq*1+2] 864 lea srcq, [srcq+strideq*2] 865 pmullw m1, m3 866 pmullw m2, m4 867 psubw m1, m5 868 paddw m1, m2 869 psraw m0, 2 870 psraw m1, 2 871 mova [tmpq+16*0], m0 872 mova [tmpq+16*1], m1 873 add tmpq, 16*2 874 sub hd, 2 875 jg .h_w8 876 RET 877.h_w16: 878 lea srcq, [srcq+wq*2] 879 neg wq 880.h_w16_loop0: 881 mov r6, wq 882.h_w16_loop: 883 movu m0, [srcq+r6*2+ 0] 884 movu m1, [srcq+r6*2+ 2] 885 pmullw m0, m3 886 pmullw m1, m4 887 psubw m0, m5 888 paddw m0, m1 889 movu m1, [srcq+r6*2+16] 890 movu m2, [srcq+r6*2+18] 891 pmullw m1, m3 892 pmullw m2, m4 893 psubw m1, m5 894 paddw m1, m2 895 psraw m0, 2 896 psraw m1, 2 897 mova [tmpq+16*0], m0 898 mova [tmpq+16*1], m1 899 add tmpq, 16*2 900 add r6, 16 901 jl .h_w16_loop 902 add srcq, strideq 903 dec hd 904 jg .h_w16_loop0 905 RET 906.v: 907 movd m4, mxyd 908 mova m3, [base+pw_16] 909 pshufb m4, [base+pw_256] 910 mova m5, [base+pw_32766] 911 psubw m3, m4 912 test dword r7m, 0x800 913 jnz .v_12bpc 914 psllw m3, 2 915 psllw m4, 2 916.v_12bpc: 917 cmp wd, 8 918 je .v_w8 919 jg .v_w16 920.v_w4: 921 movq m0, [srcq+strideq*0] 922.v_w4_loop: 923 movq m2, [srcq+strideq*1] 924 lea srcq, [srcq+strideq*2] 925 punpcklqdq m1, m0, m2 ; 0 1 926 movq m0, [srcq+strideq*0] 927 punpcklqdq m2, m0 ; 1 2 928 pmullw m1, m3 929 pmullw m2, m4 930 psubw m1, m5 931 paddw m1, m2 932 psraw m1, 2 933 mova [tmpq], m1 934 add tmpq, 16 935 sub hd, 2 936 jg .v_w4_loop 937 RET 938.v_w8: 939 movu m0, [srcq+strideq*0] 940.v_w8_loop: 941 movu m2, [srcq+strideq*1] 942 lea srcq, [srcq+strideq*2] 943 pmullw m0, m3 944 pmullw m1, m4, m2 945 psubw m0, m5 946 paddw m1, m0 947 movu m0, [srcq+strideq*0] 948 psraw m1, 2 949 pmullw m2, m3 950 mova [tmpq+16*0], m1 951 pmullw m1, m4, m0 952 psubw m2, m5 953 paddw m1, m2 954 psraw m1, 2 955 mova [tmpq+16*1], m1 956 add tmpq, 16*2 957 sub hd, 2 958 jg .v_w8_loop 959 RET 960.v_w16: 961%if WIN64 962 push r7 963%endif 964 mov r5, srcq 965%if ARCH_X86_64 966 lea r6d, [wq*4-32] 967 mov wd, wd 968 lea r6d, [hq+r6*8] 969 mov r7, tmpq 970%else 971 mov r6d, wd 972%endif 973.v_w16_loop0: 974 movu m0, [srcq+strideq*0] 975.v_w16_loop: 976 movu m2, [srcq+strideq*1] 977 lea srcq, [srcq+strideq*2] 978 pmullw m0, m3 979 pmullw m1, m4, m2 980 psubw m0, m5 981 paddw m1, m0 982 movu m0, [srcq+strideq*0] 983 psraw m1, 2 984 pmullw m2, m3 985 mova [tmpq+wq*0], m1 986 pmullw m1, m4, m0 987 psubw m2, m5 988 paddw m1, m2 989 psraw m1, 2 990 mova [tmpq+wq*2], m1 991 lea tmpq, [tmpq+wq*4] 992 sub hd, 2 993 jg .v_w16_loop 994%if ARCH_X86_64 995 add r5, 16 996 add r7, 16 997 movzx hd, r6b 998 mov srcq, r5 999 mov tmpq, r7 1000 sub r6d, 1<<8 1001%else 1002 mov tmpq, tmpmp 1003 add r5, 16 1004 mov hd, hm 1005 add tmpq, 16 1006 mov srcq, r5 1007 mov tmpmp, tmpq 1008 sub r6d, 8 1009%endif 1010 jg .v_w16_loop0 1011%if WIN64 1012 pop r7 1013%endif 1014 RET 1015.hv: 1016 WIN64_SPILL_XMM 7 1017 shl mxyd, 11 1018 movd m6, mxyd 1019 pshufb m6, [base+pw_256] 1020 cmp wd, 8 1021 je .hv_w8 1022 jg .hv_w16 1023.hv_w4: 1024 movddup m0, [srcq+strideq*0] 1025 movddup m1, [srcq+strideq*0+2] 1026 pmullw m0, m3 1027 pmullw m1, m4 1028 psubw m0, m5 1029 paddw m0, m1 1030 psraw m0, 2 1031.hv_w4_loop: 1032 movq m1, [srcq+strideq*1] 1033 movq m2, [srcq+strideq*1+2] 1034 lea srcq, [srcq+strideq*2] 1035 movhps m1, [srcq+strideq*0] 1036 movhps m2, [srcq+strideq*0+2] 1037 pmullw m1, m3 1038 pmullw m2, m4 1039 psubw m1, m5 1040 paddw m1, m2 1041 psraw m1, 2 ; 1 2 1042 shufpd m2, m0, m1, 0x01 ; 0 1 1043 mova m0, m1 1044 psubw m1, m2 1045 pmulhrsw m1, m6 1046 paddw m1, m2 1047 mova [tmpq], m1 1048 add tmpq, 16 1049 sub hd, 2 1050 jg .hv_w4_loop 1051 RET 1052.hv_w8: 1053 movu m0, [srcq+strideq*0] 1054 movu m1, [srcq+strideq*0+2] 1055 pmullw m0, m3 1056 pmullw m1, m4 1057 psubw m0, m5 1058 paddw m0, m1 1059 psraw m0, 2 1060.hv_w8_loop: 1061 movu m1, [srcq+strideq*1] 1062 movu m2, [srcq+strideq*1+2] 1063 lea srcq, [srcq+strideq*2] 1064 pmullw m1, m3 1065 pmullw m2, m4 1066 psubw m1, m5 1067 paddw m1, m2 1068 psraw m1, 2 1069 psubw m2, m1, m0 1070 pmulhrsw m2, m6 1071 paddw m2, m0 1072 mova [tmpq+16*0], m2 1073 movu m0, [srcq+strideq*0] 1074 movu m2, [srcq+strideq*0+2] 1075 pmullw m0, m3 1076 pmullw m2, m4 1077 psubw m0, m5 1078 paddw m0, m2 1079 psraw m0, 2 1080 psubw m2, m0, m1 1081 pmulhrsw m2, m6 1082 paddw m2, m1 1083 mova [tmpq+16*1], m2 1084 add tmpq, 16*2 1085 sub hd, 2 1086 jg .hv_w8_loop 1087 RET 1088.hv_w16: 1089%if WIN64 1090 push r7 1091%endif 1092 mov r5, srcq 1093%if ARCH_X86_64 1094 lea r6d, [wq*4-32] 1095 mov wd, wd 1096 lea r6d, [hq+r6*8] 1097 mov r7, tmpq 1098%else 1099 mov r6d, wd 1100%endif 1101.hv_w16_loop0: 1102 movu m0, [srcq+strideq*0] 1103 movu m1, [srcq+strideq*0+2] 1104 pmullw m0, m3 1105 pmullw m1, m4 1106 psubw m0, m5 1107 paddw m0, m1 1108 psraw m0, 2 1109.hv_w16_loop: 1110 movu m1, [srcq+strideq*1] 1111 movu m2, [srcq+strideq*1+2] 1112 lea srcq, [srcq+strideq*2] 1113 pmullw m1, m3 1114 pmullw m2, m4 1115 psubw m1, m5 1116 paddw m1, m2 1117 psraw m1, 2 1118 psubw m2, m1, m0 1119 pmulhrsw m2, m6 1120 paddw m2, m0 1121 mova [tmpq+wq*0], m2 1122 movu m0, [srcq+strideq*0] 1123 movu m2, [srcq+strideq*0+2] 1124 pmullw m0, m3 1125 pmullw m2, m4 1126 psubw m0, m5 1127 paddw m0, m2 1128 psraw m0, 2 1129 psubw m2, m0, m1 1130 pmulhrsw m2, m6 1131 paddw m2, m1 1132 mova [tmpq+wq*2], m2 1133 lea tmpq, [tmpq+wq*4] 1134 sub hd, 2 1135 jg .hv_w16_loop 1136%if ARCH_X86_64 1137 add r5, 16 1138 add r7, 16 1139 movzx hd, r6b 1140 mov srcq, r5 1141 mov tmpq, r7 1142 sub r6d, 1<<8 1143%else 1144 mov tmpq, tmpmp 1145 add r5, 16 1146 mov hd, hm 1147 add tmpq, 16 1148 mov srcq, r5 1149 mov tmpmp, tmpq 1150 sub r6d, 8 1151%endif 1152 jg .hv_w16_loop0 1153%if WIN64 1154 pop r7 1155%endif 1156 RET 1157 1158; int8_t subpel_filters[5][15][8] 1159%assign FILTER_REGULAR (0*15 << 16) | 3*15 1160%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1161%assign FILTER_SHARP (2*15 << 16) | 3*15 1162 1163%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to 1164cglobal %1_%2_16bpc 1165 mov t0d, FILTER_%3 1166%ifidn %3, %4 1167 mov t1d, t0d 1168%else 1169 mov t1d, FILTER_%4 1170%endif 1171%if %0 == 5 ; skip the jump in the last filter 1172 jmp mangle(private_prefix %+ _%5 %+ SUFFIX) 1173%endif 1174%endmacro 1175 1176%if ARCH_X86_32 1177DECLARE_REG_TMP 1, 2, 6 1178%elif WIN64 1179DECLARE_REG_TMP 4, 5, 8 1180%else 1181DECLARE_REG_TMP 7, 8, 8 1182%endif 1183 1184%define PUT_8TAP_FN FN put_8tap, 1185PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc 1186PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc 1187PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc 1188PUT_8TAP_FN regular, REGULAR, REGULAR 1189 1190cglobal put_6tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my 1191 %define base t2-put_ssse3 1192%if ARCH_X86_32 1193 %define mxb r0b 1194 %define mxd r0 1195 %define mxq r0 1196 %define myb r1b 1197 %define myd r1 1198 %define myq r1 1199%endif 1200 imul mxd, mxm, 0x010101 1201 add mxd, t0d ; 6tap_h, mx, 4tap_h 1202 imul myd, mym, 0x010101 1203 add myd, t1d ; 6tap_v, my, 4tap_v 1204 LEA t2, put_ssse3 1205 movifnidn wd, wm 1206 movifnidn srcq, srcmp 1207 movifnidn ssq, ssmp 1208 movifnidn hd, hm 1209 test mxd, 0xf00 1210 jnz .h 1211 test myd, 0xf00 1212 jnz .v 1213.put: 1214 tzcnt wd, wd 1215 movzx wd, word [base+put_ssse3_table+wq*2] 1216 movifnidn dstq, dstmp 1217 movifnidn dsq, dsmp 1218 add wq, t2 1219%if WIN64 1220 pop r8 1221 pop r7 1222%endif 1223 jmp wq 1224.h_w2: 1225 mova m2, [base+spel_h_shuf2] 1226 pshufd m3, m3, q2121 1227.h_w2_loop: 1228 movu m0, [srcq+ssq*0] 1229 movu m1, [srcq+ssq*1] 1230 lea srcq, [srcq+ssq*2] 1231 pshufb m0, m2 1232 pshufb m1, m2 1233 pmaddwd m0, m3 1234 pmaddwd m1, m3 1235 phaddd m0, m1 1236 paddd m0, m4 1237 psrad m0, 6 1238 packssdw m0, m0 1239 pxor m1, m1 1240 pminsw m0, m5 1241 pmaxsw m0, m1 1242 movd [dstq+dsq*0], m0 1243 pshuflw m0, m0, q3232 1244 movd [dstq+dsq*1], m0 1245 lea dstq, [dstq+dsq*2] 1246 sub hd, 2 1247 jg .h_w2_loop 1248 RET 1249.h_w4: 1250 movzx mxd, mxb 1251 lea srcq, [srcq-2] 1252 movq m3, [base+subpel_filters+mxq*8] 1253 movifnidn dstq, dstmp 1254 punpcklbw m3, m3 1255 psraw m3, 8 ; sign-extend 1256 jl .h_w2 1257 WIN64_SPILL_XMM 9 1258 mova m7, [base+spel_h_shufA] 1259%if ARCH_X86_32 1260 %define m8 [base+spel_h_shufB] 1261%else 1262 mova m8, [base+spel_h_shufB] 1263%endif 1264 pshufd m2, m3, q1111 1265 pshufd m3, m3, q2222 1266.h_w4_loop: 1267 movu m0, [srcq+ssq*0] 1268 movu m1, [srcq+ssq*1] 1269 lea srcq, [srcq+ssq*2] 1270 pshufb m6, m0, m7 ; 0 1 1 2 2 3 3 4 1271 pmaddwd m6, m2 1272 pshufb m0, m8 ; 2 3 3 4 4 5 5 6 1273 pmaddwd m0, m3 1274 paddd m0, m6 1275 pshufb m6, m1, m7 1276 pmaddwd m6, m2 1277 pshufb m1, m8 1278 pmaddwd m1, m3 1279 paddd m0, m4 1280 paddd m6, m4 1281 paddd m1, m6 1282 psrad m0, 6 1283 psrad m1, 6 1284 packssdw m0, m1 1285 pxor m1, m1 1286 pminsw m0, m5 1287 pmaxsw m0, m1 1288 movq [dstq+dsq*0], m0 1289 movhps [dstq+dsq*1], m0 1290 lea dstq, [dstq+dsq*2] 1291 sub hd, 2 1292 jg .h_w4_loop 1293 RET 1294.h: 1295 RESET_STACK_STATE 1296 test myd, 0xf00 1297 jnz .hv 1298 mov myd, r8m 1299 movd m5, r8m 1300 shr myd, 11 1301 movddup m4, [base+put_8tap_h_rnd+myq*8] 1302 movifnidn dsq, dsmp 1303 pshufb m5, [base+pw_256] 1304 sub wd, 4 1305 jle .h_w4 1306 WIN64_SPILL_XMM 11 1307 shr mxd, 16 1308 movq m2, [base+subpel_filters+1+mxq*8] 1309 movifnidn dstq, dstmp 1310 mova m6, [base+spel_h_shufA] 1311 mova m7, [base+spel_h_shufB] 1312 lea srcq, [srcq+wq*2] 1313 punpcklbw m2, m2 1314 lea dstq, [dstq+wq*2] 1315 psraw m2, 8 1316 neg wq 1317%if ARCH_X86_32 1318 ALLOC_STACK -16*3 1319 %define m8 [rsp+16*0] 1320 %define m9 [rsp+16*1] 1321 %define m10 [rsp+16*2] 1322 pshufd m0, m2, q0000 1323 pshufd m1, m2, q1111 1324 pshufd m2, m2, q2222 1325 mova m8, m0 1326 mova m9, m1 1327 mova m10, m2 1328%else 1329 pshufd m8, m2, q0000 1330 pshufd m9, m2, q1111 1331 pshufd m10, m2, q2222 1332%endif 1333.h_w8_loop0: 1334 mov r6, wq 1335.h_w8_loop: 1336 movu m3, [srcq+r6*2-4] 1337 movu m2, [srcq+r6*2+8] 1338 pshufb m0, m3, m6 ; 01 12 23 34 1339 pmaddwd m0, m8 ; abcd0 1340 pshufb m3, m7 ; 23 34 45 56 1341 pmaddwd m1, m9, m3 ; abcd1 1342 paddd m0, m1 1343 pshufb m1, m2, m6 ; 67 78 89 9a 1344 shufpd m3, m1, 0x01 ; 45 56 67 78 1345 pmaddwd m1, m9 ; efgh1 1346 pshufb m2, m7 ; 89 9a ab bc 1347 pmaddwd m2, m10 ; efgh2 1348 paddd m1, m2 1349 pmaddwd m2, m10, m3 ; abcd2 1350 pmaddwd m3, m8 ; efgh0 1351 paddd m0, m4 1352 paddd m1, m4 1353 paddd m0, m2 1354 paddd m1, m3 1355 psrad m0, 6 1356 psrad m1, 6 1357 packssdw m0, m1 1358 pxor m1, m1 1359 pminsw m0, m5 1360 pmaxsw m0, m1 1361 mova [dstq+r6*2], m0 1362 add r6, 8 1363 jl .h_w8_loop 1364 add srcq, ssq 1365 add dstq, dsq 1366 dec hd 1367 jg .h_w8_loop0 1368 RET 1369.v: 1370 movzx mxd, myb 1371 shr myd, 16 1372 cmp hd, 6 1373 cmovb myd, mxd 1374 movq m2, [base+subpel_filters+1+myq*8] 1375 WIN64_SPILL_XMM 11, 16 1376 movd m5, r8m 1377 movifnidn dstq, dstmp 1378 movifnidn dsq, dsmp 1379 punpcklbw m2, m2 1380 pshufb m5, [base+pw_256] 1381 psraw m2, 8 ; sign-extend 1382%if ARCH_X86_32 1383 ALLOC_STACK -16*4 1384 pshufd m0, m2, q0000 1385 mov r6, ssq 1386 pshufd m1, m2, q1111 1387 neg r6 1388 pshufd m2, m2, q2222 1389 mova m8, m0 1390 mova m9, m1 1391 mova m10, m2 1392 cmp wd, 2 1393 jne .v_w4 1394%else 1395 mov r6, ssq 1396 pshufd m8, m2, q0000 1397 neg r6 1398 cmp wd, 4 1399 jg .v_w8 1400 pshufd m9, m2, q1111 1401 pshufd m10, m2, q2222 1402 je .v_w4 1403%endif 1404.v_w2: 1405 movd m1, [srcq+r6 *2] 1406 movd m3, [srcq+r6 *1] 1407 movd m2, [srcq+ssq*0] 1408 movd m4, [srcq+ssq*1] 1409 lea srcq, [srcq+ssq*2] 1410 movd m0, [srcq+ssq*0] 1411 punpckldq m1, m3 ; 0 1 1412 punpckldq m3, m2 ; 1 2 1413 punpckldq m2, m4 ; 2 3 1414 punpckldq m4, m0 ; 3 4 1415 punpcklwd m1, m3 ; 01 12 1416 punpcklwd m2, m4 ; 23 34 1417 pxor m6, m6 1418.v_w2_loop: 1419 movd m3, [srcq+ssq*1] 1420 lea srcq, [srcq+ssq*2] 1421 pmaddwd m4, m8, m1 ; a0 b0 1422 mova m1, m2 1423 pmaddwd m2, m9 ; a1 b1 1424 paddd m4, m2 1425 punpckldq m2, m0, m3 ; 4 5 1426 movd m0, [srcq+ssq*0] 1427 punpckldq m3, m0 ; 5 6 1428 punpcklwd m2, m3 ; 67 78 1429 pmaddwd m3, m10, m2 ; a2 b2 1430 paddd m4, m3 1431 psrad m4, 5 1432 packssdw m4, m4 1433 pmaxsw m4, m6 1434 pavgw m4, m6 1435 pminsw m4, m5 1436 movd [dstq+dsq*0], m4 1437 pshuflw m4, m4, q3232 1438 movd [dstq+dsq*1], m4 1439 lea dstq, [dstq+dsq*2] 1440 sub hd, 2 1441 jg .v_w2_loop 1442 RET 1443.v_w4: 1444%if ARCH_X86_32 1445 shl wd, 14 1446 lea srcq, [srcq+r6*2] 1447 lea wd, [wq+hq-(1<<16)] 1448%if STACK_ALIGNMENT < 16 1449 %define dstmp [esp+16*3] 1450%endif 1451.v_w4_loop0: 1452 mov dstmp, dstq 1453 movq m1, [srcq+ssq*0] 1454 movq m2, [srcq+ssq*1] 1455 lea r6, [srcq+ssq*2] 1456 movq m3, [r6 +ssq*0] 1457 movq m4, [r6 +ssq*1] 1458 lea r6, [r6 +ssq*2] 1459%else 1460 movq m1, [srcq+r6 *2] 1461 movq m2, [srcq+r6 *1] 1462 lea r6, [srcq+ssq*2] 1463 movq m3, [srcq+ssq*0] 1464 movq m4, [srcq+ssq*1] 1465%endif 1466 movq m0, [r6 +ssq*0] 1467 punpcklwd m1, m2 ; 01 1468 punpcklwd m2, m3 ; 12 1469 punpcklwd m3, m4 ; 23 1470 punpcklwd m4, m0 ; 34 1471.v_w4_loop: 1472 pmaddwd m6, m8, m1 ; a0 1473 pmaddwd m7, m8, m2 ; b0 1474 mova m1, m3 1475 pmaddwd m3, m9 ; a1 1476 mova m2, m4 1477 pmaddwd m4, m9 ; b1 1478 paddd m6, m3 1479 movq m3, [r6+ssq*0] 1480 paddd m7, m4 1481 movq m4, [r6+ssq*1] 1482 lea r6, [r6+ssq*2] 1483 movq m0, [r6+ssq*0] 1484 punpcklwd m3, m4 ; 45 1485 punpcklwd m4, m0 ; 56 1486 pmaddwd m0, m10, m3 ; a2 1487 paddd m6, m0 1488 pmaddwd m0, m10, m4 ; b2 1489 paddd m7, m0 1490 psrad m6, 5 1491 psrad m7, 5 1492 packssdw m6, m7 1493 pxor m7, m7 1494 pmaxsw m6, m7 1495 pavgw m6, m7 1496 pminsw m6, m5 1497 movq [dstq+dsq*0], m6 1498 movhps [dstq+dsq*1], m6 1499 lea dstq, [dstq+dsq*2] 1500 sub hd, 2 1501 jg .v_w4_loop 1502%if ARCH_X86_32 1503 mov dstq, dstmp 1504 add srcq, 8 1505 movzx hd, ww 1506 add dstq, 8 1507 sub wd, 1<<16 1508 jg .v_w4_loop0 1509 RET 1510%else 1511 RET 1512.v_w8: 1513 mova r6m, m8 1514 shl wd, 5 1515 pshufd m6, m2, q1111 1516 lea wd, [wq+hq-(1<<8)] 1517 pshufd m7, m2, q2222 1518 WIN64_PUSH_XMM 16 1519.v_w8_loop0: 1520 movu m9, [srcq+ r6*2] 1521 movu m11, [srcq+ r6*1] 1522 lea r7, [srcq+ssq*2] 1523 movu m13, [srcq+ssq*0] 1524 movu m15, [srcq+ssq*1] 1525 mov r8, dstq 1526 movu m4, [r7 +ssq*0] 1527 punpcklwd m8, m9, m11 ; 01 1528 punpckhwd m9, m11 1529 punpcklwd m10, m11, m13 ; 12 1530 punpckhwd m11, m13 1531 punpcklwd m12, m13, m15 ; 23 1532 punpckhwd m13, m15 1533 punpcklwd m14, m15, m4 ; 34 1534 punpckhwd m15, m4 1535.v_w8_loop: 1536 mova m3, r6m 1537 pmaddwd m0, m8, m3 ; a0 1538 pmaddwd m2, m9, m3 ; a0' 1539 pmaddwd m1, m10, m3 ; b0 1540 pmaddwd m3, m11 ; b0' 1541 mova m8, m12 1542 pmaddwd m12, m6 ; a1 1543 mova m9, m13 1544 pmaddwd m13, m6 ; a1' 1545 mova m10, m14 1546 pmaddwd m14, m6 ; b1 1547 mova m11, m15 1548 pmaddwd m15, m6 ; b1' 1549 paddd m0, m12 1550 paddd m2, m13 1551 movu m13, [r7+ssq*0] 1552 paddd m1, m14 1553 paddd m3, m15 1554 movu m15, [r7+ssq*1] 1555 lea r7, [r7+ssq*2] 1556 movu m4, [r7+ssq*0] 1557 punpcklwd m12, m13, m15 ; 45 1558 punpckhwd m13, m15 1559 punpcklwd m14, m15, m4 ; 56 1560 punpckhwd m15, m4 1561 pmaddwd m4, m7, m12 ; a2 1562 paddd m0, m4 1563 pmaddwd m4, m7, m13 ; a2' 1564 paddd m2, m4 1565 pmaddwd m4, m7, m14 ; b2 1566 paddd m1, m4 1567 pmaddwd m4, m7, m15 ; b2' 1568 paddd m3, m4 1569 REPX {psrad x, 5}, m0, m2, m1, m3 1570 packssdw m0, m2 1571 packssdw m1, m3 1572 pxor m2, m2 1573 pmaxsw m0, m2 1574 pmaxsw m1, m2 1575 pavgw m0, m2 1576 pavgw m1, m2 1577 pminsw m0, m5 1578 pminsw m1, m5 1579 mova [r8+dsq*0], m0 1580 mova [r8+dsq*1], m1 1581 lea r8, [r8+dsq*2] 1582 sub hd, 2 1583 jg .v_w8_loop 1584 add srcq, 16 1585 add dstq, 16 1586 movzx hd, wb 1587 sub wd, 1<<8 1588 jg .v_w8_loop0 1589 RET 1590%endif 1591.hv: 1592 cmp wd, 4 1593 jg .hv_w8 1594 WIN64_SPILL_XMM 12, 16 1595%if ARCH_X86_32 1596 movd m3, r8m 1597 pshufb m3, [base+pw_256] 1598%else 1599 movd m11, r8m 1600 pshufb m11, [base+pw_256] 1601%endif 1602 movzx mxd, mxb 1603 movq m0, [base+subpel_filters+mxq*8] 1604 movzx mxd, myb 1605 shr myd, 16 1606 cmp hd, 6 1607 cmovb myd, mxd 1608 movq m2, [base+subpel_filters+1+myq*8] 1609 movddup m7, [base+pd_8704] 1610 sub srcq, 2 1611 pshuflw m0, m0, q2121 1612 pxor m6, m6 1613 punpcklbw m6, m0 1614 punpcklbw m2, m2 1615 psraw m2, 8 ; sign-extend 1616 test dword r8m, 0x800 1617 jz .hv_w2_10bpc 1618 movddup m7, [base+pd_2560] 1619 psraw m6, 2 1620 psllw m2, 2 1621.hv_w2_10bpc: 1622%if ARCH_X86_32 1623%assign regs_used 2 1624 ALLOC_STACK -16*7 1625%assign regs_used 7 1626 mov dstq, r0mp 1627 mov dsq, r1mp 1628 %define m11 [esp+16*4] 1629 pshufd m0, m2, q0000 1630 pshufd m1, m2, q1111 1631 pshufd m2, m2, q2222 1632 mova m8, m0 1633 mova m9, m1 1634 mova m10, m2 1635 mova m11, m3 1636 neg ssq 1637 movu m3, [srcq+ssq*2] 1638 movu m4, [srcq+ssq*1] 1639 neg ssq 1640%else 1641 pshufd m8, m2, q0000 1642 mov r6, ssq 1643 pshufd m9, m2, q1111 1644 neg r6 1645 pshufd m10, m2, q2222 1646 movu m3, [srcq+r6 *2] 1647 movu m4, [srcq+r6 *1] 1648%endif 1649 movu m1, [srcq+ssq*0] 1650 movu m0, [srcq+ssq*1] 1651 lea srcq, [srcq+ssq*2] 1652 movu m2, [srcq+ssq*0] 1653 cmp wd, 4 1654 je .hv_w4 1655 mova m5, [base+spel_h_shuf2] 1656 REPX {pshufb x, m5}, m3, m4, m0, m1, m2 1657 REPX {pmaddwd x, m6}, m3, m0, m4, m1, m2 1658 phaddd m3, m0 ; 0 3 1659 phaddd m4, m1 ; 1 2 1660 phaddd m0, m2 ; 3 4 1661 REPX {paddd x, m7}, m3, m4, m0 1662 REPX {psrad x, 10}, m3, m4, m0 1663 packssdw m3, m4 ; 0 3 1 2 1664 packssdw m4, m0 ; 1 2 3 4 1665 pshufd m2, m3, q1320 ; 0 1 2 3 1666 punpcklwd m1, m2, m4 ; 01 12 1667 punpckhwd m2, m4 ; 23 34 1668.hv_w2_loop: 1669 movu m3, [srcq+ssq*1] 1670 lea srcq, [srcq+ssq*2] 1671 movu m4, [srcq+ssq*0] 1672 pshufb m3, m5 1673 pshufb m4, m5 1674 pmaddwd m3, m6 1675 pmaddwd m4, m6 1676 phaddd m3, m4 1677 pmaddwd m4, m8, m1 ; a0 b0 1678 mova m1, m2 1679 pmaddwd m2, m9 ; a1 b1 1680 paddd m4, m2 1681 paddd m3, m7 1682 psrad m3, 10 ; 5 6 1683 packssdw m0, m3 1684 pshufd m2, m0, q2103 1685 punpckhwd m2, m0 ; 45 56 1686 mova m0, m3 1687 pmaddwd m3, m10, m2 ; a2 b2 1688 paddd m4, m3 1689 psrad m4, 10 1690 packssdw m4, m4 1691 pxor m3, m3 1692 pminsw m4, m11 1693 pmaxsw m4, m3 1694 movd [dstq+dsq*0], m4 1695 pshuflw m4, m4, q1032 1696 movd [dstq+dsq*1], m4 1697 lea dstq, [dstq+dsq*2] 1698 sub hd, 2 1699 jg .hv_w2_loop 1700 RET 1701.hv_w4: 1702%if ARCH_X86_32 1703 %define m12 [esp+16*5] 1704 %define m13 [esp+16*6] 1705 %define m14 [base+spel_h_shufA] 1706 %define m15 [base+spel_h_shufB] 1707 pshufd m5, m6, q0000 1708 pshufd m6, m6, q1111 1709 mova m12, m5 1710 mova m13, m6 1711%else 1712 WIN64_PUSH_XMM 16 1713 mova m14, [base+spel_h_shufA] 1714 mova m15, [base+spel_h_shufB] 1715 pshufd m12, m6, q0000 1716 pshufd m13, m6, q1111 1717%endif 1718%macro HV_H_W4_6TAP 3-4 m15 ; dst, src, tmp, shufB 1719 pshufb %3, %2, m14 1720 pmaddwd %3, m12 1721 pshufb %2, %4 1722 pmaddwd %2, m13 1723 paddd %3, m7 1724 paddd %1, %2, %3 1725%endmacro 1726 HV_H_W4_6TAP m3, m3, m5 1727 HV_H_W4_6TAP m4, m4, m5 1728 HV_H_W4_6TAP m5, m1, m5 1729 HV_H_W4_6TAP m0, m0, m1 1730 HV_H_W4_6TAP m2, m2, m1 1731 REPX {psrad x, 10}, m3, m5, m4, m0, m2 1732 packssdw m3, m5 ; 0 2 1733 packssdw m4, m0 ; 1 3 1734 packssdw m5, m2 ; 2 4 1735 punpcklwd m1, m3, m4 ; 01 1736 punpckhwd m3, m4 ; 23 1737 punpcklwd m2, m4, m5 ; 12 1738 punpckhwd m4, m5 ; 34 1739.hv_w4_loop: 1740 movu m0, [srcq+ssq*1] 1741 pmaddwd m5, m8, m1 ; a0 1742 lea srcq, [srcq+ssq*2] 1743 pmaddwd m6, m8, m2 ; b0 1744 mova m1, m3 1745 pmaddwd m3, m9 ; a1 1746 mova m2, m4 1747 pmaddwd m4, m9 ; b1 1748 paddd m5, m3 1749 movu m3, [srcq+ssq*0] 1750 paddd m6, m4 1751 HV_H_W4_6TAP m0, m0, m4 1752 HV_H_W4_6TAP m3, m3, m4 1753 psrad m4, m2, 16 1754 psrad m0, 10 1755 psrad m3, 10 1756 packssdw m4, m0 ; 4 5 1757 packssdw m0, m3 ; 5 6 1758 punpcklwd m3, m4, m0 ; 45 1759 punpckhwd m4, m0 ; 56 1760 pmaddwd m0, m10, m3 ; a2 1761 paddd m5, m0 1762 pmaddwd m0, m10, m4 ; b2 1763 paddd m6, m0 1764 psrad m5, 10 1765 psrad m6, 10 1766 packssdw m5, m6 1767 pxor m6, m6 1768 pminsw m5, m11 1769 pmaxsw m5, m6 1770 movq [dstq+dsq*0], m5 1771 movhps [dstq+dsq*1], m5 1772 lea dstq, [dstq+dsq*2] 1773 sub hd, 2 1774 jg .hv_w4_loop 1775 RET 1776.hv_w8: 1777 RESET_STACK_STATE 1778 shr mxd, 16 1779 movq m2, [base+subpel_filters+1+mxq*8] 1780 movzx mxd, myb 1781 shr myd, 16 1782 cmp hd, 6 1783 cmovb myd, mxd 1784 movq m1, [base+subpel_filters+1+myq*8] 1785 movd m3, r8m 1786 movddup m4, [base+pd_8704] 1787 pshufb m3, [base+pw_256] 1788 pxor m0, m0 1789 punpcklbw m0, m2 1790 punpcklbw m1, m1 1791 sub srcq, 4 1792 psraw m1, 8 ; sign-extend 1793 test dword r8m, 0x800 1794 jz .hv_w8_10bpc 1795 movddup m4, [base+pd_2560] 1796 psraw m0, 2 1797 psllw m1, 2 1798.hv_w8_10bpc: 1799%if ARCH_X86_32 1800%assign regs_used 2 1801 ALLOC_STACK -16*9 1802%assign regs_used 7 1803 mov dstq, r0mp 1804 mov dsq, r1mp 1805 mova [rsp+16*7], m4 1806%else 1807 ALLOC_STACK 16*7, 16 1808%endif 1809 mova [rsp+16*6], m3 1810 pshufd m2, m0, q0000 1811 mova [rsp+16*0], m2 1812 pshufd m2, m0, q1111 1813 mova [rsp+16*1], m2 1814 pshufd m0, m0, q2222 1815 mova [rsp+16*2], m0 1816 pshufd m2, m1, q0000 1817 mova [rsp+16*3], m2 1818 pshufd m2, m1, q1111 1819 mova [rsp+16*4], m2 1820 pshufd m1, m1, q2222 1821 mova [rsp+16*5], m1 1822 mov r6, ssq 1823 neg r6 1824%if ARCH_X86_32 1825 shl wd, 14 1826 lea r4d, [wq+hq-(1<<16)] 1827%if STACK_ALIGNMENT < 16 1828 %define srcmp [esp+16*8+4*0] 1829 %define dstmp [esp+16*8+4*1] 1830%endif 1831%macro HV_H_6TAP 3-6 [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-2], mul[1-3] 1832 punpcklwd %1, %2, %3 ; 01 12 23 34 1833 punpckhwd %2, %3 ; 45 56 67 78 1834 pmaddwd %3, %4, %1 ; a0 1835 shufpd %1, %2, 0x01 ; 23 34 45 56 1836 pmaddwd %2, %6 ; a2 1837 pmaddwd %1, %5 ; a1 1838 paddd %2, %3 1839 paddd %1, %2 1840%endmacro 1841.hv_w8_loop0: 1842 mov srcmp, srcq 1843 mov dstmp, dstq 1844 movu m5, [srcq+r6*2+0] 1845 movu m6, [srcq+r6*2+2] 1846 mova m7, [rsp+16*0] 1847 mova m1, [rsp+16*1] 1848 mova m0, [rsp+16*2] 1849 HV_H_6TAP m2, m5, m6, m7, m1, m0 1850 movu m5, [srcq+r6*1+0] 1851 movu m6, [srcq+r6*1+2] 1852 HV_H_6TAP m3, m5, m6, m7, m1, m0 1853 movu m5, [srcq+ssq*0+0] 1854 movu m6, [srcq+ssq*0+2] 1855 HV_H_6TAP m4, m5, m6, m7, m1, m0 1856 movu m5, [srcq+ssq*1+0] 1857 movu m6, [srcq+ssq*1+2] 1858 lea srcq, [srcq+ssq*2] 1859 HV_H_6TAP m0, m5, m6, m7, m1 1860 movu m5, [srcq+ssq*0+0] 1861 movu m6, [srcq+ssq*0+2] 1862 HV_H_6TAP m1, m5, m6, m7 1863 mova m5, [rsp+16*7] 1864 REPX {paddd x, m5}, m2, m3, m4, m0, m1 1865 REPX {psrad x, 10}, m2, m4, m3, m0, m1 1866 packssdw m2, m4 ; 0 2 1867 packssdw m3, m0 ; 1 3 1868 packssdw m4, m1 ; 2 4 1869 punpcklwd m0, m2, m3 ; 01 1870 punpckhwd m2, m3 ; 23 1871 punpcklwd m1, m3, m4 ; 12 1872 punpckhwd m3, m4 ; 34 1873.hv_w8_loop: 1874 mova m5, [rsp+16*3] 1875 mova m6, [rsp+16*4] 1876 pmaddwd m4, m0, m5 ; a0 1877 pmaddwd m5, m1 ; b0 1878 mova m0, m2 1879 pmaddwd m2, m6 ; a1 1880 mova m1, m3 1881 pmaddwd m3, m6 ; b1 1882 paddd m4, m2 1883 movu m2, [srcq+ssq*1+0] 1884 paddd m5, m3 1885 movu m3, [srcq+ssq*1+2] 1886 lea srcq, [srcq+ssq*2] 1887 HV_H_6TAP m6, m2, m3 1888 movu m2, [srcq+ssq*0+0] 1889 movu m3, [srcq+ssq*0+2] 1890 HV_H_6TAP m7, m2, m3 1891 mova m2, [rsp+16*7] 1892 psrad m3, m1, 16 1893 paddd m6, m2 1894 paddd m7, m2 1895 psrad m6, 10 1896 psrad m7, 10 1897 packssdw m3, m6 ; 4 5 1898 packssdw m6, m7 ; 5 6 1899 mova m7, [rsp+16*5] 1900 punpcklwd m2, m3, m6 ; 45 1901 punpckhwd m3, m6 ; 56 1902 pmaddwd m6, m2, m7 ; a2 1903 pmaddwd m7, m3 ; b2 1904 paddd m4, m6 1905 paddd m5, m7 1906 psrad m4, 10 1907 psrad m5, 10 1908 packssdw m4, m5 1909 pxor m5, m5 1910 pminsw m4, [rsp+16*6] 1911 pmaxsw m4, m5 1912 movq [dstq+dsq*0], m4 1913 movhps [dstq+dsq*1], m4 1914 lea dstq, [dstq+dsq*2] 1915 sub hd, 2 1916 jg .hv_w8_loop 1917 mov srcq, srcmp 1918 mov dstq, dstmp 1919 movzx hd, r4w 1920 add srcq, 8 1921 add dstq, 8 1922 sub r4d, 1<<16 1923%else 1924 shl wd, 5 1925 lea r8d, [wq+hq-256] 1926%macro HV_H_6TAP 5-9 [spel_h_shufA], [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-3], shift, shuf, mul[1-3] 1927%ifid %6 1928 REPX {pshufb x, %6}, %2, %3, %4 1929%else 1930 mova %1, %6 1931 pshufb %2, %1 ; 01 12 23 34 1932 pshufb %3, %1 ; 45 56 67 78 1933 pshufb %4, %1 ; 89 9a ab bc 1934%endif 1935 pmaddwd %1, %7, %2 1936 shufpd %2, %3, 0x01 ; 23 34 45 56 1937 pmaddwd %2, %8 1938 paddd %1, %2 1939 pmaddwd %2, %9, %3 1940 paddd %1, %2 1941 pmaddwd %2, %7, %3 1942 shufpd %3, %4, 0x01 ; 67 78 89 9a 1943 pmaddwd %4, %9 1944 pmaddwd %3, %8 1945 paddd %1, m4 1946 paddd %2, m4 1947 paddd %3, %4 1948 paddd %2, %3 1949 psrad %1, %5 1950 psrad %2, %5 1951 packssdw %1, %2 1952%endmacro 1953.hv_w8_loop0: 1954 mova m5, [spel_h_shufA] 1955 movu m0, [srcq+r6*2+ 0] 1956 mova m6, [rsp+16*0] 1957 movu m1, [srcq+r6*2+ 8] 1958 mova m7, [rsp+16*1] 1959 movu m2, [srcq+r6*2+16] 1960 mova m8, [rsp+16*2] 1961 HV_H_6TAP m9, m0, m1, m2, 10, m5, m6, m7, m8 1962 movu m0, [srcq+r6*1+ 0] 1963 movu m1, [srcq+r6*1+ 8] 1964 movu m2, [srcq+r6*1+16] 1965 lea r4, [srcq+ssq*2] 1966 HV_H_6TAP m11, m0, m1, m2, 10, m5, m6, m7, m8 1967 movu m0, [srcq+ssq*0+ 0] 1968 movu m1, [srcq+ssq*0+ 8] 1969 movu m2, [srcq+ssq*0+16] 1970 mov r7, dstq 1971 HV_H_6TAP m13, m0, m1, m2, 10, m5, m6, m7, m8 1972 movu m0, [srcq+ssq*1+ 0] 1973 movu m1, [srcq+ssq*1+ 8] 1974 movu m2, [srcq+ssq*1+16] 1975 HV_H_6TAP m15, m0, m1, m2, 10, m5, m6, m7, m8 1976 movu m0, [r4+ssq*0+ 0] 1977 movu m1, [r4+ssq*0+ 8] 1978 movu m2, [r4+ssq*0+16] 1979 HV_H_6TAP m5, m0, m1, m2, 10, m5, m6, m7, m8 1980 punpcklwd m8, m9, m11 ; 01 1981 punpckhwd m9, m11 1982 punpcklwd m10, m11, m13 ; 12 1983 punpckhwd m11, m13 1984 punpcklwd m12, m13, m15 ; 23 1985 punpckhwd m13, m15 1986 punpcklwd m14, m15, m5 ; 34 1987 punpckhwd m15, m5 1988.hv_w8_loop: 1989 mova m3, [rsp+16*3] 1990 mova m7, [rsp+16*4] 1991 pmaddwd m0, m8, m3 ; a0 1992 mova m8, m12 1993 pmaddwd m2, m9, m3 ; a0' 1994 mova m9, m13 1995 pmaddwd m1, m10, m3 ; b0 1996 mova m10, m14 1997 pmaddwd m3, m11 ; b0' 1998 mova m11, m15 1999 REPX {pmaddwd x, m7}, m12, m13, m14, m15 2000 movu m6, [r4+ssq*1+ 0] 2001 paddd m0, m12 2002 movu m7, [r4+ssq*1+ 8] 2003 paddd m2, m13 2004 movu m12, [r4+ssq*1+16] 2005 paddd m1, m14 2006 lea r4, [r4+ssq*2] 2007 paddd m3, m15 2008 HV_H_6TAP m15, m6, m7, m12, 10 2009 movu m6, [r4+ssq*0+ 0] 2010 movu m7, [r4+ssq*0+ 8] 2011 movu m14, [r4+ssq*0+16] 2012 punpcklwd m12, m5, m15 ; 45 2013 punpckhwd m13, m5, m15 2014 HV_H_6TAP m5, m6, m7, m14, 10 2015 mova m7, [rsp+16*5] 2016 punpcklwd m14, m15, m5 ; 56 2017 punpckhwd m15, m5 2018 pmaddwd m6, m12, m7 ; a2 2019 paddd m0, m6 2020 pmaddwd m6, m13, m7 ; a2' 2021 paddd m2, m6 2022 pmaddwd m6, m14, m7 ; b2 2023 pmaddwd m7, m15 ; b2' 2024 paddd m1, m6 2025 mova m6, [rsp+16*6] 2026 paddd m3, m7 2027 REPX {psrad x, 10}, m0, m2, m1, m3 2028 packssdw m0, m2 2029 packssdw m1, m3 2030 pxor m2, m2 2031 pminsw m0, m6 2032 pminsw m1, m6 2033 pmaxsw m0, m2 2034 pmaxsw m1, m2 2035 mova [r7+dsq*0], m0 2036 mova [r7+dsq*1], m1 2037 lea r7, [r7+dsq*2] 2038 sub hd, 2 2039 jg .hv_w8_loop 2040 add srcq, 16 2041 add dstq, 16 2042 movzx hd, r8b 2043 sub r8d, 1<<8 2044%endif 2045 jg .hv_w8_loop0 2046 RET 2047 2048PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc 2049PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc 2050PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc 2051PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc 2052PUT_8TAP_FN sharp, SHARP, SHARP 2053 2054cglobal put_8tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my 2055%if ARCH_X86_32 2056 %define mxb r0b 2057 %define mxd r0 2058 %define mxq r0 2059 %define myb r1b 2060 %define myd r1 2061 %define myq r1 2062 %define m8 [esp+16*0] 2063 %define m9 [esp+16*1] 2064 %define m10 [esp+16*2] 2065 %define m11 [esp+16*3] 2066 %define m12 [esp+16*4] 2067 %define m13 [esp+16*5] 2068 %define m14 [esp+16*6] 2069 %define m15 [esp+16*7] 2070%endif 2071 imul mxd, mxm, 0x010101 2072 add mxd, t0d ; 8tap_h, mx, 4tap_h 2073 imul myd, mym, 0x010101 2074 add myd, t1d ; 8tap_v, my, 4tap_v 2075 LEA t2, put_ssse3 2076 movifnidn wd, wm 2077 movifnidn srcq, srcmp 2078 movifnidn ssq, ssmp 2079 movifnidn hd, hm 2080 test mxd, 0xf00 2081 jnz .h 2082 test myd, 0xf00 2083 jz mangle(private_prefix %+ _put_6tap_16bpc_ssse3).put 2084.v: 2085 movzx mxd, myb 2086 shr myd, 16 2087 cmp hd, 6 2088 cmovb myd, mxd 2089 movq m3, [base+subpel_filters+myq*8] 2090 WIN64_SPILL_XMM 15 2091 movd m7, r8m 2092 movifnidn dstq, dstmp 2093 movifnidn dsq, dsmp 2094 punpcklbw m3, m3 2095 pshufb m7, [base+pw_256] 2096 psraw m3, 8 ; sign-extend 2097%if ARCH_X86_32 2098 ALLOC_STACK -16*7 2099 pshufd m0, m3, q0000 2100 pshufd m1, m3, q1111 2101 pshufd m2, m3, q2222 2102 pshufd m3, m3, q3333 2103 mova m8, m0 2104 mova m9, m1 2105 mova m10, m2 2106 mova m11, m3 2107%else 2108 pshufd m8, m3, q0000 2109 pshufd m9, m3, q1111 2110 pshufd m10, m3, q2222 2111 pshufd m11, m3, q3333 2112%endif 2113 lea r6, [ssq*3] 2114 sub srcq, r6 2115 cmp wd, 2 2116 jne .v_w4 2117.v_w2: 2118 movd m1, [srcq+ssq*0] 2119 movd m4, [srcq+ssq*1] 2120 movd m2, [srcq+ssq*2] 2121 add srcq, r6 2122 movd m5, [srcq+ssq*0] 2123 movd m3, [srcq+ssq*1] 2124 movd m6, [srcq+ssq*2] 2125 add srcq, r6 2126 movd m0, [srcq+ssq*0] 2127 punpckldq m1, m4 ; 0 1 2128 punpckldq m4, m2 ; 1 2 2129 punpckldq m2, m5 ; 2 3 2130 punpckldq m5, m3 ; 3 4 2131 punpckldq m3, m6 ; 4 5 2132 punpckldq m6, m0 ; 5 6 2133 punpcklwd m1, m4 ; 01 12 2134 punpcklwd m2, m5 ; 23 34 2135 punpcklwd m3, m6 ; 45 56 2136 pxor m6, m6 2137.v_w2_loop: 2138 movd m4, [srcq+ssq*1] 2139 lea srcq, [srcq+ssq*2] 2140 pmaddwd m5, m8, m1 ; a0 b0 2141 mova m1, m2 2142 pmaddwd m2, m9 ; a1 b1 2143 paddd m5, m2 2144 mova m2, m3 2145 pmaddwd m3, m10 ; a2 b2 2146 paddd m5, m3 2147 punpckldq m3, m0, m4 ; 6 7 2148 movd m0, [srcq+ssq*0] 2149 punpckldq m4, m0 ; 7 8 2150 punpcklwd m3, m4 ; 67 78 2151 pmaddwd m4, m11, m3 ; a3 b3 2152 paddd m5, m4 2153 psrad m5, 5 2154 packssdw m5, m5 2155 pmaxsw m5, m6 2156 pavgw m5, m6 2157 pminsw m5, m7 2158 movd [dstq+dsq*0], m5 2159 pshuflw m5, m5, q3232 2160 movd [dstq+dsq*1], m5 2161 lea dstq, [dstq+dsq*2] 2162 sub hd, 2 2163 jg .v_w2_loop 2164 RET 2165.v_w4: 2166%if ARCH_X86_32 2167 shl wd, 14 2168%if STACK_ALIGNMENT < 16 2169 mov [esp+4*29], srcq 2170 mov [esp+4*30], dstq 2171%else 2172 mov srcmp, srcq 2173%endif 2174 lea wd, [wq+hq-(1<<16)] 2175%else 2176 shl wd, 6 2177 mov r7, srcq 2178 mov r8, dstq 2179 lea wd, [wq+hq-(1<<8)] 2180%endif 2181.v_w4_loop0: 2182 movq m1, [srcq+ssq*0] 2183 movq m2, [srcq+ssq*1] 2184 movq m3, [srcq+ssq*2] 2185 add srcq, r6 2186 movq m4, [srcq+ssq*0] 2187 movq m5, [srcq+ssq*1] 2188 movq m6, [srcq+ssq*2] 2189 add srcq, r6 2190 movq m0, [srcq+ssq*0] 2191 punpcklwd m1, m2 ; 01 2192 punpcklwd m2, m3 ; 12 2193 punpcklwd m3, m4 ; 23 2194 punpcklwd m4, m5 ; 34 2195 punpcklwd m5, m6 ; 45 2196 punpcklwd m6, m0 ; 56 2197%if ARCH_X86_32 2198 jmp .v_w4_loop_start 2199.v_w4_loop: 2200 mova m1, m12 2201 mova m2, m13 2202 mova m3, m14 2203.v_w4_loop_start: 2204 pmaddwd m1, m8 ; a0 2205 pmaddwd m2, m8 ; b0 2206 mova m12, m3 2207 mova m13, m4 2208 pmaddwd m3, m9 ; a1 2209 pmaddwd m4, m9 ; b1 2210 paddd m1, m3 2211 paddd m2, m4 2212 mova m14, m5 2213 mova m4, m6 2214 pmaddwd m5, m10 ; a2 2215 pmaddwd m6, m10 ; b2 2216 paddd m1, m5 2217 paddd m2, m6 2218 movq m6, [srcq+ssq*1] 2219 lea srcq, [srcq+ssq*2] 2220 punpcklwd m5, m0, m6 ; 67 2221 movq m0, [srcq+ssq*0] 2222 pmaddwd m3, m11, m5 ; a3 2223 punpcklwd m6, m0 ; 78 2224 paddd m1, m3 2225 pmaddwd m3, m11, m6 ; b3 2226 paddd m2, m3 2227 psrad m1, 5 2228 psrad m2, 5 2229 packssdw m1, m2 2230 pxor m2, m2 2231 pmaxsw m1, m2 2232 pavgw m1, m2 2233 pminsw m1, m7 2234 movq [dstq+dsq*0], m1 2235 movhps [dstq+dsq*1], m1 2236 lea dstq, [dstq+dsq*2] 2237 sub hd, 2 2238 jg .v_w4_loop 2239%if STACK_ALIGNMENT < 16 2240 mov srcq, [esp+4*29] 2241 mov dstq, [esp+4*30] 2242 movzx hd, ww 2243 add srcq, 8 2244 add dstq, 8 2245 mov [esp+4*29], srcq 2246 mov [esp+4*30], dstq 2247%else 2248 mov srcq, srcmp 2249 mov dstq, dstmp 2250 movzx hd, ww 2251 add srcq, 8 2252 add dstq, 8 2253 mov srcmp, srcq 2254 mov dstmp, dstq 2255%endif 2256 sub wd, 1<<16 2257%else 2258.v_w4_loop: 2259 pmaddwd m12, m8, m1 ; a0 2260 pmaddwd m13, m8, m2 ; b0 2261 mova m1, m3 2262 mova m2, m4 2263 pmaddwd m3, m9 ; a1 2264 pmaddwd m4, m9 ; b1 2265 paddd m12, m3 2266 paddd m13, m4 2267 mova m3, m5 2268 mova m4, m6 2269 pmaddwd m5, m10 ; a2 2270 pmaddwd m6, m10 ; b2 2271 paddd m12, m5 2272 paddd m13, m6 2273 movq m6, [srcq+ssq*1] 2274 lea srcq, [srcq+ssq*2] 2275 punpcklwd m5, m0, m6 ; 67 2276 movq m0, [srcq+ssq*0] 2277 pmaddwd m14, m11, m5 ; a3 2278 punpcklwd m6, m0 ; 78 2279 paddd m12, m14 2280 pmaddwd m14, m11, m6 ; b3 2281 paddd m13, m14 2282 psrad m12, 5 2283 psrad m13, 5 2284 packssdw m12, m13 2285 pxor m13, m13 2286 pmaxsw m12, m13 2287 pavgw m12, m13 2288 pminsw m12, m7 2289 movq [dstq+dsq*0], m12 2290 movhps [dstq+dsq*1], m12 2291 lea dstq, [dstq+dsq*2] 2292 sub hd, 2 2293 jg .v_w4_loop 2294 add r7, 8 2295 add r8, 8 2296 movzx hd, wb 2297 mov srcq, r7 2298 mov dstq, r8 2299 sub wd, 1<<8 2300%endif 2301 jg .v_w4_loop0 2302 RET 2303.h: 2304 RESET_STACK_STATE 2305 test myd, 0xf00 2306 jnz .hv 2307 mov myd, r8m 2308 movd m5, r8m 2309 shr myd, 11 2310 movddup m4, [base+put_8tap_h_rnd+myq*8] 2311 movifnidn dsq, dsmp 2312 pshufb m5, [base+pw_256] 2313 cmp wd, 4 2314 jle mangle(private_prefix %+ _put_6tap_16bpc_ssse3).h_w4 2315 WIN64_SPILL_XMM 12 2316 shr mxd, 16 2317 movq m3, [base+subpel_filters+mxq*8] 2318 movifnidn dstq, dstmp 2319 mova m6, [base+spel_h_shufA] 2320 mova m7, [base+spel_h_shufB] 2321%if UNIX64 2322 mov wd, wd 2323%endif 2324 lea srcq, [srcq+wq*2] 2325 punpcklbw m3, m3 2326 lea dstq, [dstq+wq*2] 2327 psraw m3, 8 2328 neg wq 2329%if ARCH_X86_32 2330 ALLOC_STACK -16*4 2331 pshufd m0, m3, q0000 2332 pshufd m1, m3, q1111 2333 pshufd m2, m3, q2222 2334 pshufd m3, m3, q3333 2335 mova m8, m0 2336 mova m9, m1 2337 mova m10, m2 2338 mova m11, m3 2339%else 2340 pshufd m8, m3, q0000 2341 pshufd m9, m3, q1111 2342 pshufd m10, m3, q2222 2343 pshufd m11, m3, q3333 2344%endif 2345.h_w8_loop0: 2346 mov r6, wq 2347.h_w8_loop: 2348 movu m0, [srcq+r6*2- 6] 2349 movu m1, [srcq+r6*2+ 2] 2350 pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 2351 pshufb m0, m7 ; 2 3 3 4 4 5 5 6 2352 pmaddwd m2, m8 ; abcd0 2353 pmaddwd m0, m9 ; abcd1 2354 pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 2355 pshufb m1, m7 ; 6 7 7 8 8 9 9 a 2356 paddd m2, m4 2357 paddd m0, m2 2358 pmaddwd m2, m10, m3 ; abcd2 2359 pmaddwd m3, m8 ; efgh0 2360 paddd m0, m2 2361 pmaddwd m2, m11, m1 ; abcd3 2362 pmaddwd m1, m9 ; efgh1 2363 paddd m0, m2 2364 movu m2, [srcq+r6*2+10] 2365 paddd m3, m4 2366 paddd m1, m3 2367 pshufb m3, m2, m6 ; 8 9 9 a a b b c 2368 pshufb m2, m7 ; a b b c c d d e 2369 pmaddwd m3, m10 ; efgh2 2370 pmaddwd m2, m11 ; efgh3 2371 paddd m1, m3 2372 paddd m1, m2 2373 psrad m0, 6 2374 psrad m1, 6 2375 packssdw m0, m1 2376 pxor m1, m1 2377 pminsw m0, m5 2378 pmaxsw m0, m1 2379 mova [dstq+r6*2], m0 2380 add r6, 8 2381 jl .h_w8_loop 2382 add srcq, ssq 2383 add dstq, dsq 2384 dec hd 2385 jg .h_w8_loop0 2386 RET 2387.hv: 2388 RESET_STACK_STATE 2389%if ARCH_X86_32 2390 movd m4, r8m 2391 pshufb m4, [base+pw_256] 2392%else 2393%if WIN64 2394 ALLOC_STACK 16*6, 16 2395%endif 2396 movd m15, r8m 2397 pshufb m15, [base+pw_256] 2398%endif 2399 cmp wd, 4 2400 jg .hv_w8 2401 movzx mxd, mxb 2402 je .hv_w4 2403 movq m0, [base+subpel_filters+mxq*8] 2404 movzx mxd, myb 2405 shr myd, 16 2406 cmp hd, 6 2407 cmovb myd, mxd 2408 movq m3, [base+subpel_filters+myq*8] 2409 movddup m6, [base+pd_8704] 2410 pshuflw m0, m0, q2121 2411 pxor m7, m7 2412 punpcklbw m7, m0 2413 punpcklbw m3, m3 2414 psraw m3, 8 ; sign-extend 2415 test dword r8m, 0x800 2416 jz .hv_w2_10bpc 2417 movddup m6, [base+pd_2560] 2418 psraw m7, 2 2419 psllw m3, 2 2420.hv_w2_10bpc: 2421%if ARCH_X86_32 2422 mov dstq, dstmp 2423 mov dsq, dsmp 2424 mova m5, [base+spel_h_shuf2] 2425 ALLOC_STACK -16*8 2426 pshufd m0, m3, q0000 2427 pshufd m1, m3, q1111 2428 pshufd m2, m3, q2222 2429 pshufd m3, m3, q3333 2430 mova m9, m5 2431 mova m11, m0 2432 mova m12, m1 2433 mova m13, m2 2434 mova m14, m3 2435 mova m15, m4 2436%else 2437 mova m9, [base+spel_h_shuf2] 2438 pshufd m11, m3, q0000 2439 pshufd m12, m3, q1111 2440 pshufd m13, m3, q2222 2441 pshufd m14, m3, q3333 2442%endif 2443 lea r6, [ssq*3] 2444 sub srcq, 2 2445 sub srcq, r6 2446 movu m2, [srcq+ssq*0] 2447 movu m3, [srcq+ssq*1] 2448 movu m1, [srcq+ssq*2] 2449 add srcq, r6 2450 movu m4, [srcq+ssq*0] 2451%if ARCH_X86_32 2452 REPX {pshufb x, m5}, m2, m3, m1, m4 2453%else 2454 REPX {pshufb x, m9}, m2, m3, m1, m4 2455%endif 2456 REPX {pmaddwd x, m7}, m2, m3, m1, m4 2457 phaddd m2, m3 ; 0 1 2458 phaddd m1, m4 ; 2 3 2459 movu m3, [srcq+ssq*1] 2460 movu m4, [srcq+ssq*2] 2461 add srcq, r6 2462 movu m0, [srcq+ssq*0] 2463%if ARCH_X86_32 2464 REPX {pshufb x, m5}, m3, m4, m0 2465%else 2466 REPX {pshufb x, m9}, m3, m4, m0 2467%endif 2468 REPX {pmaddwd x, m7}, m3, m4, m0 2469 phaddd m3, m4 ; 4 5 2470 phaddd m0, m0 ; 6 6 2471 REPX {paddd x, m6}, m2, m1, m3, m0 2472 REPX {psrad x, 10}, m2, m1, m3, m0 2473 packssdw m2, m1 ; 0 1 2 3 2474 packssdw m3, m0 ; 4 5 6 _ 2475 palignr m4, m3, m2, 4 ; 1 2 3 4 2476 pshufd m5, m3, q0321 ; 5 6 _ _ 2477 punpcklwd m1, m2, m4 ; 01 12 2478 punpckhwd m2, m4 ; 23 34 2479 punpcklwd m3, m5 ; 45 56 2480.hv_w2_loop: 2481 movu m4, [srcq+ssq*1] 2482 lea srcq, [srcq+ssq*2] 2483 movu m5, [srcq+ssq*0] 2484 pshufb m4, m9 2485 pshufb m5, m9 2486 pmaddwd m4, m7 2487 pmaddwd m5, m7 2488 phaddd m4, m5 2489 pmaddwd m5, m11, m1 ; a0 b0 2490 mova m1, m2 2491 pmaddwd m2, m12 ; a1 b1 2492 paddd m5, m2 2493 mova m2, m3 2494 pmaddwd m3, m13 ; a2 b2 2495 paddd m5, m3 2496 paddd m4, m6 2497 psrad m4, 10 ; 7 8 2498 packssdw m0, m4 2499 pshufd m3, m0, q2103 2500 punpckhwd m3, m0 ; 67 78 2501 mova m0, m4 2502 pmaddwd m4, m14, m3 ; a3 b3 2503 paddd m5, m4 2504 psrad m5, 10 2505 packssdw m5, m5 2506 pxor m4, m4 2507 pminsw m5, m15 2508 pmaxsw m5, m4 2509 movd [dstq+dsq*0], m5 2510 pshuflw m5, m5, q3232 2511 movd [dstq+dsq*1], m5 2512 lea dstq, [dstq+dsq*2] 2513 sub hd, 2 2514 jg .hv_w2_loop 2515 RET 2516.hv_w8: 2517 shr mxd, 16 2518.hv_w4: 2519 movq m2, [base+subpel_filters+mxq*8] 2520 movzx mxd, myb 2521 shr myd, 16 2522 cmp hd, 6 2523 cmovb myd, mxd 2524 movq m3, [base+subpel_filters+myq*8] 2525%if ARCH_X86_32 2526 RESET_STACK_STATE 2527 mov dstq, dstmp 2528 mov dsq, dsmp 2529 mova m0, [base+spel_h_shufA] 2530 mova m1, [base+spel_h_shufB] 2531 mova m6, [base+pd_512] 2532 ALLOC_STACK -16*15 2533 mova m8, m0 2534 mova m9, m1 2535 mova m14, m6 2536%else 2537 mova m8, [base+spel_h_shufA] 2538 mova m9, [base+spel_h_shufB] 2539%endif 2540 pxor m0, m0 2541 punpcklbw m0, m2 2542 punpcklbw m3, m3 2543 psraw m3, 8 2544 test dword r8m, 0x800 2545 jz .hv_w4_10bpc 2546 psraw m0, 2 2547 psllw m3, 2 2548.hv_w4_10bpc: 2549 lea r6, [ssq*3] 2550 sub srcq, 6 2551 sub srcq, r6 2552%if ARCH_X86_32 2553 %define tmp esp+16*8 2554 shl wd, 14 2555%if STACK_ALIGNMENT < 16 2556 mov [esp+4*61], srcq 2557 mov [esp+4*62], dstq 2558%else 2559 mov srcmp, srcq 2560%endif 2561 mova [tmp+16*5], m4 2562 lea wd, [wq+hq-(1<<16)] 2563 pshufd m1, m0, q0000 2564 pshufd m2, m0, q1111 2565 pshufd m5, m0, q2222 2566 pshufd m0, m0, q3333 2567 mova m10, m1 2568 mova m11, m2 2569 mova m12, m5 2570 mova m13, m0 2571%else 2572%if WIN64 2573 %define tmp rsp 2574%else 2575 %define tmp rsp-104 ; red zone 2576%endif 2577 shl wd, 6 2578 mov r7, srcq 2579 mov r8, dstq 2580 lea wd, [wq+hq-(1<<8)] 2581 pshufd m10, m0, q0000 2582 pshufd m11, m0, q1111 2583 pshufd m12, m0, q2222 2584 pshufd m13, m0, q3333 2585 mova [tmp+16*5], m15 2586%endif 2587 pshufd m0, m3, q0000 2588 pshufd m1, m3, q1111 2589 pshufd m2, m3, q2222 2590 pshufd m3, m3, q3333 2591 mova [tmp+16*1], m0 2592 mova [tmp+16*2], m1 2593 mova [tmp+16*3], m2 2594 mova [tmp+16*4], m3 2595%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] 2596 pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 2597 pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 2598 pmaddwd m%3, m10 2599 pmaddwd m%1, m11 2600 paddd m%3, %5 2601 paddd m%1, m%3 2602 pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 2603 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a 2604 pmaddwd m%3, m12 2605 pmaddwd m%2, m13 2606 paddd m%1, m%3 2607 paddd m%1, m%2 2608 psrad m%1, %4 2609%endmacro 2610.hv_w4_loop0: 2611%if ARCH_X86_64 2612 mova m14, [pd_512] 2613%endif 2614 movu m4, [srcq+ssq*0+0] 2615 movu m1, [srcq+ssq*0+8] 2616 movu m5, [srcq+ssq*1+0] 2617 movu m2, [srcq+ssq*1+8] 2618 movu m6, [srcq+ssq*2+0] 2619 movu m3, [srcq+ssq*2+8] 2620 add srcq, r6 2621 PUT_8TAP_HV_H 4, 1, 0, 10 2622 PUT_8TAP_HV_H 5, 2, 0, 10 2623 PUT_8TAP_HV_H 6, 3, 0, 10 2624 movu m7, [srcq+ssq*0+0] 2625 movu m2, [srcq+ssq*0+8] 2626 movu m1, [srcq+ssq*1+0] 2627 movu m3, [srcq+ssq*1+8] 2628 PUT_8TAP_HV_H 7, 2, 0, 10 2629 PUT_8TAP_HV_H 1, 3, 0, 10 2630 movu m2, [srcq+ssq*2+0] 2631 movu m3, [srcq+ssq*2+8] 2632 add srcq, r6 2633 PUT_8TAP_HV_H 2, 3, 0, 10 2634 packssdw m4, m7 ; 0 3 2635 packssdw m5, m1 ; 1 4 2636 movu m0, [srcq+ssq*0+0] 2637 movu m1, [srcq+ssq*0+8] 2638 PUT_8TAP_HV_H 0, 1, 3, 10 2639 packssdw m6, m2 ; 2 5 2640 packssdw m7, m0 ; 3 6 2641 punpcklwd m1, m4, m5 ; 01 2642 punpckhwd m4, m5 ; 34 2643 punpcklwd m2, m5, m6 ; 12 2644 punpckhwd m5, m6 ; 45 2645 punpcklwd m3, m6, m7 ; 23 2646 punpckhwd m6, m7 ; 56 2647%if ARCH_X86_32 2648 jmp .hv_w4_loop_start 2649.hv_w4_loop: 2650 mova m1, [tmp+16*6] 2651 mova m2, m15 2652.hv_w4_loop_start: 2653 mova m7, [tmp+16*1] 2654 pmaddwd m1, m7 ; a0 2655 pmaddwd m2, m7 ; b0 2656 mova m7, [tmp+16*2] 2657 mova [tmp+16*6], m3 2658 pmaddwd m3, m7 ; a1 2659 mova m15, m4 2660 pmaddwd m4, m7 ; b1 2661 mova m7, [tmp+16*3] 2662 paddd m1, m3 2663 paddd m2, m4 2664 mova m3, m5 2665 pmaddwd m5, m7 ; a2 2666 mova m4, m6 2667 pmaddwd m6, m7 ; b2 2668 paddd m1, m5 2669 paddd m2, m6 2670 movu m7, [srcq+ssq*1+0] 2671 movu m5, [srcq+ssq*1+8] 2672 lea srcq, [srcq+ssq*2] 2673 PUT_8TAP_HV_H 7, 5, 6, 10 2674 packssdw m0, m7 ; 6 7 2675 mova [tmp+16*0], m0 2676 movu m0, [srcq+ssq*0+0] 2677 movu m5, [srcq+ssq*0+8] 2678 PUT_8TAP_HV_H 0, 5, 6, 10 2679 mova m6, [tmp+16*0] 2680 packssdw m7, m0 ; 7 8 2681 punpcklwd m5, m6, m7 ; 67 2682 punpckhwd m6, m7 ; 78 2683 pmaddwd m7, m5, [tmp+16*4] 2684 paddd m1, m7 ; a3 2685 pmaddwd m7, m6, [tmp+16*4] 2686 paddd m2, m7 ; b3 2687 psrad m1, 9 2688 psrad m2, 9 2689 packssdw m1, m2 2690 pxor m7, m7 2691 pmaxsw m1, m7 2692 pavgw m7, m1 2693 pminsw m7, [tmp+16*5] 2694 movq [dstq+dsq*0], m7 2695 movhps [dstq+dsq*1], m7 2696 lea dstq, [dstq+dsq*2] 2697 sub hd, 2 2698 jg .hv_w4_loop 2699%if STACK_ALIGNMENT < 16 2700 mov srcq, [esp+4*61] 2701 mov dstq, [esp+4*62] 2702 add srcq, 8 2703 add dstq, 8 2704 mov [esp+4*61], srcq 2705 mov [esp+4*62], dstq 2706%else 2707 mov srcq, srcmp 2708 mov dstq, dstmp 2709 add srcq, 8 2710 add dstq, 8 2711 mov srcmp, srcq 2712 mov dstmp, dstq 2713%endif 2714 movzx hd, ww 2715 sub wd, 1<<16 2716%else 2717.hv_w4_loop: 2718 mova m15, [tmp+16*1] 2719 pmaddwd m14, m15, m1 ; a0 2720 pmaddwd m15, m2 ; b0 2721 mova m7, [tmp+16*2] 2722 mova m1, m3 2723 pmaddwd m3, m7 ; a1 2724 mova m2, m4 2725 pmaddwd m4, m7 ; b1 2726 mova m7, [tmp+16*3] 2727 paddd m14, m3 2728 paddd m15, m4 2729 mova m3, m5 2730 pmaddwd m5, m7 ; a2 2731 mova m4, m6 2732 pmaddwd m6, m7 ; b2 2733 paddd m14, m5 2734 paddd m15, m6 2735 movu m7, [srcq+ssq*1+0] 2736 movu m5, [srcq+ssq*1+8] 2737 lea srcq, [srcq+ssq*2] 2738 PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] 2739 packssdw m0, m7 ; 6 7 2740 mova [tmp+16*0], m0 2741 movu m0, [srcq+ssq*0+0] 2742 movu m5, [srcq+ssq*0+8] 2743 PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] 2744 mova m6, [tmp+16*0] 2745 packssdw m7, m0 ; 7 8 2746 punpcklwd m5, m6, m7 ; 67 2747 punpckhwd m6, m7 ; 78 2748 pmaddwd m7, m5, [tmp+16*4] 2749 paddd m14, m7 ; a3 2750 pmaddwd m7, m6, [tmp+16*4] 2751 paddd m15, m7 ; b3 2752 psrad m14, 9 2753 psrad m15, 9 2754 packssdw m14, m15 2755 pxor m7, m7 2756 pmaxsw m14, m7 2757 pavgw m7, m14 2758 pminsw m7, [tmp+16*5] 2759 movq [dstq+dsq*0], m7 2760 movhps [dstq+dsq*1], m7 2761 lea dstq, [dstq+dsq*2] 2762 sub hd, 2 2763 jg .hv_w4_loop 2764 add r7, 8 2765 add r8, 8 2766 movzx hd, wb 2767 mov srcq, r7 2768 mov dstq, r8 2769 sub wd, 1<<8 2770%endif 2771 jg .hv_w4_loop0 2772 RET 2773%undef tmp 2774 2775%if ARCH_X86_32 2776DECLARE_REG_TMP 2, 1, 6, 4 2777%elif WIN64 2778DECLARE_REG_TMP 6, 4, 7, 4 2779%else 2780DECLARE_REG_TMP 6, 7, 7, 8 2781%endif 2782 2783%define PREP_8TAP_FN FN prep_8tap, 2784PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc 2785PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc 2786PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc 2787PREP_8TAP_FN regular, REGULAR, REGULAR 2788 2789cglobal prep_6tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my 2790 %define base t2-prep_ssse3 2791%if ARCH_X86_32 2792 %define mxb r0b 2793 %define mxd r0 2794 %define mxq r0 2795 %define myb r2b 2796 %define myd r2 2797 %define myq r2 2798%endif 2799 imul mxd, mxm, 0x010101 2800 add mxd, t0d ; 6tap_h, mx, 4tap_h 2801 imul myd, mym, 0x010101 2802 add myd, t1d ; 6tap_v, my, 4tap_v 2803 LEA t2, prep_ssse3 2804 movifnidn wd, wm 2805 movifnidn hd, hm 2806 movifnidn srcq, srcmp 2807 test mxd, 0xf00 2808 jnz .h 2809 test myd, 0xf00 2810 jnz .v 2811.prep: 2812 tzcnt wd, wd 2813 mov myd, r7m ; bitdepth_max 2814 movzx wd, word [base+prep_ssse3_table+wq*2] 2815 mova m5, [base+pw_8192] 2816 shr myd, 11 2817 add wq, t2 2818 movddup m4, [base+prep_mul+myq*8] 2819 movifnidn ssq, ssmp 2820 movifnidn tmpq, tmpmp 2821 lea r6, [ssq*3] 2822%if WIN64 2823 pop r7 2824%endif 2825 jmp wq 2826.h: 2827 RESET_STACK_STATE 2828 test myd, 0xf00 2829 jnz .hv 2830 movifnidn ssq, r2mp 2831 movddup m5, [base+prep_8tap_1d_rnd] 2832 cmp wd, 4 2833 je mangle(private_prefix %+ _prep_8tap_16bpc_ssse3).h_w4 2834 WIN64_SPILL_XMM 10 2835 shr mxd, 16 2836 movq m2, [base+subpel_filters+1+mxq*8] 2837 movifnidn tmpq, r0mp 2838 mova m4, [base+spel_h_shufA] 2839 add wd, wd 2840 mova m6, [base+spel_h_shufB] 2841 add srcq, wq 2842 punpcklbw m2, m2 2843 add tmpq, wq 2844 psraw m2, 8 2845 neg wq 2846 test dword r7m, 0x800 2847 jnz .h_w8_12bpc 2848 psllw m2, 2 2849.h_w8_12bpc: 2850 pshufd m7, m2, q0000 2851%if ARCH_X86_32 2852 ALLOC_STACK -16*2 2853 %define m8 [rsp+16*0] 2854 %define m9 [rsp+16*1] 2855 pshufd m0, m2, q1111 2856 pshufd m1, m2, q2222 2857 mova m8, m0 2858 mova m9, m1 2859%else 2860 pshufd m8, m2, q1111 2861 pshufd m9, m2, q2222 2862%endif 2863.h_w8_loop0: 2864 mov r6, wq 2865.h_w8_loop: 2866 movu m3, [srcq+r6-4] 2867 movu m2, [srcq+r6+8] 2868 pshufb m0, m3, m4 ; 01 12 23 34 2869 pmaddwd m0, m7 ; abcd0 2870 pshufb m3, m6 ; 23 34 45 56 2871 pmaddwd m1, m8, m3 ; abcd1 2872 paddd m0, m1 2873 pshufb m1, m2, m4 ; 67 78 89 9a 2874 shufpd m3, m1, 0x01; 45 56 67 78 2875 pmaddwd m1, m8 ; efgh1 2876 pshufb m2, m6 ; 89 9a ab bc 2877 pmaddwd m2, m9 ; efgh2 2878 paddd m1, m2 2879 pmaddwd m2, m9 , m3 ; abcd2 2880 pmaddwd m3, m7 ; efgh0 2881 paddd m0, m5 2882 paddd m1, m5 2883 paddd m0, m2 2884 paddd m1, m3 2885 psrad m0, 4 2886 psrad m1, 4 2887 packssdw m0, m1 2888 mova [tmpq+r6], m0 2889 add r6, 16 2890 jl .h_w8_loop 2891 add srcq, ssq 2892 sub tmpq, wq 2893 dec hd 2894 jg .h_w8_loop0 2895 RET 2896.v: 2897 movzx mxd, myb 2898 shr myd, 16 2899 cmp hd, 6 2900 cmovb myd, mxd 2901 movddup m5, [base+prep_8tap_1d_rnd] 2902 movq m2, [base+subpel_filters+1+myq*8] 2903 WIN64_SPILL_XMM 11, 16 2904 movifnidn ssq, r2mp 2905 movifnidn tmpq, r0mp 2906 punpcklbw m2, m2 2907 sub srcq, ssq 2908 psraw m2, 8 ; sign-extend 2909 test dword r7m, 0x800 2910 jnz .v_12bpc 2911 psllw m2, 2 2912.v_12bpc: 2913 sub srcq, ssq 2914%if ARCH_X86_32 2915 ALLOC_STACK -16*4 2916 pshufd m0, m2, q0000 2917 mov r6d, wd 2918 pshufd m1, m2, q1111 2919 shl r6d, 14 2920 pshufd m2, m2, q2222 2921 lea r6d, [r6+hq-(1<<16)] 2922 mova m8, m0 2923 mova m9, m1 2924 mova m10, m2 2925%if STACK_ALIGNMENT < 16 2926 %define srcmp [esp+16*3+4*0] 2927 %define tmpmp [esp+16*3+4*1] 2928%endif 2929.v_w4_loop0: 2930 mov srcmp, srcq 2931 mov tmpmp, tmpq 2932%else 2933 pshufd m8, m2, q0000 2934 and wd, -8 2935 jnz .v_w8 2936 pshufd m9, m2, q1111 2937 pshufd m10, m2, q2222 2938%endif 2939 movq m1, [srcq+ssq*0] 2940 movq m2, [srcq+ssq*1] 2941 lea srcq, [srcq+ssq*2] 2942 movq m3, [srcq+ssq*0] 2943 movq m4, [srcq+ssq*1] 2944 lea srcq, [srcq+ssq*2] 2945 movq m0, [srcq+ssq*0] 2946 punpcklwd m1, m2 ; 01 2947 punpcklwd m2, m3 ; 12 2948 punpcklwd m3, m4 ; 23 2949 punpcklwd m4, m0 ; 34 2950.v_w4_loop: 2951 pmaddwd m6, m8, m1 ; a0 2952 pmaddwd m7, m8, m2 ; b0 2953 mova m1, m3 2954 pmaddwd m3, m9 ; a1 2955 mova m2, m4 2956 pmaddwd m4, m9 ; b1 2957 paddd m6, m3 2958 movq m3, [srcq+ssq*0] 2959 paddd m7, m4 2960 movq m4, [srcq+ssq*1] 2961 lea srcq, [srcq+ssq*2] 2962 movq m0, [srcq+ssq*0] 2963 punpcklwd m3, m4 ; 45 2964 punpcklwd m4, m0 ; 56 2965 pmaddwd m0, m10, m3 ; a2 2966 paddd m6, m5 2967 paddd m6, m0 2968 pmaddwd m0, m10, m4 ; b2 2969 paddd m7, m5 2970 paddd m7, m0 2971 psrad m6, 4 2972 psrad m7, 4 2973 packssdw m6, m7 2974%if ARCH_X86_32 2975 movq [tmpq+wq*0], m6 2976 movhps [tmpq+wq*2], m6 2977 lea tmpq, [tmpq+wq*4] 2978 sub hd, 2 2979 jg .v_w4_loop 2980 mov srcq, srcmp 2981 mov tmpq, tmpmp 2982 movzx hd, r6w 2983 add srcq, 8 2984 add tmpq, 8 2985 sub r6d, 1<<16 2986 jg .v_w4_loop0 2987 RET 2988%else 2989 mova [tmpq], m6 2990 add tmpq, 16 2991 sub hd, 2 2992 jg .v_w4_loop 2993 RET 2994.v_w8: 2995 mova r6m, m8 2996 lea r6d, [wq*4-(1<<5)] 2997 pshufd m6, m2, q1111 2998 lea r6d, [hq+r6*8] 2999 pshufd m7, m2, q2222 3000 WIN64_PUSH_XMM 16 3001.v_w8_loop0: 3002 movu m9, [srcq+ssq*0] 3003 lea r5, [srcq+ssq*2] 3004 movu m11, [srcq+ssq*1] 3005 mov r7, tmpq 3006 movu m13, [r5+ssq*0] 3007 movu m15, [r5+ssq*1] 3008 lea r5, [r5+ssq*2] 3009 movu m4, [r5+ssq*0] 3010 punpcklwd m8, m9, m11 ; 01 3011 punpckhwd m9, m11 3012 punpcklwd m10, m11, m13 ; 12 3013 punpckhwd m11, m13 3014 punpcklwd m12, m13, m15 ; 23 3015 punpckhwd m13, m15 3016 punpcklwd m14, m15, m4 ; 34 3017 punpckhwd m15, m4 3018.v_w8_loop: 3019 mova m3, r6m 3020 pmaddwd m0, m8, m3 ; a0 3021 pmaddwd m2, m9, m3 ; a0' 3022 pmaddwd m1, m10, m3 ; b0 3023 pmaddwd m3, m11 ; b0' 3024 mova m8, m12 3025 pmaddwd m12, m6 ; a1 3026 mova m9, m13 3027 pmaddwd m13, m6 ; a1' 3028 mova m10, m14 3029 pmaddwd m14, m6 ; b1 3030 mova m11, m15 3031 pmaddwd m15, m6 ; b1' 3032 paddd m0, m12 3033 paddd m2, m13 3034 movu m13, [r5+ssq*0] 3035 paddd m1, m14 3036 paddd m3, m15 3037 movu m15, [r5+ssq*1] 3038 lea r5, [r5+ssq*2] 3039 movu m4, [r5+ssq*0] 3040 REPX {paddd x, m5}, m0, m2, m1, m3 3041 punpcklwd m12, m13, m15 ; 45 3042 punpckhwd m13, m15 3043 punpcklwd m14, m15, m4 ; 56 3044 punpckhwd m15, m4 3045 pmaddwd m4, m7, m12 ; a2 3046 paddd m0, m4 3047 pmaddwd m4, m7, m13 ; a2' 3048 paddd m2, m4 3049 pmaddwd m4, m7, m14 ; b2 3050 paddd m1, m4 3051 pmaddwd m4, m7, m15 ; b2' 3052 paddd m3, m4 3053 REPX {psrad x, 4}, m0, m2, m1, m3 3054 packssdw m0, m2 3055 packssdw m1, m3 3056 mova [r7+wq*0], m0 3057 mova [r7+wq*2], m1 3058 lea r7, [r7+wq*4] 3059 sub hd, 2 3060 jg .v_w8_loop 3061 add srcq, 16 3062 add tmpq, 16 3063 movzx hd, r6b 3064 sub r6d, 1<<8 3065 jg .v_w8_loop0 3066 RET 3067%endif 3068.hv: 3069 and wd, -8 3070 jnz .hv_w8 3071 movzx mxd, mxb 3072 movq m0, [base+subpel_filters+mxq*8] 3073 movzx mxd, myb 3074 shr myd, 16 3075 cmp hd, 6 3076 cmovb myd, mxd 3077 movq m2, [base+subpel_filters+1+myq*8] 3078 WIN64_SPILL_XMM 15 3079 movifnidn ssq, r2mp 3080 movifnidn tmpq, r0mp 3081 mova m7, [base+prep_8tap_2d_rnd] 3082 sub srcq, 2 3083 pshuflw m0, m0, q2121 3084 pxor m6, m6 3085 punpcklbw m6, m0 3086 punpcklbw m2, m2 3087 psraw m6, 4 3088 psraw m2, 8 3089 test dword r7m, 0x800 3090 jz .hv_w4_10bpc 3091 psraw m6, 2 3092.hv_w4_10bpc: 3093%if ARCH_X86_32 3094%assign regs_used 4 3095 ALLOC_STACK -16*7 3096%assign regs_used 7 3097 %define m10 [esp+16*3] 3098 %define m12 [esp+16*5] 3099 %define m13 [esp+16*6] 3100 %define m14 [base+spel_h_shufA] 3101 %define m11 [base+spel_h_shufB] 3102 pshufd m0, m2, q0000 3103 pshufd m1, m2, q1111 3104 pshufd m2, m2, q2222 3105 pshufd m5, m6, q0000 3106 pshufd m6, m6, q1111 3107 mova m8, m0 3108 mova m9, m1 3109 mova m10, m2 3110 mova m12, m5 3111 mova m13, m6 3112 neg ssq 3113 movu m3, [srcq+ssq*2] 3114 movu m4, [srcq+ssq*1] 3115 neg ssq 3116%else 3117 mov r6, ssq 3118 pshufd m8, m2, q0000 3119 neg r6 3120 pshufd m9, m2, q1111 3121 movu m3, [srcq+r6 *2] 3122 pshufd m10, m2, q2222 3123 movu m4, [srcq+r6 *1] 3124 pshufd m12, m6, q0000 3125 mova m14, [base+spel_h_shufA] 3126 pshufd m13, m6, q1111 3127 mova m11, [base+spel_h_shufB] 3128%endif 3129 movu m1, [srcq+ssq*0] 3130 movu m0, [srcq+ssq*1] 3131 lea srcq, [srcq+ssq*2] 3132 movu m2, [srcq+ssq*0] 3133 HV_H_W4_6TAP m3, m3, m5, m11 3134 HV_H_W4_6TAP m4, m4, m5, m11 3135 HV_H_W4_6TAP m5, m1, m5, m11 3136 HV_H_W4_6TAP m0, m0, m1, m11 3137 HV_H_W4_6TAP m2, m2, m1, m11 3138 REPX {psrad x, 6}, m3, m5, m4, m0, m2 3139 packssdw m3, m5 ; 0 2 3140 packssdw m4, m0 ; 1 3 3141 packssdw m5, m2 ; 2 4 3142 punpcklwd m1, m3, m4 ; 01 3143 punpckhwd m3, m4 ; 23 3144 punpcklwd m2, m4, m5 ; 12 3145 punpckhwd m4, m5 ; 34 3146.hv_w4_loop: 3147 movu m0, [srcq+ssq*1] 3148 pmaddwd m5, m8, m1 ; a0 3149 lea srcq, [srcq+ssq*2] 3150 pmaddwd m6, m8, m2 ; b0 3151 mova m1, m3 3152 pmaddwd m3, m9 ; a1 3153 mova m2, m4 3154 pmaddwd m4, m9 ; b1 3155 paddd m5, m3 3156 movu m3, [srcq+ssq*0] 3157 paddd m6, m4 3158 HV_H_W4_6TAP m0, m0, m4, m11 3159 HV_H_W4_6TAP m3, m3, m4, m11 3160 psrad m4, m2, 16 3161 psrad m0, 6 3162 psrad m3, 6 3163 packssdw m4, m0 ; 4 5 3164 packssdw m0, m3 ; 5 6 3165 punpcklwd m3, m4, m0 ; 45 3166 punpckhwd m4, m0 ; 56 3167 pmaddwd m0, m10, m3 ; a2 3168 paddd m5, m7 3169 paddd m5, m0 3170 pmaddwd m0, m10, m4 ; b2 3171 paddd m6, m7 3172 paddd m6, m0 3173 psrad m5, 6 3174 psrad m6, 6 3175 packssdw m5, m6 3176 mova [tmpq], m5 3177 add tmpq, 16 3178 sub hd, 2 3179 jg .hv_w4_loop 3180 RET 3181.hv_w8: 3182 RESET_STACK_STATE 3183 shr mxd, 16 3184 movq m2, [base+subpel_filters+1+mxq*8] 3185 movzx mxd, myb 3186 shr myd, 16 3187 cmp hd, 6 3188 cmovb myd, mxd 3189 movq m1, [base+subpel_filters+1+myq*8] 3190 movifnidn ssq, r2mp 3191 mova m4, [base+prep_8tap_2d_rnd] 3192 pxor m0, m0 3193 punpcklbw m0, m2 3194 punpcklbw m1, m1 3195 sub srcq, 4 3196 psraw m0, 4 3197 psraw m1, 8 3198 test dword r7m, 0x800 3199 jz .hv_w8_10bpc 3200 psraw m0, 2 3201.hv_w8_10bpc: 3202%if ARCH_X86_32 3203%assign regs_used 1 3204 ALLOC_STACK -16*9 3205%assign regs_used 7 3206 mov tmpq, r0mp 3207 mova [rsp+16*7], m4 3208%else 3209%if WIN64 3210 PUSH r8 3211%assign regs_used 9 3212%endif 3213 ALLOC_STACK 16*6, 16 3214%endif 3215 pshufd m2, m0, q0000 3216 mova [rsp+16*0], m2 3217 pshufd m2, m0, q1111 3218 mova [rsp+16*1], m2 3219 pshufd m0, m0, q2222 3220 mova [rsp+16*2], m0 3221 pshufd m2, m1, q0000 3222 mova [rsp+16*3], m2 3223 pshufd m2, m1, q1111 3224 mova [rsp+16*4], m2 3225 pshufd m1, m1, q2222 3226 mova [rsp+16*5], m1 3227 mov r6, ssq 3228 neg r6 3229%if ARCH_X86_32 3230 mov r5d, wd 3231 shl r5d, 14 3232 lea r5d, [r5+hq-(1<<16)] 3233%if STACK_ALIGNMENT < 16 3234 %define srcmp [esp+16*8+4*0] 3235 %define tmpmp [esp+16*8+4*1] 3236%endif 3237.hv_w8_loop0: 3238 mov srcmp, srcq 3239 mov tmpmp, tmpq 3240 movu m5, [srcq+r6*2+0] 3241 movu m6, [srcq+r6*2+2] 3242 mova m7, [rsp+16*0] 3243 mova m1, [rsp+16*1] 3244 mova m0, [rsp+16*2] 3245 HV_H_6TAP m2, m5, m6, m7, m1, m0 3246 movu m5, [srcq+r6*1+0] 3247 movu m6, [srcq+r6*1+2] 3248 HV_H_6TAP m3, m5, m6, m7, m1, m0 3249 movu m5, [srcq+ssq*0+0] 3250 movu m6, [srcq+ssq*0+2] 3251 HV_H_6TAP m4, m5, m6, m7, m1, m0 3252 movu m5, [srcq+ssq*1+0] 3253 movu m6, [srcq+ssq*1+2] 3254 lea srcq, [srcq+ssq*2] 3255 HV_H_6TAP m0, m5, m6, m7, m1 3256 movu m5, [srcq+ssq*0+0] 3257 movu m6, [srcq+ssq*0+2] 3258 HV_H_6TAP m1, m5, m6, m7 3259 mova m5, [rsp+16*7] 3260 REPX {paddd x, m5}, m2, m3, m4, m0, m1 3261 REPX {psrad x, 6 }, m2, m4, m3, m0, m1 3262 packssdw m2, m4 ; 0 2 3263 packssdw m3, m0 ; 1 3 3264 packssdw m4, m1 ; 2 4 3265 punpcklwd m0, m2, m3 ; 01 3266 punpckhwd m2, m3 ; 23 3267 punpcklwd m1, m3, m4 ; 12 3268 punpckhwd m3, m4 ; 34 3269.hv_w8_loop: 3270 mova m5, [rsp+16*3] 3271 mova m6, [rsp+16*4] 3272 pmaddwd m4, m0, m5 ; a0 3273 pmaddwd m5, m1 ; b0 3274 mova m0, m2 3275 pmaddwd m2, m6 ; a1 3276 mova m1, m3 3277 pmaddwd m3, m6 ; b1 3278 paddd m4, m2 3279 movu m2, [srcq+ssq*1+0] 3280 paddd m5, m3 3281 movu m3, [srcq+ssq*1+2] 3282 lea srcq, [srcq+ssq*2] 3283 HV_H_6TAP m6, m2, m3 3284 movu m2, [srcq+ssq*0+0] 3285 movu m3, [srcq+ssq*0+2] 3286 HV_H_6TAP m7, m2, m3 3287 mova m2, [rsp+16*7] 3288 psrad m3, m1, 16 3289 REPX {paddd x, m2}, m6, m7, m4, m5 3290 psrad m6, 6 3291 psrad m7, 6 3292 packssdw m3, m6 ; 4 5 3293 packssdw m6, m7 ; 5 6 3294 mova m7, [rsp+16*5] 3295 punpcklwd m2, m3, m6 ; 45 3296 punpckhwd m3, m6 ; 56 3297 pmaddwd m6, m2, m7 ; a2 3298 pmaddwd m7, m3 ; b2 3299 paddd m4, m6 3300 paddd m5, m7 3301 psrad m4, 6 3302 psrad m5, 6 3303 packssdw m4, m5 3304 movq [tmpq+wq*0], m4 3305 movhps [tmpq+wq*2], m4 3306 lea tmpq, [tmpq+wq*4] 3307 sub hd, 2 3308 jg .hv_w8_loop 3309 mov srcq, srcmp 3310 mov tmpq, tmpmp 3311 movzx hd, r5w 3312 add srcq, 8 3313 add tmpq, 8 3314 sub r5d, 1<<16 3315%else 3316 lea r8d, [wq*4-(1<<5)] 3317 lea r8d, [hq+r8*8] 3318.hv_w8_loop0: 3319 mova m5, [spel_h_shufA] 3320 movu m0, [srcq+r6*2+ 0] 3321 mova m6, [rsp+16*0] 3322 movu m1, [srcq+r6*2+ 8] 3323 mova m7, [rsp+16*1] 3324 movu m2, [srcq+r6*2+16] 3325 mova m8, [rsp+16*2] 3326 HV_H_6TAP m9, m0, m1, m2, 6, m5, m6, m7, m8 3327 movu m0, [srcq+r6*1+ 0] 3328 movu m1, [srcq+r6*1+ 8] 3329 movu m2, [srcq+r6*1+16] 3330 lea r5, [srcq+ssq*2] 3331 HV_H_6TAP m11, m0, m1, m2, 6, m5, m6, m7, m8 3332 movu m0, [srcq+ssq*0+ 0] 3333 movu m1, [srcq+ssq*0+ 8] 3334 movu m2, [srcq+ssq*0+16] 3335 mov r7, tmpq 3336 HV_H_6TAP m13, m0, m1, m2, 6, m5, m6, m7, m8 3337 movu m0, [srcq+ssq*1+ 0] 3338 movu m1, [srcq+ssq*1+ 8] 3339 movu m2, [srcq+ssq*1+16] 3340 HV_H_6TAP m15, m0, m1, m2, 6, m5, m6, m7, m8 3341 movu m0, [r5+ssq*0+ 0] 3342 movu m1, [r5+ssq*0+ 8] 3343 movu m2, [r5+ssq*0+16] 3344 HV_H_6TAP m5, m0, m1, m2, 6, m5, m6, m7, m8 3345 punpcklwd m8, m9, m11 ; 01 3346 punpckhwd m9, m11 3347 punpcklwd m10, m11, m13 ; 12 3348 punpckhwd m11, m13 3349 punpcklwd m12, m13, m15 ; 23 3350 punpckhwd m13, m15 3351 punpcklwd m14, m15, m5 ; 34 3352 punpckhwd m15, m5 3353.hv_w8_loop: 3354 mova m3, [rsp+16*3] 3355 mova m7, [rsp+16*4] 3356 pmaddwd m0, m8, m3 ; a0 3357 mova m8, m12 3358 pmaddwd m2, m9, m3 ; a0' 3359 mova m9, m13 3360 pmaddwd m1, m10, m3 ; b0 3361 mova m10, m14 3362 pmaddwd m3, m11 ; b0' 3363 mova m11, m15 3364 REPX {pmaddwd x, m7}, m12, m13, m14, m15 3365 movu m6, [r5+ssq*1+ 0] 3366 paddd m0, m12 3367 movu m7, [r5+ssq*1+ 8] 3368 paddd m2, m13 3369 movu m12, [r5+ssq*1+16] 3370 paddd m1, m14 3371 lea r5, [r5+ssq*2] 3372 paddd m3, m15 3373 HV_H_6TAP m15, m6, m7, m12, 6 3374 movu m6, [r5+ssq*0+ 0] 3375 movu m7, [r5+ssq*0+ 8] 3376 movu m14, [r5+ssq*0+16] 3377 punpcklwd m12, m5, m15 ; 45 3378 punpckhwd m13, m5, m15 3379 HV_H_6TAP m5, m6, m7, m14, 6 3380 mova m7, [rsp+16*5] 3381 REPX {paddd x, m4}, m0, m2, m1, m3 3382 punpcklwd m14, m15, m5 ; 56 3383 punpckhwd m15, m5 3384 pmaddwd m6, m12, m7 ; a2 3385 paddd m0, m6 3386 pmaddwd m6, m13, m7 ; a2' 3387 paddd m2, m6 3388 pmaddwd m6, m14, m7 ; b2 3389 pmaddwd m7, m15 ; b2' 3390 paddd m1, m6 3391 paddd m3, m7 3392 REPX {psrad x, 6}, m0, m2, m1, m3 3393 packssdw m0, m2 3394 packssdw m1, m3 3395 mova [r7+wq*0], m0 3396 mova [r7+wq*2], m1 3397 lea r7, [r7+wq*4] 3398 sub hd, 2 3399 jg .hv_w8_loop 3400 add srcq, 16 3401 add tmpq, 16 3402 movzx hd, r8b 3403 sub r8d, 1<<8 3404%endif 3405 jg .hv_w8_loop0 3406 RET 3407 3408PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc 3409PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc 3410PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc 3411PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc 3412PREP_8TAP_FN sharp, SHARP, SHARP 3413 3414cglobal prep_8tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my 3415%if ARCH_X86_32 3416 %define mxb r0b 3417 %define mxd r0 3418 %define mxq r0 3419 %define myb r2b 3420 %define myd r2 3421 %define myq r2 3422 %define m8 [esp+16*0] 3423 %define m9 [esp+16*1] 3424 %define m10 [esp+16*2] 3425 %define m11 [esp+16*3] 3426 %define m12 [esp+16*4] 3427 %define m13 [esp+16*5] 3428 %define m14 [esp+16*6] 3429 %define m15 [esp+16*7] 3430%endif 3431 imul mxd, mxm, 0x010101 3432 add mxd, t0d ; 8tap_h, mx, 4tap_h 3433 imul myd, mym, 0x010101 3434 add myd, t1d ; 8tap_v, my, 4tap_v 3435 LEA t2, prep_ssse3 3436 movifnidn wd, wm 3437 movifnidn srcq, srcmp 3438 test mxd, 0xf00 3439 jnz .h 3440 movifnidn hd, hm 3441 test myd, 0xf00 3442 jz mangle(private_prefix %+ _prep_6tap_16bpc_ssse3).prep 3443.v: 3444 movzx mxd, myb 3445 shr myd, 16 3446 cmp hd, 4 3447 cmove myd, mxd 3448 movq m3, [base+subpel_filters+myq*8] 3449 WIN64_SPILL_XMM 15 3450 movddup m7, [base+prep_8tap_1d_rnd] 3451 movifnidn ssq, r2mp 3452 movifnidn tmpq, r0mp 3453 punpcklbw m3, m3 3454 psraw m3, 8 ; sign-extend 3455 test dword r7m, 0x800 3456 jnz .v_12bpc 3457 psllw m3, 2 3458.v_12bpc: 3459%if ARCH_X86_32 3460 ALLOC_STACK -16*7 3461 pshufd m0, m3, q0000 3462 pshufd m1, m3, q1111 3463 pshufd m2, m3, q2222 3464 pshufd m3, m3, q3333 3465 mova m8, m0 3466 mova m9, m1 3467 mova m10, m2 3468 mova m11, m3 3469%else 3470 pshufd m8, m3, q0000 3471 pshufd m9, m3, q1111 3472 pshufd m10, m3, q2222 3473 pshufd m11, m3, q3333 3474%endif 3475 lea r6, [ssq*3] 3476 sub srcq, r6 3477 mov r6d, wd 3478 shl wd, 6 3479 mov r5, srcq 3480%if ARCH_X86_64 3481 mov r7, tmpq 3482%elif STACK_ALIGNMENT < 16 3483 mov [esp+4*29], tmpq 3484%endif 3485 lea wd, [wq+hq-(1<<8)] 3486.v_loop0: 3487 movq m1, [srcq+ssq*0] 3488 movq m2, [srcq+ssq*1] 3489 lea srcq, [srcq+ssq*2] 3490 movq m3, [srcq+ssq*0] 3491 movq m4, [srcq+ssq*1] 3492 lea srcq, [srcq+ssq*2] 3493 movq m5, [srcq+ssq*0] 3494 movq m6, [srcq+ssq*1] 3495 lea srcq, [srcq+ssq*2] 3496 movq m0, [srcq+ssq*0] 3497 punpcklwd m1, m2 ; 01 3498 punpcklwd m2, m3 ; 12 3499 punpcklwd m3, m4 ; 23 3500 punpcklwd m4, m5 ; 34 3501 punpcklwd m5, m6 ; 45 3502 punpcklwd m6, m0 ; 56 3503%if ARCH_X86_32 3504 jmp .v_loop_start 3505.v_loop: 3506 mova m1, m12 3507 mova m2, m13 3508 mova m3, m14 3509.v_loop_start: 3510 pmaddwd m1, m8 ; a0 3511 pmaddwd m2, m8 ; b0 3512 mova m12, m3 3513 mova m13, m4 3514 pmaddwd m3, m9 ; a1 3515 pmaddwd m4, m9 ; b1 3516 paddd m1, m3 3517 paddd m2, m4 3518 mova m14, m5 3519 mova m4, m6 3520 pmaddwd m5, m10 ; a2 3521 pmaddwd m6, m10 ; b2 3522 paddd m1, m5 3523 paddd m2, m6 3524 movq m6, [srcq+ssq*1] 3525 lea srcq, [srcq+ssq*2] 3526 punpcklwd m5, m0, m6 ; 67 3527 movq m0, [srcq+ssq*0] 3528 pmaddwd m3, m11, m5 ; a3 3529 punpcklwd m6, m0 ; 78 3530 paddd m1, m7 3531 paddd m1, m3 3532 pmaddwd m3, m11, m6 ; b3 3533 paddd m2, m7 3534 paddd m2, m3 3535 psrad m1, 4 3536 psrad m2, 4 3537 packssdw m1, m2 3538 movq [tmpq+r6*0], m1 3539 movhps [tmpq+r6*2], m1 3540 lea tmpq, [tmpq+r6*4] 3541 sub hd, 2 3542 jg .v_loop 3543%if STACK_ALIGNMENT < 16 3544 mov tmpq, [esp+4*29] 3545 add r5, 8 3546 add tmpq, 8 3547 mov srcq, r5 3548 mov [esp+4*29], tmpq 3549%else 3550 mov tmpq, tmpmp 3551 add r5, 8 3552 add tmpq, 8 3553 mov srcq, r5 3554 mov tmpmp, tmpq 3555%endif 3556%else 3557.v_loop: 3558 pmaddwd m12, m8, m1 ; a0 3559 pmaddwd m13, m8, m2 ; b0 3560 mova m1, m3 3561 mova m2, m4 3562 pmaddwd m3, m9 ; a1 3563 pmaddwd m4, m9 ; b1 3564 paddd m12, m3 3565 paddd m13, m4 3566 mova m3, m5 3567 mova m4, m6 3568 pmaddwd m5, m10 ; a2 3569 pmaddwd m6, m10 ; b2 3570 paddd m12, m5 3571 paddd m13, m6 3572 movq m6, [srcq+ssq*1] 3573 lea srcq, [srcq+ssq*2] 3574 punpcklwd m5, m0, m6 ; 67 3575 movq m0, [srcq+ssq*0] 3576 pmaddwd m14, m11, m5 ; a3 3577 punpcklwd m6, m0 ; 78 3578 paddd m12, m7 3579 paddd m12, m14 3580 pmaddwd m14, m11, m6 ; b3 3581 paddd m13, m7 3582 paddd m13, m14 3583 psrad m12, 4 3584 psrad m13, 4 3585 packssdw m12, m13 3586 movq [tmpq+r6*0], m12 3587 movhps [tmpq+r6*2], m12 3588 lea tmpq, [tmpq+r6*4] 3589 sub hd, 2 3590 jg .v_loop 3591 add r5, 8 3592 add r7, 8 3593 mov srcq, r5 3594 mov tmpq, r7 3595%endif 3596 movzx hd, wb 3597 sub wd, 1<<8 3598 jg .v_loop0 3599 RET 3600.h: 3601 RESET_STACK_STATE 3602 test myd, 0xf00 3603 jnz .hv 3604 movifnidn ssq, r2mp 3605 movifnidn hd, r4m 3606 movddup m5, [base+prep_8tap_1d_rnd] 3607 cmp wd, 4 3608 jne .h_w8 3609.h_w4: 3610 movzx mxd, mxb 3611 movq m0, [base+subpel_filters+mxq*8] 3612 mova m3, [base+spel_h_shufA] 3613 mova m4, [base+spel_h_shufB] 3614 movifnidn tmpq, tmpmp 3615 sub srcq, 2 3616 WIN64_SPILL_XMM 8 3617 punpcklbw m0, m0 3618 psraw m0, 8 3619 test dword r7m, 0x800 3620 jnz .h_w4_12bpc 3621 psllw m0, 2 3622.h_w4_12bpc: 3623 pshufd m6, m0, q1111 3624 pshufd m7, m0, q2222 3625.h_w4_loop: 3626 movu m1, [srcq+ssq*0] 3627 movu m2, [srcq+ssq*1] 3628 lea srcq, [srcq+ssq*2] 3629 pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 3630 pshufb m1, m4 ; 2 3 3 4 4 5 5 6 3631 pmaddwd m0, m6 3632 pmaddwd m1, m7 3633 paddd m0, m5 3634 paddd m0, m1 3635 pshufb m1, m2, m3 3636 pshufb m2, m4 3637 pmaddwd m1, m6 3638 pmaddwd m2, m7 3639 paddd m1, m5 3640 paddd m1, m2 3641 psrad m0, 4 3642 psrad m1, 4 3643 packssdw m0, m1 3644 mova [tmpq], m0 3645 add tmpq, 16 3646 sub hd, 2 3647 jg .h_w4_loop 3648 RET 3649.h_w8: 3650 WIN64_SPILL_XMM 11 3651 shr mxd, 16 3652 movq m2, [base+subpel_filters+mxq*8] 3653 mova m4, [base+spel_h_shufA] 3654 mova m6, [base+spel_h_shufB] 3655 movifnidn tmpq, r0mp 3656 add wd, wd 3657 punpcklbw m2, m2 3658 add srcq, wq 3659 psraw m2, 8 3660 add tmpq, wq 3661 neg wq 3662 test dword r7m, 0x800 3663 jnz .h_w8_12bpc 3664 psllw m2, 2 3665.h_w8_12bpc: 3666 pshufd m7, m2, q0000 3667%if ARCH_X86_32 3668 ALLOC_STACK -16*3 3669 pshufd m0, m2, q1111 3670 pshufd m1, m2, q2222 3671 pshufd m2, m2, q3333 3672 mova m8, m0 3673 mova m9, m1 3674 mova m10, m2 3675%else 3676 pshufd m8, m2, q1111 3677 pshufd m9, m2, q2222 3678 pshufd m10, m2, q3333 3679%endif 3680.h_w8_loop0: 3681 mov r6, wq 3682.h_w8_loop: 3683 movu m0, [srcq+r6- 6] 3684 movu m1, [srcq+r6+ 2] 3685 pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 3686 pshufb m0, m6 ; 2 3 3 4 4 5 5 6 3687 pmaddwd m2, m7 ; abcd0 3688 pmaddwd m0, m8 ; abcd1 3689 pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 3690 pshufb m1, m6 ; 6 7 7 8 8 9 9 a 3691 paddd m2, m5 3692 paddd m0, m2 3693 pmaddwd m2, m9, m3 ; abcd2 3694 pmaddwd m3, m7 ; efgh0 3695 paddd m0, m2 3696 pmaddwd m2, m10, m1 ; abcd3 3697 pmaddwd m1, m8 ; efgh1 3698 paddd m0, m2 3699 movu m2, [srcq+r6+10] 3700 paddd m3, m5 3701 paddd m1, m3 3702 pshufb m3, m2, m4 ; a b b c c d d e 3703 pshufb m2, m6 ; 8 9 9 a a b b c 3704 pmaddwd m3, m9 ; efgh2 3705 pmaddwd m2, m10 ; efgh3 3706 paddd m1, m3 3707 paddd m1, m2 3708 psrad m0, 4 3709 psrad m1, 4 3710 packssdw m0, m1 3711 mova [tmpq+r6], m0 3712 add r6, 16 3713 jl .h_w8_loop 3714 add srcq, ssq 3715 sub tmpq, wq 3716 dec hd 3717 jg .h_w8_loop0 3718 RET 3719.hv: 3720 RESET_STACK_STATE 3721 movzx t3d, mxb 3722 shr mxd, 16 3723 cmp wd, 4 3724 cmove mxd, t3d 3725 movifnidn hd, r4m 3726 movq m2, [base+subpel_filters+mxq*8] 3727 movzx mxd, myb 3728 shr myd, 16 3729 cmp hd, 4 3730 cmove myd, mxd 3731 movq m3, [base+subpel_filters+myq*8] 3732%if ARCH_X86_32 3733 mov ssq, r2mp 3734 mov tmpq, r0mp 3735 mova m0, [base+spel_h_shufA] 3736 mova m1, [base+spel_h_shufB] 3737 mova m4, [base+prep_8tap_2d_rnd] 3738 ALLOC_STACK -16*14 3739 mova m8, m0 3740 mova m9, m1 3741 mova m14, m4 3742%else 3743%if WIN64 3744 ALLOC_STACK 16*6, 16 3745%endif 3746 mova m8, [base+spel_h_shufA] 3747 mova m9, [base+spel_h_shufB] 3748%endif 3749 pxor m0, m0 3750 punpcklbw m0, m2 3751 punpcklbw m3, m3 3752 psraw m0, 4 3753 psraw m3, 8 3754 test dword r7m, 0x800 3755 jz .hv_10bpc 3756 psraw m0, 2 3757.hv_10bpc: 3758 lea r6, [ssq*3] 3759 sub srcq, 6 3760 sub srcq, r6 3761 mov r6d, wd 3762 shl wd, 6 3763 mov r5, srcq 3764%if ARCH_X86_32 3765 %define tmp esp+16*8 3766%if STACK_ALIGNMENT < 16 3767 mov [esp+4*61], tmpq 3768%endif 3769 pshufd m1, m0, q0000 3770 pshufd m2, m0, q1111 3771 pshufd m5, m0, q2222 3772 pshufd m0, m0, q3333 3773 mova m10, m1 3774 mova m11, m2 3775 mova m12, m5 3776 mova m13, m0 3777%else 3778%if WIN64 3779 %define tmp rsp 3780%else 3781 %define tmp rsp-88 ; red zone 3782%endif 3783 mov r7, tmpq 3784 pshufd m10, m0, q0000 3785 pshufd m11, m0, q1111 3786 pshufd m12, m0, q2222 3787 pshufd m13, m0, q3333 3788%endif 3789 lea wd, [wq+hq-(1<<8)] 3790 pshufd m0, m3, q0000 3791 pshufd m1, m3, q1111 3792 pshufd m2, m3, q2222 3793 pshufd m3, m3, q3333 3794 mova [tmp+16*1], m0 3795 mova [tmp+16*2], m1 3796 mova [tmp+16*3], m2 3797 mova [tmp+16*4], m3 3798.hv_loop0: 3799%if ARCH_X86_64 3800 mova m14, [prep_8tap_2d_rnd] 3801%endif 3802 movu m4, [srcq+ssq*0+0] 3803 movu m1, [srcq+ssq*0+8] 3804 movu m5, [srcq+ssq*1+0] 3805 movu m2, [srcq+ssq*1+8] 3806 lea srcq, [srcq+ssq*2] 3807 movu m6, [srcq+ssq*0+0] 3808 movu m3, [srcq+ssq*0+8] 3809 PUT_8TAP_HV_H 4, 1, 0, 6 3810 PUT_8TAP_HV_H 5, 2, 0, 6 3811 PUT_8TAP_HV_H 6, 3, 0, 6 3812 movu m7, [srcq+ssq*1+0] 3813 movu m2, [srcq+ssq*1+8] 3814 lea srcq, [srcq+ssq*2] 3815 movu m1, [srcq+ssq*0+0] 3816 movu m3, [srcq+ssq*0+8] 3817 PUT_8TAP_HV_H 7, 2, 0, 6 3818 PUT_8TAP_HV_H 1, 3, 0, 6 3819 movu m2, [srcq+ssq*1+0] 3820 movu m3, [srcq+ssq*1+8] 3821 lea srcq, [srcq+ssq*2] 3822 PUT_8TAP_HV_H 2, 3, 0, 6 3823 packssdw m4, m7 ; 0 3 3824 packssdw m5, m1 ; 1 4 3825 movu m0, [srcq+ssq*0+0] 3826 movu m1, [srcq+ssq*0+8] 3827 PUT_8TAP_HV_H 0, 1, 3, 6 3828 packssdw m6, m2 ; 2 5 3829 packssdw m7, m0 ; 3 6 3830 punpcklwd m1, m4, m5 ; 01 3831 punpckhwd m4, m5 ; 34 3832 punpcklwd m2, m5, m6 ; 12 3833 punpckhwd m5, m6 ; 45 3834 punpcklwd m3, m6, m7 ; 23 3835 punpckhwd m6, m7 ; 56 3836%if ARCH_X86_32 3837 jmp .hv_loop_start 3838.hv_loop: 3839 mova m1, [tmp+16*5] 3840 mova m2, m15 3841.hv_loop_start: 3842 mova m7, [tmp+16*1] 3843 pmaddwd m1, m7 ; a0 3844 pmaddwd m2, m7 ; b0 3845 mova m7, [tmp+16*2] 3846 mova [tmp+16*5], m3 3847 pmaddwd m3, m7 ; a1 3848 mova m15, m4 3849 pmaddwd m4, m7 ; b1 3850 mova m7, [tmp+16*3] 3851 paddd m1, m14 3852 paddd m2, m14 3853 paddd m1, m3 3854 paddd m2, m4 3855 mova m3, m5 3856 pmaddwd m5, m7 ; a2 3857 mova m4, m6 3858 pmaddwd m6, m7 ; b2 3859 paddd m1, m5 3860 paddd m2, m6 3861 movu m7, [srcq+ssq*1+0] 3862 movu m5, [srcq+ssq*1+8] 3863 lea srcq, [srcq+ssq*2] 3864 PUT_8TAP_HV_H 7, 5, 6, 6 3865 packssdw m0, m7 ; 6 7 3866 mova [tmp+16*0], m0 3867 movu m0, [srcq+ssq*0+0] 3868 movu m5, [srcq+ssq*0+8] 3869 PUT_8TAP_HV_H 0, 5, 6, 6 3870 mova m6, [tmp+16*0] 3871 packssdw m7, m0 ; 7 8 3872 punpcklwd m5, m6, m7 ; 67 3873 punpckhwd m6, m7 ; 78 3874 pmaddwd m7, m5, [tmp+16*4] 3875 paddd m1, m7 ; a3 3876 pmaddwd m7, m6, [tmp+16*4] 3877 paddd m2, m7 ; b3 3878 psrad m1, 6 3879 psrad m2, 6 3880 packssdw m1, m2 3881 movq [tmpq+r6*0], m1 3882 movhps [tmpq+r6*2], m1 3883 lea tmpq, [tmpq+r6*4] 3884 sub hd, 2 3885 jg .hv_loop 3886%if STACK_ALIGNMENT < 16 3887 mov tmpq, [esp+4*61] 3888 add r5, 8 3889 add tmpq, 8 3890 mov srcq, r5 3891 mov [esp+4*61], tmpq 3892%else 3893 mov tmpq, tmpmp 3894 add r5, 8 3895 add tmpq, 8 3896 mov srcq, r5 3897 mov tmpmp, tmpq 3898%endif 3899%else 3900.hv_loop: 3901 mova m15, [tmp+16*1] 3902 mova m7, [prep_8tap_2d_rnd] 3903 pmaddwd m14, m15, m1 ; a0 3904 pmaddwd m15, m2 ; b0 3905 paddd m14, m7 3906 paddd m15, m7 3907 mova m7, [tmp+16*2] 3908 mova m1, m3 3909 pmaddwd m3, m7 ; a1 3910 mova m2, m4 3911 pmaddwd m4, m7 ; b1 3912 mova m7, [tmp+16*3] 3913 paddd m14, m3 3914 paddd m15, m4 3915 mova m3, m5 3916 pmaddwd m5, m7 ; a2 3917 mova m4, m6 3918 pmaddwd m6, m7 ; b2 3919 paddd m14, m5 3920 paddd m15, m6 3921 movu m7, [srcq+ssq*1+0] 3922 movu m5, [srcq+ssq*1+8] 3923 lea srcq, [srcq+ssq*2] 3924 PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd] 3925 packssdw m0, m7 ; 6 7 3926 mova [tmp+16*0], m0 3927 movu m0, [srcq+ssq*0+0] 3928 movu m5, [srcq+ssq*0+8] 3929 PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd] 3930 mova m6, [tmp+16*0] 3931 packssdw m7, m0 ; 7 8 3932 punpcklwd m5, m6, m7 ; 67 3933 punpckhwd m6, m7 ; 78 3934 pmaddwd m7, m5, [tmp+16*4] 3935 paddd m14, m7 ; a3 3936 pmaddwd m7, m6, [tmp+16*4] 3937 paddd m15, m7 ; b3 3938 psrad m14, 6 3939 psrad m15, 6 3940 packssdw m14, m15 3941 movq [tmpq+r6*0], m14 3942 movhps [tmpq+r6*2], m14 3943 lea tmpq, [tmpq+r6*4] 3944 sub hd, 2 3945 jg .hv_loop 3946 add r5, 8 3947 add r7, 8 3948 mov srcq, r5 3949 mov tmpq, r7 3950%endif 3951 movzx hd, wb 3952 sub wd, 1<<8 3953 jg .hv_loop0 3954 RET 3955%undef tmp 3956 3957%macro movifprep 2 3958 %if isprep 3959 mov %1, %2 3960 %endif 3961%endmacro 3962 3963%macro SAVE_REG 1 3964 %xdefine r%1_save r%1 3965 %xdefine r%1q_save r%1q 3966 %xdefine r%1d_save r%1d 3967 %if ARCH_X86_32 3968 %define r%1m_save [rstk+stack_offset+(%1+1)*4] 3969 %endif 3970%endmacro 3971 3972%macro LOAD_REG 1 3973 %xdefine r%1 r%1_save 3974 %xdefine r%1q r%1q_save 3975 %xdefine r%1d r%1d_save 3976 %if ARCH_X86_32 3977 %define r%1m r%1m_save 3978 %endif 3979 %undef r%1d_save 3980 %undef r%1q_save 3981 %undef r%1_save 3982%endmacro 3983 3984%macro REMAP_REG 2-3 3985 %xdefine r%1 r%2 3986 %xdefine r%1q r%2q 3987 %xdefine r%1d r%2d 3988 %if ARCH_X86_32 3989 %if %3 == 0 3990 %xdefine r%1m r%2m 3991 %else 3992 %define r%1m [rstk+stack_offset+(%1+1)*4] 3993 %endif 3994 %endif 3995%endmacro 3996 3997%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 3998 %if isprep 3999 %if ARCH_X86_64 4000 SAVE_REG 14 4001 %assign %%i 14 4002 %rep 14 4003 %assign %%j %%i-1 4004 REMAP_REG %%i, %%j 4005 %assign %%i %%i-1 4006 %endrep 4007 %else 4008 SAVE_REG 5 4009 %assign %%i 5 4010 %rep 5 4011 %assign %%j %%i-1 4012 REMAP_REG %%i, %%j, 0 4013 %assign %%i %%i-1 4014 %endrep 4015 %endif 4016 %endif 4017%endmacro 4018 4019%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 4020 %if isprep 4021 %assign %%i 1 4022 %if ARCH_X86_64 4023 %rep 13 4024 %assign %%j %%i+1 4025 REMAP_REG %%i, %%j 4026 %assign %%i %%i+1 4027 %endrep 4028 LOAD_REG 14 4029 %else 4030 %rep 4 4031 %assign %%j %%i+1 4032 REMAP_REG %%i, %%j, 1 4033 %assign %%i %%i+1 4034 %endrep 4035 LOAD_REG 5 4036 %endif 4037 %endif 4038%endmacro 4039 4040%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged 4041 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 4042 RET 4043 %if %1 4044 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 4045 %endif 4046%endmacro 4047 4048%if ARCH_X86_32 4049 %macro MC_4TAP_SCALED_H 1 ; dst_mem 4050 movu m7, [srcq+ssq*0] 4051 movu m2, [srcq+ssq*1] 4052 movu m5, [r4 +ssq*0] 4053 movu m6, [r4 +ssq*1] 4054 lea srcq, [srcq+ssq*2] 4055 lea r4, [r4 +ssq*2] 4056 REPX {pshufb x, m12}, m7, m2 4057 REPX {pmaddwd x, m13}, m7, m2 4058 REPX {pshufb x, m14}, m5, m6 4059 REPX {pmaddwd x, m15}, m5, m6 4060 phaddd m7, m5 4061 phaddd m2, m6 4062 mova m5, [esp+0x00] 4063 movd m6, [esp+0x10] 4064 paddd m7, m5 4065 paddd m2, m5 4066 psrad m7, m6 4067 psrad m2, m6 4068 packssdw m7, m2 4069 mova [stk+%1], m7 4070 %endmacro 4071%endif 4072 4073%if ARCH_X86_64 4074 %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] 4075 movu m%1, [srcq+ r4*2] 4076 movu m%2, [srcq+ r6*2] 4077 movu m%3, [srcq+ r7*2] 4078 movu m%4, [srcq+ r9*2] 4079 movu m%5, [srcq+r10*2] 4080 movu m%6, [srcq+r11*2] 4081 movu m%7, [srcq+r13*2] 4082 movu m%8, [srcq+ rX*2] 4083 add srcq, ssq 4084 pmaddwd m%1, [stk+0x10] 4085 pmaddwd m%2, [stk+0x20] 4086 pmaddwd m%3, [stk+0x30] 4087 pmaddwd m%4, [stk+0x40] 4088 pmaddwd m%5, [stk+0x50] 4089 pmaddwd m%6, [stk+0x60] 4090 pmaddwd m%7, [stk+0x70] 4091 pmaddwd m%8, [stk+0x80] 4092 phaddd m%1, m%2 4093 phaddd m%3, m%4 4094 phaddd m%5, m%6 4095 phaddd m%7, m%8 4096 phaddd m%1, m%3 4097 phaddd m%5, m%7 4098 paddd m%1, hround 4099 paddd m%5, hround 4100 psrad m%1, m12 4101 psrad m%5, m12 4102 packssdw m%1, m%5 4103 %endmacro 4104%else 4105 %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets 4106 %if %3 == 1 4107 mov r0, [stk+ 0] 4108 mov rX, [stk+ 4] 4109 mov r4, [stk+ 8] 4110 mov r5, [stk+12] 4111 %endif 4112 movu m0, [srcq+r0*2] 4113 movu m1, [srcq+rX*2] 4114 movu m2, [srcq+r4*2] 4115 movu m3, [srcq+r5*2] 4116 mov r0, [stk+16] 4117 mov rX, [stk+20] 4118 mov r4, [stk+24] 4119 mov r5, [stk+28] 4120 pmaddwd m0, [stk+%1+0x00] 4121 pmaddwd m1, [stk+%1+0x10] 4122 pmaddwd m2, [stk+%1+0x20] 4123 pmaddwd m3, [stk+%1+0x30] 4124 phaddd m0, m1 4125 phaddd m2, m3 4126 movu m4, [srcq+r0*2] 4127 movu m5, [srcq+rX*2] 4128 movu m6, [srcq+r4*2] 4129 movu m7, [srcq+r5*2] 4130 add srcq, ssq 4131 pmaddwd m4, [stk+%1+0xa0] 4132 pmaddwd m5, [stk+%1+0xb0] 4133 pmaddwd m6, [stk+%1+0xc0] 4134 pmaddwd m7, [stk+%1+0xd0] 4135 phaddd m4, m5 4136 phaddd m6, m7 4137 phaddd m0, m2 4138 phaddd m4, m6 4139 paddd m0, hround 4140 paddd m4, hround 4141 psrad m0, m12 4142 psrad m4, m12 4143 packssdw m0, m4 4144 %if %2 != 0 4145 mova [stk+%2], m0 4146 %endif 4147 %endmacro 4148%endif 4149 4150%macro MC_8TAP_SCALED 1 4151%ifidn %1, put 4152 %assign isput 1 4153 %assign isprep 0 4154 %if ARCH_X86_64 4155 %if required_stack_alignment <= STACK_ALIGNMENT 4156cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax 4157 %else 4158cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax 4159 %endif 4160 %else ; ARCH_X86_32 4161 %if required_stack_alignment <= STACK_ALIGNMENT 4162cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax 4163 %else 4164cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax 4165 %endif 4166 %endif 4167 %xdefine base_reg r12 4168%else ; prep 4169 %assign isput 0 4170 %assign isprep 1 4171 %if ARCH_X86_64 4172 %if required_stack_alignment <= STACK_ALIGNMENT 4173cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax 4174 %xdefine tmp_stridem r14q 4175 %else 4176cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax 4177 %define tmp_stridem qword [stk+0x138] 4178 %endif 4179 %xdefine base_reg r11 4180 %else ; ARCH_X86_32 4181 %if required_stack_alignment <= STACK_ALIGNMENT 4182cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax 4183 %else 4184cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax 4185 %endif 4186 %define tmp_stridem dword [stk+0x138] 4187 %endif 4188%endif 4189%if ARCH_X86_32 4190 mov [esp+0x1f0], t0d 4191 mov [esp+0x1f4], t1d 4192 %if isput && required_stack_alignment > STACK_ALIGNMENT 4193 mov dstd, dstm 4194 mov dsd, dsm 4195 mov srcd, srcm 4196 mov ssd, ssm 4197 mov hd, hm 4198 mov r4, mxm 4199 %define r0m [esp+0x200] 4200 %define dsm [esp+0x204] 4201 %define dsmp dsm 4202 %define r1m dsm 4203 %define r2m [esp+0x208] 4204 %define ssm [esp+0x20c] 4205 %define r3m ssm 4206 %define hm [esp+0x210] 4207 %define mxm [esp+0x214] 4208 mov r0m, dstd 4209 mov dsm, dsd 4210 mov r2m, srcd 4211 mov ssm, ssd 4212 mov hm, hd 4213 mov r0, mym 4214 mov r1, dxm 4215 mov r2, dym 4216 %define mym [esp+0x218] 4217 %define dxm [esp+0x21c] 4218 %define dym [esp+0x220] 4219 mov mxm, r4 4220 mov mym, r0 4221 mov dxm, r1 4222 mov dym, r2 4223 tzcnt wd, wm 4224 %endif 4225 %if isput 4226 mov r3, pxmaxm 4227 %define pxmaxm r3 4228 %else 4229 mov r2, pxmaxm 4230 %endif 4231 %if isprep && required_stack_alignment > STACK_ALIGNMENT 4232 %xdefine base_reg r5 4233 %else 4234 %xdefine base_reg r6 4235 %endif 4236%endif 4237 LEA base_reg, %1_8tap_scaled_16bpc_ssse3 4238%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3 4239%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT 4240 tzcnt wd, wm 4241%endif 4242%if ARCH_X86_64 4243 %if isput 4244 mov r7d, pxmaxm 4245 %endif 4246%else 4247 %define m8 m0 4248 %define m9 m1 4249 %define m14 m4 4250 %define m15 m3 4251%endif 4252 movd m8, dxm 4253 movd m14, mxm 4254%if isput 4255 movd m15, pxmaxm 4256%endif 4257 pshufd m8, m8, q0000 4258 pshufd m14, m14, q0000 4259%if isput 4260 pshuflw m15, m15, q0000 4261 punpcklqdq m15, m15 4262%endif 4263%if isprep 4264 %if UNIX64 4265 mov r5d, t0d 4266 DECLARE_REG_TMP 5, 7 4267 %endif 4268 %if ARCH_X86_64 4269 mov r6d, pxmaxm 4270 %endif 4271%endif 4272%if ARCH_X86_64 4273 mov dyd, dym 4274%endif 4275%if isput 4276 %if WIN64 4277 mov r8d, hm 4278 DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 4279 %define hm r5m 4280 %define dxm r8m 4281 %elif ARCH_X86_64 4282 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 4283 %define hm r6m 4284 %else 4285 %endif 4286 %if ARCH_X86_64 4287 %if required_stack_alignment > STACK_ALIGNMENT 4288 %define dsm [rsp+0x138] 4289 %define rX r1 4290 %define rXd r1d 4291 %else 4292 %define dsm dsq 4293 %define rX r14 4294 %define rXd r14d 4295 %endif 4296 %else 4297 %define rX r1 4298 %endif 4299%else ; prep 4300 %if WIN64 4301 mov r7d, hm 4302 DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 4303 %define hm r4m 4304 %define dxm r7m 4305 %elif ARCH_X86_64 4306 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 4307 %xdefine hm r7m 4308 %endif 4309 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 4310 %if ARCH_X86_64 4311 %define rX r14 4312 %define rXd r14d 4313 %else 4314 %define rX r3 4315 %endif 4316%endif 4317%if ARCH_X86_64 4318 shr r7d, 11 4319 mova m10, [base+pd_0x3ff] 4320 movddup m11, [base+s_8tap_h_rnd+r7*8] 4321 movd m12, [base+s_8tap_h_sh+r7*4] 4322 %if isput 4323 movddup m13, [base+put_s_8tap_v_rnd+r7*8] 4324 movd m7, [base+put_s_8tap_v_sh+r7*4] 4325 %define pxmaxm [rsp] 4326 mova pxmaxm, m15 4327 punpcklqdq m12, m7 4328 %endif 4329 lea ss3q, [ssq*3] 4330 movzx r7d, t1b 4331 shr t1d, 16 4332 cmp hd, 6 4333 cmovs t1d, r7d 4334 sub srcq, ss3q 4335%else 4336 %define m10 [base+pd_0x3ff] 4337 %define m11 [esp+0x00] 4338 %define m12 [esp+0x10] 4339 shr r3, 11 4340 movddup m1, [base+s_8tap_h_rnd+r3*8] 4341 movd m2, [base+s_8tap_h_sh+r3*4] 4342 %if isput 4343 %define m13 [esp+0x20] 4344 %define pxmaxm [esp+0x30] 4345 %define stk esp+0x40 4346 movddup m5, [base+put_s_8tap_v_rnd+r3*8] 4347 movd m6, [base+put_s_8tap_v_sh+r3*4] 4348 mova pxmaxm, m15 4349 punpcklqdq m2, m6 4350 mova m13, m5 4351 %else 4352 %define m13 [base+pd_m524256] 4353 %endif 4354 mov ssd, ssm 4355 mova m11, m1 4356 mova m12, m2 4357 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 4358 mov r1, [esp+0x1f4] 4359 lea r0, [ssd*3] 4360 movzx r2, r1b 4361 shr r1, 16 4362 cmp dword hm, 6 4363 cmovs r1, r2 4364 mov [esp+0x1f4], r1 4365 %if isprep 4366 mov r1, r1m 4367 %endif 4368 mov r2, r2m 4369 sub srcq, r0 4370 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 4371 %define ss3q r0 4372 %define myd r4 4373 %define dyd dword dym 4374 %define hd dword hm 4375%endif 4376 cmp dyd, 1024 4377 je .dy1 4378 cmp dyd, 2048 4379 je .dy2 4380 movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] 4381 add wq, base_reg 4382 jmp wq 4383%if isput 4384.w2: 4385 %if ARCH_X86_64 4386 mov myd, mym 4387 movzx t0d, t0b 4388 sub srcq, 2 4389 movd m15, t0d 4390 %else 4391 movzx r4, byte [esp+0x1f0] 4392 sub srcq, 2 4393 movd m15, r4 4394 %endif 4395 pxor m9, m9 4396 punpckldq m9, m8 4397 paddd m14, m9 ; mx+dx*[0-1] 4398 %if ARCH_X86_64 4399 mova m9, [base+pd_0x4000] 4400 %endif 4401 pshufd m15, m15, q0000 4402 pand m8, m14, m10 4403 psrld m8, 6 4404 paddd m15, m8 4405 movd r4d, m15 4406 pshufd m15, m15, q0321 4407 %if ARCH_X86_64 4408 movd r6d, m15 4409 %else 4410 movd r3d, m15 4411 %endif 4412 mova m5, [base+bdct_lb_q] 4413 mova m6, [base+spel_s_shuf2] 4414 movd m15, [base+subpel_filters+r4*8+2] 4415 %if ARCH_X86_64 4416 movd m7, [base+subpel_filters+r6*8+2] 4417 %else 4418 movd m7, [base+subpel_filters+r3*8+2] 4419 %endif 4420 pxor m2, m2 4421 pcmpeqd m8, m2 4422 psrld m14, 10 4423 paddd m14, m14 4424 %if ARCH_X86_32 4425 mov r3, r3m 4426 pshufb m14, m5 4427 paddb m14, m6 4428 mova [stk], m14 4429 SWAP m5, m0 4430 SWAP m6, m3 4431 %define m15 m6 4432 %endif 4433 movu m0, [srcq+ssq*0] 4434 movu m1, [srcq+ssq*1] 4435 movu m2, [srcq+ssq*2] 4436 movu m3, [srcq+ss3q ] 4437 lea srcq, [srcq+ssq*4] 4438 punpckldq m15, m7 4439 %if ARCH_X86_64 4440 pshufb m14, m5 4441 paddb m14, m6 4442 pand m9, m8 4443 pandn m8, m15 4444 SWAP m15, m8 4445 por m15, m9 4446 movu m4, [srcq+ssq*0] 4447 movu m5, [srcq+ssq*1] 4448 movu m6, [srcq+ssq*2] 4449 movu m7, [srcq+ss3q ] 4450 lea srcq, [srcq+ssq*4] 4451 %else 4452 pand m7, m5, [base+pd_0x4000] 4453 pandn m5, m15 4454 por m5, m7 4455 %define m15 m5 4456 %endif 4457 punpcklbw m15, m15 4458 psraw m15, 8 4459 REPX {pshufb x, m14}, m0, m1, m2, m3 4460 REPX {pmaddwd x, m15}, m0, m1, m2, m3 4461 %if ARCH_X86_64 4462 REPX {pshufb x, m14}, m4, m5, m6, m7 4463 REPX {pmaddwd x, m15}, m4, m5, m6, m7 4464 phaddd m0, m1 4465 phaddd m2, m3 4466 phaddd m4, m5 4467 phaddd m6, m7 4468 REPX {paddd x, m11}, m0, m2, m4, m6 4469 REPX {psrad x, m12}, m0, m2, m4, m6 4470 packssdw m0, m2 ; 0 1 2 3 4471 packssdw m4, m6 ; 4 5 6 7 4472 SWAP m1, m4 4473 %else 4474 mova [stk+0x10], m15 4475 phaddd m0, m1 4476 phaddd m2, m3 4477 movu m1, [srcq+ssq*0] 4478 movu m7, [srcq+ssq*1] 4479 movu m6, [srcq+ssq*2] 4480 movu m3, [srcq+ss3q ] 4481 lea srcq, [srcq+ssq*4] 4482 REPX {pshufb x, m14}, m1, m7, m6, m3 4483 REPX {pmaddwd x, m15}, m1, m7, m6, m3 4484 phaddd m1, m7 4485 phaddd m6, m3 4486 REPX {paddd x, m11}, m0, m2, m1, m6 4487 REPX {psrad x, m12}, m0, m2, m1, m6 4488 packssdw m0, m2 4489 packssdw m1, m6 4490 %define m14 [stk+0x00] 4491 %define m15 [stk+0x10] 4492 %endif 4493 palignr m2, m1, m0, 4 ; 1 2 3 4 4494 punpcklwd m3, m0, m2 ; 01 12 4495 punpckhwd m0, m2 ; 23 34 4496 pshufd m5, m1, q0321 ; 5 6 7 _ 4497 punpcklwd m2, m1, m5 ; 45 56 4498 punpckhwd m4, m1, m5 ; 67 __ 4499 %if ARCH_X86_32 4500 mov myd, mym 4501 mov r0, r0m 4502 mova [stk+0x20], m3 4503 mova [stk+0x30], m0 4504 mova [stk+0x40], m2 4505 mova [stk+0x50], m4 4506 %endif 4507.w2_loop: 4508 and myd, 0x3ff 4509 %if ARCH_X86_64 4510 mov r6d, 64 << 24 4511 mov r4d, myd 4512 shr r4d, 6 4513 lea r4d, [t1+r4] 4514 cmovnz r6q, [base+subpel_filters+r4*8] 4515 movq m10, r6q 4516 punpcklbw m10, m10 4517 psraw m10, 8 4518 pshufd m7, m10, q0000 4519 pshufd m8, m10, q1111 4520 pmaddwd m5, m3, m7 4521 pmaddwd m6, m0, m8 4522 pshufd m9, m10, q2222 4523 pshufd m10, m10, q3333 4524 pmaddwd m7, m2, m9 4525 pmaddwd m8, m4, m10 4526 paddd m5, m6 4527 paddd m7, m8 4528 %else 4529 mov r1, [esp+0x1f4] 4530 xor r3, r3 4531 mov r5, myd 4532 shr r5, 6 4533 lea r1, [r1+r5] 4534 mov r5, 64 << 24 4535 cmovnz r3, [base+subpel_filters+r1*8+4] 4536 cmovnz r5, [base+subpel_filters+r1*8+0] 4537 movd m6, r3 4538 movd m7, r5 4539 punpckldq m7, m6 4540 punpcklbw m7, m7 4541 psraw m7, 8 4542 pshufd m5, m7, q0000 4543 pshufd m6, m7, q1111 4544 pmaddwd m3, m5 4545 pmaddwd m0, m6 4546 pshufd m5, m7, q2222 4547 pshufd m7, m7, q3333 4548 pmaddwd m2, m5 4549 pmaddwd m4, m7 4550 paddd m3, m0 4551 paddd m2, m4 4552 SWAP m5, m3 4553 SWAP m7, m2 4554 %define m8 m3 4555 %endif 4556 paddd m5, m13 4557 pshufd m6, m12, q1032 4558 pxor m8, m8 4559 paddd m5, m7 4560 psrad m5, m6 4561 packssdw m5, m5 4562 pmaxsw m5, m8 4563 pminsw m5, pxmaxm 4564 movd [dstq], m5 4565 add dstq, dsmp 4566 dec hd 4567 jz .ret 4568 %if ARCH_X86_64 4569 add myd, dyd 4570 %else 4571 add myd, dym 4572 %endif 4573 test myd, ~0x3ff 4574 %if ARCH_X86_32 4575 SWAP m3, m5 4576 SWAP m2, m7 4577 mova m3, [stk+0x20] 4578 mova m0, [stk+0x30] 4579 mova m2, [stk+0x40] 4580 mova m4, [stk+0x50] 4581 %endif 4582 jz .w2_loop 4583 %if ARCH_X86_32 4584 mov r3, r3m 4585 %endif 4586 movu m5, [srcq] 4587 test myd, 0x400 4588 jz .w2_skip_line 4589 add srcq, ssq 4590 shufps m3, m0, q1032 ; 01 12 4591 shufps m0, m2, q1032 ; 23 34 4592 shufps m2, m4, q1032 ; 45 56 4593 pshufb m5, m14 4594 pmaddwd m5, m15 4595 phaddd m5, m5 4596 paddd m5, m11 4597 psrad m5, m12 4598 packssdw m5, m5 4599 palignr m4, m5, m1, 12 4600 punpcklqdq m1, m4, m4 ; 6 7 6 7 4601 punpcklwd m4, m1, m5 ; 67 __ 4602 %if ARCH_X86_32 4603 mova [stk+0x20], m3 4604 mova [stk+0x30], m0 4605 mova [stk+0x40], m2 4606 mova [stk+0x50], m4 4607 %endif 4608 jmp .w2_loop 4609.w2_skip_line: 4610 movu m6, [srcq+ssq*1] 4611 lea srcq, [srcq+ssq*2] 4612 mova m3, m0 ; 01 12 4613 mova m0, m2 ; 23 34 4614 pshufb m5, m14 4615 pshufb m6, m14 4616 pmaddwd m5, m15 4617 pmaddwd m6, m15 4618 phaddd m5, m6 4619 paddd m5, m11 4620 psrad m5, m12 4621 packssdw m5, m5 ; 6 7 6 7 4622 punpckhqdq m1, m5 ; 4 5 6 7 4623 pshufd m5, m1, q0321 ; 5 6 7 _ 4624 punpcklwd m2, m1, m5 ; 45 56 4625 punpckhwd m4, m1, m5 ; 67 __ 4626 %if ARCH_X86_32 4627 mova [stk+0x20], m3 4628 mova [stk+0x30], m0 4629 mova [stk+0x40], m2 4630 mova [stk+0x50], m4 4631 %endif 4632 jmp .w2_loop 4633%endif 4634INIT_XMM ssse3 4635.w4: 4636%if ARCH_X86_64 4637 mov myd, mym 4638 mova [rsp+0x10], m11 4639 mova [rsp+0x20], m12 4640 %if isput 4641 mova [rsp+0x30], m13 4642 %endif 4643 movzx t0d, t0b 4644 sub srcq, 2 4645 movd m15, t0d 4646%else 4647 %define m8 m0 4648 %xdefine m14 m4 4649 %define m15 m3 4650 movzx r4, byte [esp+0x1f0] 4651 sub srcq, 2 4652 movd m15, r4 4653%endif 4654 pmaddwd m8, [base+rescale_mul] 4655%if ARCH_X86_64 4656 mova m9, [base+pd_0x4000] 4657%else 4658 %define m9 [base+pd_0x4000] 4659%endif 4660 pshufd m15, m15, q0000 4661 paddd m14, m8 ; mx+dx*[0-3] 4662 pand m0, m14, m10 4663 psrld m0, 6 4664 paddd m15, m0 4665 pshufd m7, m15, q1032 4666%if ARCH_X86_64 4667 movd r4d, m15 4668 movd r11d, m7 4669 pshufd m15, m15, q0321 4670 pshufd m7, m7, q0321 4671 movd r6d, m15 4672 movd r13d, m7 4673 mova m10, [base+bdct_lb_q+ 0] 4674 mova m11, [base+bdct_lb_q+16] 4675 movd m13, [base+subpel_filters+ r4*8+2] 4676 movd m2, [base+subpel_filters+ r6*8+2] 4677 movd m15, [base+subpel_filters+r11*8+2] 4678 movd m4, [base+subpel_filters+r13*8+2] 4679%else 4680 movd r0, m15 4681 movd r4, m7 4682 pshufd m15, m15, q0321 4683 pshufd m7, m7, q0321 4684 movd rX, m15 4685 movd r5, m7 4686 mova m5, [base+bdct_lb_q+ 0] 4687 mova m6, [base+bdct_lb_q+16] 4688 movd m1, [base+subpel_filters+r0*8+2] 4689 movd m2, [base+subpel_filters+rX*8+2] 4690 movd m3, [base+subpel_filters+r4*8+2] 4691 movd m7, [base+subpel_filters+r5*8+2] 4692 movifprep r3, r3m 4693 SWAP m4, m7 4694 %define m10 m5 4695 %define m11 m6 4696 %define m12 m1 4697 %define m13 m1 4698%endif 4699 psrld m14, 10 4700 paddd m14, m14 4701 punpckldq m13, m2 4702 punpckldq m15, m4 4703 punpcklqdq m13, m15 4704 pxor m2, m2 4705 pcmpeqd m0, m2 4706%if ARCH_X86_64 4707 pand m9, m0 4708%else 4709 pand m2, m9, m0 4710 %define m9 m2 4711 SWAP m7, m4 4712%endif 4713 pandn m0, m13 4714%if ARCH_X86_64 4715 SWAP m13, m0 4716%else 4717 %define m13 m0 4718%endif 4719 por m13, m9 4720 punpckhbw m15, m13, m13 4721 punpcklbw m13, m13 4722 psraw m15, 8 4723 psraw m13, 8 4724 pshufb m12, m14, m10 4725 pshufb m14, m11 4726 mova m10, [base+spel_s_shuf2] 4727 movd r4d, m14 4728 shr r4d, 24 4729%if ARCH_X86_32 4730 mova [stk+0x20], m13 4731 mova [stk+0x30], m15 4732 pxor m2, m2 4733%endif 4734 pshufb m7, m14, m2 4735 psubb m14, m7 4736 paddb m12, m10 4737 paddb m14, m10 4738%if ARCH_X86_64 4739 lea r6, [r4+ssq*1] 4740 lea r11, [r4+ssq*2] 4741 lea r13, [r4+ss3q ] 4742 movu m7, [srcq+ssq*0] 4743 movu m9, [srcq+ssq*1] 4744 movu m8, [srcq+ssq*2] 4745 movu m10, [srcq+ss3q ] 4746 movu m1, [srcq+r4 ] 4747 movu m3, [srcq+r6 ] 4748 movu m2, [srcq+r11 ] 4749 movu m4, [srcq+r13 ] 4750 lea srcq, [srcq+ssq*4] 4751 REPX {pshufb x, m12}, m7, m9, m8, m10 4752 REPX {pmaddwd x, m13}, m7, m9, m8, m10 4753 REPX {pshufb x, m14}, m1, m2, m3, m4 4754 REPX {pmaddwd x, m15}, m1, m2, m3, m4 4755 mova m5, [rsp+0x10] 4756 movd xm6, [rsp+0x20] 4757 phaddd m7, m1 4758 phaddd m9, m3 4759 phaddd m8, m2 4760 phaddd m10, m4 4761 movu m1, [srcq+ssq*0] 4762 movu m2, [srcq+ssq*1] 4763 movu m3, [srcq+ssq*2] 4764 movu m4, [srcq+ss3q ] 4765 REPX {paddd x, m5}, m7, m9, m8, m10 4766 REPX {psrad x, xm6}, m7, m9, m8, m10 4767 packssdw m7, m9 ; 0 1 4768 packssdw m8, m10 ; 2 3 4769 movu m0, [srcq+r4 ] 4770 movu m9, [srcq+r6 ] 4771 movu m10, [srcq+r11 ] 4772 movu m11, [srcq+r13 ] 4773 lea srcq, [srcq+ssq*4] 4774 REPX {pshufb x, m12}, m1, m2, m3, m4 4775 REPX {pmaddwd x, m13}, m1, m2, m3, m4 4776 REPX {pshufb x, m14}, m0, m9, m10, m11 4777 REPX {pmaddwd x, m15}, m0, m9, m10, m11 4778 phaddd m1, m0 4779 phaddd m2, m9 4780 phaddd m3, m10 4781 phaddd m4, m11 4782 REPX {paddd x, m5}, m1, m2, m3, m4 4783 REPX {psrad x, xm6}, m1, m2, m3, m4 4784 packssdw m1, m2 ; 4 5 4785 packssdw m3, m4 ; 6 7 4786 SWAP m9, m1 4787 shufps m4, m7, m8, q1032 ; 1 2 4788 shufps m5, m8, m9, q1032 ; 3 4 4789 shufps m6, m9, m3, q1032 ; 5 6 4790 pshufd m10, m3, q1032 ; 7 _ 4791 punpcklwd m0, m7, m4 ; 01 4792 punpckhwd m7, m4 ; 12 4793 punpcklwd m1, m8, m5 ; 23 4794 punpckhwd m8, m5 ; 34 4795 punpcklwd m2, m9, m6 ; 45 4796 punpckhwd m9, m6 ; 56 4797 punpcklwd m3, m10 ; 67 4798 mova [rsp+0x40], m7 4799 mova [rsp+0x50], m8 4800 mova [rsp+0x60], m9 4801%else 4802 mova [stk+0x00], m12 4803 mova [stk+0x10], m14 4804 add r4, srcq 4805 MC_4TAP_SCALED_H 0x40 ; 0 1 4806 MC_4TAP_SCALED_H 0x50 ; 2 3 4807 MC_4TAP_SCALED_H 0x60 ; 4 5 4808 MC_4TAP_SCALED_H 0x70 ; 6 7 4809 mova m4, [stk+0x40] 4810 mova m5, [stk+0x50] 4811 mova m6, [stk+0x60] 4812 mova m7, [stk+0x70] 4813 mov [stk+0xc0], r4 4814 shufps m1, m4, m5, q1032 ; 1 2 4815 shufps m2, m5, m6, q1032 ; 3 4 4816 shufps m3, m6, m7, q1032 ; 5 6 4817 pshufd m0, m7, q1032 ; 7 _ 4818 mova [stk+0xb0], m0 4819 punpcklwd m0, m4, m1 ; 01 4820 punpckhwd m4, m1 ; 12 4821 punpcklwd m1, m5, m2 ; 23 4822 punpckhwd m5, m2 ; 34 4823 punpcklwd m2, m6, m3 ; 45 4824 punpckhwd m6, m3 ; 56 4825 punpcklwd m3, m7, [stk+0xb0] ; 67 4826 mov myd, mym 4827 mov r0, r0m 4828 mova [stk+0x40], m0 ; 01 4829 mova [stk+0x50], m1 ; 23 4830 mova [stk+0x60], m2 ; 45 4831 mova [stk+0x70], m3 ; 67 4832 mova [stk+0x80], m4 ; 12 4833 mova [stk+0x90], m5 ; 34 4834 mova [stk+0xa0], m6 ; 56 4835 %define m12 [stk+0x00] 4836 %define m14 [stk+0x10] 4837 %define m13 [stk+0x20] 4838 %define m15 [stk+0x30] 4839 %define hrnd_mem [esp+0x00] 4840 %define hsh_mem [esp+0x10] 4841 %if isput 4842 %define vrnd_mem [esp+0x20] 4843 %else 4844 %define vrnd_mem [base+pd_m524256] 4845 %endif 4846%endif 4847.w4_loop: 4848 and myd, 0x3ff 4849%if ARCH_X86_64 4850 mov r11d, 64 << 24 4851 mov r13d, myd 4852 shr r13d, 6 4853 lea r13d, [t1+r13] 4854 cmovnz r11q, [base+subpel_filters+r13*8] 4855 movq m9, r11q 4856 punpcklbw m9, m9 4857 psraw m9, 8 4858 pshufd m7, m9, q0000 4859 pshufd m8, m9, q1111 4860 pmaddwd m4, m0, m7 4861 pmaddwd m5, m1, m8 4862 pshufd m7, m9, q2222 4863 pshufd m9, m9, q3333 4864 pmaddwd m6, m2, m7 4865 pmaddwd m8, m3, m9 4866 %if isput 4867 movd m9, [rsp+0x28] 4868 %define vrnd_mem [rsp+0x30] 4869 %else 4870 %define vrnd_mem [base+pd_m524256] 4871 %endif 4872 paddd m4, m5 4873 paddd m6, m8 4874 paddd m4, m6 4875 paddd m4, vrnd_mem 4876%else 4877 mov mym, myd 4878 mov r5, [esp+0x1f4] 4879 xor r3, r3 4880 shr r4, 6 4881 lea r5, [r5+r4] 4882 mov r4, 64 << 24 4883 cmovnz r4, [base+subpel_filters+r5*8+0] 4884 cmovnz r3, [base+subpel_filters+r5*8+4] 4885 movd m7, r4 4886 movd m6, r3 4887 punpckldq m7, m6 4888 punpcklbw m7, m7 4889 psraw m7, 8 4890 pshufd m4, m7, q0000 4891 pshufd m5, m7, q1111 4892 pshufd m6, m7, q2222 4893 pshufd m7, m7, q3333 4894 pmaddwd m0, m4 4895 pmaddwd m1, m5 4896 pmaddwd m2, m6 4897 pmaddwd m3, m7 4898 %if isput 4899 movd m4, [esp+0x18] 4900 %endif 4901 paddd m0, m1 4902 paddd m2, m3 4903 paddd m0, vrnd_mem 4904 paddd m0, m2 4905 SWAP m4, m0 4906 %define m9 m0 4907%endif 4908%if isput 4909 pxor m5, m5 4910 psrad m4, m9 4911 packssdw m4, m4 4912 pmaxsw m4, m5 4913 pminsw m4, pxmaxm 4914 movq [dstq], m4 4915 add dstq, dsmp 4916%else 4917 psrad m4, 6 4918 packssdw m4, m4 4919 movq [tmpq], m4 4920 add tmpq, 8 4921%endif 4922 dec hd 4923 jz .ret 4924%if ARCH_X86_64 4925 add myd, dyd 4926 test myd, ~0x3ff 4927 jz .w4_loop 4928 mova m8, [rsp+0x10] 4929 movd m9, [rsp+0x20] 4930 movu m4, [srcq] 4931 movu m5, [srcq+r4] 4932 test myd, 0x400 4933 jz .w4_skip_line 4934 mova m0, [rsp+0x40] 4935 mova [rsp+0x40], m1 4936 mova m1, [rsp+0x50] 4937 mova [rsp+0x50], m2 4938 mova m2, [rsp+0x60] 4939 mova [rsp+0x60], m3 4940 pshufb m4, m12 4941 pshufb m5, m14 4942 pmaddwd m4, m13 4943 pmaddwd m5, m15 4944 phaddd m4, m5 4945 paddd m4, m8 4946 psrad m4, m9 4947 packssdw m4, m4 4948 punpcklwd m3, m10, m4 4949 mova m10, m4 4950 add srcq, ssq 4951 jmp .w4_loop 4952.w4_skip_line: 4953 movu m6, [srcq+ssq*1] 4954 movu m7, [srcq+r6] 4955 mova m0, [rsp+0x50] 4956 mova m11, [rsp+0x60] 4957 pshufb m4, m12 4958 pshufb m6, m12 4959 pshufb m5, m14 4960 pshufb m7, m14 4961 pmaddwd m4, m13 4962 pmaddwd m6, m13 4963 pmaddwd m5, m15 4964 pmaddwd m7, m15 4965 mova [rsp+0x40], m0 4966 mova [rsp+0x50], m11 4967 phaddd m4, m5 4968 phaddd m6, m7 4969 paddd m4, m8 4970 paddd m6, m8 4971 psrad m4, m9 4972 psrad m6, m9 4973 packssdw m4, m6 4974 punpcklwd m9, m10, m4 4975 mova [rsp+0x60], m9 4976 pshufd m10, m4, q1032 4977 mova m0, m1 4978 mova m1, m2 4979 mova m2, m3 4980 punpcklwd m3, m4, m10 4981 lea srcq, [srcq+ssq*2] 4982 jmp .w4_loop 4983%else 4984 SWAP m0, m4 4985 mov myd, mym 4986 mov r3, r3m 4987 add myd, dym 4988 test myd, ~0x3ff 4989 jnz .w4_next_line 4990 mova m0, [stk+0x40] 4991 mova m1, [stk+0x50] 4992 mova m2, [stk+0x60] 4993 mova m3, [stk+0x70] 4994 jmp .w4_loop 4995.w4_next_line: 4996 mov r5, [stk+0xc0] 4997 movu m4, [srcq] 4998 movu m5, [r5] 4999 test myd, 0x400 5000 jz .w4_skip_line 5001 add [stk+0xc0], ssq 5002 mova m0, [stk+0x80] 5003 mova m3, [stk+0x50] 5004 mova [stk+0x40], m0 5005 mova [stk+0x80], m3 5006 mova m1, [stk+0x90] 5007 mova m6, [stk+0x60] 5008 mova [stk+0x50], m1 5009 mova [stk+0x90], m6 5010 mova m2, [stk+0xa0] 5011 mova m7, [stk+0x70] 5012 mova [stk+0x60], m2 5013 mova [stk+0xa0], m7 5014 pshufb m4, m12 5015 pshufb m5, m14 5016 pmaddwd m4, m13 5017 pmaddwd m5, m15 5018 phaddd m4, m5 5019 paddd m4, hrnd_mem 5020 psrad m4, hsh_mem 5021 packssdw m4, m4 5022 punpcklwd m3, [stk+0xb0], m4 5023 mova [stk+0xb0], m4 5024 mova [stk+0x70], m3 5025 add srcq, ssq 5026 jmp .w4_loop 5027.w4_skip_line: 5028 movu m6, [srcq+ssq*1] 5029 movu m7, [r5 +ssq*1] 5030 lea r5, [r5 +ssq*2] 5031 mov [stk+0xc0], r5 5032 mova m0, [stk+0x50] 5033 mova m1, [stk+0x60] 5034 mova m2, [stk+0x70] 5035 mova m3, [stk+0x90] 5036 pshufb m4, m12 5037 pshufb m6, m12 5038 pshufb m5, m14 5039 pshufb m7, m14 5040 pmaddwd m4, m13 5041 pmaddwd m6, m13 5042 pmaddwd m5, m15 5043 pmaddwd m7, m15 5044 mova [stk+0x40], m0 5045 mova [stk+0x50], m1 5046 mova [stk+0x60], m2 5047 mova [stk+0x80], m3 5048 phaddd m4, m5 5049 phaddd m6, m7 5050 mova m5, [stk+0xa0] 5051 mova m7, [stk+0xb0] 5052 paddd m4, hrnd_mem 5053 paddd m6, hrnd_mem 5054 psrad m4, hsh_mem 5055 psrad m6, hsh_mem 5056 packssdw m4, m6 5057 punpcklwd m7, m4 5058 pshufd m6, m4, q1032 5059 mova [stk+0x90], m5 5060 mova [stk+0xa0], m7 5061 mova [stk+0xb0], m6 5062 punpcklwd m3, m4, m6 5063 mova [stk+0x70], m3 5064 lea srcq, [srcq+ssq*2] 5065 jmp .w4_loop 5066%endif 5067INIT_XMM ssse3 5068%if ARCH_X86_64 5069 %define stk rsp+0x20 5070%endif 5071.w8: 5072 mov dword [stk+0xf0], 1 5073 movifprep tmp_stridem, 16 5074 jmp .w_start 5075.w16: 5076 mov dword [stk+0xf0], 2 5077 movifprep tmp_stridem, 32 5078 jmp .w_start 5079.w32: 5080 mov dword [stk+0xf0], 4 5081 movifprep tmp_stridem, 64 5082 jmp .w_start 5083.w64: 5084 mov dword [stk+0xf0], 8 5085 movifprep tmp_stridem, 128 5086 jmp .w_start 5087.w128: 5088 mov dword [stk+0xf0], 16 5089 movifprep tmp_stridem, 256 5090.w_start: 5091%if ARCH_X86_64 5092 %ifidn %1, put 5093 movifnidn dsm, dsq 5094 %endif 5095 mova [rsp+0x10], m11 5096 %define hround m11 5097 shr t0d, 16 5098 movd m15, t0d 5099 %if isprep 5100 mova m13, [base+pd_m524256] 5101 %endif 5102%else 5103 %define hround [esp+0x00] 5104 %define m12 [esp+0x10] 5105 %define m10 [base+pd_0x3ff] 5106 %define m8 m0 5107 %xdefine m14 m4 5108 %define m15 m3 5109 %if isprep 5110 %define ssq ssm 5111 %endif 5112 mov r4, [esp+0x1f0] 5113 shr r4, 16 5114 movd m15, r4 5115 mov r0, r0m 5116 mov myd, mym 5117%endif 5118 sub srcq, 6 5119 pslld m7, m8, 2 ; dx*4 5120 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 5121 pshufd m15, m15, q0000 5122 paddd m14, m8 ; mx+dx*[0-3] 5123 mova [stk+0x100], m7 5124 mova [stk+0x120], m15 5125 mov [stk+0x0f8], srcq 5126 mov [stk+0x130], r0q ; dstq / tmpq 5127%if ARCH_X86_64 && UNIX64 5128 mov hm, hd 5129%elif ARCH_X86_32 5130 mov r5, hm 5131 mov [stk+0x0f4], myd 5132 mov [stk+0x134], r5 5133%endif 5134 jmp .hloop 5135.hloop_prep: 5136 dec dword [stk+0x0f0] 5137 jz .ret 5138%if ARCH_X86_64 5139 add qword [stk+0x130], 16 5140 mov hd, hm 5141%else 5142 add dword [stk+0x130], 16 5143 mov myd, [stk+0x0f4] 5144 mov r5, [stk+0x134] 5145 mov r0, [stk+0x130] 5146%endif 5147 mova m7, [stk+0x100] 5148 mova m14, [stk+0x110] 5149%if ARCH_X86_64 5150 mova m10, [base+pd_0x3ff] 5151 mova m11, [rsp+0x10] 5152%endif 5153 mova m15, [stk+0x120] 5154 mov srcq, [stk+0x0f8] 5155%if ARCH_X86_64 5156 mov r0q, [stk+0x130] ; dstq / tmpq 5157%else 5158 mov mym, myd 5159 mov hm, r5 5160 mov r0m, r0 5161 mov r3, r3m 5162%endif 5163 paddd m14, m7 5164.hloop: 5165%if ARCH_X86_64 5166 mova m9, [base+pq_0x40000000] 5167%else 5168 %define m9 [base+pq_0x40000000] 5169%endif 5170 pxor m1, m1 5171 psrld m2, m14, 10 5172 mova [stk], m2 5173 pand m6, m14, m10 5174 psrld m6, 6 5175 paddd m5, m15, m6 5176 pcmpeqd m6, m1 5177 pshufd m2, m5, q1032 5178%if ARCH_X86_64 5179 movd r4d, m5 5180 movd r6d, m2 5181 pshufd m5, m5, q0321 5182 pshufd m2, m2, q0321 5183 movd r7d, m5 5184 movd r9d, m2 5185 movq m0, [base+subpel_filters+r4*8] 5186 movq m1, [base+subpel_filters+r6*8] 5187 movhps m0, [base+subpel_filters+r7*8] 5188 movhps m1, [base+subpel_filters+r9*8] 5189%else 5190 movd r0, m5 5191 movd rX, m2 5192 pshufd m5, m5, q0321 5193 pshufd m2, m2, q0321 5194 movd r4, m5 5195 movd r5, m2 5196 movq m0, [base+subpel_filters+r0*8] 5197 movq m1, [base+subpel_filters+rX*8] 5198 movhps m0, [base+subpel_filters+r4*8] 5199 movhps m1, [base+subpel_filters+r5*8] 5200%endif 5201 paddd m14, m7 ; mx+dx*[4-7] 5202 pand m5, m14, m10 5203 psrld m5, 6 5204 paddd m15, m5 5205 pxor m2, m2 5206 pcmpeqd m5, m2 5207 mova [stk+0x110], m14 5208 pshufd m4, m15, q1032 5209%if ARCH_X86_64 5210 movd r10d, m15 5211 movd r11d, m4 5212 pshufd m15, m15, q0321 5213 pshufd m4, m4, q0321 5214 movd r13d, m15 5215 movd rXd, m4 5216 movq m2, [base+subpel_filters+r10*8] 5217 movq m3, [base+subpel_filters+r11*8] 5218 movhps m2, [base+subpel_filters+r13*8] 5219 movhps m3, [base+subpel_filters+ rX*8] 5220 psrld m14, 10 5221 movq r11, m14 5222 punpckhqdq m14, m14 5223 movq rX, m14 5224 mov r10d, r11d 5225 shr r11, 32 5226 mov r13d, rXd 5227 shr rX, 32 5228 mov r4d, [stk+ 0] 5229 mov r6d, [stk+ 4] 5230 mov r7d, [stk+ 8] 5231 mov r9d, [stk+12] 5232 pshufd m4, m6, q1100 5233 pshufd m6, m6, q3322 5234 pshufd m14, m5, q1100 5235 pshufd m5, m5, q3322 5236 pand m7, m9, m4 5237 pand m8, m9, m6 5238 pand m15, m9, m14 5239 pand m9, m9, m5 5240 pandn m4, m0 5241 pandn m6, m1 5242 pandn m14, m2 5243 pandn m5, m3 5244 por m7, m4 5245 por m8, m6 5246 por m15, m14 5247 por m9, m5 5248 punpcklbw m0, m7, m7 5249 punpckhbw m7, m7 5250 punpcklbw m1, m8, m8 5251 punpckhbw m8, m8 5252 psraw m0, 8 5253 psraw m7, 8 5254 psraw m1, 8 5255 psraw m8, 8 5256 punpcklbw m2, m15, m15 5257 punpckhbw m15, m15 5258 punpcklbw m3, m9, m9 5259 punpckhbw m9, m9 5260 psraw m2, 8 5261 psraw m15, 8 5262 psraw m3, 8 5263 psraw m9, 8 5264 mova [stk+0x10], m0 5265 mova [stk+0x20], m7 5266 mova [stk+0x30], m1 5267 mova [stk+0x40], m8 5268 mova [stk+0x50], m2 5269 mova [stk+0x60], m15 5270 mova [stk+0x70], m3 5271 mova [stk+0x80], m9 5272 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 5273 mova [stk+0x90], m1 5274 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 5275 mova [stk+0xa0], m2 5276 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 5277 mova [stk+0xb0], m3 5278 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 5279 mova [stk+0xc0], m4 5280 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 5281 mova [stk+0xd0], m5 5282 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 5283 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 5284 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 5285 mova m5, [stk+0xd0] 5286 mova m1, [stk+0x90] 5287 mova m2, [stk+0xa0] 5288 mova m3, [stk+0xb0] 5289 mova m9, [stk+0xc0] 5290 mov myd, mym 5291 mov dyd, dym 5292 punpcklwd m4, m5, m6 ; 45a 5293 punpckhwd m5, m6 ; 45b 5294 punpcklwd m6, m7, m8 ; 67a 5295 punpckhwd m7, m8 ; 67b 5296 punpcklwd m0, m1, m2 ; 01a 5297 punpckhwd m1, m2 ; 01b 5298 punpcklwd m2, m3, m9 ; 23a 5299 punpckhwd m3, m9 ; 23b 5300 mova [stk+0x90], m4 5301 mova [stk+0xa0], m5 5302 mova [stk+0xb0], m6 5303 mova [stk+0xc0], m7 5304 %define hround [rsp+0x10] 5305.vloop: 5306 and myd, 0x3ff 5307 mov r6d, 64 << 24 5308 mov r4d, myd 5309 shr r4d, 6 5310 lea r4d, [t1+r4] 5311 cmovnz r6q, [base+subpel_filters+r4*8] 5312 movq m11, r6q 5313 punpcklbw m11, m11 5314 psraw m11, 8 5315 pshufd m5, m11, q0000 5316 pshufd m7, m11, q1111 5317 pshufd m10, m11, q2222 5318 pshufd m11, m11, q3333 5319 pmaddwd m4, m5, m0 5320 pmaddwd m5, m5, m1 5321 pmaddwd m6, m7, m2 5322 pmaddwd m7, m7, m3 5323 paddd m4, m13 5324 paddd m5, m13 5325 paddd m4, m6 5326 paddd m5, m7 5327 pmaddwd m6, [stk+0x90], m10 5328 pmaddwd m7, [stk+0xa0], m10 5329 pmaddwd m8, [stk+0xb0], m11 5330 pmaddwd m9, [stk+0xc0], m11 5331 paddd m4, m6 5332 paddd m5, m7 5333 %if isput 5334 pshufd m6, m12, q1032 5335 %endif 5336 paddd m4, m8 5337 paddd m5, m9 5338%else 5339 movd r0, m15 5340 movd rX, m4 5341 pshufd m15, m15, q0321 5342 pshufd m4, m4, q0321 5343 movd r4, m15 5344 movd r5, m4 5345 mova m14, [stk+0x110] 5346 movq m2, [base+subpel_filters+r0*8] 5347 movq m3, [base+subpel_filters+rX*8] 5348 movhps m2, [base+subpel_filters+r4*8] 5349 movhps m3, [base+subpel_filters+r5*8] 5350 psrld m14, 10 5351 mova [stk+16], m14 5352 mov r0, [stk+ 0] 5353 mov rX, [stk+ 4] 5354 mov r4, [stk+ 8] 5355 mov r5, [stk+12] 5356 mova [stk+0x20], m0 5357 mova [stk+0x30], m1 5358 mova [stk+0x40], m2 5359 mova [stk+0x50], m3 5360 pshufd m4, m6, q1100 5361 pshufd m6, m6, q3322 5362 pshufd m7, m5, q1100 5363 pshufd m5, m5, q3322 5364 pand m0, m9, m4 5365 pand m1, m9, m6 5366 pand m2, m9, m7 5367 pand m3, m9, m5 5368 pandn m4, [stk+0x20] 5369 pandn m6, [stk+0x30] 5370 pandn m7, [stk+0x40] 5371 pandn m5, [stk+0x50] 5372 por m0, m4 5373 por m1, m6 5374 por m2, m7 5375 por m3, m5 5376 punpcklbw m4, m0, m0 5377 punpckhbw m0, m0 5378 punpcklbw m5, m1, m1 5379 punpckhbw m1, m1 5380 psraw m4, 8 5381 psraw m0, 8 5382 psraw m5, 8 5383 psraw m1, 8 5384 punpcklbw m6, m2, m2 5385 punpckhbw m2, m2 5386 punpcklbw m7, m3, m3 5387 punpckhbw m3, m3 5388 psraw m6, 8 5389 psraw m2, 8 5390 psraw m7, 8 5391 psraw m3, 8 5392 mova [stk+0x0a0], m4 5393 mova [stk+0x0b0], m0 5394 mova [stk+0x0c0], m5 5395 mova [stk+0x0d0], m1 5396 mova [stk+0x140], m6 5397 mova [stk+0x150], m2 5398 mova [stk+0x160], m7 5399 mova [stk+0x170], m3 5400 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 5401 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 5402 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 5403 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 5404 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 5405 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 5406 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 5407 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 5408 mova m5, [stk+0x60] 5409 mova m6, [stk+0x70] 5410 mova m7, [stk+0x80] 5411 mova m0, [stk+0x90] 5412 mov myd, mym 5413 punpcklwd m4, m5, m6 ; 45a 5414 punpckhwd m5, m6 ; 45b 5415 punpcklwd m6, m7, m0 ; 67a 5416 punpckhwd m7, m0 ; 67b 5417 mova [stk+0x60], m4 5418 mova [stk+0x70], m5 5419 mova [stk+0x80], m6 5420 mova [stk+0x90], m7 5421 mova m1, [stk+0x20] 5422 mova m2, [stk+0x30] 5423 mova m3, [stk+0x40] 5424 mova m4, [stk+0x50] 5425 punpcklwd m0, m1, m2 ; 01a 5426 punpckhwd m1, m2 ; 01b 5427 punpcklwd m2, m3, m4 ; 23a 5428 punpckhwd m3, m4 ; 23b 5429 mova [stk+0x20], m0 5430 mova [stk+0x30], m1 5431 mova [stk+0x40], m2 5432 mova [stk+0x50], m3 5433.vloop: 5434 mov r0, r0m 5435 mov r5, [esp+0x1f4] 5436 and myd, 0x3ff 5437 mov mym, myd 5438 xor r3, r3 5439 shr r4, 6 5440 lea r5, [r5+r4] 5441 mov r4, 64 << 24 5442 cmovnz r4, [base+subpel_filters+r5*8+0] 5443 cmovnz r3, [base+subpel_filters+r5*8+4] 5444 movd m7, r4 5445 movd m6, r3 5446 punpckldq m7, m6 5447 punpcklbw m7, m7 5448 psraw m7, 8 5449 pshufd m4, m7, q0000 5450 pshufd m5, m7, q1111 5451 pmaddwd m0, m4 5452 pmaddwd m1, m4 5453 pmaddwd m2, m5 5454 pmaddwd m3, m5 5455 pshufd m6, m7, q2222 5456 pshufd m7, m7, q3333 5457 paddd m0, m2 5458 paddd m1, m3 5459 pmaddwd m2, [stk+0x60], m6 5460 pmaddwd m3, [stk+0x70], m6 5461 pmaddwd m4, [stk+0x80], m7 5462 pmaddwd m5, [stk+0x90], m7 5463 %if isput 5464 movd m6, [esp+0x18] 5465 %endif 5466 paddd m0, m2 5467 paddd m1, m3 5468 paddd m0, vrnd_mem 5469 paddd m1, vrnd_mem 5470 paddd m4, m0 5471 paddd m5, m1 5472%endif 5473%ifidn %1, put 5474 psrad m4, m6 5475 psrad m5, m6 5476 packssdw m4, m5 5477 pxor m7, m7 5478 pmaxsw m4, m7 5479 pminsw m4, pxmaxm 5480 mova [dstq], m4 5481 add dstq, dsm 5482%else 5483 psrad m4, 6 5484 psrad m5, 6 5485 packssdw m4, m5 5486 mova [tmpq], m4 5487 add tmpq, tmp_stridem 5488%endif 5489 dec hd 5490 jz .hloop_prep 5491%if ARCH_X86_64 5492 add myd, dyd 5493 test myd, ~0x3ff 5494 jz .vloop 5495 test myd, 0x400 5496 mov [stk+0x140], myd 5497 mov r4d, [stk+ 0] 5498 mov r6d, [stk+ 4] 5499 mov r7d, [stk+ 8] 5500 mov r9d, [stk+12] 5501 jz .skip_line 5502 mova m14, [base+unpckw] 5503 movu m8, [srcq+r10*2] 5504 movu m9, [srcq+r11*2] 5505 movu m10, [srcq+r13*2] 5506 movu m11, [srcq+ rX*2] 5507 movu m4, [srcq+ r4*2] 5508 movu m5, [srcq+ r6*2] 5509 movu m6, [srcq+ r7*2] 5510 movu m7, [srcq+ r9*2] 5511 add srcq, ssq 5512 mov myd, [stk+0x140] 5513 mov dyd, dym 5514 pshufd m15, m14, q1032 5515 pshufb m0, m14 ; 0a 1a 5516 pshufb m1, m14 ; 0b 1b 5517 pshufb m2, m15 ; 3a 2a 5518 pshufb m3, m15 ; 3b 2b 5519 pmaddwd m8, [stk+0x50] 5520 pmaddwd m9, [stk+0x60] 5521 pmaddwd m10, [stk+0x70] 5522 pmaddwd m11, [stk+0x80] 5523 pmaddwd m4, [stk+0x10] 5524 pmaddwd m5, [stk+0x20] 5525 pmaddwd m6, [stk+0x30] 5526 pmaddwd m7, [stk+0x40] 5527 phaddd m8, m9 5528 phaddd m10, m11 5529 mova m11, hround 5530 phaddd m4, m5 5531 phaddd m6, m7 5532 phaddd m8, m10 5533 phaddd m4, m6 5534 paddd m4, m11 5535 paddd m8, m11 5536 psrad m4, m12 5537 psrad m8, m12 5538 packssdw m4, m8 5539 pshufb m5, [stk+0x90], m14 ; 4a 5a 5540 pshufb m6, [stk+0xa0], m14 ; 4b 5b 5541 pshufb m7, [stk+0xb0], m15 ; 7a 6a 5542 pshufb m8, [stk+0xc0], m15 ; 7b 6b 5543 punpckhwd m0, m2 ; 12a 5544 punpckhwd m1, m3 ; 12b 5545 punpcklwd m2, m5 ; 34a 5546 punpcklwd m3, m6 ; 34b 5547 punpckhwd m5, m7 ; 56a 5548 punpckhwd m6, m8 ; 56b 5549 punpcklwd m7, m4 ; 78a 5550 punpckhqdq m4, m4 5551 punpcklwd m8, m4 ; 78b 5552 mova [stk+0x90], m5 5553 mova [stk+0xa0], m6 5554 mova [stk+0xb0], m7 5555 mova [stk+0xc0], m8 5556 jmp .vloop 5557.skip_line: 5558 MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11 5559 MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11 5560 mov myd, [stk+0x140] 5561 mov dyd, dym 5562 mova m0, m2 ; 01a 5563 mova m1, m3 ; 01b 5564 mova m2, [stk+0x90] ; 23a 5565 mova m3, [stk+0xa0] ; 23b 5566 mova m5, [stk+0xb0] ; 45a 5567 mova m6, [stk+0xc0] ; 45b 5568 punpcklwd m7, m4, m8 ; 67a 5569 punpckhwd m4, m8 ; 67b 5570 mova [stk+0x90], m5 5571 mova [stk+0xa0], m6 5572 mova [stk+0xb0], m7 5573 mova [stk+0xc0], m4 5574%else 5575 mov r0m, r0 5576 mov myd, mym 5577 mov r3, r3m 5578 add myd, dym 5579 test myd, ~0x3ff 5580 mov mym, myd 5581 jnz .next_line 5582 mova m0, [stk+0x20] 5583 mova m1, [stk+0x30] 5584 mova m2, [stk+0x40] 5585 mova m3, [stk+0x50] 5586 jmp .vloop 5587.next_line: 5588 test myd, 0x400 5589 mov r0, [stk+ 0] 5590 mov rX, [stk+ 4] 5591 mov r4, [stk+ 8] 5592 mov r5, [stk+12] 5593 jz .skip_line 5594 MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 5595 mova m7, [base+unpckw] 5596 pshufd m4, m7, q1032 5597 pshufb m0, [stk+0x20], m7 ; 0a 1a 5598 pshufb m1, [stk+0x30], m7 ; 0b 1b 5599 pshufb m2, [stk+0x40], m4 ; 3a 2a 5600 pshufb m3, [stk+0x50], m4 ; 3b 2b 5601 pshufb m5, [stk+0x60], m7 ; 4a 5a 5602 pshufb m6, [stk+0x70], m7 ; 4b 5b 5603 pshufb m7, [stk+0x80], m4 ; 7a 6a 5604 punpckhwd m0, m2 ; 12a 5605 punpckhwd m1, m3 ; 12b 5606 punpcklwd m2, m5 ; 34a 5607 punpcklwd m3, m6 ; 34b 5608 mova [stk+0x20], m0 5609 mova [stk+0x30], m1 5610 mova [stk+0x40], m2 5611 mova [stk+0x50], m3 5612 punpckhwd m5, m7 ; 56a 5613 mova [stk+0x60], m5 5614 pshufb m5, [stk+0x90], m4 ; 7b 6b 5615 punpcklwd m7, [stk+0xe0] ; 78a 5616 punpckhwd m6, m5 ; 56b 5617 mova [stk+0x70], m6 5618 movq m6, [stk+0xe8] 5619 mova [stk+0x80], m7 5620 punpcklwd m5, m6 5621 mov myd, mym 5622 mova [stk+0x90], m5 5623 jmp .vloop 5624.skip_line: 5625 MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 5626 MC_8TAP_SCALED_H 0xa0, 0 ; 9 5627 mova m7, [stk+0xe0] 5628 mova m2, [stk+0x60] ; 23a 5629 mova m3, [stk+0x70] ; 23b 5630 mova m4, [stk+0x80] ; 45a 5631 mova m5, [stk+0x90] ; 45b 5632 punpcklwd m6, m7, m0 ; 67a 5633 punpckhwd m7, m0 ; 67b 5634 mova m0, [stk+0x40] ; 01a 5635 mova m1, [stk+0x50] ; 01b 5636 mov myd, mym 5637 mova [stk+0x40], m2 5638 mova [stk+0x50], m3 5639 mova [stk+0x60], m4 5640 mova [stk+0x70], m5 5641 mova [stk+0x80], m6 5642 mova [stk+0x90], m7 5643 mova [stk+0x20], m0 5644 mova [stk+0x30], m1 5645%endif 5646 jmp .vloop 5647INIT_XMM ssse3 5648.dy1: 5649 movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] 5650 add wq, base_reg 5651 jmp wq 5652%if isput 5653.dy1_w2: 5654 %if ARCH_X86_64 5655 mov myd, mym 5656 movzx t0d, t0b 5657 sub srcq, 2 5658 movd m15, t0d 5659 %else 5660 %define m8 m0 5661 %define m9 m1 5662 %define m14 m4 5663 %define m15 m3 5664 %define m11 [esp+0x00] 5665 %define m12 [esp+0x10] 5666 %define m13 [esp+0x20] 5667 movzx r5, byte [esp+0x1f0] 5668 sub srcq, 2 5669 movd m15, r5 5670 mov r1, r1m 5671 %endif 5672 pxor m9, m9 5673 punpckldq m9, m8 5674 paddd m14, m9 ; mx+dx*[0-1] 5675 %if ARCH_X86_64 5676 mova m9, [base+pd_0x4000] 5677 %endif 5678 pshufd m15, m15, q0000 5679 pand m8, m14, m10 5680 psrld m8, 6 5681 paddd m15, m8 5682 movd r4d, m15 5683 pshufd m15, m15, q0321 5684 %if ARCH_X86_64 5685 movd r6d, m15 5686 %else 5687 movd r3d, m15 5688 %endif 5689 mova m5, [base+bdct_lb_q] 5690 mova m6, [base+spel_s_shuf2] 5691 movd m15, [base+subpel_filters+r4*8+2] 5692 %if ARCH_X86_64 5693 movd m7, [base+subpel_filters+r6*8+2] 5694 %else 5695 movd m7, [base+subpel_filters+r3*8+2] 5696 %endif 5697 pxor m2, m2 5698 pcmpeqd m8, m2 5699 psrld m14, 10 5700 paddd m14, m14 5701 %if ARCH_X86_32 5702 mov r3, r3m 5703 pshufb m14, m5 5704 paddb m14, m6 5705 mova [stk], m14 5706 SWAP m5, m0 5707 SWAP m6, m3 5708 %define m15 m6 5709 %endif 5710 movu m0, [srcq+ssq*0] 5711 movu m1, [srcq+ssq*1] 5712 movu m2, [srcq+ssq*2] 5713 movu m3, [srcq+ss3q ] 5714 lea srcq, [srcq+ssq*4] 5715 punpckldq m15, m7 5716 %if ARCH_X86_64 5717 pshufb m14, m5 5718 paddb m14, m6 5719 pand m9, m8 5720 pandn m8, m15 5721 SWAP m15, m8 5722 por m15, m9 5723 movu m4, [srcq+ssq*0] 5724 movu m5, [srcq+ssq*1] 5725 movu m6, [srcq+ssq*2] 5726 add srcq, ss3q 5727 shr myd, 6 5728 mov r4d, 64 << 24 5729 lea myd, [t1+myq] 5730 cmovnz r4q, [base+subpel_filters+myq*8] 5731 %else 5732 pand m7, m5, [base+pd_0x4000] 5733 pandn m5, m15 5734 por m5, m7 5735 %define m15 m5 5736 mov myd, mym 5737 mov r5, [esp+0x1f4] 5738 xor r3, r3 5739 shr myd, 6 5740 lea r5, [r5+myd] 5741 mov r4, 64 << 24 5742 cmovnz r4, [base+subpel_filters+r5*8+0] 5743 cmovnz r3, [base+subpel_filters+r5*8+4] 5744 mov [stk+0x20], r3 5745 mov r3, r3m 5746 %endif 5747 punpcklbw m15, m15 5748 psraw m15, 8 5749 REPX {pshufb x, m14}, m0, m1, m2, m3 5750 REPX {pmaddwd x, m15}, m0, m1, m2, m3 5751 %if ARCH_X86_64 5752 REPX {pshufb x, m14}, m4, m5, m6 5753 REPX {pmaddwd x, m15}, m4, m5, m6 5754 phaddd m0, m1 5755 phaddd m2, m3 5756 phaddd m4, m5 5757 phaddd m6, m6 5758 REPX {paddd x, m11}, m0, m2, m4, m6 5759 REPX {psrad x, m12}, m0, m2, m4, m6 5760 packssdw m0, m2 ; 0 1 2 3 5761 packssdw m4, m6 ; 4 5 6 5762 SWAP m1, m4 5763 movq m10, r4 5764 %else 5765 mova [stk+0x10], m15 5766 phaddd m0, m1 5767 phaddd m2, m3 5768 movu m1, [srcq+ssq*0] 5769 movu m7, [srcq+ssq*1] 5770 movu m6, [srcq+ssq*2] 5771 add srcq, ss3q 5772 REPX {pshufb x, m14}, m1, m7, m6 5773 REPX {pmaddwd x, m15}, m1, m7, m6 5774 %define m14 [stk+0x00] 5775 %define m15 [stk+0x10] 5776 phaddd m1, m7 5777 phaddd m6, m6 5778 REPX {paddd x, m11}, m0, m2, m1, m6 5779 REPX {psrad x, m12}, m0, m2, m1, m6 5780 packssdw m0, m2 5781 packssdw m1, m6 5782 %define m8 m6 5783 %define m9 m4 5784 %define m10 m5 5785 movd m10, r4 5786 movd m9, [stk+0x20] 5787 punpckldq m10, m9 5788 %endif 5789 punpcklbw m10, m10 5790 psraw m10, 8 5791 pshufd m7, m10, q0000 5792 pshufd m8, m10, q1111 5793 pshufd m9, m10, q2222 5794 pshufd m10, m10, q3333 5795 %if ARCH_X86_32 5796 mova [stk+0x50], m7 5797 mova [stk+0x60], m8 5798 mova [stk+0x70], m9 5799 mova [stk+0x80], m10 5800 %define m7 [stk+0x50] 5801 %define m8 [stk+0x60] 5802 %define m9 [stk+0x70] 5803 %define m10 [stk+0x80] 5804 %endif 5805 palignr m2, m1, m0, 4 ; 1 2 3 4 5806 punpcklwd m3, m0, m2 ; 01 12 5807 punpckhwd m0, m2 ; 23 34 5808 pshufd m4, m1, q2121 ; 5 6 5 6 5809 punpcklwd m2, m1, m4 ; 45 56 5810 %if ARCH_X86_32 5811 mov r0, r0m 5812 %endif 5813.dy1_w2_loop: 5814 movu m1, [srcq+ssq*0] 5815 movu m6, [srcq+ssq*1] 5816 lea srcq, [srcq+ssq*2] 5817 pmaddwd m5, m3, m7 5818 mova m3, m0 5819 pmaddwd m0, m8 5820 pshufb m1, m14 5821 pshufb m6, m14 5822 pmaddwd m1, m15 5823 pmaddwd m6, m15 5824 phaddd m1, m6 5825 paddd m1, m11 5826 psrad m1, m12 5827 packssdw m1, m1 5828 paddd m5, m0 5829 mova m0, m2 5830 pmaddwd m2, m9 5831 paddd m5, m2 5832 palignr m2, m1, m4, 12 5833 punpcklwd m2, m1 ; 67 78 5834 pmaddwd m4, m2, m10 5835 paddd m5, m13 5836 paddd m5, m4 5837 pxor m6, m6 5838 mova m4, m1 5839 pshufd m1, m12, q1032 5840 psrad m5, m1 5841 packssdw m5, m5 5842 pmaxsw m5, m6 5843 pminsw m5, pxmaxm 5844 movd [dstq+dsq*0], m5 5845 pshuflw m5, m5, q1032 5846 movd [dstq+dsq*1], m5 5847 lea dstq, [dstq+dsq*2] 5848 sub hd, 2 5849 jg .dy1_w2_loop 5850 RET 5851%endif 5852INIT_XMM ssse3 5853.dy1_w4: 5854%if ARCH_X86_64 5855 mov myd, mym 5856 mova [rsp+0x10], m11 5857 mova [rsp+0x20], m12 5858 %if isput 5859 mova [rsp+0x30], m13 5860 %define vrnd_mem [rsp+0x30] 5861 %define stk rsp+0x40 5862 %else 5863 %define vrnd_mem [base+pd_m524256] 5864 %define stk rsp+0x30 5865 %endif 5866 movzx t0d, t0b 5867 sub srcq, 2 5868 movd m15, t0d 5869%else 5870 %define m10 [base+pd_0x3ff] 5871 %define m9 [base+pd_0x4000] 5872 %define m8 m0 5873 %xdefine m14 m4 5874 %define m15 m3 5875 %if isprep 5876 %define ssq r3 5877 %endif 5878 movzx r5, byte [esp+0x1f0] 5879 sub srcq, 2 5880 movd m15, r5 5881%endif 5882 pmaddwd m8, [base+rescale_mul] 5883%if ARCH_X86_64 5884 mova m9, [base+pd_0x4000] 5885%endif 5886 pshufd m15, m15, q0000 5887 paddd m14, m8 ; mx+dx*[0-3] 5888 pand m0, m14, m10 5889 psrld m0, 6 5890 paddd m15, m0 5891 pshufd m7, m15, q1032 5892%if ARCH_X86_64 5893 movd r4d, m15 5894 movd r11d, m7 5895 pshufd m15, m15, q0321 5896 pshufd m7, m7, q0321 5897 movd r6d, m15 5898 movd r13d, m7 5899 mova m10, [base+bdct_lb_q+ 0] 5900 mova m11, [base+bdct_lb_q+16] 5901 movd m13, [base+subpel_filters+ r4*8+2] 5902 movd m2, [base+subpel_filters+ r6*8+2] 5903 movd m15, [base+subpel_filters+r11*8+2] 5904 movd m4, [base+subpel_filters+r13*8+2] 5905%else 5906 movd r0, m15 5907 movd r4, m7 5908 pshufd m15, m15, q0321 5909 pshufd m7, m7, q0321 5910 movd rX, m15 5911 movd r5, m7 5912 mova m5, [base+bdct_lb_q+ 0] 5913 mova m6, [base+bdct_lb_q+16] 5914 movd m1, [base+subpel_filters+r0*8+2] 5915 movd m2, [base+subpel_filters+rX*8+2] 5916 movd m3, [base+subpel_filters+r4*8+2] 5917 movd m7, [base+subpel_filters+r5*8+2] 5918 SWAP m4, m7 5919 %if isprep 5920 mov r3, r3m 5921 %endif 5922 %define m10 m5 5923 %define m11 m6 5924 %define m12 m1 5925 %define m13 m1 5926%endif 5927 psrld m14, 10 5928 paddd m14, m14 5929 punpckldq m13, m2 5930 punpckldq m15, m4 5931 punpcklqdq m13, m15 5932 pxor m2, m2 5933 pcmpeqd m0, m2 5934%if ARCH_X86_64 5935 pand m9, m0 5936%else 5937 pand m2, m9, m0 5938 %define m9 m2 5939 SWAP m7, m4 5940%endif 5941 pandn m0, m13 5942%if ARCH_X86_64 5943 SWAP m13, m0 5944%else 5945 %define m13 m0 5946%endif 5947 por m13, m9 5948 punpckhbw m15, m13, m13 5949 punpcklbw m13, m13 5950 psraw m15, 8 5951 psraw m13, 8 5952 pshufb m12, m14, m10 5953 pshufb m14, m11 5954 mova m10, [base+spel_s_shuf2] 5955 movd r4d, m14 5956 shr r4d, 24 5957%if ARCH_X86_32 5958 mova [stk+0x40], m13 5959 mova [stk+0x50], m15 5960 pxor m2, m2 5961%endif 5962 pshufb m7, m14, m2 5963 psubb m14, m7 5964 paddb m12, m10 5965 paddb m14, m10 5966%if ARCH_X86_64 5967 lea r6, [r4+ssq*1] 5968 lea r11, [r4+ssq*2] 5969 lea r13, [r4+ss3q ] 5970 movu m7, [srcq+ssq*0] 5971 movu m9, [srcq+ssq*1] 5972 movu m8, [srcq+ssq*2] 5973 movu m10, [srcq+ss3q ] 5974 movu m1, [srcq+r4 ] 5975 movu m3, [srcq+r6 ] 5976 movu m2, [srcq+r11 ] 5977 movu m4, [srcq+r13 ] 5978 lea srcq, [srcq+ssq*4] 5979 REPX {pshufb x, m12}, m7, m9, m8, m10 5980 REPX {pmaddwd x, m13}, m7, m9, m8, m10 5981 REPX {pshufb x, m14}, m1, m3, m2, m4 5982 REPX {pmaddwd x, m15}, m1, m3, m2, m4 5983 mova m5, [rsp+0x10] 5984 movd xm6, [rsp+0x20] 5985 phaddd m7, m1 5986 phaddd m9, m3 5987 phaddd m8, m2 5988 phaddd m10, m4 5989 movu m1, [srcq+ssq*0] 5990 movu m2, [srcq+ssq*1] 5991 movu m3, [srcq+ssq*2] 5992 REPX {paddd x, m5}, m7, m9, m8, m10 5993 REPX {psrad x, xm6}, m7, m9, m8, m10 5994 packssdw m7, m9 ; 0 1 5995 packssdw m8, m10 ; 2 3 5996 movu m0, [srcq+r4 ] 5997 movu m9, [srcq+r6 ] 5998 movu m10, [srcq+r11 ] 5999 add srcq, ss3q 6000 REPX {pshufb x, m12}, m1, m2, m3 6001 REPX {pmaddwd x, m13}, m1, m2, m3 6002 REPX {pshufb x, m14}, m0, m9, m10 6003 REPX {pmaddwd x, m15}, m0, m9, m10 6004 phaddd m1, m0 6005 phaddd m2, m9 6006 phaddd m3, m10 6007 shr myd, 6 6008 mov r13d, 64 << 24 6009 lea myd, [t1+myq] 6010 cmovnz r13q, [base+subpel_filters+myq*8] 6011 REPX {paddd x, m5}, m1, m2, m3 6012 REPX {psrad x, xm6}, m1, m2, m3 6013 packssdw m1, m2 ; 4 5 6014 packssdw m3, m3 ; 6 6 6015 SWAP m9, m1 6016 shufps m4, m7, m8, q1032 ; 1 2 6017 shufps m5, m8, m9, q1032 ; 3 4 6018 shufps m6, m9, m3, q1032 ; 5 6 6019 punpcklwd m0, m7, m4 ; 01 6020 punpckhwd m7, m4 ; 12 6021 punpcklwd m1, m8, m5 ; 23 6022 punpckhwd m8, m5 ; 34 6023 punpcklwd m2, m9, m6 ; 45 6024 punpckhwd m9, m6 ; 56 6025 movq m10, r13 6026 mova [stk+0x00], m1 6027 mova [stk+0x10], m8 6028 mova [stk+0x20], m2 6029 mova [stk+0x30], m9 6030 mova [stk+0x40], m3 6031 %define hrnd_mem [rsp+0x10] 6032 %define hsh_mem [rsp+0x20] 6033 %define vsh_mem [rsp+0x28] 6034 %if isput 6035 %define vrnd_mem [rsp+0x30] 6036 %else 6037 %define vrnd_mem [base+pd_m524256] 6038 %endif 6039%else 6040 mova [stk+0x20], m12 6041 mova [stk+0x30], m14 6042 add r4, srcq 6043 MC_4TAP_SCALED_H 0x60 ; 0 1 6044 MC_4TAP_SCALED_H 0x70 ; 2 3 6045 MC_4TAP_SCALED_H 0x80 ; 4 5 6046 movu m7, [srcq] 6047 movu m2, [r4] 6048 add srcq, ssq 6049 add r4, ssq 6050 mov [stk+0xb0], r4 6051 pshufb m7, m12 6052 pshufb m2, m14 6053 pmaddwd m7, m13 6054 pmaddwd m2, m15 6055 phaddd m7, m2 6056 paddd m7, [esp+0x00] 6057 psrad m7, [esp+0x10] 6058 packssdw m7, m7 ; 6 6 6059 mova m4, [stk+0x60] 6060 mova m5, [stk+0x70] 6061 mova m6, [stk+0x80] 6062 mov myd, mym 6063 mov rX, [esp+0x1f4] 6064 xor r5, r5 6065 shr myd, 6 6066 lea rX, [rX+myd] 6067 mov r4, 64 << 24 6068 cmovnz r4, [base+subpel_filters+rX*8+0] 6069 cmovnz r5, [base+subpel_filters+rX*8+4] 6070 mov r3, r3m 6071 shufps m1, m4, m5, q1032 ; 1 2 6072 shufps m2, m5, m6, q1032 ; 3 4 6073 shufps m3, m6, m7, q1032 ; 5 6 6074 mova [stk+0xa0], m7 6075 punpcklwd m0, m4, m1 ; 01 6076 punpckhwd m4, m1 ; 12 6077 punpcklwd m1, m5, m2 ; 23 6078 punpckhwd m5, m2 ; 34 6079 punpcklwd m2, m6, m3 ; 45 6080 punpckhwd m6, m3 ; 56 6081 movd m7, r4 6082 movd m3, r5 6083 mov r0, r0m 6084 %if isput 6085 mov r1, r1m 6086 %endif 6087 mov r4, [stk+0xb0] 6088 mova [stk+0xc0], m4 ; 12 6089 mova [stk+0x60], m1 ; 23 6090 mova [stk+0x70], m2 ; 45 6091 mova [stk+0x80], m5 ; 34 6092 mova [stk+0x90], m6 ; 56 6093 %define m12 [stk+0x20] 6094 %define m14 [stk+0x30] 6095 %define m13 [stk+0x40] 6096 %define m15 [stk+0x50] 6097 %define hrnd_mem [esp+0x00] 6098 %define hsh_mem [esp+0x10] 6099 %define vsh_mem [esp+0x18] 6100 %if isput 6101 %define vrnd_mem [esp+0x20] 6102 %else 6103 %define vrnd_mem [base+pd_m524256] 6104 %endif 6105 %define m10 m7 6106 punpckldq m10, m3 6107%endif 6108 punpcklbw m10, m10 6109 psraw m10, 8 6110 pshufd m3, m10, q0000 6111 pshufd m4, m10, q1111 6112 pshufd m5, m10, q2222 6113 pshufd m10, m10, q3333 6114%if ARCH_X86_32 6115 %xdefine m8 m3 6116 %xdefine m9 m6 6117 %xdefine m11 m5 6118 %xdefine m6 m4 6119 mova [stk+0x100], m3 6120 mova [stk+0x110], m4 6121 mova [stk+0x120], m5 6122 mova [stk+0x130], m10 6123 %define m3 [stk+0x100] 6124 %define m4 [stk+0x110] 6125 %define m5 [stk+0x120] 6126 %define m10 [stk+0x130] 6127 mova m7, [stk+0xc0] 6128 mova m8, [stk+0x80] 6129%endif 6130.dy1_w4_loop: 6131 movu m11, [srcq+ssq*0] 6132 movu m6, [srcq+ssq*1] 6133 pmaddwd m0, m3 6134 pmaddwd m7, m3 6135 pmaddwd m1, m4 6136 pmaddwd m8, m4 6137 pmaddwd m2, m5 6138 pmaddwd m9, m5 6139 paddd m1, m0 6140 paddd m8, m7 6141%if ARCH_X86_64 6142 movu m0, [srcq+r4] 6143 movu m7, [srcq+r6] 6144%else 6145 movu m0, [r4+ssq*0] 6146 movu m7, [r4+ssq*1] 6147 lea r4, [r4+ssq*2] 6148%endif 6149 lea srcq, [srcq+ssq*2] 6150 paddd m1, m2 6151 paddd m8, m9 6152 pshufb m11, m12 6153 pshufb m6, m12 6154 pmaddwd m11, m13 6155 pmaddwd m6, m13 6156 pshufb m0, m14 6157 pshufb m7, m14 6158 pmaddwd m0, m15 6159 pmaddwd m7, m15 6160 phaddd m11, m0 6161 phaddd m6, m7 6162 paddd m11, hrnd_mem 6163 paddd m6, hrnd_mem 6164 psrad m11, hsh_mem 6165 psrad m6, hsh_mem 6166 packssdw m11, m6 ; 7 8 6167%if ARCH_X86_64 6168 shufps m9, [stk+0x40], m11, q1032 ; 6 7 6169 mova m0, [stk+0x00] 6170 mova [stk+0x40], m11 6171%else 6172 shufps m9, [stk+0xa0], m11, q1032 ; 6 7 6173 mova m0, [stk+0x60] 6174 mova [stk+0xa0], m11 6175%endif 6176 punpcklwd m2, m9, m11 ; 67 6177 punpckhwd m9, m11 ; 78 6178 pmaddwd m6, m2, m10 6179 pmaddwd m7, m9, m10 6180%if isput 6181 movd m11, vsh_mem 6182%endif 6183 paddd m1, vrnd_mem 6184 paddd m8, vrnd_mem 6185 paddd m1, m6 6186 paddd m8, m7 6187%if ARCH_X86_64 6188 mova m7, [stk+0x10] 6189%else 6190 mova m7, [stk+0x80] 6191%endif 6192%if isput 6193 psrad m1, m11 6194 psrad m8, m11 6195%else 6196 psrad m1, 6 6197 psrad m8, 6 6198%endif 6199 packssdw m1, m8 6200%if ARCH_X86_64 6201 mova m8, [stk+0x30] 6202%else 6203 mova m8, [stk+0x90] 6204%endif 6205%if isput 6206 pxor m6, m6 6207 pmaxsw m1, m6 6208 pminsw m1, pxmaxm 6209 movq [dstq+dsq*0], m1 6210 movhps [dstq+dsq*1], m1 6211 lea dstq, [dstq+dsq*2] 6212%else 6213 mova [tmpq], m1 6214 add tmpq, 16 6215%endif 6216%if ARCH_X86_64 6217 mova m1, [stk+0x20] 6218 mova [stk+0x10], m8 6219 mova [stk+0x00], m1 6220 mova [stk+0x20], m2 6221 mova [stk+0x30], m9 6222%else 6223 mova m1, [stk+0x70] 6224 mova [stk+0x80], m8 6225 mova [stk+0x60], m1 6226 mova [stk+0x70], m2 6227 mova [stk+0x90], m9 6228%endif 6229 sub hd, 2 6230 jg .dy1_w4_loop 6231 MC_8TAP_SCALED_RET ; why not jz .ret? 6232INIT_XMM ssse3 6233.dy1_w8: 6234 mov dword [stk+0xf0], 1 6235 movifprep tmp_stridem, 16 6236 jmp .dy1_w_start 6237.dy1_w16: 6238 mov dword [stk+0xf0], 2 6239 movifprep tmp_stridem, 32 6240 jmp .dy1_w_start 6241.dy1_w32: 6242 mov dword [stk+0xf0], 4 6243 movifprep tmp_stridem, 64 6244 jmp .dy1_w_start 6245.dy1_w64: 6246 mov dword [stk+0xf0], 8 6247 movifprep tmp_stridem, 128 6248 jmp .dy1_w_start 6249.dy1_w128: 6250 mov dword [stk+0xf0], 16 6251 movifprep tmp_stridem, 256 6252.dy1_w_start: 6253 mov myd, mym 6254%if ARCH_X86_64 6255 %ifidn %1, put 6256 movifnidn dsm, dsq 6257 %endif 6258 mova [rsp+0x10], m11 6259 mova [rsp+0x20], m12 6260 %define hround m11 6261 %if isput 6262 mova [rsp+0x30], m13 6263 %else 6264 mova m13, [base+pd_m524256] 6265 %endif 6266 shr t0d, 16 6267 shr myd, 6 6268 mov r4d, 64 << 24 6269 lea myd, [t1+myq] 6270 cmovnz r4q, [base+subpel_filters+myq*8] 6271 movd m15, t0d 6272%else 6273 %define hround [esp+0x00] 6274 %define m12 [esp+0x10] 6275 %define m10 [base+pd_0x3ff] 6276 %define m8 m0 6277 %xdefine m14 m4 6278 %xdefine m15 m3 6279 %if isprep 6280 %define ssq ssm 6281 %endif 6282 mov r5, [esp+0x1f0] 6283 mov r3, [esp+0x1f4] 6284 shr r5, 16 6285 movd m15, r5 6286 xor r5, r5 6287 shr myd, 6 6288 lea r3, [r3+myd] 6289 mov r4, 64 << 24 6290 cmovnz r4, [base+subpel_filters+r3*8+0] 6291 cmovnz r5, [base+subpel_filters+r3*8+4] 6292 mov r0, r0m 6293 mov r3, r3m 6294%endif 6295 sub srcq, 6 6296 pslld m7, m8, 2 ; dx*4 6297 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 6298 pshufd m15, m15, q0000 6299 paddd m14, m8 ; mx+dx*[0-3] 6300%if ARCH_X86_64 6301 movq m3, r4q 6302%else 6303 movd m5, r4 6304 movd m6, r5 6305 punpckldq m5, m6 6306 SWAP m3, m5 6307%endif 6308 punpcklbw m3, m3 6309 psraw m3, 8 6310 mova [stk+0x100], m7 6311 mova [stk+0x120], m15 6312 mov [stk+0x0f8], srcq 6313 mov [stk+0x130], r0q ; dstq / tmpq 6314 pshufd m0, m3, q0000 6315 pshufd m1, m3, q1111 6316 pshufd m2, m3, q2222 6317 pshufd m3, m3, q3333 6318%if ARCH_X86_64 6319 mova [stk+0x140], m0 6320 mova [stk+0x150], m1 6321 mova [stk+0x160], m2 6322 mova [stk+0x170], m3 6323 %if UNIX64 6324 mov hm, hd 6325 %endif 6326%else 6327 mova [stk+0x180], m0 6328 mova [stk+0x190], m1 6329 mova [stk+0x1a0], m2 6330 mova [stk+0x1b0], m3 6331 SWAP m5, m3 6332 mov r5, hm 6333 mov [stk+0x134], r5 6334%endif 6335 jmp .dy1_hloop 6336.dy1_hloop_prep: 6337 dec dword [stk+0x0f0] 6338 jz .ret 6339%if ARCH_X86_64 6340 add qword [stk+0x130], 16 6341 mov hd, hm 6342%else 6343 add dword [stk+0x130], 16 6344 mov r5, [stk+0x134] 6345 mov r0, [stk+0x130] 6346%endif 6347 mova m7, [stk+0x100] 6348 mova m14, [stk+0x110] 6349%if ARCH_X86_64 6350 mova m10, [base+pd_0x3ff] 6351 mova m11, [rsp+0x10] 6352%endif 6353 mova m15, [stk+0x120] 6354 mov srcq, [stk+0x0f8] 6355%if ARCH_X86_64 6356 mov r0q, [stk+0x130] ; dstq / tmpq 6357%else 6358 mov hm, r5 6359 mov r0m, r0 6360 mov r3, r3m 6361%endif 6362 paddd m14, m7 6363.dy1_hloop: 6364%if ARCH_X86_64 6365 mova m9, [base+pq_0x40000000] 6366%else 6367 %define m9 [base+pq_0x40000000] 6368%endif 6369 pxor m1, m1 6370 psrld m2, m14, 10 6371 mova [stk], m2 6372 pand m6, m14, m10 6373 psrld m6, 6 6374 paddd m5, m15, m6 6375 pcmpeqd m6, m1 6376 pshufd m2, m5, q1032 6377%if ARCH_X86_64 6378 movd r4d, m5 6379 movd r6d, m2 6380 pshufd m5, m5, q0321 6381 pshufd m2, m2, q0321 6382 movd r7d, m5 6383 movd r9d, m2 6384 movq m0, [base+subpel_filters+r4*8] 6385 movq m1, [base+subpel_filters+r6*8] 6386 movhps m0, [base+subpel_filters+r7*8] 6387 movhps m1, [base+subpel_filters+r9*8] 6388%else 6389 movd r0, m5 6390 movd rX, m2 6391 pshufd m5, m5, q0321 6392 pshufd m2, m2, q0321 6393 movd r4, m5 6394 movd r5, m2 6395 movq m0, [base+subpel_filters+r0*8] 6396 movq m1, [base+subpel_filters+rX*8] 6397 movhps m0, [base+subpel_filters+r4*8] 6398 movhps m1, [base+subpel_filters+r5*8] 6399%endif 6400 paddd m14, m7 ; mx+dx*[4-7] 6401 pand m5, m14, m10 6402 psrld m5, 6 6403 paddd m15, m5 6404 pxor m2, m2 6405 pcmpeqd m5, m2 6406 mova [stk+0x110], m14 6407 pshufd m4, m15, q1032 6408%if ARCH_X86_64 6409 movd r10d, m15 6410 movd r11d, m4 6411 pshufd m15, m15, q0321 6412 pshufd m4, m4, q0321 6413 movd r13d, m15 6414 movd rXd, m4 6415 movq m2, [base+subpel_filters+r10*8] 6416 movq m3, [base+subpel_filters+r11*8] 6417 movhps m2, [base+subpel_filters+r13*8] 6418 movhps m3, [base+subpel_filters+ rX*8] 6419 psrld m14, 10 6420 movq r11, m14 6421 punpckhqdq m14, m14 6422 movq rX, m14 6423 mov r10d, r11d 6424 shr r11, 32 6425 mov r13d, rXd 6426 shr rX, 32 6427 mov r4d, [stk+ 0] 6428 mov r6d, [stk+ 4] 6429 mov r7d, [stk+ 8] 6430 mov r9d, [stk+12] 6431 pshufd m4, m6, q1100 6432 pshufd m6, m6, q3322 6433 pshufd m14, m5, q1100 6434 pshufd m5, m5, q3322 6435 pand m7, m9, m4 6436 pand m8, m9, m6 6437 pand m15, m9, m14 6438 pand m9, m9, m5 6439 pandn m4, m0 6440 pandn m6, m1 6441 pandn m14, m2 6442 pandn m5, m3 6443 por m7, m4 6444 por m8, m6 6445 por m15, m14 6446 por m9, m5 6447 punpcklbw m0, m7, m7 6448 punpckhbw m7, m7 6449 punpcklbw m1, m8, m8 6450 punpckhbw m8, m8 6451 psraw m0, 8 6452 psraw m7, 8 6453 psraw m1, 8 6454 psraw m8, 8 6455 punpcklbw m2, m15, m15 6456 punpckhbw m15, m15 6457 punpcklbw m3, m9, m9 6458 punpckhbw m9, m9 6459 psraw m2, 8 6460 psraw m15, 8 6461 psraw m3, 8 6462 psraw m9, 8 6463 mova [stk+0x10], m0 6464 mova [stk+0x20], m7 6465 mova [stk+0x30], m1 6466 mova [stk+0x40], m8 6467 mova [stk+0x50], m2 6468 mova [stk+0x60], m15 6469 mova [stk+0x70], m3 6470 mova [stk+0x80], m9 6471 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 6472 mova [stk+0x90], m1 6473 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 6474 mova [stk+0xa0], m2 6475 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 6476 mova [stk+0xb0], m3 6477 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 6478 mova [stk+0xc0], m4 6479 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 6480 mova [stk+0xd0], m5 6481 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 6482 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 6483 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 6484 mova m5, [stk+0xd0] 6485 mova m1, [stk+0x90] 6486 mova m2, [stk+0xa0] 6487 mova m3, [stk+0xb0] 6488 mova m9, [stk+0xc0] 6489 punpcklwd m4, m5, m6 ; 45a 6490 punpckhwd m5, m6 ; 45b 6491 punpcklwd m6, m7, m8 ; 67a 6492 punpckhwd m7, m8 ; 67b 6493 punpcklwd m0, m1, m2 ; 01a 6494 punpckhwd m1, m2 ; 01b 6495 punpcklwd m2, m3, m9 ; 23a 6496 punpckhwd m3, m9 ; 23b 6497 mova m10, [stk+0x140] 6498 mova m11, [stk+0x150] 6499 mova m14, [stk+0x160] 6500 mova m15, [stk+0x170] 6501 mova [stk+0x90], m4 6502 mova [stk+0xa0], m5 6503 mova [stk+0xb0], m6 6504 mova [stk+0xc0], m7 6505 %define hround [rsp+0x10] 6506 %define shift [rsp+0x20] 6507 %if isput 6508 %define vround [rsp+0x30] 6509 %else 6510 %define vround [base+pd_m524256] 6511 %endif 6512.dy1_vloop: 6513 pmaddwd m4, m0, m10 6514 pmaddwd m5, m1, m10 6515 pmaddwd m6, m2, m11 6516 pmaddwd m7, m3, m11 6517 paddd m4, m13 6518 paddd m5, m13 6519 paddd m4, m6 6520 paddd m5, m7 6521 pmaddwd m6, [stk+0x90], m14 6522 pmaddwd m7, [stk+0xa0], m14 6523 pmaddwd m8, [stk+0xb0], m15 6524 pmaddwd m9, [stk+0xc0], m15 6525 paddd m4, m6 6526 paddd m5, m7 6527 %if isput 6528 pshufd m6, m12, q1032 6529 %endif 6530 paddd m4, m8 6531 paddd m5, m9 6532%else 6533 movd r0, m15 6534 movd rX, m4 6535 pshufd m15, m15, q0321 6536 pshufd m4, m4, q0321 6537 movd r4, m15 6538 movd r5, m4 6539 mova m14, [stk+0x110] 6540 movq m2, [base+subpel_filters+r0*8] 6541 movq m3, [base+subpel_filters+rX*8] 6542 movhps m2, [base+subpel_filters+r4*8] 6543 movhps m3, [base+subpel_filters+r5*8] 6544 psrld m14, 10 6545 mova [stk+16], m14 6546 mov r0, [stk+ 0] 6547 mov rX, [stk+ 4] 6548 mov r4, [stk+ 8] 6549 mov r5, [stk+12] 6550 mova [stk+0x20], m0 6551 mova [stk+0x30], m1 6552 mova [stk+0x40], m2 6553 mova [stk+0x50], m3 6554 pshufd m4, m6, q1100 6555 pshufd m6, m6, q3322 6556 pshufd m7, m5, q1100 6557 pshufd m5, m5, q3322 6558 pand m0, m9, m4 6559 pand m1, m9, m6 6560 pand m2, m9, m7 6561 pand m3, m9, m5 6562 pandn m4, [stk+0x20] 6563 pandn m6, [stk+0x30] 6564 pandn m7, [stk+0x40] 6565 pandn m5, [stk+0x50] 6566 por m0, m4 6567 por m1, m6 6568 por m2, m7 6569 por m3, m5 6570 punpcklbw m4, m0, m0 6571 punpckhbw m0, m0 6572 punpcklbw m5, m1, m1 6573 punpckhbw m1, m1 6574 psraw m4, 8 6575 psraw m0, 8 6576 psraw m5, 8 6577 psraw m1, 8 6578 punpcklbw m6, m2, m2 6579 punpckhbw m2, m2 6580 punpcklbw m7, m3, m3 6581 punpckhbw m3, m3 6582 psraw m6, 8 6583 psraw m2, 8 6584 psraw m7, 8 6585 psraw m3, 8 6586 mova [stk+0x0a0], m4 6587 mova [stk+0x0b0], m0 6588 mova [stk+0x0c0], m5 6589 mova [stk+0x0d0], m1 6590 mova [stk+0x140], m6 6591 mova [stk+0x150], m2 6592 mova [stk+0x160], m7 6593 mova [stk+0x170], m3 6594 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 6595 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 6596 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 6597 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 6598 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 6599 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 6600 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 6601 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 6602 mova m5, [stk+0x60] 6603 mova m6, [stk+0x70] 6604 mova m7, [stk+0x80] 6605 mova m0, [stk+0x90] 6606 mov r0, r0m 6607 punpcklwd m4, m5, m6 ; 45a 6608 punpckhwd m5, m6 ; 45b 6609 punpcklwd m6, m7, m0 ; 67a 6610 punpckhwd m7, m0 ; 67b 6611 mova [stk+0x60], m4 6612 mova [stk+0x70], m5 6613 mova [stk+0x80], m6 6614 mova [stk+0x90], m7 6615 mova m1, [stk+0x20] 6616 mova m2, [stk+0x30] 6617 mova m3, [stk+0x40] 6618 mova m4, [stk+0x50] 6619 punpcklwd m0, m1, m2 ; 01a 6620 punpckhwd m1, m2 ; 01b 6621 punpcklwd m2, m3, m4 ; 23a 6622 punpckhwd m3, m4 ; 23b 6623 mova m4, [stk+0x180] 6624 mova m5, [stk+0x190] 6625 mova m6, [stk+0x1a0] 6626 mova m7, [stk+0x1b0] 6627 mova [stk+0x20], m0 6628 mova [stk+0x30], m1 6629 mova [stk+0x40], m2 6630 mova [stk+0x50], m3 6631.dy1_vloop: 6632 pmaddwd m0, m4 6633 pmaddwd m1, m4 6634 pmaddwd m2, m5 6635 pmaddwd m3, m5 6636 paddd m0, m2 6637 paddd m1, m3 6638 pmaddwd m2, [stk+0x60], m6 6639 pmaddwd m3, [stk+0x70], m6 6640 pmaddwd m4, [stk+0x80], m7 6641 pmaddwd m5, [stk+0x90], m7 6642 %if isput 6643 movd m6, [esp+0x18] 6644 %endif 6645 paddd m0, m2 6646 paddd m1, m3 6647 paddd m0, vrnd_mem 6648 paddd m1, vrnd_mem 6649 paddd m4, m0 6650 paddd m5, m1 6651%endif 6652%ifidn %1, put 6653 psrad m4, m6 6654 psrad m5, m6 6655 packssdw m4, m5 6656 pxor m7, m7 6657 pmaxsw m4, m7 6658 pminsw m4, pxmaxm 6659 mova [dstq], m4 6660 add dstq, dsm 6661%else 6662 psrad m4, 6 6663 psrad m5, 6 6664 packssdw m4, m5 6665 mova [tmpq], m4 6666 add tmpq, tmp_stridem 6667%endif 6668 dec hd 6669 jz .dy1_hloop_prep 6670%if ARCH_X86_64 6671 movu m8, [srcq+r10*2] 6672 movu m9, [srcq+r11*2] 6673 movu m12, [srcq+r13*2] 6674 movu m13, [srcq+ rX*2] 6675 movu m4, [srcq+ r4*2] 6676 movu m5, [srcq+ r6*2] 6677 movu m6, [srcq+ r7*2] 6678 movu m7, [srcq+ r9*2] 6679 add srcq, ssq 6680 pmaddwd m8, [stk+0x50] 6681 pmaddwd m9, [stk+0x60] 6682 pmaddwd m12, [stk+0x70] 6683 pmaddwd m13, [stk+0x80] 6684 pmaddwd m4, [stk+0x10] 6685 pmaddwd m5, [stk+0x20] 6686 pmaddwd m6, [stk+0x30] 6687 pmaddwd m7, [stk+0x40] 6688 phaddd m8, m9 6689 phaddd m12, m13 6690 mova m9, [base+unpckw] 6691 mova m13, hround 6692 phaddd m4, m5 6693 phaddd m6, m7 6694 phaddd m8, m12 6695 phaddd m4, m6 6696 pshufd m5, m9, q1032 6697 pshufb m0, m9 ; 0a 1a 6698 pshufb m1, m9 ; 0b 1b 6699 pshufb m2, m5 ; 3a 2a 6700 pshufb m3, m5 ; 3b 2b 6701 mova m12, shift 6702 paddd m4, m13 6703 paddd m8, m13 6704 psrad m4, m12 6705 psrad m8, m12 6706 packssdw m4, m8 6707 pshufb m6, [stk+0x90], m9 ; 4a 5a 6708 pshufb m7, [stk+0xa0], m9 ; 4b 5b 6709 pshufb m8, [stk+0xb0], m5 ; 7a 6a 6710 pshufb m13, [stk+0xc0], m5 ; 7b 6b 6711 punpckhwd m0, m2 ; 12a 6712 punpckhwd m1, m3 ; 12b 6713 punpcklwd m2, m6 ; 34a 6714 punpcklwd m3, m7 ; 34b 6715 punpckhwd m6, m8 ; 56a 6716 punpckhwd m7, m13 ; 56b 6717 punpcklwd m8, m4 ; 78a 6718 punpckhqdq m4, m4 6719 punpcklwd m13, m4 ; 78b 6720 mova [stk+0x90], m6 6721 mova [stk+0xa0], m7 6722 mova [stk+0xb0], m8 6723 mova [stk+0xc0], m13 6724 mova m13, vround 6725%else 6726 mov r0m, r0 6727 mov r3, r3m 6728 mov r0, [stk+ 0] 6729 mov rX, [stk+ 4] 6730 mov r4, [stk+ 8] 6731 mov r5, [stk+12] 6732 MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 6733 mova m7, [base+unpckw] 6734 pshufd m4, m7, q1032 6735 pshufb m0, [stk+0x20], m7 ; 0a 1a 6736 pshufb m1, [stk+0x30], m7 ; 0b 1b 6737 pshufb m2, [stk+0x40], m4 ; 3a 2a 6738 pshufb m3, [stk+0x50], m4 ; 3b 2b 6739 pshufb m5, [stk+0x60], m7 ; 4a 5a 6740 pshufb m6, [stk+0x70], m7 ; 4b 5b 6741 pshufb m7, [stk+0x80], m4 ; 7a 6a 6742 punpckhwd m0, m2 ; 12a 6743 punpckhwd m1, m3 ; 12b 6744 punpcklwd m2, m5 ; 34a 6745 punpcklwd m3, m6 ; 34b 6746 mova [stk+0x20], m0 6747 mova [stk+0x30], m1 6748 mova [stk+0x40], m2 6749 mova [stk+0x50], m3 6750 punpckhwd m5, m7 ; 56a 6751 mova [stk+0x60], m5 6752 pshufb m5, [stk+0x90], m4 ; 7b 6b 6753 punpcklwd m7, [stk+0xe0] ; 78a 6754 mova m4, [stk+0x180] 6755 punpckhwd m6, m5 ; 56b 6756 mova [stk+0x70], m6 6757 movq m6, [stk+0xe8] 6758 mova [stk+0x80], m7 6759 mova m7, [stk+0x1b0] 6760 punpcklwd m5, m6 6761 mova m6, [stk+0x1a0] 6762 mova [stk+0x90], m5 6763 mova m5, [stk+0x190] 6764 mov r0, r0m 6765%endif 6766 jmp .dy1_vloop 6767INIT_XMM ssse3 6768%if ARCH_X86_64 6769 %define stk rsp+0x20 6770%endif 6771.dy2: 6772 movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] 6773 add wq, base_reg 6774 jmp wq 6775%if isput 6776.dy2_w2: 6777 %if ARCH_X86_64 6778 mov myd, mym 6779 mova [rsp+0x10], m13 6780 %define vrnd_mem [rsp+0x10] 6781 movzx t0d, t0b 6782 sub srcq, 2 6783 movd m15, t0d 6784 %else 6785 %define m8 m0 6786 %define m9 m1 6787 %define m14 m4 6788 %define m15 m3 6789 %define m11 [esp+0x00] 6790 %define m12 [esp+0x10] 6791 %define vrnd_mem [esp+0x20] 6792 mov r1, r1m 6793 movzx r5, byte [esp+0x1f0] 6794 sub srcq, 2 6795 movd m15, r5 6796 %endif 6797 pxor m9, m9 6798 punpckldq m9, m8 6799 paddd m14, m9 ; mx+dx*[0-1] 6800 %if ARCH_X86_64 6801 mova m9, [base+pd_0x4000] 6802 %endif 6803 pshufd m15, m15, q0000 6804 pand m8, m14, m10 6805 psrld m8, 6 6806 paddd m15, m8 6807 movd r4d, m15 6808 pshufd m15, m15, q0321 6809 %if ARCH_X86_64 6810 movd r6d, m15 6811 %else 6812 movd r3d, m15 6813 %endif 6814 mova m5, [base+bdct_lb_q] 6815 mova m6, [base+spel_s_shuf2] 6816 movd m15, [base+subpel_filters+r4*8+2] 6817 %if ARCH_X86_64 6818 movd m7, [base+subpel_filters+r6*8+2] 6819 %else 6820 movd m7, [base+subpel_filters+r3*8+2] 6821 %endif 6822 pxor m2, m2 6823 pcmpeqd m8, m2 6824 psrld m14, 10 6825 paddd m14, m14 6826 %if ARCH_X86_32 6827 mov r3, r3m 6828 pshufb m14, m5 6829 paddb m14, m6 6830 mova [stk], m14 6831 SWAP m5, m0 6832 SWAP m6, m3 6833 %define m15 m6 6834 %endif 6835 movu m0, [srcq+ssq*0] 6836 movu m1, [srcq+ssq*2] 6837 movu m2, [srcq+ssq*4] 6838 punpckldq m15, m7 6839 %if ARCH_X86_64 6840 pshufb m14, m5 6841 paddb m14, m6 6842 pand m9, m8 6843 pandn m8, m15 6844 SWAP m15, m8 6845 por m15, m9 6846 movu m4, [srcq+ssq*1] 6847 movu m5, [srcq+ss3q ] 6848 lea srcq, [srcq+ssq*4] 6849 movu m6, [srcq+ssq*1] 6850 lea srcq, [srcq+ssq*2] 6851 shr myd, 6 6852 mov r4d, 64 << 24 6853 lea myd, [t1+myq] 6854 cmovnz r4q, [base+subpel_filters+myq*8] 6855 %else 6856 pand m7, m5, [base+pd_0x4000] 6857 pandn m5, m15 6858 por m5, m7 6859 %define m15 m5 6860 mov myd, mym 6861 mov r5, [esp+0x1f4] 6862 xor r3, r3 6863 shr myd, 6 6864 lea r5, [r5+myd] 6865 mov r4, 64 << 24 6866 cmovnz r4, [base+subpel_filters+r5*8+0] 6867 cmovnz r3, [base+subpel_filters+r5*8+4] 6868 mov [stk+0x20], r3 6869 mov r3, r3m 6870 %endif 6871 punpcklbw m15, m15 6872 psraw m15, 8 6873 REPX {pshufb x, m14}, m0, m1, m2 6874 REPX {pmaddwd x, m15}, m0, m1, m2 6875 %if ARCH_X86_64 6876 REPX {pshufb x, m14}, m4, m5, m6 6877 REPX {pmaddwd x, m15}, m4, m5, m6 6878 phaddd m0, m1 6879 phaddd m1, m2 6880 phaddd m4, m5 6881 phaddd m5, m6 6882 REPX {paddd x, m11}, m0, m1, m4, m5 6883 REPX {psrad x, m12}, m0, m1, m4, m5 6884 packssdw m0, m1 ; 0 2 2 4 6885 packssdw m4, m5 ; 1 3 3 5 6886 SWAP m2, m4 6887 movq m10, r4 6888 %else 6889 mova [stk+0x10], m15 6890 phaddd m0, m1 6891 phaddd m1, m2 6892 movu m2, [srcq+ssq*1] 6893 movu m7, [srcq+ss3q ] 6894 lea srcq, [srcq+ssq*4] 6895 movu m6, [srcq+ssq*1] 6896 lea srcq, [srcq+ssq*2] 6897 REPX {pshufb x, m14}, m2, m7, m6 6898 REPX {pmaddwd x, m15}, m2, m7, m6 6899 %define m14 [stk+0x00] 6900 %define m15 [stk+0x10] 6901 phaddd m2, m7 6902 phaddd m7, m6 6903 REPX {paddd x, m11}, m0, m1, m2, m7 6904 REPX {psrad x, m12}, m0, m1, m2, m7 6905 packssdw m0, m1 6906 packssdw m2, m7 6907 %define m8 m6 6908 %define m9 m4 6909 %define m10 m5 6910 movd m10, r4 6911 movd m9, [stk+0x20] 6912 punpckldq m10, m9 6913 %endif 6914 punpcklbw m10, m10 6915 psraw m10, 8 6916 pshufd m7, m10, q0000 6917 pshufd m8, m10, q1111 6918 pshufd m9, m10, q2222 6919 pshufd m10, m10, q3333 6920 %if ARCH_X86_32 6921 mova [stk+0x50], m7 6922 mova [stk+0x60], m8 6923 mova [stk+0x70], m9 6924 mova [stk+0x80], m10 6925 %xdefine m13 m7 6926 %define m7 [stk+0x50] 6927 %define m8 [stk+0x60] 6928 %define m9 [stk+0x70] 6929 %define m10 [stk+0x80] 6930 %endif 6931 punpcklwd m1, m0, m2 ; 01 23 6932 punpckhwd m3, m0, m2 ; 23 45 6933 %if ARCH_X86_32 6934 mov r4, r0m 6935 %define dstq r4 6936 mova [stk+0x20], m3 6937 mova [stk+0x30], m0 6938 %endif 6939.dy2_w2_loop: 6940 movu m4, [srcq+ssq*0] 6941 movu m5, [srcq+ssq*1] 6942 movu m6, [srcq+ssq*2] 6943 movu m13, [srcq+ss3q ] 6944 lea srcq, [srcq+ssq*4] 6945 pmaddwd m3, m8 6946 REPX {pshufb x, m14}, m4, m5, m6, m13 6947 REPX {pmaddwd x, m15}, m4, m5, m6, m13 6948 phaddd m4, m5 6949 phaddd m6, m13 6950 pmaddwd m5, m1, m7 6951 paddd m4, m11 6952 paddd m6, m11 6953 psrad m4, m12 6954 psrad m6, m12 6955 packssdw m4, m6 ; 6 7 8 9 6956 paddd m5, m3 6957 pshufd m3, m4, q2200 6958 pshufd m4, m4, q3311 6959 palignr m3, m0, 12 ; 4 6 6 8 6960 palignr m4, m2, 12 ; 5 7 7 9 6961 mova m0, m3 6962 mova m2, m4 6963 punpcklwd m1, m3, m4 6964 punpckhwd m3, m4 6965 pmaddwd m6, m1, m9 6966 pmaddwd m4, m3, m10 6967 paddd m5, vrnd_mem 6968 paddd m6, m4 6969 paddd m5, m6 6970 pshufd m4, m12, q1032 6971 pxor m6, m6 6972 psrad m5, m4 6973 packssdw m5, m5 6974 pmaxsw m5, m6 6975 pminsw m5, pxmaxm 6976 movd [dstq+dsq*0], m5 6977 pshuflw m5, m5, q1032 6978 movd [dstq+dsq*1], m5 6979 lea dstq, [dstq+dsq*2] 6980 sub hd, 2 6981 jg .dy2_w2_loop 6982 RET 6983%endif 6984INIT_XMM ssse3 6985.dy2_w4: 6986%if ARCH_X86_64 6987 mov myd, mym 6988 mova [rsp+0x10], m11 6989 mova [rsp+0x20], m12 6990 %if isput 6991 mova [rsp+0x30], m13 6992 %define vrnd_mem [rsp+0x30] 6993 %define stk rsp+0x40 6994 %else 6995 %define vrnd_mem [base+pd_m524256] 6996 %define stk rsp+0x30 6997 %endif 6998 movzx t0d, t0b 6999 sub srcq, 2 7000 movd m15, t0d 7001%else 7002 %define m10 [base+pd_0x3ff] 7003 %define m9 [base+pd_0x4000] 7004 %define m8 m0 7005 %xdefine m14 m4 7006 %define m15 m3 7007 %if isprep 7008 %define ssq r3 7009 %endif 7010 movzx r5, byte [esp+0x1f0] 7011 sub srcq, 2 7012 movd m15, r5 7013%endif 7014 pmaddwd m8, [base+rescale_mul] 7015%if ARCH_X86_64 7016 mova m9, [base+pd_0x4000] 7017%endif 7018 pshufd m15, m15, q0000 7019 paddd m14, m8 ; mx+dx*[0-3] 7020 pand m0, m14, m10 7021 psrld m0, 6 7022 paddd m15, m0 7023 pshufd m7, m15, q1032 7024%if ARCH_X86_64 7025 movd r4d, m15 7026 movd r11d, m7 7027 pshufd m15, m15, q0321 7028 pshufd m7, m7, q0321 7029 movd r6d, m15 7030 movd r13d, m7 7031 mova m10, [base+bdct_lb_q+ 0] 7032 mova m11, [base+bdct_lb_q+16] 7033 movd m13, [base+subpel_filters+ r4*8+2] 7034 movd m2, [base+subpel_filters+ r6*8+2] 7035 movd m15, [base+subpel_filters+r11*8+2] 7036 movd m4, [base+subpel_filters+r13*8+2] 7037%else 7038 movd r1, m15 7039 movd r4, m7 7040 pshufd m15, m15, q0321 7041 pshufd m7, m7, q0321 7042 movd r3, m15 7043 movd r5, m7 7044 mova m5, [base+bdct_lb_q+ 0] 7045 mova m6, [base+bdct_lb_q+16] 7046 movd m1, [base+subpel_filters+r1*8+2] 7047 movd m2, [base+subpel_filters+r3*8+2] 7048 movd m3, [base+subpel_filters+r4*8+2] 7049 movd m7, [base+subpel_filters+r5*8+2] 7050 SWAP m4, m7 7051 mov r3, r3m 7052 %if isprep 7053 lea ss3q, [ssq*3] 7054 %endif 7055 %define m10 m5 7056 %define m11 m6 7057 %define m12 m1 7058 %define m13 m1 7059%endif 7060 psrld m14, 10 7061 paddd m14, m14 7062 punpckldq m13, m2 7063 punpckldq m15, m4 7064 punpcklqdq m13, m15 7065 pxor m2, m2 7066 pcmpeqd m0, m2 7067%if ARCH_X86_64 7068 pand m9, m0 7069%else 7070 pand m2, m9, m0 7071 %define m9 m2 7072 SWAP m7, m4 7073%endif 7074 pandn m0, m13 7075%if ARCH_X86_64 7076 SWAP m13, m0 7077%else 7078 %define m13 m0 7079%endif 7080 por m13, m9 7081 punpckhbw m15, m13, m13 7082 punpcklbw m13, m13 7083 psraw m15, 8 7084 psraw m13, 8 7085 pshufb m12, m14, m10 7086 pshufb m14, m11 7087 mova m10, [base+spel_s_shuf2] 7088 movd r4d, m14 7089 shr r4d, 24 7090%if ARCH_X86_32 7091 mova [stk+0x40], m13 7092 mova [stk+0x50], m15 7093 pxor m2, m2 7094%endif 7095 pshufb m7, m14, m2 7096 psubb m14, m7 7097 paddb m12, m10 7098 paddb m14, m10 7099%if ARCH_X86_64 7100 lea r6, [r4+ssq*1] 7101 lea r11, [r4+ssq*2] 7102 lea r13, [r4+ss3q ] 7103 movu m1, [srcq+ssq*0] 7104 movu m8, [srcq+ssq*2] 7105 movu m9, [srcq+ssq*1] 7106 movu m10, [srcq+ss3q ] 7107 movu m7, [srcq+r4 ] 7108 movu m2, [srcq+r11 ] 7109 movu m3, [srcq+r6 ] 7110 movu m4, [srcq+r13 ] 7111 lea srcq, [srcq+ssq*4] 7112 REPX {pshufb x, m12}, m1, m9, m8, m10 7113 REPX {pmaddwd x, m13}, m1, m9, m8, m10 7114 REPX {pshufb x, m14}, m7, m3, m2, m4 7115 REPX {pmaddwd x, m15}, m7, m3, m2, m4 7116 mova m5, [rsp+0x10] 7117 movd xm6, [rsp+0x20] 7118 phaddd m1, m7 7119 phaddd m8, m2 7120 phaddd m9, m3 7121 phaddd m10, m4 7122 movu m2, [srcq+ssq*0] 7123 movu m3, [srcq+ssq*1] 7124 REPX {paddd x, m5}, m1, m9, m8, m10 7125 REPX {psrad x, xm6}, m1, m9, m8, m10 7126 packssdw m1, m8 ; 0 2 7127 packssdw m9, m10 ; 1 3 7128 movu m0, [srcq+r4 ] 7129 movu m8, [srcq+r6 ] 7130 lea srcq, [srcq+ssq*2] 7131 REPX {pshufb x, m12}, m2, m3 7132 REPX {pmaddwd x, m13}, m2, m3 7133 REPX {pshufb x, m14}, m0, m8 7134 REPX {pmaddwd x, m15}, m0, m8 7135 phaddd m2, m0 7136 phaddd m3, m8 7137 shr myd, 6 7138 mov r9d, 64 << 24 7139 lea myd, [t1+myq] 7140 cmovnz r9q, [base+subpel_filters+myq*8] 7141 REPX {paddd x, m5}, m2, m3 7142 REPX {psrad x, xm6}, m2, m3 7143 packssdw m2, m3 ; 4 5 7144 pshufd m3, m2, q1032 ; 5 _ 7145 punpcklwd m0, m1, m9 ; 01 7146 punpckhwd m1, m9 ; 23 7147 punpcklwd m2, m3 ; 45 7148 movq m10, r9 7149 %define hrnd_mem [rsp+0x10] 7150 %define hsh_mem [rsp+0x20] 7151 %define vsh_mem [rsp+0x28] 7152 %if isput 7153 %define vrnd_mem [rsp+0x30] 7154 %else 7155 %define vrnd_mem [base+pd_m524256] 7156 %endif 7157%else 7158 mova [stk+0x20], m12 7159 mova [stk+0x30], m14 7160 add r4, srcq 7161 MC_4TAP_SCALED_H 0x60 ; 0 1 7162 MC_4TAP_SCALED_H 0x70 ; 2 3 7163 MC_4TAP_SCALED_H 0x80 ; 4 5 7164 mov [stk+0xe0], r4 7165 mova m3, [base+spel_s_shuf8] 7166 mova m0, [stk+0x60] 7167 mova m1, [stk+0x70] 7168 mova m2, [stk+0x80] 7169 mov myd, mym 7170 mov rX, [esp+0x1f4] 7171 xor r5, r5 7172 shr myd, 6 7173 lea rX, [rX+myd] 7174 mov r4, 64 << 24 7175 cmovnz r4, [base+subpel_filters+rX*8+0] 7176 cmovnz r5, [base+subpel_filters+rX*8+4] 7177 mov r3, r3m 7178 pshufb m0, m3 ; 01 7179 pshufb m1, m3 ; 23 7180 pshufb m2, m3 ; 45 7181 movd m7, r4 7182 movd m4, r5 7183 mov r5, r0m 7184 %if isput 7185 mov r1, r1m 7186 %endif 7187 mov r4, [stk+0xe0] 7188 %define dstq r5 7189 %define tmpq r5 7190 %define m12 [stk+0x20] 7191 %define m14 [stk+0x30] 7192 %define m13 [stk+0x40] 7193 %define m15 [stk+0x50] 7194 %define hrnd_mem [esp+0x00] 7195 %define hsh_mem [esp+0x10] 7196 %define vsh_mem [esp+0x18] 7197 %if isput 7198 %define vrnd_mem [esp+0x20] 7199 %else 7200 %define vrnd_mem [base+pd_m524256] 7201 %endif 7202 %define m10 m7 7203 punpckldq m10, m4 7204%endif 7205 punpcklbw m10, m10 7206 psraw m10, 8 7207 pshufd m3, m10, q0000 7208 pshufd m4, m10, q1111 7209 pshufd m5, m10, q2222 7210 pshufd m10, m10, q3333 7211%if ARCH_X86_32 7212 %xdefine m8 m3 7213 %xdefine m9 m6 7214 %xdefine m11 m5 7215 %xdefine m6 m4 7216 mova [stk+0x100], m3 7217 mova [stk+0x110], m4 7218 mova [stk+0x120], m5 7219 mova [stk+0x130], m10 7220 %define m3 [stk+0x100] 7221 %define m4 [stk+0x110] 7222 %define m5 [stk+0x120] 7223 %define m10 [stk+0x130] 7224%endif 7225.dy2_w4_loop: 7226 pmaddwd m8, m0, m3 7227 pmaddwd m9, m1, m3 7228 mova m0, m2 7229 pmaddwd m1, m4 7230 pmaddwd m11, m2, m4 7231 paddd m8, vrnd_mem 7232 paddd m9, vrnd_mem 7233 pmaddwd m2, m5 7234 paddd m8, m1 7235 paddd m9, m11 7236 paddd m8, m2 7237 movu m6, [srcq+ssq*0] 7238 movu m1, [srcq+ssq*2] 7239%if ARCH_X86_64 7240 movu m11, [srcq+r4 ] 7241 movu m2, [srcq+r11] 7242%else 7243 movu m11, [r4+ssq*0] 7244 movu m2, [r4+ssq*2] 7245%endif 7246 pshufb m6, m12 7247 pshufb m1, m12 7248 pmaddwd m6, m13 7249 pmaddwd m1, m13 7250 pshufb m11, m14 7251 pshufb m2, m14 7252 pmaddwd m11, m15 7253 pmaddwd m2, m15 7254 phaddd m6, m11 7255 phaddd m1, m2 7256 paddd m6, hrnd_mem 7257 paddd m1, hrnd_mem 7258 psrad m6, hsh_mem 7259 psrad m1, hsh_mem 7260 movu m7, [srcq+ssq*1] 7261 movu m11, [srcq+ss3q ] 7262 packssdw m6, m1 ; 6 8 7263%if ARCH_X86_64 7264 movu m2, [srcq+r6 ] 7265 movu m1, [srcq+r13] 7266%else 7267 movu m2, [r4+ssq*1] 7268 movu m1, [r4+ss3q ] 7269%endif 7270 pshufb m7, m12 7271 pshufb m11, m12 7272 pmaddwd m7, m13 7273 pmaddwd m11, m13 7274 pshufb m2, m14 7275 pshufb m1, m14 7276 pmaddwd m2, m15 7277 pmaddwd m1, m15 7278 phaddd m7, m2 7279 phaddd m11, m1 7280 paddd m7, hrnd_mem 7281 paddd m11, hrnd_mem 7282 psrad m7, hsh_mem 7283 psrad m11, hsh_mem 7284 packssdw m7, m11 ; 7 9 7285%if ARCH_X86_32 7286 lea r4, [r4+ssq*4] 7287%endif 7288 lea srcq, [srcq+ssq*4] 7289 punpcklwd m1, m6, m7 ; 67 7290 punpckhwd m6, m7 ; 89 7291 mova m2, m6 7292 pmaddwd m11, m1, m5 7293 pmaddwd m7, m1, m10 7294 pmaddwd m6, m10 7295 paddd m9, m11 7296%if isput 7297 movd m11, vsh_mem 7298%endif 7299 paddd m8, m7 7300 paddd m9, m6 7301%if isput 7302 psrad m8, m11 7303 psrad m9, m11 7304 packssdw m8, m9 7305 pxor m7, m7 7306 pmaxsw m8, m7 7307 pminsw m8, pxmaxm 7308 movq [dstq+dsq*0], m8 7309 movhps [dstq+dsq*1], m8 7310 lea dstq, [dstq+dsq*2] 7311%else 7312 psrad m8, 6 7313 psrad m9, 6 7314 packssdw m8, m9 7315 mova [tmpq], m8 7316 add tmpq, 16 7317%endif 7318 sub hd, 2 7319 jg .dy2_w4_loop 7320 MC_8TAP_SCALED_RET ; why not jz .ret? 7321INIT_XMM ssse3 7322.dy2_w8: 7323 mov dword [stk+0xf0], 1 7324 movifprep tmp_stridem, 16 7325 jmp .dy2_w_start 7326.dy2_w16: 7327 mov dword [stk+0xf0], 2 7328 movifprep tmp_stridem, 32 7329 jmp .dy2_w_start 7330.dy2_w32: 7331 mov dword [stk+0xf0], 4 7332 movifprep tmp_stridem, 64 7333 jmp .dy2_w_start 7334.dy2_w64: 7335 mov dword [stk+0xf0], 8 7336 movifprep tmp_stridem, 128 7337 jmp .dy2_w_start 7338.dy2_w128: 7339 mov dword [stk+0xf0], 16 7340 movifprep tmp_stridem, 256 7341.dy2_w_start: 7342 mov myd, mym 7343%if ARCH_X86_64 7344 %ifidn %1, put 7345 movifnidn dsm, dsq 7346 %endif 7347 mova [rsp+0x10], m11 7348 mova [rsp+0x20], m12 7349 %define hround m11 7350 %if isput 7351 mova [rsp+0x30], m13 7352 %else 7353 mova m13, [base+pd_m524256] 7354 %endif 7355 shr t0d, 16 7356 shr myd, 6 7357 mov r4d, 64 << 24 7358 lea myd, [t1+myq] 7359 cmovnz r4q, [base+subpel_filters+myq*8] 7360 movd m15, t0d 7361%else 7362 %define hround [esp+0x00] 7363 %define m12 [esp+0x10] 7364 %define m10 [base+pd_0x3ff] 7365 %define m8 m0 7366 %xdefine m14 m4 7367 %xdefine m15 m3 7368 %if isput 7369 %define dstq r0 7370 %else 7371 %define tmpq r0 7372 %define ssq ssm 7373 %endif 7374 mov r5, [esp+0x1f0] 7375 mov r3, [esp+0x1f4] 7376 shr r5, 16 7377 movd m15, r5 7378 xor r5, r5 7379 shr myd, 6 7380 lea r3, [r3+myd] 7381 mov r4, 64 << 24 7382 cmovnz r4, [base+subpel_filters+r3*8+0] 7383 cmovnz r5, [base+subpel_filters+r3*8+4] 7384 mov r0, r0m 7385 mov r3, r3m 7386%endif 7387 sub srcq, 6 7388 pslld m7, m8, 2 ; dx*4 7389 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 7390 pshufd m15, m15, q0000 7391 paddd m14, m8 ; mx+dx*[0-3] 7392%if ARCH_X86_64 7393 movq m3, r4q 7394%else 7395 movd m5, r4 7396 movd m6, r5 7397 punpckldq m5, m6 7398 SWAP m3, m5 7399%endif 7400 punpcklbw m3, m3 7401 psraw m3, 8 7402 mova [stk+0x100], m7 7403 mova [stk+0x120], m15 7404 mov [stk+0x0f8], srcq 7405 mov [stk+0x130], r0q ; dstq / tmpq 7406 pshufd m0, m3, q0000 7407 pshufd m1, m3, q1111 7408 pshufd m2, m3, q2222 7409 pshufd m3, m3, q3333 7410%if ARCH_X86_64 7411 mova [stk+0x140], m0 7412 mova [stk+0x150], m1 7413 mova [stk+0x160], m2 7414 mova [stk+0x170], m3 7415 %if UNIX64 7416 mov hm, hd 7417 %endif 7418%else 7419 mova [stk+0x180], m0 7420 mova [stk+0x190], m1 7421 mova [stk+0x1a0], m2 7422 mova [stk+0x1b0], m3 7423 SWAP m5, m3 7424 mov r5, hm 7425 mov [stk+0x134], r5 7426%endif 7427 jmp .dy2_hloop 7428.dy2_hloop_prep: 7429 dec dword [stk+0x0f0] 7430 jz .ret 7431%if ARCH_X86_64 7432 add qword [stk+0x130], 16 7433 mov hd, hm 7434%else 7435 add dword [stk+0x130], 16 7436 mov r5, [stk+0x134] 7437 mov r0, [stk+0x130] 7438%endif 7439 mova m7, [stk+0x100] 7440 mova m14, [stk+0x110] 7441%if ARCH_X86_64 7442 mova m10, [base+pd_0x3ff] 7443 mova m11, [rsp+0x10] 7444%endif 7445 mova m15, [stk+0x120] 7446 mov srcq, [stk+0x0f8] 7447%if ARCH_X86_64 7448 mov r0q, [stk+0x130] ; dstq / tmpq 7449%else 7450 mov hm, r5 7451 mov r0m, r0 7452 mov r3, r3m 7453%endif 7454 paddd m14, m7 7455.dy2_hloop: 7456%if ARCH_X86_64 7457 mova m9, [base+pq_0x40000000] 7458%else 7459 %define m9 [base+pq_0x40000000] 7460%endif 7461 pxor m1, m1 7462 psrld m2, m14, 10 7463 mova [stk], m2 7464 pand m6, m14, m10 7465 psrld m6, 6 7466 paddd m5, m15, m6 7467 pcmpeqd m6, m1 7468 pshufd m2, m5, q1032 7469%if ARCH_X86_64 7470 movd r4d, m5 7471 movd r6d, m2 7472 pshufd m5, m5, q0321 7473 pshufd m2, m2, q0321 7474 movd r7d, m5 7475 movd r9d, m2 7476 movq m0, [base+subpel_filters+r4*8] 7477 movq m1, [base+subpel_filters+r6*8] 7478 movhps m0, [base+subpel_filters+r7*8] 7479 movhps m1, [base+subpel_filters+r9*8] 7480%else 7481 movd r0, m5 7482 movd rX, m2 7483 pshufd m5, m5, q0321 7484 pshufd m2, m2, q0321 7485 movd r4, m5 7486 movd r5, m2 7487 movq m0, [base+subpel_filters+r0*8] 7488 movq m1, [base+subpel_filters+rX*8] 7489 movhps m0, [base+subpel_filters+r4*8] 7490 movhps m1, [base+subpel_filters+r5*8] 7491%endif 7492 paddd m14, m7 ; mx+dx*[4-7] 7493 pand m5, m14, m10 7494 psrld m5, 6 7495 paddd m15, m5 7496 pxor m2, m2 7497 pcmpeqd m5, m2 7498 mova [stk+0x110], m14 7499 pshufd m4, m15, q1032 7500%if ARCH_X86_64 7501 movd r10d, m15 7502 movd r11d, m4 7503 pshufd m15, m15, q0321 7504 pshufd m4, m4, q0321 7505 movd r13d, m15 7506 movd rXd, m4 7507 movq m2, [base+subpel_filters+r10*8] 7508 movq m3, [base+subpel_filters+r11*8] 7509 movhps m2, [base+subpel_filters+r13*8] 7510 movhps m3, [base+subpel_filters+ rX*8] 7511 psrld m14, 10 7512 movq r11, m14 7513 punpckhqdq m14, m14 7514 movq rX, m14 7515 mov r10d, r11d 7516 shr r11, 32 7517 mov r13d, rXd 7518 shr rX, 32 7519 mov r4d, [stk+ 0] 7520 mov r6d, [stk+ 4] 7521 mov r7d, [stk+ 8] 7522 mov r9d, [stk+12] 7523 pshufd m4, m6, q1100 7524 pshufd m6, m6, q3322 7525 pshufd m14, m5, q1100 7526 pshufd m5, m5, q3322 7527 pand m7, m9, m4 7528 pand m8, m9, m6 7529 pand m15, m9, m14 7530 pand m9, m9, m5 7531 pandn m4, m0 7532 pandn m6, m1 7533 pandn m14, m2 7534 pandn m5, m3 7535 por m7, m4 7536 por m8, m6 7537 por m15, m14 7538 por m9, m5 7539 punpcklbw m0, m7, m7 7540 punpckhbw m7, m7 7541 punpcklbw m1, m8, m8 7542 punpckhbw m8, m8 7543 psraw m0, 8 7544 psraw m7, 8 7545 psraw m1, 8 7546 psraw m8, 8 7547 punpcklbw m2, m15, m15 7548 punpckhbw m15, m15 7549 punpcklbw m3, m9, m9 7550 punpckhbw m9, m9 7551 psraw m2, 8 7552 psraw m15, 8 7553 psraw m3, 8 7554 psraw m9, 8 7555 mova [stk+0x10], m0 7556 mova [stk+0x20], m7 7557 mova [stk+0x30], m1 7558 mova [stk+0x40], m8 7559 mova [stk+0x50], m2 7560 mova [stk+0x60], m15 7561 mova [stk+0x70], m3 7562 mova [stk+0x80], m9 7563 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 7564 mova [stk+0x90], m1 7565 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 7566 mova [stk+0xa0], m2 7567 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 7568 mova [stk+0xb0], m3 7569 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 7570 mova [stk+0xc0], m4 7571 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 7572 mova [stk+0xd0], m5 7573 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 7574 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 7575 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 7576 mova m5, [stk+0xd0] 7577 mova m1, [stk+0x90] 7578 mova m2, [stk+0xa0] 7579 mova m3, [stk+0xb0] 7580 mova m9, [stk+0xc0] 7581 punpcklwd m4, m5, m6 ; 45a 7582 punpckhwd m5, m6 ; 45b 7583 punpcklwd m6, m7, m8 ; 67a 7584 punpckhwd m7, m8 ; 67b 7585 punpcklwd m0, m1, m2 ; 01a 7586 punpckhwd m1, m2 ; 01b 7587 punpcklwd m2, m3, m9 ; 23a 7588 punpckhwd m3, m9 ; 23b 7589 mova m10, [stk+0x140] 7590 mova m11, [stk+0x150] 7591 mova m14, [stk+0x160] 7592 mova m15, [stk+0x170] 7593 mova [stk+0x90], m4 7594 mova [stk+0xa0], m5 7595 mova [stk+0xb0], m6 7596 mova [stk+0xc0], m7 7597 %define hround [rsp+0x10] 7598 %define shift [rsp+0x20] 7599 %if isput 7600 %define vround [rsp+0x30] 7601 %else 7602 %define vround [base+pd_m524256] 7603 %endif 7604.dy2_vloop: 7605 pmaddwd m4, m0, m10 7606 pmaddwd m5, m1, m10 7607 pmaddwd m6, m2, m11 7608 pmaddwd m7, m3, m11 7609 paddd m4, m13 7610 paddd m5, m13 7611 paddd m4, m6 7612 paddd m5, m7 7613 pmaddwd m6, [stk+0x90], m14 7614 pmaddwd m7, [stk+0xa0], m14 7615 pmaddwd m8, [stk+0xb0], m15 7616 pmaddwd m9, [stk+0xc0], m15 7617 paddd m4, m6 7618 paddd m5, m7 7619 %if isput 7620 pshufd m6, m12, q1032 7621 %endif 7622 paddd m4, m8 7623 paddd m5, m9 7624%else 7625 movd r0, m15 7626 movd rX, m4 7627 pshufd m15, m15, q0321 7628 pshufd m4, m4, q0321 7629 movd r4, m15 7630 movd r5, m4 7631 mova m14, [stk+0x110] 7632 movq m2, [base+subpel_filters+r0*8] 7633 movq m3, [base+subpel_filters+rX*8] 7634 movhps m2, [base+subpel_filters+r4*8] 7635 movhps m3, [base+subpel_filters+r5*8] 7636 psrld m14, 10 7637 mova [stk+16], m14 7638 mov r0, [stk+ 0] 7639 mov rX, [stk+ 4] 7640 mov r4, [stk+ 8] 7641 mov r5, [stk+12] 7642 mova [stk+0x20], m0 7643 mova [stk+0x30], m1 7644 mova [stk+0x40], m2 7645 mova [stk+0x50], m3 7646 pshufd m4, m6, q1100 7647 pshufd m6, m6, q3322 7648 pshufd m7, m5, q1100 7649 pshufd m5, m5, q3322 7650 pand m0, m9, m4 7651 pand m1, m9, m6 7652 pand m2, m9, m7 7653 pand m3, m9, m5 7654 pandn m4, [stk+0x20] 7655 pandn m6, [stk+0x30] 7656 pandn m7, [stk+0x40] 7657 pandn m5, [stk+0x50] 7658 por m0, m4 7659 por m1, m6 7660 por m2, m7 7661 por m3, m5 7662 punpcklbw m4, m0, m0 7663 punpckhbw m0, m0 7664 punpcklbw m5, m1, m1 7665 punpckhbw m1, m1 7666 psraw m4, 8 7667 psraw m0, 8 7668 psraw m5, 8 7669 psraw m1, 8 7670 punpcklbw m6, m2, m2 7671 punpckhbw m2, m2 7672 punpcklbw m7, m3, m3 7673 punpckhbw m3, m3 7674 psraw m6, 8 7675 psraw m2, 8 7676 psraw m7, 8 7677 psraw m3, 8 7678 mova [stk+0x0a0], m4 7679 mova [stk+0x0b0], m0 7680 mova [stk+0x0c0], m5 7681 mova [stk+0x0d0], m1 7682 mova [stk+0x140], m6 7683 mova [stk+0x150], m2 7684 mova [stk+0x160], m7 7685 mova [stk+0x170], m3 7686 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 7687 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 7688 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 7689 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 7690 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 7691 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 7692 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 7693 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 7694 mova m5, [stk+0x60] 7695 mova m6, [stk+0x70] 7696 mova m7, [stk+0x80] 7697 mova m0, [stk+0x90] 7698 mov r0, r0m 7699 punpcklwd m4, m5, m6 ; 45a 7700 punpckhwd m5, m6 ; 45b 7701 punpcklwd m6, m7, m0 ; 67a 7702 punpckhwd m7, m0 ; 67b 7703 mova [stk+0x60], m4 7704 mova [stk+0x70], m5 7705 mova [stk+0x80], m6 7706 mova [stk+0x90], m7 7707 mova m1, [stk+0x20] 7708 mova m2, [stk+0x30] 7709 mova m3, [stk+0x40] 7710 mova m4, [stk+0x50] 7711 punpcklwd m0, m1, m2 ; 01a 7712 punpckhwd m1, m2 ; 01b 7713 punpcklwd m2, m3, m4 ; 23a 7714 punpckhwd m3, m4 ; 23b 7715 mova m4, [stk+0x180] 7716 mova m5, [stk+0x190] 7717 mova m6, [stk+0x1a0] 7718 mova m7, [stk+0x1b0] 7719 mova [stk+0x40], m2 7720 mova [stk+0x50], m3 7721.dy2_vloop: 7722 pmaddwd m0, m4 7723 pmaddwd m1, m4 7724 pmaddwd m2, m5 7725 pmaddwd m3, m5 7726 paddd m0, m2 7727 paddd m1, m3 7728 pmaddwd m2, [stk+0x60], m6 7729 pmaddwd m3, [stk+0x70], m6 7730 pmaddwd m4, [stk+0x80], m7 7731 pmaddwd m5, [stk+0x90], m7 7732 %if isput 7733 movd m6, [esp+0x18] 7734 %endif 7735 paddd m0, m2 7736 paddd m1, m3 7737 paddd m0, vrnd_mem 7738 paddd m1, vrnd_mem 7739 paddd m4, m0 7740 paddd m5, m1 7741%endif 7742%ifidn %1, put 7743 psrad m4, m6 7744 psrad m5, m6 7745 packssdw m4, m5 7746 pxor m7, m7 7747 pmaxsw m4, m7 7748 pminsw m4, pxmaxm 7749 mova [dstq], m4 7750 add dstq, dsm 7751%else 7752 psrad m4, 6 7753 psrad m5, 6 7754 packssdw m4, m5 7755 mova [tmpq], m4 7756 add tmpq, tmp_stridem 7757%endif 7758 dec hd 7759 jz .dy2_hloop_prep 7760%if ARCH_X86_64 7761 MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1 7762 mova [stk+0xd0], m4 7763 MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1 7764 mova m4, [stk+0xd0] 7765 mova m0, m2 ; 01a 7766 mova m1, m3 ; 01b 7767 mova m2, [stk+0x90] ; 23a 7768 mova m3, [stk+0xa0] ; 23b 7769 mova m5, [stk+0xb0] ; 45a 7770 mova m6, [stk+0xc0] ; 45b 7771 punpcklwd m7, m4, m8 ; 67a 7772 punpckhwd m4, m8 ; 67b 7773 mova [stk+0x90], m5 7774 mova [stk+0xa0], m6 7775 mova [stk+0xb0], m7 7776 mova [stk+0xc0], m4 7777%else 7778 mov r0m, r0 7779 mov r3, r3m 7780 MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8 7781 MC_8TAP_SCALED_H 0xa0, 0 ; 9 7782 mova m7, [stk+0xe0] 7783 mova m2, [stk+0x60] ; 23a 7784 mova m3, [stk+0x70] ; 23b 7785 mova m4, [stk+0x80] ; 45a 7786 mova m5, [stk+0x90] ; 45b 7787 punpcklwd m6, m7, m0 ; 67a 7788 punpckhwd m7, m0 ; 67b 7789 mova m0, [stk+0x40] ; 01a 7790 mova m1, [stk+0x50] ; 01b 7791 mova [stk+0x40], m2 7792 mova [stk+0x50], m3 7793 mova [stk+0x60], m4 7794 mova [stk+0x70], m5 7795 mova m4, [stk+0x180] 7796 mova m5, [stk+0x190] 7797 mova [stk+0x80], m6 7798 mova [stk+0x90], m7 7799 mova m6, [stk+0x1a0] 7800 mova m7, [stk+0x1b0] 7801 mov r0, r0m 7802%endif 7803 jmp .dy2_vloop 7804INIT_XMM ssse3 7805.ret: 7806 MC_8TAP_SCALED_RET 0 7807%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT 7808 %define r0m [rstk+stack_offset+ 4] 7809 %define r1m [rstk+stack_offset+ 8] 7810 %define r2m [rstk+stack_offset+12] 7811 %define r3m [rstk+stack_offset+16] 7812%endif 7813%undef isput 7814%undef isprep 7815%endmacro 7816 7817%macro BILIN_SCALED_FN 1 7818cglobal %1_bilin_scaled_16bpc 7819 mov t0d, (5*15 << 16) | 5*15 7820 mov t1d, (5*15 << 16) | 5*15 7821 jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) 7822%endmacro 7823 7824%if WIN64 7825DECLARE_REG_TMP 6, 5 7826%elif ARCH_X86_64 7827DECLARE_REG_TMP 6, 8 7828%else 7829DECLARE_REG_TMP 1, 2 7830%endif 7831 7832%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, 7833BILIN_SCALED_FN put 7834PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_16bpc 7835PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_16bpc 7836PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_16bpc 7837PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_16bpc 7838PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_16bpc 7839PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_16bpc 7840PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_16bpc 7841PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_16bpc 7842PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR 7843MC_8TAP_SCALED put 7844 7845%if WIN64 7846DECLARE_REG_TMP 5, 4 7847%elif ARCH_X86_64 7848DECLARE_REG_TMP 6, 7 7849%else 7850DECLARE_REG_TMP 1, 2 7851%endif 7852 7853%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, 7854BILIN_SCALED_FN prep 7855PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_16bpc 7856PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_16bpc 7857PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_16bpc 7858PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_16bpc 7859PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_16bpc 7860PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_16bpc 7861PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_16bpc 7862PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_16bpc 7863PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR 7864MC_8TAP_SCALED prep 7865 7866%if ARCH_X86_64 7867DECLARE_REG_TMP 6 7868%else 7869DECLARE_REG_TMP 2 7870%endif 7871 7872%if ARCH_X86_64 7873; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that 7874; by allocating 16 bytes more stack space so that stack offsets match up. 7875%if WIN64 && STACK_ALIGNMENT == 16 7876%assign stksz 16*14 7877%else 7878%assign stksz 16*13 7879%endif 7880cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \ 7881 mx, tmp, alpha, beta, \ 7882 filter, my, gamma, cnt 7883%assign stack_size_padded_8x8t stack_size_padded 7884%else 7885cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ 7886 filter, mx, my 7887%define m8 [esp+16*13] 7888%define m9 [esp+16*14] 7889%define cntd dword [esp+4*63] 7890%define dstq tmpq 7891%define dsq 0 7892%if STACK_ALIGNMENT < 16 7893%define dstm [esp+4*65] 7894%define dsm [esp+4*66] 7895%else 7896%define dstm r0m 7897%define dsm r1m 7898%endif 7899%endif 7900%define base filterq-$$ 7901 mov t0d, r7m 7902 LEA filterq, $$ 7903 shr t0d, 11 7904%if ARCH_X86_64 7905 movddup m8, [base+warp8x8t_rnd] 7906%else 7907 movddup m1, [base+warp8x8t_rnd] 7908 mov r1, r1m 7909 add r1, r1 7910 mova m8, m1 7911 mov r1m, r1 ; ds *= 2 7912%endif 7913 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main 7914 jmp .start 7915.loop: 7916%if ARCH_X86_64 7917 lea dstq, [dstq+dsq*4] 7918%else 7919 add dstq, dsm 7920 mov dstm, dstq 7921%endif 7922 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2 7923.start: 7924%if ARCH_X86_32 7925 mov dstq, dstm 7926%endif 7927 paddd m1, m8 7928 paddd m2, m8 7929 psrad m1, 15 7930 psrad m2, 15 7931 packssdw m1, m2 7932 mova [dstq+dsq*0], m1 7933 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3 7934%if ARCH_X86_32 7935 mov dstq, dstm 7936 add dstq, dsm 7937%endif 7938 paddd m1, m8 7939 paddd m2, m8 7940 psrad m1, 15 7941 psrad m2, 15 7942 packssdw m1, m2 7943 mova [dstq+dsq*2], m1 7944 dec cntd 7945 jg .loop 7946 RET 7947 7948%if ARCH_X86_64 7949cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ 7950 mx, tmp, alpha, beta, \ 7951 filter, my, gamma, cnt 7952ASSERT stack_size_padded == stack_size_padded_8x8t 7953%else 7954cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ 7955 filter, mx, my 7956%endif 7957 mov t0d, r7m 7958 LEA filterq, $$ 7959 shr t0d, 11 7960%if ARCH_X86_64 7961 movddup m8, [base+warp8x8_rnd2+t0*8] 7962 movd m9, r7m ; pixel_max 7963 pshufb m9, [base+pw_256] 7964%else 7965 movddup m1, [base+warp8x8_rnd2+t0*8] 7966 movd m2, r7m ; pixel_max 7967 pshufb m2, [base+pw_256] 7968 mova m8, m1 7969 mova m9, m2 7970%endif 7971 call .main 7972 jmp .start 7973.loop: 7974%if ARCH_X86_64 7975 lea dstq, [dstq+dsq*2] 7976%else 7977 add dstq, dsm 7978 mov dstm, dstq 7979%endif 7980 call .main2 7981.start: 7982%if ARCH_X86_32 7983 mov dstq, dstm 7984%endif 7985 psrad m1, 16 7986 psrad m2, 16 7987 packssdw m1, m2 7988 pmaxsw m1, m6 7989 pmulhrsw m1, m8 7990 pminsw m1, m9 7991 mova [dstq+dsq*0], m1 7992 call .main3 7993%if ARCH_X86_32 7994 mov dstq, dstm 7995 add dstq, dsm 7996%endif 7997 psrad m1, 16 7998 psrad m2, 16 7999 packssdw m1, m2 8000 pmaxsw m1, m6 8001 pmulhrsw m1, m8 8002 pminsw m1, m9 8003 mova [dstq+dsq*1], m1 8004 dec cntd 8005 jg .loop 8006 RET 8007ALIGN function_align 8008.main: 8009 ; Stack args offset by one (r4m -> r5m etc.) due to call 8010%if WIN64 8011 mov deltaq, r5m 8012 mov mxd, r6m 8013%endif 8014 movd m0, [base+warp8x8_shift+t0*4] 8015 movddup m7, [base+warp8x8_rnd1+t0*8] 8016 add filterq, mc_warp_filter-$$ 8017%if ARCH_X86_64 8018 movsx alphad, word [deltaq+2*0] 8019 movsx betad, word [deltaq+2*1] 8020 movsx gammad, word [deltaq+2*2] 8021 movsx deltad, word [deltaq+2*3] 8022 lea tmpq, [ssq*3] 8023 add mxd, 512+(64<<10) 8024 sub srcq, tmpq ; src -= ss*3 8025 imul tmpd, alphad, -7 8026 mov myd, r7m 8027 add betad, tmpd ; beta -= alpha*7 8028 imul tmpd, gammad, -7 8029 add myd, 512+(64<<10) 8030 mov cntd, 4 8031 add deltad, tmpd ; delta -= gamma*7 8032%else 8033%if STACK_ALIGNMENT < 16 8034 %assign stack_offset stack_offset - gprsize 8035%endif 8036 mov r3d, r5m ; abcd 8037%if STACK_ALIGNMENT < 16 8038 mov r0, r1m ; dst 8039 mov r1, r2m ; ds 8040 mov [esp+gprsize+4*65], r0 8041 mov [esp+gprsize+4*66], r1 8042%endif 8043 movsx alphad, word [r3+2*0] 8044 movsx r2d, word [r3+2*1] 8045 movsx gammad, word [r3+2*2] 8046 movsx r3d, word [r3+2*3] 8047 imul r5d, alphad, -7 8048 add r2d, r5d ; beta -= alpha*7 8049 imul r5d, gammad, -7 8050 mov [esp+gprsize+4*60], r2d 8051 add r3d, r5d ; delta -= gamma*7 8052 mov [esp+gprsize+4*61], r3d 8053 mov r3d, r4m ; ss 8054 mov srcq, r3m 8055 mov mxd, r6m 8056 mov myd, r7m 8057 mov dword [esp+gprsize+4*63], 4 ; cnt 8058 mov [esp+gprsize+4*62], r3 8059 lea r3, [r3*3] 8060 add mxd, 512+(64<<10) 8061 add myd, 512+(64<<10) 8062 sub srcq, r3 ; src -= ss*3 8063%if STACK_ALIGNMENT < 16 8064 %assign stack_offset stack_offset + gprsize 8065%endif 8066%endif 8067 mova [rsp+gprsize], m0 8068 pxor m6, m6 8069 call .h 8070 mova m5, m0 8071 call .h 8072 punpcklwd m1, m5, m0 ; 01 8073 punpckhwd m5, m0 8074 mova [rsp+gprsize+16* 1], m1 8075 mova [rsp+gprsize+16* 4], m5 8076 mova m5, m0 8077 call .h 8078 punpcklwd m1, m5, m0 ; 12 8079 punpckhwd m5, m0 8080 mova [rsp+gprsize+16* 7], m1 8081 mova [rsp+gprsize+16*10], m5 8082 mova m5, m0 8083 call .h 8084 punpcklwd m1, m5, m0 ; 23 8085 punpckhwd m5, m0 8086 mova [rsp+gprsize+16* 2], m1 8087 mova [rsp+gprsize+16* 5], m5 8088 mova m5, m0 8089 call .h 8090 punpcklwd m1, m5, m0 ; 34 8091 punpckhwd m5, m0 8092 mova [rsp+gprsize+16* 8], m1 8093 mova [rsp+gprsize+16*11], m5 8094 mova m5, m0 8095 call .h 8096 punpcklwd m1, m5, m0 ; 45 8097 punpckhwd m5, m0 8098 mova [rsp+gprsize+16* 3], m1 8099 mova [rsp+gprsize+16* 6], m5 8100 mova m5, m0 8101 call .h 8102 punpcklwd m1, m5, m0 ; 56 8103 punpckhwd m5, m0 8104 mova [rsp+gprsize+16* 9], m1 8105 mova [rsp+gprsize+16*12], m5 8106 mova m5, m0 8107.main2: 8108 call .h 8109%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h 8110 lea tmpd, [myq+gammaq] 8111 shr myd, 10 8112 movq m4, [filterq+myq*8] ; a 8113 lea myd, [tmpq+gammaq] 8114 shr tmpd, 10 8115 movq m2, [filterq+tmpq*8] ; b 8116 lea tmpd, [myq+gammaq] 8117 shr myd, 10 8118 movq m3, [filterq+myq*8] ; c 8119 lea myd, [tmpq+gammaq] 8120 shr tmpd, 10 8121 movq m1, [filterq+tmpq*8] ; d 8122 lea tmpd, [myq+gammaq] 8123 shr myd, 10 8124 punpcklwd m4, m2 8125 punpcklwd m3, m1 8126 punpckldq m2, m4, m3 8127 punpckhdq m4, m3 8128 punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 8129 pmaddwd m1, [rsp+gprsize+16*%1] 8130 punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 8131 mova m2, [rsp+gprsize+16*%2] 8132 pmaddwd m3, m2 8133 mova [rsp+gprsize+16*%1], m2 8134 paddd m1, m3 8135 punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 8136 mova m2, [rsp+gprsize+16*%3] 8137 pmaddwd m3, m2 8138 mova [rsp+gprsize+16*%2], m2 8139 paddd m1, m3 8140 punpcklwd m3, m5, m0 ; 67 8141 punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 8142 pmaddwd m2, m3 8143 mova [rsp+gprsize+16*%3], m3 8144 paddd m1, m2 8145 movq m4, [filterq+myq*8] ; e 8146 lea myd, [tmpq+gammaq] 8147 shr tmpd, 10 8148 movq m3, [filterq+tmpq*8] ; f 8149 lea tmpd, [myq+gammaq] 8150 shr myd, 10 8151 movq m2, [filterq+myq*8] ; g 8152%if ARCH_X86_64 8153 lea myd, [tmpq+deltaq] ; my += delta 8154%else 8155 mov myd, [esp+gprsize+4*61] 8156 add myd, tmpd 8157%endif 8158 shr tmpd, 10 8159 punpcklwd m4, m3 8160 movq m3, [filterq+tmpq*8] ; h 8161 punpcklwd m2, m3 8162 punpckldq m3, m4, m2 8163 punpckhdq m4, m2 8164 punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8 8165 pmaddwd m2, [rsp+gprsize+16*%4] 8166 punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8 8167 mova m3, [rsp+gprsize+16*%5] 8168 pmaddwd m6, m3 8169 mova [rsp+gprsize+16*%4], m3 8170 pxor m3, m3 8171 paddd m2, m6 8172 punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8 8173 mova m6, [rsp+gprsize+16*%6] 8174 pmaddwd m3, m6 8175 mova [rsp+gprsize+16*%5], m6 8176 punpckhwd m5, m0 8177 pxor m6, m6 8178 paddd m2, m3 8179 punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8 8180 pmaddwd m3, m5 8181 mova [rsp+gprsize+16*%6], m5 8182 mova m5, m0 8183 paddd m2, m3 8184%endmacro 8185 WARP_V 1, 2, 3, 4, 5, 6 8186 ret 8187.main3: 8188 call .h 8189 WARP_V 7, 8, 9, 10, 11, 12 8190 ret 8191ALIGN function_align 8192.h: 8193 lea tmpd, [mxq+alphaq] 8194 shr mxd, 10 8195 movq m3, [filterq+mxq*8] 8196 punpcklbw m0, m6, m3 8197 movu m3, [srcq-6] 8198 pmaddwd m0, m3 ; 0 8199 lea mxd, [tmpq+alphaq] 8200 shr tmpd, 10 8201 movq m3, [filterq+tmpq*8] 8202 punpcklbw m2, m6, m3 8203 movu m3, [srcq-4] 8204 pmaddwd m2, m3 ; 1 8205 lea tmpd, [mxq+alphaq] 8206 shr mxd, 10 8207 movq m3, [filterq+mxq*8] 8208 phaddd m0, m2 ; 0 1 8209 punpcklbw m2, m6, m3 8210 movu m3, [srcq-2] 8211 pmaddwd m2, m3 ; 2 8212 lea mxd, [tmpq+alphaq] 8213 shr tmpd, 10 8214 movq m3, [filterq+tmpq*8] 8215 punpcklbw m1, m6, m3 8216 movu m3, [srcq+0] 8217 pmaddwd m1, m3 ; 3 8218 lea tmpd, [mxq+alphaq] 8219 shr mxd, 10 8220 movq m3, [filterq+mxq*8] 8221 phaddd m2, m1 ; 2 3 8222 punpcklbw m1, m6, m3 8223 movu m3, [srcq+2] 8224 pmaddwd m1, m3 ; 4 8225 lea mxd, [tmpq+alphaq] 8226 shr tmpd, 10 8227 movq m3, [filterq+tmpq*8] 8228 phaddd m0, m2 ; 0 1 2 3 8229 punpcklbw m2, m6, m3 8230 movu m3, [srcq+4] 8231 pmaddwd m2, m3 ; 5 8232 lea tmpd, [mxq+alphaq] 8233 shr mxd, 10 8234 movq m3, [filterq+mxq*8] 8235 phaddd m1, m2 ; 4 5 8236 punpcklbw m2, m6, m3 8237 movu m3, [srcq+6] 8238 pmaddwd m2, m3 ; 6 8239%if ARCH_X86_64 8240 lea mxd, [tmpq+betaq] ; mx += beta 8241%else 8242 mov mxd, [esp+gprsize*2+4*60] 8243 add mxd, tmpd 8244%endif 8245 shr tmpd, 10 8246 movq m3, [filterq+tmpq*8] 8247 punpcklbw m4, m6, m3 8248 movu m3, [srcq+8] 8249%if ARCH_X86_64 8250 add srcq, ssq 8251%else 8252 add srcq, [esp+gprsize*2+4*62] 8253%endif 8254 pmaddwd m3, m4 ; 7 8255 phaddd m2, m3 ; 6 7 8256 phaddd m1, m2 ; 4 5 6 7 8257 paddd m0, m7 8258 paddd m1, m7 8259 psrad m0, [rsp+gprsize*2] 8260 psrad m1, [rsp+gprsize*2] 8261 packssdw m0, m1 8262 ret 8263 8264%macro BIDIR_FN 0 8265 call .main 8266 jmp wq 8267.w4_loop: 8268 call .main 8269 lea dstq, [dstq+strideq*2] 8270.w4: 8271 movq [dstq+strideq*0], m0 8272 movhps [dstq+strideq*1], m0 8273 lea dstq, [dstq+strideq*2] 8274 movq [dstq+strideq*0], m1 8275 movhps [dstq+strideq*1], m1 8276 sub hd, 4 8277 jg .w4_loop 8278.ret: 8279 RET 8280.w8_loop: 8281 call .main 8282 lea dstq, [dstq+strideq*2] 8283.w8: 8284 mova [dstq+strideq*0], m0 8285 mova [dstq+strideq*1], m1 8286 sub hd, 2 8287 jne .w8_loop 8288 RET 8289.w16_loop: 8290 call .main 8291 add dstq, strideq 8292.w16: 8293 mova [dstq+16*0], m0 8294 mova [dstq+16*1], m1 8295 dec hd 8296 jg .w16_loop 8297 RET 8298.w32_loop: 8299 call .main 8300 add dstq, strideq 8301.w32: 8302 mova [dstq+16*0], m0 8303 mova [dstq+16*1], m1 8304 call .main 8305 mova [dstq+16*2], m0 8306 mova [dstq+16*3], m1 8307 dec hd 8308 jg .w32_loop 8309 RET 8310.w64_loop: 8311 call .main 8312 add dstq, strideq 8313.w64: 8314 mova [dstq+16*0], m0 8315 mova [dstq+16*1], m1 8316 call .main 8317 mova [dstq+16*2], m0 8318 mova [dstq+16*3], m1 8319 call .main 8320 mova [dstq+16*4], m0 8321 mova [dstq+16*5], m1 8322 call .main 8323 mova [dstq+16*6], m0 8324 mova [dstq+16*7], m1 8325 dec hd 8326 jg .w64_loop 8327 RET 8328.w128_loop: 8329 call .main 8330 add dstq, strideq 8331.w128: 8332 mova [dstq+16* 0], m0 8333 mova [dstq+16* 1], m1 8334 call .main 8335 mova [dstq+16* 2], m0 8336 mova [dstq+16* 3], m1 8337 call .main 8338 mova [dstq+16* 4], m0 8339 mova [dstq+16* 5], m1 8340 call .main 8341 mova [dstq+16* 6], m0 8342 mova [dstq+16* 7], m1 8343 call .main 8344 mova [dstq+16* 8], m0 8345 mova [dstq+16* 9], m1 8346 call .main 8347 mova [dstq+16*10], m0 8348 mova [dstq+16*11], m1 8349 call .main 8350 mova [dstq+16*12], m0 8351 mova [dstq+16*13], m1 8352 call .main 8353 mova [dstq+16*14], m0 8354 mova [dstq+16*15], m1 8355 dec hd 8356 jg .w128_loop 8357 RET 8358%endmacro 8359 8360%if UNIX64 8361DECLARE_REG_TMP 7 8362%else 8363DECLARE_REG_TMP 5 8364%endif 8365 8366cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h 8367%define base r6-avg_ssse3_table 8368 LEA r6, avg_ssse3_table 8369 tzcnt wd, wm 8370 mov t0d, r6m ; pixel_max 8371 movsxd wq, [r6+wq*4] 8372 shr t0d, 11 8373 movddup m2, [base+bidir_rnd+t0*8] 8374 movddup m3, [base+bidir_mul+t0*8] 8375 movifnidn hd, hm 8376 add wq, r6 8377 BIDIR_FN 8378ALIGN function_align 8379.main: 8380 mova m0, [tmp1q+16*0] 8381 paddsw m0, [tmp2q+16*0] 8382 mova m1, [tmp1q+16*1] 8383 paddsw m1, [tmp2q+16*1] 8384 add tmp1q, 16*2 8385 add tmp2q, 16*2 8386 pmaxsw m0, m2 8387 pmaxsw m1, m2 8388 psubsw m0, m2 8389 psubsw m1, m2 8390 pmulhw m0, m3 8391 pmulhw m1, m3 8392 ret 8393 8394cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h 8395%define base r6-w_avg_ssse3_table 8396 LEA r6, w_avg_ssse3_table 8397 tzcnt wd, wm 8398 mov t0d, r6m ; weight 8399 movd m6, r7m ; pixel_max 8400 movddup m5, [base+pd_65538] 8401 movsxd wq, [r6+wq*4] 8402 pshufb m6, [base+pw_256] 8403 add wq, r6 8404 lea r6d, [t0-16] 8405 shl t0d, 16 8406 sub t0d, r6d ; 16-weight, weight 8407 paddw m5, m6 8408 mov r6d, t0d 8409 shl t0d, 2 8410 test dword r7m, 0x800 8411 cmovnz r6d, t0d 8412 movifnidn hd, hm 8413 movd m4, r6d 8414 pslld m5, 7 8415 pxor m7, m7 8416 pshufd m4, m4, q0000 8417 BIDIR_FN 8418ALIGN function_align 8419.main: 8420 mova m2, [tmp1q+16*0] 8421 mova m0, [tmp2q+16*0] 8422 punpckhwd m3, m0, m2 8423 punpcklwd m0, m2 8424 mova m2, [tmp1q+16*1] 8425 mova m1, [tmp2q+16*1] 8426 add tmp1q, 16*2 8427 add tmp2q, 16*2 8428 pmaddwd m3, m4 8429 pmaddwd m0, m4 8430 paddd m3, m5 8431 paddd m0, m5 8432 psrad m3, 8 8433 psrad m0, 8 8434 packssdw m0, m3 8435 punpckhwd m3, m1, m2 8436 punpcklwd m1, m2 8437 pmaddwd m3, m4 8438 pmaddwd m1, m4 8439 paddd m3, m5 8440 paddd m1, m5 8441 psrad m3, 8 8442 psrad m1, 8 8443 packssdw m1, m3 8444 pminsw m0, m6 8445 pminsw m1, m6 8446 pmaxsw m0, m7 8447 pmaxsw m1, m7 8448 ret 8449 8450%if ARCH_X86_64 8451cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask 8452%else 8453cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask 8454%define hd dword r5m 8455%define m8 [base+pw_64] 8456%endif 8457%define base r6-mask_ssse3_table 8458 LEA r6, mask_ssse3_table 8459 tzcnt wd, wm 8460 mov t0d, r7m ; pixel_max 8461 shr t0d, 11 8462 movsxd wq, [r6+wq*4] 8463 movddup m6, [base+bidir_rnd+t0*8] 8464 movddup m7, [base+bidir_mul+t0*8] 8465%if ARCH_X86_64 8466 mova m8, [base+pw_64] 8467 movifnidn hd, hm 8468%endif 8469 add wq, r6 8470 mov maskq, r6mp 8471 BIDIR_FN 8472ALIGN function_align 8473.main: 8474 movq m3, [maskq+8*0] 8475 mova m0, [tmp1q+16*0] 8476 mova m4, [tmp2q+16*0] 8477 pxor m5, m5 8478 punpcklbw m3, m5 8479 punpckhwd m2, m0, m4 8480 punpcklwd m0, m4 8481 psubw m1, m8, m3 8482 punpckhwd m4, m3, m1 ; m, 64-m 8483 punpcklwd m3, m1 8484 pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) 8485 pmaddwd m0, m3 8486 movq m3, [maskq+8*1] 8487 mova m1, [tmp1q+16*1] 8488 mova m4, [tmp2q+16*1] 8489 add maskq, 8*2 8490 add tmp1q, 16*2 8491 add tmp2q, 16*2 8492 psrad m2, 5 8493 psrad m0, 5 8494 packssdw m0, m2 8495 punpcklbw m3, m5 8496 punpckhwd m2, m1, m4 8497 punpcklwd m1, m4 8498 psubw m5, m8, m3 8499 punpckhwd m4, m3, m5 ; m, 64-m 8500 punpcklwd m3, m5 8501 pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) 8502 pmaddwd m1, m3 8503 psrad m2, 5 8504 psrad m1, 5 8505 packssdw m1, m2 8506 pmaxsw m0, m6 8507 pmaxsw m1, m6 8508 psubsw m0, m6 8509 psubsw m1, m6 8510 pmulhw m0, m7 8511 pmulhw m1, m7 8512 ret 8513 8514cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask 8515%define base t0-w_mask_420_ssse3_table 8516 LEA t0, w_mask_420_ssse3_table 8517 tzcnt wd, wm 8518 mov r6d, r8m ; pixel_max 8519 movd m0, r7m ; sign 8520 shr r6d, 11 8521 movsxd wq, [t0+wq*4] 8522%if ARCH_X86_64 8523 mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 8524 mova m9, [base+pw_64] 8525 movddup m10, [base+bidir_rnd+r6*8] 8526 movddup m11, [base+bidir_mul+r6*8] 8527%else 8528 mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 8529 mova m2, [base+pw_64] 8530 movddup m3, [base+bidir_rnd+r6*8] 8531 movddup m4, [base+bidir_mul+r6*8] 8532 ALLOC_STACK -16*4 8533 mova [rsp+16*0], m1 8534 mova [rsp+16*1], m2 8535 mova [rsp+16*2], m3 8536 mova [rsp+16*3], m4 8537 %define m8 [rsp+gprsize+16*0] 8538 %define m9 [rsp+gprsize+16*1] 8539 %define m10 [rsp+gprsize+16*2] 8540 %define m11 [rsp+gprsize+16*3] 8541%endif 8542 movd m7, [base+pw_2] 8543 psubw m7, m0 8544 pshufb m7, [base+pw_256] 8545 add wq, t0 8546 movifnidn hd, r5m 8547 mov maskq, r6mp 8548 call .main 8549 jmp wq 8550.w4_loop: 8551 call .main 8552 lea dstq, [dstq+strideq*2] 8553 add maskq, 4 8554.w4: 8555 movq [dstq+strideq*0], m0 8556 phaddw m2, m3 8557 movhps [dstq+strideq*1], m0 8558 phaddd m2, m2 8559 lea dstq, [dstq+strideq*2] 8560 paddw m2, m7 8561 movq [dstq+strideq*0], m1 8562 psrlw m2, 2 8563 movhps [dstq+strideq*1], m1 8564 packuswb m2, m2 8565 movd [maskq], m2 8566 sub hd, 4 8567 jg .w4_loop 8568 RET 8569.w8_loop: 8570 call .main 8571 lea dstq, [dstq+strideq*2] 8572 add maskq, 4 8573.w8: 8574 mova [dstq+strideq*0], m0 8575 paddw m2, m3 8576 phaddw m2, m2 8577 mova [dstq+strideq*1], m1 8578 paddw m2, m7 8579 psrlw m2, 2 8580 packuswb m2, m2 8581 movd [maskq], m2 8582 sub hd, 2 8583 jg .w8_loop 8584 RET 8585.w16_loop: 8586 call .main 8587 lea dstq, [dstq+strideq*2] 8588 add maskq, 8 8589.w16: 8590 mova [dstq+strideq*1+16*0], m2 8591 mova [dstq+strideq*0+16*0], m0 8592 mova [dstq+strideq*1+16*1], m3 8593 mova [dstq+strideq*0+16*1], m1 8594 call .main 8595 paddw m2, [dstq+strideq*1+16*0] 8596 paddw m3, [dstq+strideq*1+16*1] 8597 mova [dstq+strideq*1+16*0], m0 8598 phaddw m2, m3 8599 mova [dstq+strideq*1+16*1], m1 8600 paddw m2, m7 8601 psrlw m2, 2 8602 packuswb m2, m2 8603 movq [maskq], m2 8604 sub hd, 2 8605 jg .w16_loop 8606 RET 8607.w32_loop: 8608 call .main 8609 lea dstq, [dstq+strideq*2] 8610 add maskq, 16 8611.w32: 8612 mova [dstq+strideq*1+16*0], m2 8613 mova [dstq+strideq*0+16*0], m0 8614 mova [dstq+strideq*1+16*1], m3 8615 mova [dstq+strideq*0+16*1], m1 8616 call .main 8617 mova [dstq+strideq*0+16*2], m0 8618 phaddw m2, m3 8619 mova [dstq+strideq*1+16*3], m2 8620 mova [dstq+strideq*0+16*3], m1 8621 call .main 8622 paddw m2, [dstq+strideq*1+16*0] 8623 paddw m3, [dstq+strideq*1+16*1] 8624 mova [dstq+strideq*1+16*0], m0 8625 phaddw m2, m3 8626 mova [dstq+strideq*1+16*2], m2 8627 mova [dstq+strideq*1+16*1], m1 8628 call .main 8629 phaddw m2, m3 8630 paddw m3, m7, [dstq+strideq*1+16*2] 8631 paddw m2, [dstq+strideq*1+16*3] 8632 mova [dstq+strideq*1+16*2], m0 8633 paddw m2, m7 8634 psrlw m3, 2 8635 psrlw m2, 2 8636 mova [dstq+strideq*1+16*3], m1 8637 packuswb m3, m2 8638 mova [maskq], m3 8639 sub hd, 2 8640 jg .w32_loop 8641 RET 8642.w64_loop: 8643 call .main 8644 lea dstq, [dstq+strideq*2] 8645 add maskq, 16*2 8646.w64: 8647 mova [dstq+strideq*1+16*1], m2 8648 mova [dstq+strideq*0+16*0], m0 8649 mova [dstq+strideq*1+16*2], m3 8650 mova [dstq+strideq*0+16*1], m1 8651 call .main 8652 mova [dstq+strideq*1+16*3], m2 8653 mova [dstq+strideq*0+16*2], m0 8654 mova [dstq+strideq*1+16*4], m3 8655 mova [dstq+strideq*0+16*3], m1 8656 call .main 8657 mova [dstq+strideq*1+16*5], m2 8658 mova [dstq+strideq*0+16*4], m0 8659 mova [dstq+strideq*1+16*6], m3 8660 mova [dstq+strideq*0+16*5], m1 8661 call .main 8662 mova [dstq+strideq*0+16*6], m0 8663 phaddw m2, m3 8664 mova [dstq+strideq*1+16*7], m2 8665 mova [dstq+strideq*0+16*7], m1 8666 call .main 8667 paddw m2, [dstq+strideq*1+16*1] 8668 paddw m3, [dstq+strideq*1+16*2] 8669 mova [dstq+strideq*1+16*0], m0 8670 phaddw m2, m3 8671 mova [dstq+strideq*1+16*2], m2 8672 mova [dstq+strideq*1+16*1], m1 8673 call .main 8674 paddw m2, [dstq+strideq*1+16*3] 8675 paddw m3, [dstq+strideq*1+16*4] 8676 phaddw m2, m3 8677 paddw m3, m7, [dstq+strideq*1+16*2] 8678 mova [dstq+strideq*1+16*2], m0 8679 paddw m2, m7 8680 psrlw m3, 2 8681 psrlw m2, 2 8682 mova [dstq+strideq*1+16*3], m1 8683 packuswb m3, m2 8684 mova [maskq+16*0], m3 8685 call .main 8686 paddw m2, [dstq+strideq*1+16*5] 8687 paddw m3, [dstq+strideq*1+16*6] 8688 mova [dstq+strideq*1+16*4], m0 8689 phaddw m2, m3 8690 mova [dstq+strideq*1+16*6], m2 8691 mova [dstq+strideq*1+16*5], m1 8692 call .main 8693 phaddw m2, m3 8694 paddw m3, m7, [dstq+strideq*1+16*6] 8695 paddw m2, [dstq+strideq*1+16*7] 8696 mova [dstq+strideq*1+16*6], m0 8697 paddw m2, m7 8698 psrlw m3, 2 8699 psrlw m2, 2 8700 mova [dstq+strideq*1+16*7], m1 8701 packuswb m3, m2 8702 mova [maskq+16*1], m3 8703 sub hd, 2 8704 jg .w64_loop 8705 RET 8706.w128_loop: 8707 call .main 8708 lea dstq, [dstq+strideq*2] 8709 add maskq, 16*4 8710.w128: 8711 mova [dstq+strideq*1+16* 1], m2 8712 mova [dstq+strideq*0+16* 0], m0 8713 mova [dstq+strideq*1+16* 2], m3 8714 mova [dstq+strideq*0+16* 1], m1 8715 call .main 8716 mova [dstq+strideq*1+16* 3], m2 8717 mova [dstq+strideq*0+16* 2], m0 8718 mova [dstq+strideq*1+16* 4], m3 8719 mova [dstq+strideq*0+16* 3], m1 8720 call .main 8721 mova [dstq+strideq*1+16* 5], m2 8722 mova [dstq+strideq*0+16* 4], m0 8723 mova [dstq+strideq*1+16* 6], m3 8724 mova [dstq+strideq*0+16* 5], m1 8725 call .main 8726 mova [dstq+strideq*1+16* 7], m2 8727 mova [dstq+strideq*0+16* 6], m0 8728 mova [dstq+strideq*1+16* 8], m3 8729 mova [dstq+strideq*0+16* 7], m1 8730 call .main 8731 mova [dstq+strideq*1+16* 9], m2 8732 mova [dstq+strideq*0+16* 8], m0 8733 mova [dstq+strideq*1+16*10], m3 8734 mova [dstq+strideq*0+16* 9], m1 8735 call .main 8736 mova [dstq+strideq*1+16*11], m2 8737 mova [dstq+strideq*0+16*10], m0 8738 mova [dstq+strideq*1+16*12], m3 8739 mova [dstq+strideq*0+16*11], m1 8740 call .main 8741 mova [dstq+strideq*1+16*13], m2 8742 mova [dstq+strideq*0+16*12], m0 8743 mova [dstq+strideq*1+16*14], m3 8744 mova [dstq+strideq*0+16*13], m1 8745 call .main 8746 mova [dstq+strideq*0+16*14], m0 8747 phaddw m2, m3 8748 mova [dstq+strideq*1+16*15], m2 8749 mova [dstq+strideq*0+16*15], m1 8750 call .main 8751 paddw m2, [dstq+strideq*1+16* 1] 8752 paddw m3, [dstq+strideq*1+16* 2] 8753 mova [dstq+strideq*1+16* 0], m0 8754 phaddw m2, m3 8755 mova [dstq+strideq*1+16* 2], m2 8756 mova [dstq+strideq*1+16* 1], m1 8757 call .main 8758 paddw m2, [dstq+strideq*1+16* 3] 8759 paddw m3, [dstq+strideq*1+16* 4] 8760 phaddw m2, m3 8761 paddw m3, m7, [dstq+strideq*1+16* 2] 8762 mova [dstq+strideq*1+16* 2], m0 8763 paddw m2, m7 8764 psrlw m3, 2 8765 psrlw m2, 2 8766 mova [dstq+strideq*1+16* 3], m1 8767 packuswb m3, m2 8768 mova [maskq+16*0], m3 8769 call .main 8770 paddw m2, [dstq+strideq*1+16* 5] 8771 paddw m3, [dstq+strideq*1+16* 6] 8772 mova [dstq+strideq*1+16* 4], m0 8773 phaddw m2, m3 8774 mova [dstq+strideq*1+16* 6], m2 8775 mova [dstq+strideq*1+16* 5], m1 8776 call .main 8777 paddw m2, [dstq+strideq*1+16* 7] 8778 paddw m3, [dstq+strideq*1+16* 8] 8779 phaddw m2, m3 8780 paddw m3, m7, [dstq+strideq*1+16* 6] 8781 mova [dstq+strideq*1+16* 6], m0 8782 paddw m2, m7 8783 psrlw m3, 2 8784 psrlw m2, 2 8785 mova [dstq+strideq*1+16* 7], m1 8786 packuswb m3, m2 8787 mova [maskq+16*1], m3 8788 call .main 8789 paddw m2, [dstq+strideq*1+16* 9] 8790 paddw m3, [dstq+strideq*1+16*10] 8791 mova [dstq+strideq*1+16* 8], m0 8792 phaddw m2, m3 8793 mova [dstq+strideq*1+16*10], m2 8794 mova [dstq+strideq*1+16* 9], m1 8795 call .main 8796 paddw m2, [dstq+strideq*1+16*11] 8797 paddw m3, [dstq+strideq*1+16*12] 8798 phaddw m2, m3 8799 paddw m3, m7, [dstq+strideq*1+16*10] 8800 mova [dstq+strideq*1+16*10], m0 8801 paddw m2, m7 8802 psrlw m3, 2 8803 psrlw m2, 2 8804 mova [dstq+strideq*1+16*11], m1 8805 packuswb m3, m2 8806 mova [maskq+16*2], m3 8807 call .main 8808 paddw m2, [dstq+strideq*1+16*13] 8809 paddw m3, [dstq+strideq*1+16*14] 8810 mova [dstq+strideq*1+16*12], m0 8811 phaddw m2, m3 8812 mova [dstq+strideq*1+16*14], m2 8813 mova [dstq+strideq*1+16*13], m1 8814 call .main 8815 phaddw m2, m3 8816 paddw m3, m7, [dstq+strideq*1+16*14] 8817 paddw m2, [dstq+strideq*1+16*15] 8818 mova [dstq+strideq*1+16*14], m0 8819 paddw m2, m7 8820 psrlw m3, 2 8821 psrlw m2, 2 8822 mova [dstq+strideq*1+16*15], m1 8823 packuswb m3, m2 8824 mova [maskq+16*3], m3 8825 sub hd, 2 8826 jg .w128_loop 8827 RET 8828ALIGN function_align 8829.main: 8830%macro W_MASK 2 ; dst/tmp_offset, mask 8831 mova m%1, [tmp1q+16*%1] 8832 mova m%2, [tmp2q+16*%1] 8833 punpcklwd m4, m%2, m%1 8834 punpckhwd m5, m%2, m%1 8835 psubsw m%1, m%2 8836 pabsw m%1, m%1 8837 psubusw m6, m8, m%1 8838 psrlw m6, 10 ; 64-m 8839 psubw m%2, m9, m6 ; m 8840 punpcklwd m%1, m6, m%2 8841 punpckhwd m6, m%2 8842 pmaddwd m%1, m4 8843 pmaddwd m6, m5 8844 psrad m%1, 5 8845 psrad m6, 5 8846 packssdw m%1, m6 8847 pmaxsw m%1, m10 8848 psubsw m%1, m10 8849 pmulhw m%1, m11 8850%endmacro 8851 W_MASK 0, 2 8852 W_MASK 1, 3 8853 add tmp1q, 16*2 8854 add tmp2q, 16*2 8855 ret 8856 8857cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask 8858%define base t0-w_mask_422_ssse3_table 8859 LEA t0, w_mask_422_ssse3_table 8860 tzcnt wd, wm 8861 mov r6d, r8m ; pixel_max 8862 movd m7, r7m ; sign 8863 shr r6d, 11 8864 movsxd wq, [t0+wq*4] 8865%if ARCH_X86_64 8866 mova m8, [base+pw_27615] 8867 mova m9, [base+pw_64] 8868 movddup m10, [base+bidir_rnd+r6*8] 8869 movddup m11, [base+bidir_mul+r6*8] 8870%else 8871 mova m1, [base+pw_27615] 8872 mova m2, [base+pw_64] 8873 movddup m3, [base+bidir_rnd+r6*8] 8874 movddup m4, [base+bidir_mul+r6*8] 8875 ALLOC_STACK -16*4 8876 mova [rsp+16*0], m1 8877 mova [rsp+16*1], m2 8878 mova [rsp+16*2], m3 8879 mova [rsp+16*3], m4 8880%endif 8881 pxor m0, m0 8882 add wq, t0 8883 pshufb m7, m0 8884 movifnidn hd, r5m 8885 mov maskq, r6mp 8886 call .main 8887 jmp wq 8888.w4_loop: 8889 call .main 8890 lea dstq, [dstq+strideq*2] 8891.w4: 8892 movq [dstq+strideq*0], m0 8893 movhps [dstq+strideq*1], m0 8894 lea dstq, [dstq+strideq*2] 8895 movq [dstq+strideq*0], m1 8896 movhps [dstq+strideq*1], m1 8897 sub hd, 4 8898 jg .w4_loop 8899.end: 8900 RET 8901.w8_loop: 8902 call .main 8903 lea dstq, [dstq+strideq*2] 8904.w8: 8905 mova [dstq+strideq*0], m0 8906 mova [dstq+strideq*1], m1 8907 sub hd, 2 8908 jg .w8_loop 8909.w8_end: 8910 RET 8911.w16_loop: 8912 call .main 8913 lea dstq, [dstq+strideq*2] 8914.w16: 8915 mova [dstq+strideq*0+16*0], m0 8916 mova [dstq+strideq*0+16*1], m1 8917 call .main 8918 mova [dstq+strideq*1+16*0], m0 8919 mova [dstq+strideq*1+16*1], m1 8920 sub hd, 2 8921 jg .w16_loop 8922 RET 8923.w32_loop: 8924 call .main 8925 add dstq, strideq 8926.w32: 8927 mova [dstq+16*0], m0 8928 mova [dstq+16*1], m1 8929 call .main 8930 mova [dstq+16*2], m0 8931 mova [dstq+16*3], m1 8932 dec hd 8933 jg .w32_loop 8934 RET 8935.w64_loop: 8936 call .main 8937 add dstq, strideq 8938.w64: 8939 mova [dstq+16*0], m0 8940 mova [dstq+16*1], m1 8941 call .main 8942 mova [dstq+16*2], m0 8943 mova [dstq+16*3], m1 8944 call .main 8945 mova [dstq+16*4], m0 8946 mova [dstq+16*5], m1 8947 call .main 8948 mova [dstq+16*6], m0 8949 mova [dstq+16*7], m1 8950 dec hd 8951 jg .w64_loop 8952 RET 8953.w128_loop: 8954 call .main 8955 add dstq, strideq 8956.w128: 8957 mova [dstq+16* 0], m0 8958 mova [dstq+16* 1], m1 8959 call .main 8960 mova [dstq+16* 2], m0 8961 mova [dstq+16* 3], m1 8962 call .main 8963 mova [dstq+16* 4], m0 8964 mova [dstq+16* 5], m1 8965 call .main 8966 mova [dstq+16* 6], m0 8967 mova [dstq+16* 7], m1 8968 call .main 8969 mova [dstq+16* 8], m0 8970 mova [dstq+16* 9], m1 8971 call .main 8972 mova [dstq+16*10], m0 8973 mova [dstq+16*11], m1 8974 call .main 8975 mova [dstq+16*12], m0 8976 mova [dstq+16*13], m1 8977 call .main 8978 mova [dstq+16*14], m0 8979 mova [dstq+16*15], m1 8980 dec hd 8981 jg .w128_loop 8982 RET 8983ALIGN function_align 8984.main: 8985 W_MASK 0, 2 8986 W_MASK 1, 3 8987 phaddw m2, m3 8988 add tmp1q, 16*2 8989 add tmp2q, 16*2 8990 packuswb m2, m2 8991 pxor m3, m3 8992 psubb m2, m7 8993 pavgb m2, m3 8994 movq [maskq], m2 8995 add maskq, 8 8996 ret 8997 8998cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask 8999%define base t0-w_mask_444_ssse3_table 9000 LEA t0, w_mask_444_ssse3_table 9001 tzcnt wd, wm 9002 mov r6d, r8m ; pixel_max 9003 shr r6d, 11 9004 movsxd wq, [t0+wq*4] 9005%if ARCH_X86_64 9006 mova m8, [base+pw_27615] 9007 mova m9, [base+pw_64] 9008 movddup m10, [base+bidir_rnd+r6*8] 9009 movddup m11, [base+bidir_mul+r6*8] 9010%else 9011 mova m1, [base+pw_27615] 9012 mova m2, [base+pw_64] 9013 movddup m3, [base+bidir_rnd+r6*8] 9014 movddup m7, [base+bidir_mul+r6*8] 9015 ALLOC_STACK -16*3 9016 mova [rsp+16*0], m1 9017 mova [rsp+16*1], m2 9018 mova [rsp+16*2], m3 9019 %define m11 m7 9020%endif 9021 add wq, t0 9022 movifnidn hd, r5m 9023 mov maskq, r6mp 9024 call .main 9025 jmp wq 9026.w4_loop: 9027 call .main 9028 lea dstq, [dstq+strideq*2] 9029.w4: 9030 movq [dstq+strideq*0], m0 9031 movhps [dstq+strideq*1], m0 9032 lea dstq, [dstq+strideq*2] 9033 movq [dstq+strideq*0], m1 9034 movhps [dstq+strideq*1], m1 9035 sub hd, 4 9036 jg .w4_loop 9037.end: 9038 RET 9039.w8_loop: 9040 call .main 9041 lea dstq, [dstq+strideq*2] 9042.w8: 9043 mova [dstq+strideq*0], m0 9044 mova [dstq+strideq*1], m1 9045 sub hd, 2 9046 jg .w8_loop 9047.w8_end: 9048 RET 9049.w16_loop: 9050 call .main 9051 lea dstq, [dstq+strideq*2] 9052.w16: 9053 mova [dstq+strideq*0+16*0], m0 9054 mova [dstq+strideq*0+16*1], m1 9055 call .main 9056 mova [dstq+strideq*1+16*0], m0 9057 mova [dstq+strideq*1+16*1], m1 9058 sub hd, 2 9059 jg .w16_loop 9060 RET 9061.w32_loop: 9062 call .main 9063 add dstq, strideq 9064.w32: 9065 mova [dstq+16*0], m0 9066 mova [dstq+16*1], m1 9067 call .main 9068 mova [dstq+16*2], m0 9069 mova [dstq+16*3], m1 9070 dec hd 9071 jg .w32_loop 9072 RET 9073.w64_loop: 9074 call .main 9075 add dstq, strideq 9076.w64: 9077 mova [dstq+16*0], m0 9078 mova [dstq+16*1], m1 9079 call .main 9080 mova [dstq+16*2], m0 9081 mova [dstq+16*3], m1 9082 call .main 9083 mova [dstq+16*4], m0 9084 mova [dstq+16*5], m1 9085 call .main 9086 mova [dstq+16*6], m0 9087 mova [dstq+16*7], m1 9088 dec hd 9089 jg .w64_loop 9090 RET 9091.w128_loop: 9092 call .main 9093 add dstq, strideq 9094.w128: 9095 mova [dstq+16* 0], m0 9096 mova [dstq+16* 1], m1 9097 call .main 9098 mova [dstq+16* 2], m0 9099 mova [dstq+16* 3], m1 9100 call .main 9101 mova [dstq+16* 4], m0 9102 mova [dstq+16* 5], m1 9103 call .main 9104 mova [dstq+16* 6], m0 9105 mova [dstq+16* 7], m1 9106 call .main 9107 mova [dstq+16* 8], m0 9108 mova [dstq+16* 9], m1 9109 call .main 9110 mova [dstq+16*10], m0 9111 mova [dstq+16*11], m1 9112 call .main 9113 mova [dstq+16*12], m0 9114 mova [dstq+16*13], m1 9115 call .main 9116 mova [dstq+16*14], m0 9117 mova [dstq+16*15], m1 9118 dec hd 9119 jg .w128_loop 9120 RET 9121ALIGN function_align 9122.main: 9123 W_MASK 0, 2 9124 W_MASK 1, 3 9125 packuswb m2, m3 9126 add tmp1q, 16*2 9127 add tmp2q, 16*2 9128 mova [maskq], m2 9129 add maskq, 16 9130 ret 9131 9132; (a * (64 - m) + b * m + 32) >> 6 9133; = (((b - a) * m + 32) >> 6) + a 9134; = (((b - a) * (m << 9) + 16384) >> 15) + a 9135; except m << 9 overflows int16_t when m == 64 (which is possible), 9136; but if we negate m it works out (-64 << 9 == -32768). 9137; = (((a - b) * (m * -512) + 16384) >> 15) + a 9138cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3 9139%define base r6-blend_ssse3_table 9140 LEA r6, blend_ssse3_table 9141 tzcnt wd, wm 9142 movifnidn hd, hm 9143 movsxd wq, [r6+wq*4] 9144 movifnidn maskq, maskmp 9145 mova m7, [base+pw_m512] 9146 add wq, r6 9147 lea stride3q, [strideq*3] 9148 pxor m6, m6 9149 jmp wq 9150.w4: 9151 mova m5, [maskq] 9152 movq m0, [dstq+strideq*0] 9153 movhps m0, [dstq+strideq*1] 9154 movq m1, [dstq+strideq*2] 9155 movhps m1, [dstq+stride3q ] 9156 psubw m2, m0, [tmpq+16*0] 9157 psubw m3, m1, [tmpq+16*1] 9158 add maskq, 16 9159 add tmpq, 32 9160 punpcklbw m4, m5, m6 9161 punpckhbw m5, m6 9162 pmullw m4, m7 9163 pmullw m5, m7 9164 pmulhrsw m2, m4 9165 pmulhrsw m3, m5 9166 paddw m0, m2 9167 paddw m1, m3 9168 movq [dstq+strideq*0], m0 9169 movhps [dstq+strideq*1], m0 9170 movq [dstq+strideq*2], m1 9171 movhps [dstq+stride3q ], m1 9172 lea dstq, [dstq+strideq*4] 9173 sub hd, 4 9174 jg .w4 9175 RET 9176.w8: 9177 mova m5, [maskq] 9178 mova m0, [dstq+strideq*0] 9179 mova m1, [dstq+strideq*1] 9180 psubw m2, m0, [tmpq+16*0] 9181 psubw m3, m1, [tmpq+16*1] 9182 add maskq, 16 9183 add tmpq, 32 9184 punpcklbw m4, m5, m6 9185 punpckhbw m5, m6 9186 pmullw m4, m7 9187 pmullw m5, m7 9188 pmulhrsw m2, m4 9189 pmulhrsw m3, m5 9190 paddw m0, m2 9191 paddw m1, m3 9192 mova [dstq+strideq*0], m0 9193 mova [dstq+strideq*1], m1 9194 lea dstq, [dstq+strideq*2] 9195 sub hd, 2 9196 jg .w8 9197 RET 9198.w16: 9199 mova m5, [maskq] 9200 mova m0, [dstq+16*0] 9201 mova m1, [dstq+16*1] 9202 psubw m2, m0, [tmpq+16*0] 9203 psubw m3, m1, [tmpq+16*1] 9204 add maskq, 16 9205 add tmpq, 32 9206 punpcklbw m4, m5, m6 9207 punpckhbw m5, m6 9208 pmullw m4, m7 9209 pmullw m5, m7 9210 pmulhrsw m2, m4 9211 pmulhrsw m3, m5 9212 paddw m0, m2 9213 paddw m1, m3 9214 mova [dstq+16*0], m0 9215 mova [dstq+16*1], m1 9216 add dstq, strideq 9217 dec hd 9218 jg .w16 9219 RET 9220.w32: 9221 mova m5, [maskq+16*0] 9222 mova m0, [dstq+16*0] 9223 mova m1, [dstq+16*1] 9224 psubw m2, m0, [tmpq+16*0] 9225 psubw m3, m1, [tmpq+16*1] 9226 punpcklbw m4, m5, m6 9227 punpckhbw m5, m6 9228 pmullw m4, m7 9229 pmullw m5, m7 9230 pmulhrsw m2, m4 9231 pmulhrsw m3, m5 9232 paddw m0, m2 9233 paddw m1, m3 9234 mova [dstq+16*0], m0 9235 mova [dstq+16*1], m1 9236 mova m5, [maskq+16*1] 9237 mova m0, [dstq+16*2] 9238 mova m1, [dstq+16*3] 9239 psubw m2, m0, [tmpq+16*2] 9240 psubw m3, m1, [tmpq+16*3] 9241 add maskq, 32 9242 add tmpq, 64 9243 punpcklbw m4, m5, m6 9244 punpckhbw m5, m6 9245 pmullw m4, m7 9246 pmullw m5, m7 9247 pmulhrsw m2, m4 9248 pmulhrsw m3, m5 9249 paddw m0, m2 9250 paddw m1, m3 9251 mova [dstq+16*2], m0 9252 mova [dstq+16*3], m1 9253 add dstq, strideq 9254 dec hd 9255 jg .w32 9256 RET 9257 9258cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h 9259%define base r5-blend_v_ssse3_table 9260 LEA r5, blend_v_ssse3_table 9261 tzcnt wd, wm 9262 movifnidn hd, hm 9263 movsxd wq, [r5+wq*4] 9264 add wq, r5 9265 jmp wq 9266.w2: 9267 movd m4, [base+obmc_masks+2*2] 9268.w2_loop: 9269 movd m0, [dstq+strideq*0] 9270 movd m2, [tmpq+4*0] 9271 movd m1, [dstq+strideq*1] 9272 movd m3, [tmpq+4*1] 9273 add tmpq, 4*2 9274 psubw m2, m0 9275 psubw m3, m1 9276 pmulhrsw m2, m4 9277 pmulhrsw m3, m4 9278 paddw m0, m2 9279 paddw m1, m3 9280 movd [dstq+strideq*0], m0 9281 movd [dstq+strideq*1], m1 9282 lea dstq, [dstq+strideq*2] 9283 sub hd, 2 9284 jg .w2_loop 9285 RET 9286.w4: 9287 movddup m2, [base+obmc_masks+4*2] 9288.w4_loop: 9289 movq m0, [dstq+strideq*0] 9290 movhps m0, [dstq+strideq*1] 9291 mova m1, [tmpq] 9292 add tmpq, 8*2 9293 psubw m1, m0 9294 pmulhrsw m1, m2 9295 paddw m0, m1 9296 movq [dstq+strideq*0], m0 9297 movhps [dstq+strideq*1], m0 9298 lea dstq, [dstq+strideq*2] 9299 sub hd, 2 9300 jg .w4_loop 9301 RET 9302.w8: 9303 mova m4, [base+obmc_masks+8*2] 9304.w8_loop: 9305 mova m0, [dstq+strideq*0] 9306 mova m2, [tmpq+16*0] 9307 mova m1, [dstq+strideq*1] 9308 mova m3, [tmpq+16*1] 9309 add tmpq, 16*2 9310 psubw m2, m0 9311 psubw m3, m1 9312 pmulhrsw m2, m4 9313 pmulhrsw m3, m4 9314 paddw m0, m2 9315 paddw m1, m3 9316 mova [dstq+strideq*0], m0 9317 mova [dstq+strideq*1], m1 9318 lea dstq, [dstq+strideq*2] 9319 sub hd, 2 9320 jg .w8_loop 9321 RET 9322.w16: 9323 mova m4, [base+obmc_masks+16*2] 9324 movq m5, [base+obmc_masks+16*3] 9325.w16_loop: 9326 mova m0, [dstq+16*0] 9327 mova m2, [tmpq+16*0] 9328 mova m1, [dstq+16*1] 9329 mova m3, [tmpq+16*1] 9330 add tmpq, 16*2 9331 psubw m2, m0 9332 psubw m3, m1 9333 pmulhrsw m2, m4 9334 pmulhrsw m3, m5 9335 paddw m0, m2 9336 paddw m1, m3 9337 mova [dstq+16*0], m0 9338 mova [dstq+16*1], m1 9339 add dstq, strideq 9340 dec hd 9341 jg .w16_loop 9342 RET 9343.w32: 9344%if WIN64 9345 movaps [rsp+8], m6 9346%endif 9347 mova m4, [base+obmc_masks+16*4] 9348 mova m5, [base+obmc_masks+16*5] 9349 mova m6, [base+obmc_masks+16*6] 9350.w32_loop: 9351 mova m0, [dstq+16*0] 9352 mova m2, [tmpq+16*0] 9353 mova m1, [dstq+16*1] 9354 mova m3, [tmpq+16*1] 9355 psubw m2, m0 9356 psubw m3, m1 9357 pmulhrsw m2, m4 9358 pmulhrsw m3, m5 9359 paddw m0, m2 9360 mova m2, [dstq+16*2] 9361 paddw m1, m3 9362 mova m3, [tmpq+16*2] 9363 add tmpq, 16*4 9364 psubw m3, m2 9365 pmulhrsw m3, m6 9366 paddw m2, m3 9367 mova [dstq+16*0], m0 9368 mova [dstq+16*1], m1 9369 mova [dstq+16*2], m2 9370 add dstq, strideq 9371 dec hd 9372 jg .w32_loop 9373%if WIN64 9374 movaps m6, [rsp+8] 9375%endif 9376 RET 9377 9378%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp 9379 mova m0, [dstq+16*(%1+0)] 9380 mova m2, [tmpq+16*(%2+0)] 9381 mova m1, [dstq+16*(%1+1)] 9382 mova m3, [tmpq+16*(%2+1)] 9383%if %3 9384 add tmpq, 16*%3 9385%endif 9386 psubw m2, m0 9387 psubw m3, m1 9388 pmulhrsw m2, m5 9389 pmulhrsw m3, m5 9390 paddw m0, m2 9391 paddw m1, m3 9392 mova [dstq+16*(%1+0)], m0 9393 mova [dstq+16*(%1+1)], m1 9394%endmacro 9395 9396cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask 9397%define base r6-blend_h_ssse3_table 9398 LEA r6, blend_h_ssse3_table 9399 tzcnt wd, wm 9400 mov hd, hm 9401 movsxd wq, [r6+wq*4] 9402 movddup m4, [base+blend_shuf] 9403 lea maskq, [base+obmc_masks+hq*2] 9404 lea hd, [hq*3] 9405 add wq, r6 9406 shr hd, 2 ; h * 3/4 9407 lea maskq, [maskq+hq*2] 9408 neg hq 9409 jmp wq 9410.w2: 9411 movd m0, [dstq+dsq*0] 9412 movd m2, [dstq+dsq*1] 9413 movd m3, [maskq+hq*2] 9414 movq m1, [tmpq] 9415 add tmpq, 4*2 9416 punpckldq m0, m2 9417 punpcklwd m3, m3 9418 psubw m1, m0 9419 pmulhrsw m1, m3 9420 paddw m0, m1 9421 movd [dstq+dsq*0], m0 9422 psrlq m0, 32 9423 movd [dstq+dsq*1], m0 9424 lea dstq, [dstq+dsq*2] 9425 add hq, 2 9426 jl .w2 9427 RET 9428.w4: 9429 mova m3, [base+blend_shuf] 9430.w4_loop: 9431 movq m0, [dstq+dsq*0] 9432 movhps m0, [dstq+dsq*1] 9433 movd m2, [maskq+hq*2] 9434 mova m1, [tmpq] 9435 add tmpq, 8*2 9436 psubw m1, m0 9437 pshufb m2, m3 9438 pmulhrsw m1, m2 9439 paddw m0, m1 9440 movq [dstq+dsq*0], m0 9441 movhps [dstq+dsq*1], m0 9442 lea dstq, [dstq+dsq*2] 9443 add hq, 2 9444 jl .w4_loop 9445 RET 9446.w8: 9447 movddup m5, [base+blend_shuf+8] 9448%if WIN64 9449 movaps [rsp+ 8], m6 9450 movaps [rsp+24], m7 9451%endif 9452.w8_loop: 9453 movd m7, [maskq+hq*2] 9454 mova m0, [dstq+dsq*0] 9455 mova m2, [tmpq+16*0] 9456 mova m1, [dstq+dsq*1] 9457 mova m3, [tmpq+16*1] 9458 add tmpq, 16*2 9459 pshufb m6, m7, m4 9460 psubw m2, m0 9461 pshufb m7, m5 9462 psubw m3, m1 9463 pmulhrsw m2, m6 9464 pmulhrsw m3, m7 9465 paddw m0, m2 9466 paddw m1, m3 9467 mova [dstq+dsq*0], m0 9468 mova [dstq+dsq*1], m1 9469 lea dstq, [dstq+dsq*2] 9470 add hq, 2 9471 jl .w8_loop 9472%if WIN64 9473 movaps m6, [rsp+ 8] 9474 movaps m7, [rsp+24] 9475%endif 9476 RET 9477.w16: 9478 movd m5, [maskq+hq*2] 9479 pshufb m5, m4 9480 BLEND_H_ROW 0, 0, 2 9481 add dstq, dsq 9482 inc hq 9483 jl .w16 9484 RET 9485.w32: 9486 movd m5, [maskq+hq*2] 9487 pshufb m5, m4 9488 BLEND_H_ROW 0, 0 9489 BLEND_H_ROW 2, 2, 4 9490 add dstq, dsq 9491 inc hq 9492 jl .w32 9493 RET 9494.w64: 9495 movd m5, [maskq+hq*2] 9496 pshufb m5, m4 9497 BLEND_H_ROW 0, 0 9498 BLEND_H_ROW 2, 2 9499 BLEND_H_ROW 4, 4 9500 BLEND_H_ROW 6, 6, 8 9501 add dstq, dsq 9502 inc hq 9503 jl .w64 9504 RET 9505.w128: 9506 movd m5, [maskq+hq*2] 9507 pshufb m5, m4 9508 BLEND_H_ROW 0, 0 9509 BLEND_H_ROW 2, 2 9510 BLEND_H_ROW 4, 4 9511 BLEND_H_ROW 6, 6, 16 9512 BLEND_H_ROW 8, -8 9513 BLEND_H_ROW 10, -6 9514 BLEND_H_ROW 12, -4 9515 BLEND_H_ROW 14, -2 9516 add dstq, dsq 9517 inc hq 9518 jl .w128 9519 RET 9520 9521; emu_edge args: 9522; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, 9523; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, 9524; const pixel *ref, const ptrdiff_t ref_stride 9525; 9526; bw, bh total filled size 9527; iw, ih, copied block -> fill bottom, right 9528; x, y, offset in bw/bh -> fill top, left 9529cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \ 9530 y, dst, dstride, src, sstride, \ 9531 bottomext, rightext, blk 9532 ; we assume that the buffer (stride) is larger than width, so we can 9533 ; safely overwrite by a few bytes 9534 9535%if ARCH_X86_64 9536 %define reg_zero r12q 9537 %define reg_tmp r10 9538 %define reg_src srcq 9539 %define reg_bottomext bottomextq 9540 %define reg_rightext rightextq 9541 %define reg_blkm r9m 9542%else 9543 %define reg_zero r6 9544 %define reg_tmp r0 9545 %define reg_src r1 9546 %define reg_bottomext r0 9547 %define reg_rightext r1 9548 %define reg_blkm r2m 9549%endif 9550 ; 9551 ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 9552 xor reg_zero, reg_zero 9553 lea reg_tmp, [ihq-1] 9554 cmp yq, ihq 9555 cmovs reg_tmp, yq 9556 test yq, yq 9557 cmovs reg_tmp, reg_zero 9558%if ARCH_X86_64 9559 imul reg_tmp, sstrideq 9560 add srcq, reg_tmp 9561%else 9562 imul reg_tmp, sstridem 9563 mov reg_src, srcm 9564 add reg_src, reg_tmp 9565%endif 9566 ; 9567 ; ref += iclip(x, 0, iw - 1) 9568 lea reg_tmp, [iwq-1] 9569 cmp xq, iwq 9570 cmovs reg_tmp, xq 9571 test xq, xq 9572 cmovs reg_tmp, reg_zero 9573 lea reg_src, [reg_src+reg_tmp*2] 9574%if ARCH_X86_32 9575 mov srcm, reg_src 9576%endif 9577 ; 9578 ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) 9579%if ARCH_X86_32 9580 mov r1, r1m ; restore bh 9581%endif 9582 lea reg_bottomext, [yq+bhq] 9583 sub reg_bottomext, ihq 9584 lea r3, [bhq-1] 9585 cmovs reg_bottomext, reg_zero 9586 ; 9587 9588 DEFINE_ARGS bw, bh, iw, ih, x, \ 9589 topext, dst, dstride, src, sstride, \ 9590 bottomext, rightext, blk 9591 9592 ; top_ext = iclip(-y, 0, bh - 1) 9593 neg topextq 9594 cmovs topextq, reg_zero 9595 cmp reg_bottomext, bhq 9596 cmovns reg_bottomext, r3 9597 cmp topextq, bhq 9598 cmovg topextq, r3 9599 %if ARCH_X86_32 9600 mov r4m, reg_bottomext 9601 ; 9602 ; right_ext = iclip(x + bw - iw, 0, bw - 1) 9603 mov r0, r0m ; restore bw 9604 %endif 9605 lea reg_rightext, [xq+bwq] 9606 sub reg_rightext, iwq 9607 lea r2, [bwq-1] 9608 cmovs reg_rightext, reg_zero 9609 9610 DEFINE_ARGS bw, bh, iw, ih, leftext, \ 9611 topext, dst, dstride, src, sstride, \ 9612 bottomext, rightext, blk 9613 9614 ; left_ext = iclip(-x, 0, bw - 1) 9615 neg leftextq 9616 cmovs leftextq, reg_zero 9617 cmp reg_rightext, bwq 9618 cmovns reg_rightext, r2 9619 %if ARCH_X86_32 9620 mov r3m, r1 9621 %endif 9622 cmp leftextq, bwq 9623 cmovns leftextq, r2 9624 9625%undef reg_zero 9626%undef reg_tmp 9627%undef reg_src 9628%undef reg_bottomext 9629%undef reg_rightext 9630 9631 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ 9632 topext, dst, dstride, src, sstride, \ 9633 bottomext, rightext, blk 9634 9635 ; center_h = bh - top_ext - bottom_ext 9636%if ARCH_X86_64 9637 lea r3, [bottomextq+topextq] 9638 sub centerhq, r3 9639%else 9640 mov r1, centerhm ; restore r1 9641 sub centerhq, topextq 9642 sub centerhq, r4m 9643 mov r1m, centerhq 9644%endif 9645 ; 9646 ; blk += top_ext * PXSTRIDE(dst_stride) 9647 mov r2, topextq 9648%if ARCH_X86_64 9649 imul r2, dstrideq 9650%else 9651 mov r6, r6m ; restore dstq 9652 imul r2, dstridem 9653%endif 9654 add dstq, r2 9655 mov reg_blkm, dstq ; save pointer for ext 9656 ; 9657 ; center_w = bw - left_ext - right_ext 9658 mov centerwq, bwq 9659%if ARCH_X86_64 9660 lea r3, [rightextq+leftextq] 9661 sub centerwq, r3 9662%else 9663 sub centerwq, r3m 9664 sub centerwq, leftextq 9665%endif 9666 9667; vloop Macro 9668%macro v_loop 3 ; need_left_ext, need_right_ext, suffix 9669 %if ARCH_X86_64 9670 %define reg_tmp r12 9671 %else 9672 %define reg_tmp r0 9673 %endif 9674.v_loop_%3: 9675 %if ARCH_X86_32 9676 mov r0, r0m 9677 mov r1, r1m 9678 %endif 9679%if %1 9680 ; left extension 9681 %if ARCH_X86_64 9682 movd m0, [srcq] 9683 %else 9684 mov r3, srcm 9685 movd m0, [r3] 9686 %endif 9687 pshuflw m0, m0, q0000 9688 punpcklqdq m0, m0 9689 xor r3, r3 9690.left_loop_%3: 9691 mova [dstq+r3*2], m0 9692 add r3, mmsize/2 9693 cmp r3, leftextq 9694 jl .left_loop_%3 9695 ; body 9696 lea reg_tmp, [dstq+leftextq*2] 9697%endif 9698 xor r3, r3 9699.body_loop_%3: 9700 %if ARCH_X86_64 9701 movu m0, [srcq+r3*2] 9702 %else 9703 mov r1, srcm 9704 movu m0, [r1+r3*2] 9705 %endif 9706%if %1 9707 movu [reg_tmp+r3*2], m0 9708%else 9709 movu [dstq+r3*2], m0 9710%endif 9711 add r3, mmsize/2 9712 cmp r3, centerwq 9713 jl .body_loop_%3 9714%if %2 9715 ; right extension 9716%if %1 9717 lea reg_tmp, [reg_tmp+centerwq*2] 9718%else 9719 lea reg_tmp, [dstq+centerwq*2] 9720%endif 9721 %if ARCH_X86_64 9722 movd m0, [srcq+centerwq*2-2] 9723 %else 9724 mov r3, srcm 9725 movd m0, [r3+centerwq*2-2] 9726 %endif 9727 pshuflw m0, m0, q0000 9728 punpcklqdq m0, m0 9729 xor r3, r3 9730.right_loop_%3: 9731 movu [reg_tmp+r3*2], m0 9732 add r3, mmsize/2 9733 %if ARCH_X86_64 9734 cmp r3, rightextq 9735 %else 9736 cmp r3, r3m 9737 %endif 9738 jl .right_loop_%3 9739%endif 9740 %if ARCH_X86_64 9741 add dstq, dstrideq 9742 add srcq, sstrideq 9743 dec centerhq 9744 jg .v_loop_%3 9745 %else 9746 add dstq, dstridem 9747 mov r0, sstridem 9748 add srcm, r0 9749 sub dword centerhm, 1 9750 jg .v_loop_%3 9751 mov r0, r0m ; restore r0 9752 %endif 9753%endmacro ; vloop MACRO 9754 9755 test leftextq, leftextq 9756 jnz .need_left_ext 9757 %if ARCH_X86_64 9758 test rightextq, rightextq 9759 jnz .need_right_ext 9760 %else 9761 cmp leftextq, r3m ; leftextq == 0 9762 jne .need_right_ext 9763 %endif 9764 v_loop 0, 0, 0 9765 jmp .body_done 9766 9767 ;left right extensions 9768.need_left_ext: 9769 %if ARCH_X86_64 9770 test rightextq, rightextq 9771 %else 9772 mov r3, r3m 9773 test r3, r3 9774 %endif 9775 jnz .need_left_right_ext 9776 v_loop 1, 0, 1 9777 jmp .body_done 9778 9779.need_left_right_ext: 9780 v_loop 1, 1, 2 9781 jmp .body_done 9782 9783.need_right_ext: 9784 v_loop 0, 1, 3 9785 9786.body_done: 9787; r0 ; bw 9788; r1 ;; x loop 9789; r4 ;; y loop 9790; r5 ; topextq 9791; r6 ;dstq 9792; r7 ;dstrideq 9793; r8 ; srcq 9794%if ARCH_X86_64 9795 %define reg_dstride dstrideq 9796%else 9797 %define reg_dstride r2 9798%endif 9799 ; 9800 ; bottom edge extension 9801 %if ARCH_X86_64 9802 test bottomextq, bottomextq 9803 jz .top 9804 %else 9805 xor r1, r1 9806 cmp r1, r4m 9807 je .top 9808 %endif 9809 ; 9810 %if ARCH_X86_64 9811 mov srcq, dstq 9812 sub srcq, dstrideq 9813 xor r1, r1 9814 %else 9815 mov r3, dstq 9816 mov reg_dstride, dstridem 9817 sub r3, reg_dstride 9818 mov srcm, r3 9819 %endif 9820 ; 9821.bottom_x_loop: 9822 %if ARCH_X86_64 9823 mova m0, [srcq+r1*2] 9824 lea r3, [dstq+r1*2] 9825 mov r4, bottomextq 9826 %else 9827 mov r3, srcm 9828 mova m0, [r3+r1*2] 9829 lea r3, [dstq+r1*2] 9830 mov r4, r4m 9831 %endif 9832 ; 9833.bottom_y_loop: 9834 mova [r3], m0 9835 add r3, reg_dstride 9836 dec r4 9837 jg .bottom_y_loop 9838 add r1, mmsize/2 9839 cmp r1, bwq 9840 jl .bottom_x_loop 9841 9842.top: 9843 ; top edge extension 9844 test topextq, topextq 9845 jz .end 9846%if ARCH_X86_64 9847 mov srcq, reg_blkm 9848%else 9849 mov r3, reg_blkm 9850 mov reg_dstride, dstridem 9851%endif 9852 mov dstq, dstm 9853 xor r1, r1 9854 ; 9855.top_x_loop: 9856%if ARCH_X86_64 9857 mova m0, [srcq+r1*2] 9858%else 9859 mov r3, reg_blkm 9860 mova m0, [r3+r1*2] 9861%endif 9862 lea r3, [dstq+r1*2] 9863 mov r4, topextq 9864 ; 9865.top_y_loop: 9866 mova [r3], m0 9867 add r3, reg_dstride 9868 dec r4 9869 jg .top_y_loop 9870 add r1, mmsize/2 9871 cmp r1, bwq 9872 jl .top_x_loop 9873 9874.end: 9875 RET 9876 9877%undef reg_dstride 9878%undef reg_blkm 9879%undef reg_tmp 9880 9881%macro SCRATCH 3 9882%if ARCH_X86_32 9883 mova [rsp+%3*mmsize], m%1 9884%define m%2 [rsp+%3*mmsize] 9885%else 9886 SWAP %1, %2 9887%endif 9888%endmacro 9889 9890%if ARCH_X86_64 9891cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \ 9892 dst_w, h, src_w, dx, mx0, pxmax 9893%elif STACK_ALIGNMENT >= 16 9894cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \ 9895 dst_w, h, src_w, dx, mx0, pxmax 9896%else 9897cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \ 9898 dst_w, h, src_w, dx, mx0, pxmax 9899%endif 9900 movifnidn dstq, dstmp 9901 movifnidn srcq, srcmp 9902%if STACK_ALIGNMENT >= 16 9903 movifnidn dst_wd, dst_wm 9904%endif 9905%if ARCH_X86_64 9906 movifnidn hd, hm 9907%endif 9908 sub dword mx0m, 4<<14 9909 sub dword src_wm, 8 9910 movd m4, pxmaxm 9911 movd m7, dxm 9912 movd m6, mx0m 9913 movd m5, src_wm 9914 punpcklwd m4, m4 9915 pshufd m4, m4, q0000 9916 pshufd m7, m7, q0000 9917 pshufd m6, m6, q0000 9918 pshufd m5, m5, q0000 9919 mova [rsp+16*3*ARCH_X86_32], m4 9920%if ARCH_X86_64 9921 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x 9922 LEA r7, $$ 9923 %define base r7-$$ 9924%else 9925 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x 9926 %define hd dword r5m 9927 %if STACK_ALIGNMENT >= 16 9928 LEA r6, $$ 9929 %define base r6-$$ 9930 %else 9931 LEA r4, $$ 9932 %define base r4-$$ 9933 %endif 9934%endif 9935%if ARCH_X86_64 9936 mova m12, [base+pd_64] 9937 mova m11, [base+pd_63] 9938%else 9939 %define m12 [base+pd_64] 9940 %define m11 [base+pd_63] 9941%endif 9942 pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] 9943 pslld m7, 2 ; dx*4 9944 pslld m5, 14 9945 paddd m6, m4 ; mx+[0..3]*dx 9946 SCRATCH 7, 15, 0 9947 SCRATCH 6, 14, 1 9948 SCRATCH 5, 13, 2 9949 pxor m1, m1 9950.loop_y: 9951 xor xd, xd 9952 mova m0, m14 ; per-line working version of mx 9953.loop_x: 9954 pcmpgtd m1, m0 9955 pandn m1, m0 9956 psrad m2, m0, 8 ; filter offset (unmasked) 9957 pcmpgtd m3, m13, m1 9958 pand m1, m3 9959 pandn m3, m13 9960 por m1, m3 9961 psubd m3, m0, m1 ; pshufb offset 9962 psrad m1, 14 ; clipped src_x offset 9963 psrad m3, 14 ; pshufb edge_emu offset 9964 pand m2, m11 ; filter offset (masked) 9965 ; load source pixels 9966%if ARCH_X86_64 9967 movd r8d, m1 9968 pshuflw m1, m1, q3232 9969 movd r9d, m1 9970 punpckhqdq m1, m1 9971 movd r10d, m1 9972 psrlq m1, 32 9973 movd r11d, m1 9974 movu m4, [srcq+r8*2] 9975 movu m5, [srcq+r9*2] 9976 movu m6, [srcq+r10*2] 9977 movu m7, [srcq+r11*2] 9978 ; if no emulation is required, we don't need to shuffle or emulate edges 9979 packssdw m3, m3 9980 movq r11, m3 9981 test r11, r11 9982 jz .filter 9983 movsx r8, r11w 9984 sar r11, 16 9985 movsx r9, r11w 9986 sar r11, 16 9987 movsx r10, r11w 9988 sar r11, 16 9989 movu m1, [base+resize_shuf+8+r8*2] 9990 movu m3, [base+resize_shuf+8+r9*2] 9991 movu m8, [base+resize_shuf+8+r10*2] 9992 movu m9, [base+resize_shuf+8+r11*2] 9993 pshufb m4, m1 9994 pshufb m5, m3 9995 pshufb m6, m8 9996 pshufb m7, m9 9997.filter: 9998 movd r8d, m2 9999 pshuflw m2, m2, q3232 10000 movd r9d, m2 10001 punpckhqdq m2, m2 10002 movd r10d, m2 10003 psrlq m2, 32 10004 movd r11d, m2 10005 movq m8, [base+resize_filter+r8*8] 10006 movq m2, [base+resize_filter+r9*8] 10007 pxor m9, m9 10008 punpcklbw m1, m9, m8 10009 punpcklbw m3, m9, m2 10010 psraw m1, 8 10011 psraw m3, 8 10012 movq m10, [base+resize_filter+r10*8] 10013 movq m2, [base+resize_filter+r11*8] 10014 punpcklbw m8, m9, m10 10015 punpcklbw m9, m2 10016 psraw m8, 8 10017 psraw m9, 8 10018 pmaddwd m4, m1 10019 pmaddwd m5, m3 10020 pmaddwd m6, m8 10021 pmaddwd m7, m9 10022 phaddd m4, m5 10023%else 10024 movd r3, m1 10025 pshuflw m1, m1, q3232 10026 movd r1, m1 10027 punpckhqdq m1, m1 10028 movu m4, [srcq+r3*2] 10029 movu m5, [srcq+r1*2] 10030 movd r3, m1 10031 psrlq m1, 32 10032 movd r1, m1 10033 movu m6, [srcq+r3*2] 10034 movu m7, [srcq+r1*2] 10035 ; if no emulation is required, we don't need to shuffle or emulate edges 10036 pxor m1, m1 10037 pcmpeqb m1, m3 10038 pmovmskb r3d, m1 10039 cmp r3d, 0xffff 10040 je .filter 10041 movd r3, m3 10042 movu m1, [base+resize_shuf+8+r3*2] 10043 pshuflw m3, m3, q3232 10044 movd r1, m3 10045 pshufb m4, m1 10046 movu m1, [base+resize_shuf+8+r1*2] 10047 punpckhqdq m3, m3 10048 movd r3, m3 10049 pshufb m5, m1 10050 movu m1, [base+resize_shuf+8+r3*2] 10051 psrlq m3, 32 10052 movd r1, m3 10053 pshufb m6, m1 10054 movu m1, [base+resize_shuf+8+r1*2] 10055 pshufb m7, m1 10056.filter: 10057 mova [esp+4*16], m6 10058 mova [esp+5*16], m7 10059 movd r3, m2 10060 pshuflw m2, m2, q3232 10061 movd r1, m2 10062 movq m6, [base+resize_filter+r3*8] 10063 movq m7, [base+resize_filter+r1*8] 10064 pxor m3, m3 10065 punpcklbw m1, m3, m6 10066 punpcklbw m3, m7 10067 psraw m1, 8 10068 psraw m3, 8 10069 pmaddwd m4, m1 10070 pmaddwd m5, m3 10071 punpckhqdq m2, m2 10072 movd r3, m2 10073 psrlq m2, 32 10074 movd r1, m2 10075 phaddd m4, m5 10076 movq m2, [base+resize_filter+r3*8] 10077 movq m5, [base+resize_filter+r1*8] 10078 mova m6, [esp+4*16] 10079 mova m7, [esp+5*16] 10080 pxor m3, m3 10081 punpcklbw m1, m3, m2 10082 punpcklbw m3, m5 10083 psraw m1, 8 10084 psraw m3, 8 10085 pmaddwd m6, m1 10086 pmaddwd m7, m3 10087%endif 10088 phaddd m6, m7 10089 phaddd m4, m6 10090 pxor m1, m1 10091 psubd m2, m12, m4 10092 psrad m2, 7 10093 packssdw m2, m2 10094 pmaxsw m2, m1 10095 pminsw m2, [rsp+16*3*ARCH_X86_32] 10096 movq [dstq+xq*2], m2 10097 paddd m0, m15 10098 add xd, 4 10099%if STACK_ALIGNMENT >= 16 10100 cmp xd, dst_wd 10101%else 10102 cmp xd, dst_wm 10103%endif 10104 jl .loop_x 10105 add dstq, dst_stridemp 10106 add srcq, src_stridemp 10107 dec hd 10108 jg .loop_y 10109 RET 10110