1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; Copyright © 2017-2021, The rav1e contributors 4; Copyright © 2020, Nathan Egge 5; Copyright © 2021, Matthias Dressel 6; All rights reserved. 7; 8; Redistribution and use in source and binary forms, with or without 9; modification, are permitted provided that the following conditions are met: 10; 11; 1. Redistributions of source code must retain the above copyright notice, this 12; list of conditions and the following disclaimer. 13; 14; 2. Redistributions in binary form must reproduce the above copyright notice, 15; this list of conditions and the following disclaimer in the documentation 16; and/or other materials provided with the distribution. 17; 18; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 25; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29%include "config.asm" 30%include "ext/x86/x86inc.asm" 31 32SECTION_RODATA 33%macro COEF 1-2 34pd_%1: times 4 dd %1 35%if %0 == 2 36pd_m%1: times 4 dd -%1 37%endif 38%endmacro 39 40COEF 201 41COEF 401 42COEF 601, 1 43COEF 799 44COEF 995 45COEF 1189, 1 46COEF 1380, 1 47COEF 1567 48COEF 1751 49COEF 1931 50COEF 2106, 1 51COEF 2276, 1 52COEF 2440 53COEF 2598, 1 54COEF 2751, 1 55COEF 2896 56COEF 3035 57COEF 3166 58COEF 3290 59COEF 3406 60COEF 3513 61COEF 3612 62COEF 3703 63COEF 3784 64COEF 3857 65COEF 3920 66COEF 3973 67COEF 4017 68COEF 4052 69COEF 4076 70COEF 4091 71 72deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 73 74%if ARCH_X86_32 75pd_1: times 4 dd 1 76%endif 77pd_2: times 4 dd 2 78pw_5: times 8 dw 5 79pd_1321: times 4 dd 1321 80pd_2482: times 4 dd 2482 81pd_m3344: times 4 dd -3344 82pd_2048: times 4 dd 2048 83pw_4x2048_4xm2048: times 4 dw 2048 84 times 4 dw -2048 85pw_4xm2048_4x2048: times 4 dw -2048 86 times 4 dw 2048 87pw_2048: times 8 dw 2048 88pw_m2048: times 8 dw -2048 89pd_3803: times 4 dd 3803 90pw_4096: times 8 dw 4096 91pd_5793: times 4 dd 5793 92pd_6144: times 4 dd 6144 93pw_8192: times 8 dw 8192 94pd_10240: times 4 dd 10240 95pd_11586: times 4 dd 11586 96pw_1697x8: times 8 dw 1697*8 97pw_2896x8: times 8 dw 2896*8 98pw_1697x16: times 8 dw 1697*16 99pw_16384: times 8 dw 16384 100pixel_10bpc_max: times 8 dw 0x03ff 101 102pw_1567_3784: times 4 dw 1567, 3784 103pw_m3784_1567: times 4 dw -3784, 1567 104pw_2896_2896: times 4 dw 2896, 2896 105pw_m2896_2896: times 4 dw -2896, 2896 106 107clip_18b_min: times 4 dd -0x20000 108clip_18b_max: times 4 dd 0x1ffff 109 110idct64_mul_16bpc: 111dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 112dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 113dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 114dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 115 116cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 117cextern iadst_4x4_internal_8bpc_ssse3.main 118cextern idct_4x8_internal_8bpc_ssse3.main 119cextern iadst_4x8_internal_8bpc_ssse3.main 120cextern idct_16x4_internal_8bpc_ssse3.main 121cextern iadst_16x4_internal_8bpc_ssse3.main 122cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end 123cextern idct_8x4_internal_8bpc_ssse3.main 124cextern iadst_8x4_internal_8bpc_ssse3.main 125cextern idct_8x8_internal_8bpc_ssse3.main 126cextern idct_8x8_internal_8bpc_ssse3.pass1_end3 127cextern iadst_8x8_internal_8bpc_ssse3.main 128cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end 129cextern idct_16x8_internal_8bpc_ssse3.main 130cextern iadst_16x8_internal_8bpc_ssse3.main 131cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end 132cextern idct_8x32_internal_8bpc_ssse3.main 133cextern idct_8x32_internal_8bpc_ssse3.main_fast 134cextern idct_8x32_internal_8bpc_ssse3.main_veryfast 135cextern idct_16x64_internal_8bpc_ssse3.main 136cextern idct_16x64_internal_8bpc_ssse3.main_fast 137 138tbl_4x16_2d: db 0, 13, 29, 45 139tbl_4x16_h: db 0, 16, 32, 48 140tbl_4x16_v: db 0, 4, 8, 12 141 142tbl_8x16_2d: db 0, 14, 30, 46 143tbl_8x16_v: db 0, 4, 8, 12 144tbl_8x16_h: db 0, 32, 64, 96 145 146tbl_16x16_2d: db 0, 10, 36, 78 147tbl_16x16_v: db 0, 4, 8, 12 148tbl_16x16_h: db 0, 64, 128, 192 149 150tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203 151 152tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343 153 154tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one 155tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406 156 157tbl_Nx32_odd_offset: db 2*16, 2*23 158 db 2*20, 2*19 159 db 2*18, 2*21 160 db 2*22, 2*17 161 db 2*30, 2*25 162 db 2*26, 2*29 163 db 2*28, 2*27 164 db 2*24, 2*31 165 166tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46 167 db 2* 8, 2*40, 2*23, 2*38 168 db 2* 1, 2*36, 2*20, 2*42 169 db 2* 9, 2*44, 2*19, 2*34 170 db 2* 2, 2*60, 2*18, 2*50 171 db 2*10, 2*52, 2*21, 2*58 172 db 2* 3, 2*56, 2*22, 2*54 173 db 2*11, 2*48, 2*17, 2*62 174 175SECTION .text 176 177%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx) 178%define m(x) m_suffix(x, SUFFIX) 179 180; This refers to the first function in itx_sse i.e. the start of the text section 181; which is needed as a base pointer for constants. 182%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3) 183 184%if ARCH_X86_64 185%define o(x) x 186%else 187%define o(x) r6-$$+x ; PIC 188%endif 189 190%macro IWHT4_1D 0 191 ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 192 paddd m0, m1 ; in0 += in1 193 psubd m4, m2, m3 ; tmp0 = in2 - in3 194 psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1 195 psrad m5, 1 196 psubd m2, m5, m1 ; in2 = tmp1 - in1 197 psubd m5, m3 ; in1 = tmp1 - in3 198 psubd m0, m5 ; in0 -= in1 199 paddd m4, m2 ; in3 = tmp0 + in2 200 ; m0 = out0, m1 = in1, m2 = out2, m3 = in3 201 ; m4 = out3, m5 = out1 202%endmacro 203 204INIT_XMM sse2 205cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax 206 mova m0, [cq+16*0] 207 mova m1, [cq+16*1] 208 mova m2, [cq+16*2] 209 mova m3, [cq+16*3] 210 REPX {psrad x, 2}, m0, m1, m2, m3 211 IWHT4_1D 212 punpckldq m1, m0, m5 213 punpckhdq m3, m0, m5 214 punpckldq m5, m2, m4 215 punpckhdq m2, m4 216 punpcklqdq m0, m1, m5 217 punpckhqdq m1, m5 218 punpcklqdq m4, m3, m2 219 punpckhqdq m3, m2 220 mova m2, m4 221 IWHT4_1D 222 packssdw m0, m4 ; low: out3, high: out0 223 packssdw m2, m5 ; low: out2, high: out1 224 pxor m4, m4 225 mova [cq+16*0], m4 226 mova [cq+16*1], m4 227 mova [cq+16*2], m4 228 mova [cq+16*3], m4 229 lea r2, [dstq+strideq*2] 230 movq m1, [dstq+strideq*0] 231 movhps m1, [r2 +strideq*1] 232 movq m3, [r2 +strideq*0] 233 movhps m3, [dstq+strideq*1] 234 movd m5, bdmaxm 235 pshuflw m5, m5, q0000 ; broadcast 236 punpcklqdq m5, m5 ; broadcast 237 paddsw m0, m1 238 paddsw m2, m3 239 pmaxsw m0, m4 240 pmaxsw m2, m4 241 pminsw m0, m5 242 pminsw m2, m5 243 movhps [r2 +strideq*1], m0 ; write out0 244 movhps [dstq+strideq*1], m2 ; write out1 245 movq [r2 +strideq*0], m2 ; write out2 246 movq [dstq+strideq*0], m0 ; write out3 247 RET 248 249; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 250; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 251; flags: 2 = inv_dst1, 4 = inv_dst2 252; skip round/shift if rnd is not a number 253%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags 254; %1 dst/src[1] 255; %2 dst/src[2] 256; %3 tmp[1] 257; %4 tmp[2] 258; %5 tmp[3] 259; %6 rnd 260; %7 coef[1] 261; %8 coef[2] 262; %9 flags 263%ifnidn %7,%8 ; optimize when coef1 == coef2 264%if %8 < 32 265 pmulld m%4, m%1, m%8 266 pmulld m%3, m%2, m%8 267%else 268 mova m%3, [o(pd_%8)] 269 pmulld m%4, m%1, m%3 270 pmulld m%3, m%2 271%endif 272%endif 273%if %7 < 32 274 pmulld m%1, m%7 275 pmulld m%2, m%7 276%else 277 mova m%5, [o(pd_%7)] 278 pmulld m%1, m%5 279 pmulld m%2, m%5 280%endif 281%if %9 & 4 ; invert dst2 282 paddd m%4, m%2 283 psubd m%2, m%6, m%4 284%else 285%ifnum %6 286%ifnidn %7,%8 287 paddd m%4, m%6 288%else 289 paddd m%1, m%6 290%endif 291%endif 292%ifnidn %7,%8 293 paddd m%2, m%4 294%else 295 mova m%3, m%2 296 paddd m%2, m%1 297%endif 298%endif 299%if %9 & 2 ; invert dst1 300 psubd m%3, m%1 301 paddd m%1, m%3, m%6 302%else 303%ifnum %6 304%ifnidn %7,%8 305 paddd m%1, m%6 306%endif 307%endif 308 psubd m%1, m%3 309%endif 310%ifnum %6 311 psrad m%2, 12 312 psrad m%1, 12 313%endif 314%endmacro 315 316%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack 317cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 318 %define %%p1 m(i%1_%4_internal_16bpc) 319%if ARCH_X86_32 320 LEA r6, $$ 321%endif 322%if has_epilogue 323%ifidn %1_%2, dct_dct 324 test eobd, eobd 325 jz %%end 326%endif 327 lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] 328%ifnum %3 329%if %3 330 add eobd, %3 331%endif 332%else 333 lea r5, [o(%3)] 334%endif 335 call %%p1 336 RET 337%%end: 338%else 339 ; Jump to the 1st txfm function if we're not taking the fast path, which 340 ; in turn performs an indirect jump to the 2nd txfm function. 341 lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] 342%ifnum %3 343%if %3 344 add eobd, %3 345%endif 346%else 347 lea r5, [o(%3)] 348%endif 349%ifidn %1_%2, dct_dct 350 test eobd, eobd 351 jnz %%p1 352%else 353 ; jump to the 1st txfm function unless it's located directly after this 354 times ((%%end - %%p1) >> 31) & 1 jmp %%p1 355ALIGN function_align 356%%end: 357%endif 358%endif 359%endmacro 360 361%macro INV_TXFM_4X4_FN 2 ; type1, type2 362 INV_TXFM_FN %1, %2, 0, 4x4 363%ifidn %1_%2, dct_dct 364 imul r5d, [cq], 181 365 mov [cq], eobd ; 0 366 mov r3d, 4 367.dconly: 368 add r5d, 128 369 sar r5d, 8 370.dconly2: 371 imul r5d, 2896 372 mova m2, [o(pixel_10bpc_max)] 373 add r5d, 34816 374 movd m0, r5d 375 pshuflw m0, m0, q1111 376 pxor m3, m3 377 punpcklqdq m0, m0 378.dconly_loop: 379 movq m1, [dstq+strideq*0] 380 movhps m1, [dstq+strideq*1] 381 paddw m1, m0 382 pminsw m1, m2 383 pmaxsw m1, m3 384 movq [dstq+strideq*0], m1 385 movhps [dstq+strideq*1], m1 386 lea dstq, [dstq+strideq*2] 387 sub r3d, 2 388 jg .dconly_loop 389 RET 390%endif 391%endmacro 392 393%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd 394 ; butterfly rotation 395 ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0 396 ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3 397 ; Hadamard rotation 398 psubd m%5, m%1, m%2 399 paddd m%2, m%1 400 paddd m%1, m%3, m%4 401 psubd m%3, m%4 402 ; %1 (src1) = out0 403 ; %2 (src2) = out1 404 ; %3 (src3) = out3 405 ; $5 (tmp1) = out2 406%endmacro 407 408INIT_XMM sse4 409 410INV_TXFM_4X4_FN dct, dct 411INV_TXFM_4X4_FN dct, identity 412INV_TXFM_4X4_FN dct, adst 413INV_TXFM_4X4_FN dct, flipadst 414 415cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 416 mova m0, [cq+16*0] 417 mova m1, [cq+16*1] 418 mova m2, [cq+16*2] 419 mova m3, [cq+16*3] 420 mova m5, [o(pd_2048)] 421 call .pass1_main 422 packssdw m0, m1 ; out0 out1 423 packssdw m4, m2 ; out2 out3 424 ; transpose 425 punpckhwd m2, m0, m4 426 punpcklwd m0, m4 427 punpckhwd m1, m0, m2 428 punpcklwd m0, m2 429 ; m0 = out0 out1 430 ; m1 = out2 out3 431 ; m5 = pd_2048 432 jmp tx2q 433.pass1_main: 434 IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 435 ret 436.pass2: 437 ; m0 = in0 in1 438 ; m1 = in2 in3 439 ; m5 = pd_2048 440 punpckhwd m2, m1, m0 441 punpcklwd m1, m0 442 pmaddwd m4, m2, [o(pw_m3784_1567)] 443 pmaddwd m2, [o(pw_1567_3784)] 444 pmaddwd m0, m1, [o(pw_m2896_2896)] 445 pmaddwd m1, [o(pw_2896_2896)] 446 REPX {paddd x, m5}, m4, m2, m0, m1 447 packssdw m5, m5 ; pw_2048 448 REPX {psrad x, 12}, m4, m2, m0, m1 449 packssdw m2, m4 ; t3 t2 450 packssdw m1, m0 ; t0 t1 451 paddsw m0, m1, m2 ; out0 out1 452 psubsw m1, m2 ; out3 out2 453 pmulhrsw m0, m5 454 pmulhrsw m1, m5 455 movq m2, [dstq+strideq*0] 456 movhps m2, [dstq+strideq*1] 457 lea r5, [dstq+strideq*2] 458 movq m3, [r5 +strideq*1] 459 movhps m3, [r5 +strideq*0] 460 mova m5, [o(pixel_10bpc_max)] 461 pxor m4, m4 462 mova [cq+16*0], m4 463 mova [cq+16*1], m4 464 mova [cq+16*2], m4 465 mova [cq+16*3], m4 466 paddw m0, m2 467 paddw m1, m3 468 pmaxsw m0, m4 469 pmaxsw m1, m4 470 pminsw m0, m5 471 pminsw m1, m5 472 movq [dstq+strideq*0], m0 473 movhps [dstq+strideq*1], m0 474 movhps [r5 +strideq*0], m1 475 movq [r5 +strideq*1], m1 476 RET 477 478INV_TXFM_4X4_FN adst, dct 479INV_TXFM_4X4_FN adst, adst 480INV_TXFM_4X4_FN adst, flipadst 481INV_TXFM_4X4_FN adst, identity 482 483cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 484 call .main 485 packssdw m0, m2 ; out0 out1 486 packssdw m1, m4 ; out2 out3 487 ; transpose 488 punpckhwd m2, m0, m1 489 punpcklwd m0, m1 490 punpckhwd m1, m0, m2 491 punpcklwd m0, m2 492 ; m0 = out0 out1 493 ; m1 = out2 out3 494 ; m5 = pd_2048 495 jmp tx2q 496.pass2: 497 ; m0 = in0 in1 498 ; m1 = in2 in3 499%if ARCH_X86_32 500 lea r5, [o(itx8_start)] 501%endif 502 call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main 503.end: 504 mova m4, [o(pw_2048)] 505 movq m2, [dstq+strideq*0] 506 movhps m2, [dstq+strideq*1] 507 lea r5, [dstq+strideq*2] 508 movq m3, [r5 +strideq*0] 509 movhps m3, [r5 +strideq*1] 510 mova m5, [o(pixel_10bpc_max)] 511 pmulhrsw m0, m4 512 pmulhrsw m1, m4 513 pxor m4, m4 514 mova [cq+16*0], m4 515 mova [cq+16*1], m4 516 mova [cq+16*2], m4 517 mova [cq+16*3], m4 518 paddw m0, m2 519 paddw m1, m3 520 pmaxsw m0, m4 521 pmaxsw m1, m4 522 pminsw m0, m5 523 pminsw m1, m5 524 movq [dstq+strideq*0], m0 525 movhps [dstq+strideq*1], m0 526 movq [r5 +strideq*0], m1 527 movhps [r5 +strideq*1], m1 528 RET 529ALIGN function_align 530.main: 531 mova m1, [cq+16*2] 532 mova m3, [cq+16*3] 533 mova m5, [cq+16*0] 534 lea r3, [cq+16*1] 535.main2: 536 mova m0, [o(pd_1321)] ; SINPI_1_9 537 mova m2, [o(pd_2482)] ; SINPI_2_9 538 mova m6, [o(pd_3803)] ; SINPI_4_9 539 pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2] 540 pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3] 541 pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2] 542 pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0] 543 psubd m1, m3 ; T[2] - T[3] 544 pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3] 545 pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0] 546 paddd m0, m6 ; s[0] += s[3] 547 paddd m0, m3 ; s[0] += s[5] 548 mova m3, [o(pd_m3344)] ; -SINPI_3_9 549 psubd m2, m4 ; s[1] -= s[4] 550 psubd m2, m7 ; s[1] -= s[6] 551 psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0] 552 pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7 553 pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1] 554 mova m5, [o(pd_2048)] 555 REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048 556 paddd m4, m0, m2 ; x[3] = s[0] + s[1] 557 psubd m2, m3 ; x[1] = s[1] + s[3] 558 psubd m0, m3 ; x[0] = s[0] + s[3] 559 paddd m4, m3 ; x[3] -= s[3] 560 paddd m2, m5 ; x[1] + 2048 561 REPX {psrad x, 12}, m0, m2, m1, m4 562 ret 563 564 565INV_TXFM_4X4_FN flipadst, dct 566INV_TXFM_4X4_FN flipadst, adst 567INV_TXFM_4X4_FN flipadst, flipadst 568INV_TXFM_4X4_FN flipadst, identity 569 570cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 571 call m(iadst_4x4_internal_16bpc).main 572 packssdw m0, m2 ; out0 out1 573 packssdw m1, m4 ; out2 out3 574 ; transpose 575 punpcklwd m2, m1, m0 576 punpckhwd m1, m0 577 punpcklwd m0, m1, m2 578 punpckhwd m1, m2 579 ; m0 = out0 out1 580 ; m1 = out2 out3 581 ; m5 = pd_2048 582 jmp tx2q 583.pass2: 584 ; m0 = in0 in1 585 ; m1 = in2 in3 586%if ARCH_X86_32 587 lea r5, [o(itx8_start)] 588%endif 589 call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main 590 mova m4, [o(pw_2048)] 591 movq m3, [dstq+strideq*1] 592 movhps m3, [dstq+strideq*0] 593 lea r5, [dstq+strideq*2] 594 movq m2, [r5 +strideq*1] 595 movhps m2, [r5 +strideq*0] 596 mova m5, [o(pixel_10bpc_max)] 597 pmulhrsw m0, m4 598 pmulhrsw m1, m4 599 pxor m4, m4 600 mova [cq+16*0], m4 601 mova [cq+16*1], m4 602 mova [cq+16*2], m4 603 mova [cq+16*3], m4 604 paddw m0, m2 605 paddw m1, m3 606 pmaxsw m0, m4 607 pmaxsw m1, m4 608 pminsw m0, m5 609 pminsw m1, m5 610 movhps [dstq+strideq*0], m1 611 movq [dstq+strideq*1], m1 612 movhps [r5 +strideq*0], m0 613 movq [r5 +strideq*1], m0 614 RET 615 616INV_TXFM_4X4_FN identity, dct 617INV_TXFM_4X4_FN identity, adst 618INV_TXFM_4X4_FN identity, flipadst 619INV_TXFM_4X4_FN identity, identity 620 621cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 622 mova m3, [o(pd_5793)] 623 pmulld m0, m3, [cq+16*0] 624 pmulld m1, m3, [cq+16*1] 625 pmulld m2, m3, [cq+16*2] 626 pmulld m3, [cq+16*3] 627 mova m5, [o(pd_2048)] 628 REPX {paddd x, m5}, m0, m1, m2, m3 629 REPX {psrad x, 12}, m0, m1, m2, m3 630 packssdw m0, m1 631 packssdw m2, m3 632 ; transpose 633 punpckhwd m3, m0, m2 634 punpcklwd m0, m2 635 punpckhwd m1, m0, m3 636 punpcklwd m0, m3 637 ; m0 = out0 out1 638 ; m1 = out2 out3 639 ; m5 = pd_2048 640 jmp tx2q 641.pass2: 642 ; m0 = in0 in1 643 ; m1 = in2 in3 644 ; m5 = pd_2048 645 mova m4, [o(pw_1697x8)] 646 movq m2, [dstq+strideq*0] 647 movhps m2, [dstq+strideq*1] 648 lea r5, [dstq+strideq*2] 649 pmulhrsw m3, m4, m0 650 pmulhrsw m4, m1 651 paddsw m0, m3 652 paddsw m1, m4 653 movq m3, [r5 +strideq*0] 654 movhps m3, [r5 +strideq*1] 655 mova m4, [o(pixel_10bpc_max)] 656 packssdw m5, m5 ; pw_2048 657 pmulhrsw m0, m5 658 pmulhrsw m1, m5 659 pxor m5, m5 660 mova [cq+16*0], m5 661 mova [cq+16*1], m5 662 mova [cq+16*2], m5 663 mova [cq+16*3], m5 664 paddw m0, m2 665 paddw m1, m3 666 pmaxsw m0, m5 667 pmaxsw m1, m5 668 pminsw m0, m4 669 pminsw m1, m4 670 movq [dstq+strideq*0], m0 671 movhps [dstq+strideq*1], m0 672 movq [r5 +strideq*0], m1 673 movhps [r5 +strideq*1], m1 674 RET 675 676%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset 677 INV_TXFM_FN %1, %2, %3, 4x8 678%ifidn %1_%2, dct_dct 679 imul r5d, [cq], 181 680 mov [cq], eobd ; 0 681 mov r3d, 8 682 add r5d, 128 683 sar r5d, 8 684 imul r5d, 181 685 jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly 686%endif 687%endmacro 688 689INV_TXFM_4X8_FN dct, dct 690INV_TXFM_4X8_FN dct, identity, 9 691INV_TXFM_4X8_FN dct, adst 692INV_TXFM_4X8_FN dct, flipadst 693 694cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 695%undef cmp 696 mova m5, [o(pd_2048)] 697%if ARCH_X86_64 698 xor r5d, r5d 699 cmp eobd, 13 700 setge r5b 701%else 702 mov r5d, 1 703 cmp eobd, 13 704 sbb r5d, 0 705%endif 706 shl r5d, 4 707.loop_pass1: 708 mova m3, [o(pd_2896)] 709 pmulld m0, m3, [cq+32*0+r5] 710 pmulld m1, m3, [cq+32*1+r5] 711 pmulld m2, m3, [cq+32*2+r5] 712 pmulld m3, [cq+32*3+r5] 713 REPX {paddd x, m5}, m0, m1, m2, m3 714 REPX {psrad x, 12}, m0, m1, m2, m3 715 call m(idct_4x4_internal_16bpc).pass1_main 716 packssdw m0, m1 ; out0 out1 717 packssdw m4, m2 ; out2 out3 718 test r5d, r5d 719 jz .end_pass1 720 mova [cq+32*0+16], m0 721 mova [cq+32*1+16], m4 722 xor r5d, r5d 723 jmp .loop_pass1 724.end_pass1: 725 punpckhwd m2, m0, m4 726 punpcklwd m0, m4 727 punpckhwd m1, m0, m2 728 punpcklwd m0, m2 729 mova m2, [cq+32*0+16] 730 mova m6, [cq+32*1+16] 731 punpckhwd m4, m2, m6 732 punpcklwd m2, m6 733 punpckhwd m3, m2, m4 734 punpcklwd m2, m4 735 ; m0-3 = packed & transposed output 736 jmp tx2q 737.pass2: 738%if ARCH_X86_32 739 lea r5, [o(itx8_start)] 740%endif 741 call m_suffix(idct_4x8_internal_8bpc, _ssse3).main 742 ; m0-3 is now out0/1,3/2,4/5,7/6 743 mova m4, [o(pw_2048)] 744 shufps m1, m1, q1032 745 shufps m3, m3, q1032 746.end: 747 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 748 pxor m4, m4 749 REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 750 mova m7, [o(pixel_10bpc_max)] 751 lea r2, [strideq*3] 752 movq m5, [dstq+strideq*0] 753 movq m6, [dstq+strideq*2] 754 movhps m5, [dstq+strideq*1] 755 movhps m6, [dstq+r2] 756 lea r4, [dstq+strideq*4] 757 paddw m0, m5 758 paddw m1, m6 759 movq m5, [r4+strideq*0] 760 movq m6, [r4+strideq*2] 761 movhps m5, [r4+strideq*1] 762 movhps m6, [r4+r2] 763 paddw m2, m5 764 paddw m3, m6 765 REPX {pminsw x, m7}, m0, m1, m2, m3 766 REPX {pmaxsw x, m4}, m0, m1, m2, m3 767 movq [dstq+strideq*0], m0 768 movhps [dstq+strideq*1], m0 769 movq [dstq+strideq*2], m1 770 movhps [dstq+r2 ], m1 771 movq [r4 +strideq*0], m2 772 movhps [r4 +strideq*1], m2 773 movq [r4 +strideq*2], m3 774 movhps [r4 +r2 ], m3 775 RET 776 777INV_TXFM_4X8_FN adst, dct 778INV_TXFM_4X8_FN adst, adst 779INV_TXFM_4X8_FN adst, flipadst 780INV_TXFM_4X8_FN adst, identity, 9 781 782cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 783 call .pass1_main 784 punpckhwd m2, m0, m1 785 punpcklwd m0, m1 786 punpckhwd m1, m0, m2 787 punpcklwd m0, m2 788 mova m2, [cq+32*2+16] 789 mova m6, [cq+32*3+16] 790 punpckhwd m4, m2, m6 791 punpcklwd m2, m6 792 punpckhwd m3, m2, m4 793 punpcklwd m2, m4 794 ; m0-3 = packed & transposed output 795 jmp tx2q 796.pass1_main: 797%undef cmp 798%if ARCH_X86_64 799 xor r5d, r5d 800 cmp eobd, 13 801 setge r5b 802%else 803 mov r5d, 1 804 cmp eobd, 13 805 sbb r5d, 0 806%endif 807 shl r5d, 4 808 lea r3, [cq+32*1+16] 809.loop_pass1: 810 mova m0, [o(pd_2048)] 811 mova m3, [o(pd_2896)] 812 pmulld m5, m3, [cq+32*0+r5] 813 pmulld m2, m3, [cq+32*1+r5] 814 pmulld m1, m3, [cq+32*2+r5] 815 pmulld m3, [cq+32*3+r5] 816 REPX {paddd x, m0}, m5, m2, m1, m3 817 REPX {psrad x, 12}, m5, m2, m1, m3 818 mova [r3], m2 819 call m(iadst_4x4_internal_16bpc).main2 820 packssdw m0, m2 ; out0 out1 821 packssdw m1, m4 ; out2 out3 822 test r5d, r5d 823 jz .end_pass1 824 mova [cq+32*2+16], m0 825 mova [cq+32*3+16], m1 826 xor r5d, r5d 827 jmp .loop_pass1 828.end_pass1: 829 ret 830.pass2: 831 shufps m0, m0, q1032 832 shufps m1, m1, q1032 833%if ARCH_X86_32 834 lea r5, [o(itx8_start)] 835%endif 836 call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main 837 mova m4, [o(pw_4x2048_4xm2048)] 838 jmp m(idct_4x8_internal_16bpc).end 839 840INV_TXFM_4X8_FN flipadst, dct 841INV_TXFM_4X8_FN flipadst, adst 842INV_TXFM_4X8_FN flipadst, flipadst 843INV_TXFM_4X8_FN flipadst, identity, 9 844 845cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 846 call m(iadst_4x8_internal_16bpc).pass1_main 847 punpcklwd m2, m1, m0 848 punpckhwd m1, m0 849 punpcklwd m0, m1, m2 850 punpckhwd m1, m2 851 mova m6, [cq+32*2+16] 852 mova m2, [cq+32*3+16] 853 punpcklwd m4, m2, m6 854 punpckhwd m2, m6 855 punpckhwd m3, m2, m4 856 punpcklwd m2, m4 857 ; m0-3 = packed & transposed output 858 jmp tx2q 859.pass2: 860 shufps m0, m0, q1032 861 shufps m1, m1, q1032 862%if ARCH_X86_32 863 lea r5, [o(itx8_start)] 864%endif 865 call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main 866 mova m4, m0 867 mova m5, m1 868 pshufd m0, m3, q1032 869 pshufd m1, m2, q1032 870 pshufd m2, m5, q1032 871 pshufd m3, m4, q1032 872 mova m4, [o(pw_4xm2048_4x2048)] 873 jmp m(idct_4x8_internal_16bpc).end 874 875INV_TXFM_4X8_FN identity, dct 876INV_TXFM_4X8_FN identity, adst 877INV_TXFM_4X8_FN identity, flipadst 878INV_TXFM_4X8_FN identity, identity, 3 879 880cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 881%undef cmp 882 mova m5, [o(pd_2048)] 883 mova m4, [o(pd_2896)] 884 mova m6, [o(pd_5793)] 885 ; clear m7 in case we skip the bottom square 886 pxor m7, m7 887%if ARCH_X86_64 888 xor r5d, r5d 889 cmp eobd, 16 890 setge r5b 891%else 892 mov r5d, 1 893 cmp eobd, 16 894 sbb r5d, 0 895%endif 896 shl r5d, 4 897.loop_pass1: 898 pmulld m0, m4, [cq+32*0+r5] 899 pmulld m1, m4, [cq+32*1+r5] 900 pmulld m2, m4, [cq+32*2+r5] 901 pmulld m3, m4, [cq+32*3+r5] 902 REPX {paddd x, m5}, m0, m1, m2, m3 903 REPX {psrad x, 12}, m0, m1, m2, m3 904 REPX {pmulld x, m6}, m0, m1, m2, m3 905 REPX {paddd x, m5}, m0, m1, m2, m3 906 REPX {psrad x, 12}, m0, m1, m2, m3 907 packssdw m0, m1 908 packssdw m2, m3 909 test r5d, r5d 910 jz .end_pass1 911 mova [cq+32*0+16], m0 912 mova m7, m2 913 xor r5d, r5d 914 jmp .loop_pass1 915.end_pass1: 916 punpckhwd m4, m0, m2 917 punpcklwd m0, m2 918 punpckhwd m1, m0, m4 919 punpcklwd m0, m4 920 mova m2, [cq+32*0+16] 921 punpckhwd m4, m2, m7 922 punpcklwd m2, m7 923 punpckhwd m3, m2, m4 924 punpcklwd m2, m4 925 ; m0-3 = packed & transposed output 926 jmp tx2q 927.pass2: 928 mova m4, [o(pw_4096)] 929 jmp m(idct_4x8_internal_16bpc).end 930 931%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix 932 INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 933%ifidn %1_%2, dct_dct 934 imul r5d, [cq], 181 935 mov [cq], eobd ; 0 936 mov r3d, 16 937 add r5d, 384 938 sar r5d, 9 939 jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2 940%endif 941%endmacro 942 943INV_TXFM_4X16_FN dct, dct 944INV_TXFM_4X16_FN dct, identity, v 945INV_TXFM_4X16_FN dct, adst 946INV_TXFM_4X16_FN dct, flipadst 947 948cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 949%undef cmp 950%if ARCH_X86_32 951 mov r5m, r6d 952%endif 953 mov r6d, 4 954.zero_loop: 955 dec r6d 956 cmp eobb, byte [r5+r6] 957 jl .zero_loop 958 mov r5d, r6d 959 shl r5d, 4 960%if ARCH_X86_32 961 ; restore pic-ptr 962 mov r6, r5m 963%endif 964 mova m5, [o(pd_2048)] 965.loop_pass1: 966 mova m0, [cq+64*0+r5] 967 mova m1, [cq+64*1+r5] 968 mova m2, [cq+64*2+r5] 969 mova m3, [cq+64*3+r5] 970 call m(idct_4x4_internal_16bpc).pass1_main 971 pcmpeqd m3, m3 972 REPX {psubd x, m3}, m0, m1, m4, m2 973 REPX {psrad x, 1}, m0, m1, m4, m2 974 packssdw m0, m1 ; out0 out1 975 packssdw m4, m2 ; out2 out3 976 punpckhwd m2, m0, m4 977 punpcklwd m0, m4 978 punpckhwd m1, m0, m2 979 punpcklwd m0, m2 980 test r5d, r5d 981 jz .end_pass1 982 mova [cq+64*0+r5], m0 983 mova [cq+64*1+r5], m1 984 sub r5d, 16 985 jmp .loop_pass1 986.end_pass1: 987 mova m2, [cq+64*0+16] 988 mova m3, [cq+64*1+16] 989 mova m4, [cq+64*0+32] 990 mova m5, [cq+64*1+32] 991 mova m6, [cq+64*0+48] 992 mova m7, [cq+64*1+48] 993 ; m0-7 = packed & transposed output 994 jmp tx2q 995.pass2: 996%if ARCH_X86_32 997 lea r5, [o(itx8_start)] 998%endif 999 call m_suffix(idct_16x4_internal_8bpc, _ssse3).main 1000 ; m0-6 is out0-13 [with odd registers having inversed output] 1001 ; [coeffq+16*7] has out15/14 1002 mova m7, [o(pw_2048)] 1003 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1004 pmulhrsw m7, [cq+16*7] 1005 REPX {shufps x, x, q1032}, m1, m3, m5, m7 1006 mova [cq+16*0], m4 1007 mova [cq+16*1], m5 1008 mova [cq+16*2], m6 1009 mova [cq+16*3], m7 1010.end: 1011 pxor m4, m4 1012 REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1013 mova m7, [o(pixel_10bpc_max)] 1014 mov r5d, 2 1015 lea r3, [strideq*3] 1016.loop: 1017 movq m5, [dstq+strideq*0] 1018 movq m6, [dstq+strideq*2] 1019 movhps m5, [dstq+strideq*1] 1020 movhps m6, [dstq+r3] 1021 lea r4, [dstq+strideq*4] 1022 paddw m0, m5 1023 paddw m1, m6 1024 movq m5, [r4+strideq*0] 1025 movq m6, [r4+strideq*2] 1026 movhps m5, [r4+strideq*1] 1027 movhps m6, [r4+r3] 1028 paddw m2, m5 1029 paddw m3, m6 1030 REPX {pminsw x, m7}, m0, m1, m2, m3 1031 REPX {pmaxsw x, m4}, m0, m1, m2, m3 1032 movq [dstq+strideq*0], m0 1033 movhps [dstq+strideq*1], m0 1034 movq [dstq+strideq*2], m1 1035 movhps [dstq+r3 ], m1 1036 movq [r4 +strideq*0], m2 1037 movhps [r4 +strideq*1], m2 1038 movq [r4 +strideq*2], m3 1039 movhps [r4 +r3 ], m3 1040 dec r5d 1041 jz .end2 1042 lea dstq, [dstq+strideq*8] 1043 mova m0, [cq+0*16] 1044 mova m1, [cq+1*16] 1045 mova m2, [cq+2*16] 1046 mova m3, [cq+3*16] 1047 REPX {mova [cq+x*16], m4}, 0, 1, 2, 3 1048 jmp .loop 1049.end2: 1050 RET 1051 1052INV_TXFM_4X16_FN adst, dct 1053INV_TXFM_4X16_FN adst, adst 1054INV_TXFM_4X16_FN adst, flipadst 1055INV_TXFM_4X16_FN adst, identity, v 1056 1057cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 1058%undef cmp 1059%if ARCH_X86_32 1060 mov r5m, r6d 1061%endif 1062 mov r6d, 4 1063.zero_loop: 1064 dec r6d 1065 cmp eobb, byte [r6+r5] 1066 jl .zero_loop 1067 mov r5d, r6d 1068 shl r5d, 4 1069%if ARCH_X86_32 1070 ; restore pic-ptr 1071 mov r6, r5m 1072%endif 1073.loop_pass1: 1074 mova m5, [cq+64*0+r5] 1075 lea r3, [cq+64*1+r5] 1076 mova m1, [cq+64*2+r5] 1077 mova m3, [cq+64*3+r5] 1078 call m(iadst_4x4_internal_16bpc).main2 1079 pcmpeqd m3, m3 1080 REPX {psubd x, m3}, m0, m2, m1, m4 1081 REPX {psrad x, 1}, m0, m2, m1, m4 1082 packssdw m0, m2 ; out0 out1 1083 packssdw m1, m4 ; out2 out3 1084 punpckhwd m2, m0, m1 1085 punpcklwd m0, m1 1086 punpckhwd m1, m0, m2 1087 punpcklwd m0, m2 1088 test r5d, r5d 1089 jz m(idct_4x16_internal_16bpc).end_pass1 1090 mova [cq+64*0+r5], m0 1091 mova [cq+64*1+r5], m1 1092 sub r5d, 16 1093 jmp .loop_pass1 1094.pass2: 1095%if ARCH_X86_32 1096 lea r5, [o(itx8_start)] 1097%endif 1098 call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main 1099 call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end 1100 ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8 1101 ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13 1102 mova m1, [o(pw_4x2048_4xm2048)] 1103 REPX {pmulhrsw x, m1}, m7, m2, m0 1104 pshufd m6, m1, q1032 ; 4x-2048,4x2048 1105 pmulhrsw m1, [cq+16*7] 1106 REPX {pmulhrsw x, m6}, m5, m4, m3 1107 pmulhrsw m6, [cq+16*6] 1108 ; m7/5/2/4 = out4/11,5/10,6/9,7/8 1109 ; m0/3/6/1 = out0/15,3/12,1/14,2/13 1110 ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 1111 movhps [cq+0*8], m4 1112 movhps [cq+1*8], m2 1113 movhps [cq+2*8], m5 1114 movhps [cq+3*8], m7 1115 movhps [cq+4*8], m3 1116 movhps [cq+5*8], m1 1117 movhps [cq+6*8], m6 1118 movhps [cq+7*8], m0 1119 punpcklqdq m0, m6 1120 punpcklqdq m1, m3 1121 punpcklqdq m3, m2, m4 1122 punpcklqdq m2, m7, m5 1123 jmp m(idct_4x16_internal_16bpc).end 1124 1125INV_TXFM_4X16_FN flipadst, dct 1126INV_TXFM_4X16_FN flipadst, adst 1127INV_TXFM_4X16_FN flipadst, flipadst 1128INV_TXFM_4X16_FN flipadst, identity, v 1129 1130cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 1131%undef cmp 1132%if ARCH_X86_32 1133 mov r5m, r6d 1134%endif 1135 mov r6d, 4 1136.zero_loop: 1137 dec r6d 1138 cmp eobb, byte [r5+r6] 1139 jl .zero_loop 1140 mov r5d, r6d 1141 shl r5d, 4 1142%if ARCH_X86_32 1143 ; restore pic-ptr 1144 mov r6, r5m 1145%endif 1146.loop_pass1: 1147 mova m5, [cq+64*0+r5] 1148 lea r3, [cq+64*1+r5] 1149 mova m1, [cq+64*2+r5] 1150 mova m3, [cq+64*3+r5] 1151 call m(iadst_4x4_internal_16bpc).main2 1152 pcmpeqd m3, m3 1153 REPX {psubd x, m3}, m0, m2, m1, m4 1154 REPX {psrad x, 1}, m0, m2, m1, m4 1155 packssdw m0, m2 ; out3 out2 1156 packssdw m1, m4 ; out1 out0 1157 punpcklwd m2, m1, m0 1158 punpckhwd m1, m0 1159 punpcklwd m0, m1, m2 1160 punpckhwd m1, m2 1161 test r5d, r5d 1162 jz m(idct_4x16_internal_16bpc).end_pass1 1163 mova [cq+64*0+r5], m0 1164 mova [cq+64*1+r5], m1 1165 sub r5d, 16 1166 jmp .loop_pass1 1167.pass2: 1168%if ARCH_X86_32 1169 lea r5, [o(itx8_start)] 1170%endif 1171 call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main 1172 call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end 1173 ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7 1174 ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2 1175 mova m1, [o(pw_4x2048_4xm2048)] 1176 REPX {pmulhrsw x, m1}, m7, m2, m0 1177 pshufd m6, m1, q1032 ; 4x-2048,4x2048 1178 pmulhrsw m1, [cq+16*7] 1179 REPX {pmulhrsw x, m6}, m5, m4, m3 1180 pmulhrsw m6, [cq+16*6] 1181 ; m7/5/2/4 = out11/4,10/5,9/6,8/7 1182 ; m0/3/6/1 = out15/0,12/3,14/1,13/2 1183 ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 1184 movq [cq+0*8], m4 1185 movq [cq+1*8], m2 1186 movq [cq+2*8], m5 1187 movq [cq+3*8], m7 1188 movq [cq+4*8], m3 1189 movq [cq+5*8], m1 1190 movq [cq+6*8], m6 1191 movq [cq+7*8], m0 1192 punpckhqdq m0, m6 1193 punpckhqdq m1, m3 1194 punpckhqdq m3, m2, m4 1195 punpckhqdq m2, m7, m5 1196 jmp m(idct_4x16_internal_16bpc).end 1197 1198INV_TXFM_4X16_FN identity, dct, h 1199INV_TXFM_4X16_FN identity, adst, h 1200INV_TXFM_4X16_FN identity, flipadst, h 1201INV_TXFM_4X16_FN identity, identity 1202 1203cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 1204%undef cmp 1205%if ARCH_X86_32 1206 mov r5m, r6d 1207%endif 1208 mov r6d, 4 1209.zero_loop: 1210 dec r6d 1211 cmp eobb, byte [r5+r6] 1212 jl .zero_loop 1213 mov r5d, r6d 1214 shl r5d, 4 1215%if ARCH_X86_32 1216 ; restore pic-ptr 1217 mov r6, r5m 1218%endif 1219 mova m5, [o(pd_6144)] 1220 mova m4, [o(pd_5793)] 1221.loop_pass1: 1222 pmulld m0, m4, [cq+64*0+r5] 1223 pmulld m1, m4, [cq+64*1+r5] 1224 pmulld m2, m4, [cq+64*2+r5] 1225 pmulld m3, m4, [cq+64*3+r5] 1226 REPX {paddd x, m5}, m0, m1, m2, m3 1227 REPX {psrad x, 13}, m0, m1, m2, m3 1228 packssdw m0, m1 1229 packssdw m2, m3 1230 punpckhwd m3, m0, m2 1231 punpcklwd m0, m2 1232 punpckhwd m1, m0, m3 1233 punpcklwd m0, m3 1234 test r5d, r5d 1235 jz m(idct_4x16_internal_16bpc).end_pass1 1236 mova [cq+64*0+r5], m0 1237 mova [cq+64*1+r5], m1 1238 sub r5d, 16 1239 jmp .loop_pass1 1240.pass2: 1241 mova [cq+16*4], m0 1242 mova [cq+16*5], m1 1243 mova [cq+16*6], m2 1244 mova [cq+16*7], m7 1245 mova m0, [o(pw_1697x16)] 1246 mova m7, [o(pw_2048)] 1247 pmulhrsw m1, m0, m4 1248 pmulhrsw m2, m0, m5 1249 REPX {paddsw x, x}, m4, m5 1250 paddsw m4, m1 1251 paddsw m5, m2 1252 REPX {pmulhrsw x, m7}, m4, m5 1253 mova [cq+16*0], m4 1254 mova [cq+16*1], m5 1255 mova m4, [cq+16*7] 1256 pmulhrsw m1, m0, m6 1257 pmulhrsw m2, m0, m4 1258 REPX {paddsw x, x}, m6, m4 1259 paddsw m6, m1 1260 paddsw m4, m2 1261 REPX {pmulhrsw x, m7}, m6, m4 1262 mova [cq+16*2], m6 1263 mova [cq+16*3], m4 1264 mova m4, [cq+16*4] 1265 mova m1, [cq+16*5] 1266 mova m2, [cq+16*6] 1267 pmulhrsw m5, m0, m2 1268 pmulhrsw m6, m0, m3 1269 REPX {paddsw x, x}, m2, m3 1270 paddsw m2, m5 1271 paddsw m3, m6 1272 pmulhrsw m6, m0, m1 1273 pmulhrsw m0, m4 1274 REPX {paddsw x, x}, m1, m4 1275 paddsw m1, m6 1276 paddsw m0, m4 1277 REPX {pmulhrsw x, m7}, m2, m3, m1, m0 1278 jmp m(idct_4x16_internal_16bpc).end 1279 1280%macro INV_TXFM_8X4_FN 2 ; type1, type2 1281%if ARCH_X86_64 1282 INV_TXFM_FN %1, %2, 0, 8x4, 15 1283%else 1284 INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 1285%endif 1286%ifidn %1_%2, dct_dct 1287 imul r5d, [cq], 181 1288 mov [cq], eobd ; 0 1289 add r5d, 128 1290 sar r5d, 8 1291 imul r5d, 181 1292 add r5d, 128 1293 sar r5d, 8 1294 imul r5d, 2896 1295 add r5d, 34816 1296 movd m0, r5d 1297 pshuflw m0, m0, q1111 1298 punpcklqdq m0, m0 1299 mova m6, [o(pixel_10bpc_max)] 1300 pxor m5, m5 1301 lea r2, [strideq*3] 1302 mova m1, [dstq+strideq*0] 1303 mova m2, [dstq+strideq*1] 1304 mova m3, [dstq+strideq*2] 1305 mova m4, [dstq+r2] 1306 REPX {paddw x, m0}, m1, m2, m3, m4 1307 REPX {pmaxsw x, m5}, m1, m2, m3, m4 1308 REPX {pminsw x, m6}, m1, m2, m3, m4 1309 mova [dstq+strideq*0], m1 1310 mova [dstq+strideq*1], m2 1311 mova [dstq+strideq*2], m3 1312 mova [dstq+r2 ], m4 1313 RET 1314%endif 1315%endmacro 1316 1317INV_TXFM_8X4_FN dct, dct 1318INV_TXFM_8X4_FN dct, identity 1319INV_TXFM_8X4_FN dct, adst 1320INV_TXFM_8X4_FN dct, flipadst 1321 1322cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 1323 lea r5, [o(.main)] 1324.pass1_entry: 1325%if ARCH_X86_32 1326 lea r3, [rsp+gprsize] 1327%else 1328 mova m11, [o(pd_2048)] 1329 mova m12, [o(clip_18b_min)] 1330 mova m13, [o(clip_18b_max)] 1331 mova m14, [o(pd_2896)] 1332%endif 1333 mova m0, [cq+0*16] 1334 mova m1, [cq+1*16] 1335 mova m2, [cq+2*16] 1336 mova m3, [cq+3*16] 1337 mova m4, [cq+4*16] 1338 mova m5, [cq+5*16] 1339 mova m6, [cq+6*16] 1340 mova m7, [cq+7*16] 1341 call .rect2_mul 1342 call r5 1343 call .transpose4x8packed 1344 ; m0-3 = packed & transposed output 1345 jmp tx2q 1346.transpose4x8packed: 1347 ; transpose 1348 punpcklwd m1, m2, m6 1349 punpckhwd m2, m6 1350 punpckhwd m6, m0, m4 1351 punpcklwd m0, m4 1352 1353 punpckhwd m3, m0, m1 1354 punpcklwd m0, m1 1355 punpckhwd m4, m6, m2 1356 punpcklwd m6, m2 1357 1358 punpcklwd m2, m3, m4 1359 punpckhwd m3, m4 1360 punpckhwd m1, m0, m6 1361 punpcklwd m0, m6 1362 ret 1363.main: 1364 call .main_pass1 1365 call .round 1366 packssdw m0, m1 1367 packssdw m2, m3 1368 packssdw m4, m5 1369 packssdw m6, m7 1370 ret 1371.rect2_mul: 1372%if ARCH_X86_64 1373 REPX {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 1374 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 1375%else 1376 mova [r3], m7 1377 mova m7, [o(pd_2896)] 1378 REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 1379 pmulld m7, [r3] 1380 mova [r3], m7 1381 mova m7, [o(pd_2048)] 1382 REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 1383 paddd m7, [r3] 1384%endif 1385 REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 1386 ret 1387%if ARCH_X86_64 1388.main_pass1_fast: 1389 pmulld m5, m3, [o(pd_m2276)] 1390 pmulld m3, [o(pd_3406)] 1391 pmulld m7, m1, [o(pd_4017)] 1392 pmulld m1, [o(pd_799)] 1393 pmulld m6, m2, [o(pd_3784)] 1394 pmulld m2, [o(pd_1567)] 1395 pmulld m0, m14 1396 pxor m4, m4 1397 jmp .main_pass1_fast2 1398.main_pass1: 1399 ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a 1400 ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a 1401 ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3 1402 REPX {pmulld x, m14}, m0, m4 1403.main_pass1_fast2: 1404 REPX {paddd x, m11}, m1, m2, m3, m5, m6, m7 1405 REPX {psrad x, 12 }, m1, m2, m3, m5, m6, m7 1406 paddd m8, m1, m5 ; t4 1407 psubd m1, m5 ; t5a 1408 paddd m9, m7, m3 ; t7 1409 psubd m7, m3 ; t6a 1410 REPX {pmaxsd x, m12}, m1, m8, m7, m9 1411 REPX {pminsd x, m13}, m1, m8, m7, m9 1412 REPX {pmulld x, m14}, m7, m1 1413 paddd m0, m11 1414 paddd m7, m11 1415 psubd m5, m0, m4 1416 paddd m0, m4 1417 psubd m4, m7, m1 1418 paddd m7, m1 1419 REPX {psrad x, 12 }, m5, m0, m4, m7 1420 psubd m3, m0, m6 ; dct4 out3 1421 paddd m0, m6 ; dct4 out0 1422 paddd m6, m5, m2 ; dct4 out1 1423 psubd m5, m2 ; dct4 out2 1424 REPX {pmaxsd x, m12}, m0, m6, m5, m3 1425 REPX {pminsd x, m13}, m0, m6, m5, m3 1426 ret 1427.round: 1428 paddd m1, m6, m7 ; out1 1429 psubd m6, m7 ; out6 1430 psubd m7, m0, m9 ; out7 1431 paddd m0, m9 ; out0 1432 paddd m2, m5, m4 ; out2 1433 psubd m5, m4 ; out5 1434 psubd m4, m3, m8 ; out4 1435 paddd m3, m8 ; out3 1436%else 1437.main_pass1_fast: 1438 pmulld m5, m3, [o(pd_m2276)] 1439 pmulld m3, [o(pd_3406)] 1440 pmulld m7, m1, [o(pd_4017)] 1441 pmulld m1, [o(pd_799)] 1442 pmulld m6, m2, [o(pd_3784)] 1443 pmulld m2, [o(pd_1567)] 1444 mova m4, [o(pd_2048)] 1445 mova [r3+0*16], m2 1446 REPX {paddd x, m4}, m5, m3, m7, m1 1447 REPX {psrad x, 12}, m5, m3, m7, m1 1448 paddd m2, m1, m5 ; t4 1449 psubd m1, m5 ; t5a 1450 pmulld m5, m0, [o(pd_2896)] 1451 mova m0, m4 1452 paddd m4, m7, m3 ; t7 1453 psubd m7, m3 ; t6a 1454 mova m3, [o(clip_18b_min)] 1455 REPX {pmaxsd x, m3 }, m1, m2, m7, m4 1456 mova m3, [o(clip_18b_max)] 1457 REPX {pminsd x, m3 }, m1, m2, m7, m4 1458 mova [r3+3*16], m2 1459 mova [r3+1*16], m4 1460 pxor m4, m4 1461 mova m2, [r3+0*16] 1462 mova m3, [o(pd_2896)] 1463 jmp .main_pass1_fast2 1464.main_pass1: 1465 mova [r3+0*16], m0 1466 mova [r3+1*16], m2 1467 mova [r3+2*16], m4 1468 mova [r3+3*16], m6 1469 mova m0, [o(pd_2048)] 1470 ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a 1471 ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a 1472 paddd m2, m1, m5 ; t4 1473 psubd m1, m5 ; t5a 1474 paddd m4, m7, m3 ; t7 1475 psubd m7, m3 ; t6a 1476 mova m6, [o(clip_18b_min)] 1477 REPX {pmaxsd x, m6 }, m1, m2, m7, m4 1478 mova m6, [o(clip_18b_max)] 1479 REPX {pminsd x, m6 }, m1, m2, m7, m4 1480 mova m6, [r3+3*16] 1481 mova [r3+3*16], m2 1482 mova m2, [r3+1*16] 1483 mova [r3+1*16], m4 1484 1485 ITX_MULSUB_2D 2, 6, 4, 3, 5, _, 1567, 3784 ; t2 t3 1486 mova m3, [o(pd_2896)] 1487 mova m5, [r3+0*16] 1488 mova m4, [r3+2*16] 1489 REPX {pmulld x, m3 }, m5, m4 1490.main_pass1_fast2: 1491 REPX {paddd x, m0 }, m2, m6 1492 REPX {psrad x, 12 }, m2, m6 1493 REPX {pmulld x, m3 }, m7, m1 1494 paddd m7, m0 1495 paddd m0, m5 1496 1497 psubd m5, m0, m4 1498 paddd m0, m4 1499 psubd m4, m7, m1 1500 paddd m7, m1 1501 REPX {psrad x, 12 }, m5, m0, m4, m7 1502 psubd m3, m0, m6 ; dct4 out3 1503 paddd m0, m6 ; dct4 out0 1504 paddd m6, m5, m2 ; dct4 out1 1505 psubd m5, m2 ; dct4 out2 1506 1507 mova m1, [o(clip_18b_min)] 1508 REPX {pmaxsd x, m1 }, m0, m6, m5, m3 1509 mova m1, [o(clip_18b_max)] 1510 REPX {pminsd x, m1 }, m0, m6, m5, m3 1511 ret 1512.round: 1513 paddd m1, m6, m7 ; out1 1514 psubd m6, m7 ; out6 1515 mova [r3+0*16], m6 1516 mova m6, [r3+1*16] 1517 psubd m7, m0, m6 ; out7 1518 paddd m0, m6 ; out0 1519 paddd m2, m5, m4 ; out2 1520 psubd m5, m4 ; out5 1521 mova m6, [r3+3*16] 1522 psubd m4, m3, m6 ; out4 1523 paddd m3, m6 ; out3 1524 mova m6, [r3+0*16] 1525%endif 1526 ret 1527 1528.pass2: 1529%if ARCH_X86_32 1530 lea r5, [o(itx8_start)] 1531%endif 1532 call m_suffix(idct_8x4_internal_8bpc, _ssse3).main 1533.end: 1534 lea r3, [strideq*3] 1535 call .round2_and_write_8x4 1536 REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 1537 RET 1538.round2_and_write_8x4: 1539 pxor m6, m6 1540 mova m5, [o(pixel_10bpc_max)] 1541 mova m4, [o(pw_2048)] 1542.round1_and_write_8x4: 1543 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 1544.write_8x4: 1545 paddw m0, [dstq+strideq*0] 1546 paddw m1, [dstq+strideq*1] 1547 paddw m2, [dstq+strideq*2] 1548 paddw m3, [dstq+r3] 1549 REPX {pminsw x, m5}, m0, m1, m2, m3 1550 REPX {pmaxsw x, m6}, m0, m1, m2, m3 1551 mova [dstq+strideq*0], m0 1552 mova [dstq+strideq*1], m1 1553 mova [dstq+strideq*2], m2 1554 mova [dstq+r3 ], m3 1555 ret 1556 1557INV_TXFM_8X4_FN adst, dct 1558INV_TXFM_8X4_FN adst, adst 1559INV_TXFM_8X4_FN adst, flipadst 1560INV_TXFM_8X4_FN adst, identity 1561 1562cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 1563 lea r5, [o(.main)] 1564 jmp m(idct_8x4_internal_16bpc).pass1_entry 1565.main: 1566 call .main_pass1 1567 call .round 1568 packssdw m0, m1 1569 packssdw m2, m3 1570 packssdw m4, m5 1571 packssdw m6, m7 1572 ret 1573.main_pass1: 1574%if ARCH_X86_64 1575 ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a 1576 ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a 1577 ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a 1578 ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a 1579 psubd m8, m2, m6 ; t6 1580 paddd m2, m6 ; t2 1581 psubd m6, m0, m4 ; t4 1582 paddd m0, m4 ; t0 1583 psubd m4, m5, m1 ; t7 1584 paddd m5, m1 ; t3 1585 psubd m1, m7, m3 ; t5 1586 paddd m7, m3 ; t1 1587 REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7 1588 REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7 1589 ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a 1590 ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a 1591 psubd m9, m6, m8 ; t7 1592 paddd m6, m8 ; out6 1593 mova m8, [o(pd_2896)] 1594 psubd m3, m7, m5 ; t3 1595 paddd m7, m5 ; -out7 1596 psubd m5, m0, m2 ; t2 1597 paddd m0, m2 ; out0 1598 psubd m2, m1, m4 ; t6 1599 paddd m1, m4 ; -out1 1600 REPX {pmaxsd x, m12}, m5, m3, m2, m9 1601 REPX {pminsd x, m13}, m5, m3, m2, m9 1602 REPX {pmulld x, m14}, m5, m3, m2, m9 1603 psubd m4, m5, m3 ; (t2 - t3) * 2896 1604 paddd m3, m5 ; (t2 + t3) * 2896 1605 psubd m5, m2, m9 ; (t6 - t7) * 2896 1606 paddd m2, m9 ; (t6 + t7) * 2896 1607 ret 1608.round: 1609 1610 ; m0=out0,m1=-out1,m6=out6,m7=-out7 1611 1612 pcmpeqd m8, m8 1613 REPX {pxor x, m8 }, m1, m7, m3, m5 1614 REPX {psubd x, m8 }, m1, m7 1615 REPX {paddd x, m11}, m2, m3, m4, m5 1616 REPX {psrad x, 12 }, m2, m3, m4, m5 1617%else 1618 mova [r3+0*16], m2 1619 mova [r3+1*16], m3 1620 mova [r3+2*16], m4 1621 mova [r3+3*16], m5 1622 mova m5, [o(pd_2048)] 1623 1624 ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a 1625 ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a 1626 mova m2, [r3+0*16] 1627 mova m3, [r3+1*16] 1628 mova m4, [r3+2*16] 1629 mova [r3+0*16], m0 1630 mova [r3+1*16], m1 1631 mova [r3+2*16], m6 1632 mova m1, [r3+3*16] 1633 mova [r3+3*16], m7 1634 ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a 1635 ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a 1636 mova m0, [r3+0*16] 1637 mova m6, [r3+2*16] 1638 psubd m7, m2, m6 ; t6 1639 paddd m2, m6 ; t2 1640 psubd m6, m0, m4 ; t4 1641 paddd m0, m4 ; t0 1642 mova [r3+0*16], m7 1643 mova m5, [r3+1*16] 1644 mova m7, [r3+3*16] 1645 psubd m4, m1, m5 ; t7 1646 paddd m5, m1 ; t3 1647 psubd m1, m7, m3 ; t5 1648 paddd m7, m3 ; t1 1649 mova m3, [o(clip_18b_min)] 1650 REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7 1651 mova [r3+1*16], m7 1652 mova m7, [o(clip_18b_max)] 1653 pmaxsd m3, [r3+0*16] 1654 REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5 1655 pminsd m7, [r3+1*16] 1656 mova [r3+0*16], m0 1657 mova [r3+1*16], m2 1658 mova [r3+2*16], m5 1659 mova [r3+3*16], m7 1660 mova m0, [o(pd_2048)] 1661 ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a 1662 ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a 1663 mova m5, [r3+2*16] 1664 mova m7, [r3+3*16] 1665 psubd m2, m6, m3 ; t7 1666 paddd m6, m3 ; out6 1667 mova [r3+3*16], m6 1668 mova m0, [r3+0*16] 1669 mova m6, [r3+1*16] 1670 psubd m3, m7, m5 ; t3 1671 paddd m7, m5 ; -out7 1672 psubd m5, m0, m6 ; t2 1673 paddd m0, m6 ; out0 1674 psubd m6, m1, m4 ; t6 1675 paddd m1, m4 ; -out1 1676 mova m4, [o(clip_18b_min)] 1677 REPX {pmaxsd x, m4 }, m5, m3, m6, m2 1678 mova m4, [o(clip_18b_max)] 1679 REPX {pminsd x, m4 }, m5, m3, m6, m2 1680 mova m4, [o(pd_2896)] 1681 REPX {pmulld x, m4 }, m5, m3, m6, m2 1682 psubd m4, m5, m3 ; (t2 - t3) * 2896 1683 paddd m3, m5 ; (t2 + t3) * 2896 1684 psubd m5, m6, m2 ; (t6 - t7) * 2896 1685 paddd m2, m6 ; (t6 + t7) * 2896 1686 ret 1687.round: 1688 mova [r3+2*16], m0 1689 1690 pcmpeqd m0, m0 1691 mova m6, [o(pd_2048)] 1692 REPX {pxor x, m0 }, m1, m7, m3, m5 1693 REPX {psubd x, m0 }, m1, m7 1694 REPX {paddd x, m6 }, m2, m3, m4, m5 1695 REPX {psrad x, 12 }, m2, m3, m4, m5 1696 1697 mova m6, [r3+3*16] 1698 mova m0, [r3+2*16] 1699%endif 1700 ret 1701 1702.pass2: 1703%if ARCH_X86_32 1704 lea r5, [o(itx8_start)] 1705%endif 1706 call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main 1707 jmp m(idct_8x4_internal_16bpc).end 1708 1709INV_TXFM_8X4_FN flipadst, dct 1710INV_TXFM_8X4_FN flipadst, adst 1711INV_TXFM_8X4_FN flipadst, flipadst 1712INV_TXFM_8X4_FN flipadst, identity 1713 1714cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 1715 lea r5, [o(.main)] 1716 jmp m(idct_8x4_internal_16bpc).pass1_entry 1717.main: 1718 call m(iadst_8x4_internal_16bpc).main_pass1 1719 call m(iadst_8x4_internal_16bpc).round 1720 packssdw m7, m6 1721 packssdw m5, m4 1722 packssdw m3, m2 1723 packssdw m1, m0 1724 mova m0, m7 1725 mova m2, m5 1726 mova m4, m3 1727 mova m6, m1 1728 ret 1729.pass2: 1730%if ARCH_X86_32 1731 lea r5, [o(itx8_start)] 1732%endif 1733 call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main 1734 lea r3, [strideq*3] 1735 add dstq, r3 1736 neg strideq 1737 jmp m(idct_8x4_internal_16bpc).end 1738 1739INV_TXFM_8X4_FN identity, dct 1740INV_TXFM_8X4_FN identity, adst 1741INV_TXFM_8X4_FN identity, flipadst 1742INV_TXFM_8X4_FN identity, identity 1743 1744cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 1745 lea r5, [o(.main)] 1746 jmp m(idct_8x4_internal_16bpc).pass1_entry 1747.main: 1748 REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 1749 packssdw m0, m1 1750 packssdw m2, m3 1751 packssdw m4, m5 1752 packssdw m6, m7 1753 ret 1754.pass2: 1755 mova m7, [o(pw_1697x8)] 1756 pmulhrsw m4, m7, m0 1757 pmulhrsw m5, m7, m1 1758 pmulhrsw m6, m7, m2 1759 pmulhrsw m7, m3 1760 paddsw m0, m4 1761 paddsw m1, m5 1762 paddsw m2, m6 1763 paddsw m3, m7 1764 jmp m(idct_8x4_internal_16bpc).end 1765 1766%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset 1767%if ARCH_X86_64 1768 INV_TXFM_FN %1, %2, %3, 8x8, 15, 0-3*16 1769%else 1770 INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 1771%endif 1772%ifidn %1_%2, dct_dct 1773 imul r5d, [cq], 181 1774 mov [cq], eobd ; 0 1775 mov r3d, 2 1776.end: 1777 add r5d, 384 1778 sar r5d, 9 1779.end2: 1780 imul r5d, 2896 1781 add r5d, 34816 1782 movd m0, r5d 1783 pshuflw m0, m0, q1111 1784 punpcklqdq m0, m0 1785 mova m6, [o(pixel_10bpc_max)] 1786 pxor m5, m5 1787 lea r2, [strideq*3] 1788.loop: 1789 mova m1, [dstq+strideq*0] 1790 mova m2, [dstq+strideq*1] 1791 mova m3, [dstq+strideq*2] 1792 mova m4, [dstq+r2] 1793 REPX {paddw x, m0}, m1, m2, m3, m4 1794 REPX {pmaxsw x, m5}, m1, m2, m3, m4 1795 REPX {pminsw x, m6}, m1, m2, m3, m4 1796 mova [dstq+strideq*0], m1 1797 mova [dstq+strideq*1], m2 1798 mova [dstq+strideq*2], m3 1799 mova [dstq+r2 ], m4 1800 lea dstq, [dstq+strideq*4] 1801 dec r3d 1802 jg .loop 1803 RET 1804%endif 1805%endmacro 1806 1807INV_TXFM_8X8_FN dct, dct 1808INV_TXFM_8X8_FN dct, identity, 6 1809INV_TXFM_8X8_FN dct, adst 1810INV_TXFM_8X8_FN dct, flipadst 1811 1812cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 1813%if ARCH_X86_32 1814 DECLARE_REG_TMP 1 1815 mov [rsp+4*16+1*gprsize], r1 1816%else 1817 DECLARE_REG_TMP 6 1818%endif 1819 lea t0, [o(.pass1_main)] 1820 1821.pass1_full: 1822%if ARCH_X86_64 1823 mova m11, [o(pd_2048)] 1824 mova m12, [o(clip_18b_min)] 1825 mova m13, [o(clip_18b_max)] 1826 mova m14, [o(pd_2896)] 1827%endif 1828%undef cmp 1829%if ARCH_X86_64 1830 xor r5d, r5d 1831 cmp eobd, 10 1832 setge r5b 1833%else 1834 mov r5d, 1 1835 cmp eobd, 10 1836 sbb r5d, 0 1837%endif 1838 shl r5d, 4 1839%if ARCH_X86_32 1840 lea r3, [rsp+gprsize] 1841%endif 1842.loop_pass1: 1843 mova m0, [cq+0*32+r5] 1844 mova m1, [cq+1*32+r5] 1845 mova m2, [cq+2*32+r5] 1846 mova m3, [cq+3*32+r5] 1847 mova m4, [cq+4*32+r5] 1848 mova m5, [cq+5*32+r5] 1849 mova m6, [cq+6*32+r5] 1850 mova m7, [cq+7*32+r5] 1851 call t0 1852 1853 test r5d, r5d 1854 jz .end_pass1 1855 1856 mova [cq+0*32+16], m0 1857 mova [cq+1*32+16], m1 1858 mova [cq+2*32+16], m2 1859 mova [cq+3*32+16], m3 1860 1861 sub r5d, 16 1862 jmp .loop_pass1 1863.end_pass1: 1864 mova m4, [cq+0*32+16] 1865 mova m5, [cq+1*32+16] 1866 mova m6, [cq+2*32+16] 1867 mova m7, [cq+3*32+16] 1868%if ARCH_X86_32 1869 mov r1, [rsp+4*16+1*gprsize] 1870%endif 1871 jmp tx2q 1872.pass1_main: 1873 call m(idct_8x4_internal_16bpc).main_pass1 1874 pcmpeqd m1, m1 1875 REPX {psubd x, m1}, m0, m6, m5, m3 1876 call m(idct_8x4_internal_16bpc).round 1877 REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 1878.pack_and_transpose: 1879 packssdw m2, m3 1880 packssdw m6, m7 1881 packssdw m0, m1 1882 packssdw m4, m5 1883 jmp m(idct_8x4_internal_16bpc).transpose4x8packed 1884 1885.pass2: 1886%if ARCH_X86_32 1887 lea r5, [o(itx8_start)] 1888%endif 1889 call m_suffix(idct_8x8_internal_8bpc, _ssse3).main 1890 lea r3, [strideq*3] 1891%if ARCH_X86_64 1892 mova m10, [o(pixel_10bpc_max)] 1893 pxor m9, m9 1894%endif 1895 call .round3_and_write_8x8 1896.zero: 1897%if ARCH_X86_64 1898%define mzero m9 1899%else 1900%define mzero m7 1901 pxor m7, m7 1902%endif 1903 REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1904%undef mzero 1905 RET 1906 1907 ; round (rounded right-shift by 5) before writing 1908 ; data in m0-7 1909 ; on x86-64, pw_2048 is in m8 1910 ; .round1 is for m0-7 1911 ; .round2 is for m0-6 & [rsp+gprsize*2] 1912 ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) 1913 ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7 1914%if ARCH_X86_32 1915.round1_and_write_8x8: 1916 mova [rsp+gprsize*2], m7 1917.round2_and_write_8x8: 1918%endif 1919.round3_and_write_8x8: 1920 mova m7, [o(pw_2048)] 1921%if ARCH_X86_32 1922.round4_and_write_8x8: 1923%endif 1924 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1925 pmulhrsw m7, [rsp+gprsize*2] 1926%if ARCH_X86_64 1927 jmp .write_8x8 1928.round2_and_write_8x8: 1929 mova m7, [rsp+gprsize*2] 1930.round1_and_write_8x8: 1931 REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 1932%endif 1933 1934 ; m0-7 have to-be-written data [pre-rounded] 1935 ; on x86-64, m9-10 contain a zero/pixel_max 1936 ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch 1937 ; r0,1,3 contain dstq/strideq/stride3q 1938 ; r5 is a scratch register 1939.write_8x8: 1940 lea r5, [dstq+strideq*4] 1941 paddw m0, [dstq+strideq*0] 1942 paddw m1, [dstq+strideq*1] 1943 paddw m2, [dstq+strideq*2] 1944 paddw m3, [dstq+r3] 1945 paddw m4, [r5 +strideq*0] 1946 paddw m5, [r5 +strideq*1] 1947 paddw m6, [r5 +strideq*2] 1948 paddw m7, [r5 +r3] 1949%if ARCH_X86_64 1950 REPX {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 1951 REPX {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 1952%else 1953 mova [rsp+gprsize*2], m7 1954 pxor m7, m7 1955 REPX {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1956 pmaxsw m7, [rsp+gprsize*2] 1957 mova [rsp+gprsize*2], m7 1958 mova m7, [o(pixel_10bpc_max)] 1959 REPX {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1960 pminsw m7, [rsp+gprsize*2] 1961%endif 1962 mova [dstq+strideq*0], m0 1963 mova [dstq+strideq*1], m1 1964 mova [dstq+strideq*2], m2 1965 mova [dstq+r3 ], m3 1966 mova [r5 +strideq*0], m4 1967 mova [r5 +strideq*1], m5 1968 mova [r5 +strideq*2], m6 1969 mova [r5 +r3 ], m7 1970 ret 1971 1972INV_TXFM_8X8_FN adst, dct 1973INV_TXFM_8X8_FN adst, adst 1974INV_TXFM_8X8_FN adst, flipadst 1975INV_TXFM_8X8_FN adst, identity, 6 1976 1977cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 1978%if ARCH_X86_32 1979 mov [rsp+4*16+1*gprsize], r1 1980%endif 1981 lea t0, [o(.pass1_main)] 1982 jmp m(idct_8x8_internal_16bpc).pass1_full 1983.pass1_main: 1984 call m(iadst_8x4_internal_16bpc).main_pass1 1985 call .round 1986 jmp m(idct_8x8_internal_16bpc).pack_and_transpose 1987.round: 1988%if ARCH_X86_64 1989 pcmpeqd m8, m8 ; -1 1990 REPX {psubd x, m8 }, m0, m6 1991 REPX {pxor x, m8 }, m1, m7, m3, m5 1992 REPX {psrad x, 1 }, m0, m1, m6, m7 1993 REPX {psubd x, m8 }, m1, m7 1994 mova m8, [o(pd_6144)] 1995 REPX {paddd x, m8 }, m2, m3, m4, m5 1996 REPX {psrad x, 13 }, m2, m3, m4, m5 1997%else 1998 mova [r3+2*16], m0 1999 2000 pcmpeqd m0, m0 ; -1 2001 mova m6, [o(pd_6144)] 2002 REPX {pxor x, m0 }, m1, m7, m3, m5 2003 REPX {psrad x, 1 }, m1, m7 2004 REPX {psubd x, m0 }, m1, m7 2005 REPX {paddd x, m6 }, m2, m3, m4, m5 2006 REPX {psrad x, 13 }, m2, m3, m4, m5 2007 2008 mova m0, [r3+2*16] 2009 psrld m6, 12 ; +1 2010 paddd m0, m6 2011 paddd m6, [r3+3*16] 2012 REPX {psrad x, 1 }, m0, m6 2013%endif 2014 ret 2015 2016.pass2: 2017%if ARCH_X86_32 2018 lea r5, [o(itx8_start)] 2019%endif 2020 call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main 2021 call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end 2022 lea r3, [strideq*3] 2023%if ARCH_X86_64 2024 mova m10, [o(pixel_10bpc_max)] 2025 pxor m9, m9 2026%endif 2027 call .round3_and_write_8x8 2028 jmp m(idct_8x8_internal_16bpc).zero 2029 2030 ; round (rounded right-shift by 5) before writing; odd registers are negated 2031 ; data in m0-7 2032 ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11 2033 ; .round1 is for m0-7 2034 ; .round2 is for m0-6 & [rsp+gprsize*2] 2035 ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) 2036%if ARCH_X86_64 2037.round2_and_write_8x8: 2038 mova m7, [rsp+gprsize*2] 2039.round1_and_write_8x8: 2040 REPX {pmulhrsw x, m8 }, m0, m2, m4, m6 2041 REPX {pmulhrsw x, m11}, m1, m3, m5, m7 2042 jmp m(idct_8x8_internal_16bpc).write_8x8 2043%else 2044.round1_and_write_8x8: 2045 mova [rsp+gprsize*2], m7 2046.round2_and_write_8x8: 2047%endif 2048.round3_and_write_8x8: 2049 mova m7, [o(pw_2048)] 2050 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 2051 mova m7, [o(pw_m2048)] 2052 REPX {pmulhrsw x, m7}, m1, m3, m5 2053 pmulhrsw m7, [rsp+gprsize*2] 2054 jmp m(idct_8x8_internal_16bpc).write_8x8 2055 2056INV_TXFM_8X8_FN flipadst, dct 2057INV_TXFM_8X8_FN flipadst, adst 2058INV_TXFM_8X8_FN flipadst, flipadst 2059INV_TXFM_8X8_FN flipadst, identity, 6 2060 2061cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 2062%if ARCH_X86_32 2063 mov [rsp+4*16+1*gprsize], r1 2064%endif 2065 lea t0, [o(.pass1_main)] 2066 jmp m(idct_8x8_internal_16bpc).pass1_full 2067.pass1_main: 2068 call m(iadst_8x4_internal_16bpc).main_pass1 2069 call m(iadst_8x8_internal_16bpc).round 2070 ; invert registers 2071 packssdw m7, m6 2072 packssdw m5, m4 2073 packssdw m3, m2 2074 packssdw m1, m0 2075 mova m0, m7 2076 mova m2, m5 2077 mova m4, m3 2078 mova m6, m1 2079 jmp m(idct_8x4_internal_16bpc).transpose4x8packed 2080 2081.pass2: 2082 lea dstq, [dstq+strideq*8] 2083 sub dstq, strideq 2084 neg strideq 2085 jmp m(iadst_8x8_internal_16bpc).pass2 2086 2087INV_TXFM_8X8_FN identity, dct 2088INV_TXFM_8X8_FN identity, adst 2089INV_TXFM_8X8_FN identity, flipadst 2090INV_TXFM_8X8_FN identity, identity 2091 2092cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 2093 mova m0, [cq+0*32] 2094 mova m1, [cq+1*32] 2095 mova m2, [cq+2*32] 2096 mova m3, [cq+3*32] 2097 mova m4, [cq+4*32] 2098 mova m5, [cq+5*32] 2099 mova m6, [cq+6*32] 2100 mova m7, [cq+7*32] 2101 packssdw m0, [cq+0*32+16] 2102 packssdw m1, [cq+1*32+16] 2103 packssdw m2, [cq+2*32+16] 2104 packssdw m3, [cq+3*32+16] 2105 packssdw m4, [cq+4*32+16] 2106 packssdw m5, [cq+5*32+16] 2107 packssdw m6, [cq+6*32+16] 2108 packssdw m7, [cq+7*32+16] 2109 mova [rsp+gprsize+16*1], m6 2110 jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3 2111 2112.pass2: 2113%if ARCH_X86_32 2114 lea r5, [o(itx8_start)] 2115%endif 2116 lea r3, [strideq*3] 2117%if ARCH_X86_64 2118 mova m10, [o(pixel_10bpc_max)] 2119 pxor m9, m9 2120 mova m8, [o(pw_4096)] 2121 call m(idct_8x8_internal_16bpc).round1_and_write_8x8 2122%else 2123 mova [rsp+gprsize], m7 2124 mova m7, [o(pw_4096)] 2125 call m(idct_8x8_internal_16bpc).round4_and_write_8x8 2126%endif 2127 jmp m(idct_8x8_internal_16bpc).zero 2128 2129%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix 2130%if ARCH_X86_64 2131 INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16 2132%else 2133 INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 2134%endif 2135%ifidn %1_%2, dct_dct 2136 imul r5d, [cq], 181 2137 mov [cq], eobd ; 0 2138 add r5d, 128 2139 sar r5d, 8 2140 imul r5d, 181 2141 mov r3d, 4 2142%if stack_size_padded > 0 2143 ; adjust to caller's stack allocation 2144 add rsp, (12+ARCH_X86_64)*16 2145%endif 2146 jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end 2147%endif 2148%endmacro 2149 2150INV_TXFM_8X16_FN dct, dct 2151INV_TXFM_8X16_FN dct, identity, v 2152INV_TXFM_8X16_FN dct, adst 2153INV_TXFM_8X16_FN dct, flipadst 2154 2155%if ARCH_X86_64 2156DECLARE_REG_TMP 7 2157%endif 2158 2159cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 2160%if WIN64 2161 PUSH r7 2162%elif ARCH_X86_32 2163 mov [rsp+16*16+gprsize*1], r1 2164 mov [rsp+16*16+gprsize*2], r6 2165%endif 2166 lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)] 2167.pass1_full: 2168%if ARCH_X86_64 2169 mova m11, [o(pd_2048)] 2170 mova m12, [o(clip_18b_min)] 2171 mova m13, [o(clip_18b_max)] 2172 mova m14, [o(pd_2896)] 2173%endif 2174%undef cmp 2175 mov r6d, 4 2176.zero_loop: 2177 dec r6d 2178 cmp eobb, byte [r5+r6] 2179 jl .zero_loop 2180 mov r5d, r6d 2181 shl r5d, 4 2182%if ARCH_X86_32 2183 ; restore pic-ptr 2184 mov r6, [rsp+16*16+2*gprsize] 2185 ; setup stack pointer 2186 lea r3, [rsp+gprsize] 2187%endif 2188.loop_pass1: 2189 mova m0, [cq+0*64+r5] 2190 mova m1, [cq+1*64+r5] 2191 mova m2, [cq+2*64+r5] 2192 mova m3, [cq+3*64+r5] 2193 mova m4, [cq+4*64+r5] 2194 mova m5, [cq+5*64+r5] 2195 mova m6, [cq+6*64+r5] 2196 mova m7, [cq+7*64+r5] 2197 call m(idct_8x4_internal_16bpc).rect2_mul 2198 call t0 2199 2200 mova [cq+0*64+r5], m0 2201 mova [cq+1*64+r5], m1 2202 mova [cq+2*64+r5], m2 2203 mova [cq+3*64+r5], m3 2204 sub r5d, 16 2205 jge .loop_pass1 2206%if WIN64 2207 POP r7 2208%elif ARCH_X86_32 2209 mov r1, [rsp+16*16+1*gprsize] 2210%endif 2211 jmp tx2q 2212 2213.pass2: 2214%if ARCH_X86_32 2215 lea r5, [o(itx8_start)] 2216%endif 2217 2218 ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15 2219 ; some are still pre-loaded from the final loop iteration in pass=1 2220 2221 mova m1, m2 2222 mova m2, [cq+ 1*16] 2223 mova m3, [cq+ 9*16] 2224 mova m4, [cq+ 2*16] 2225 mova m5, [cq+10*16] 2226 mova m6, [cq+ 3*16] 2227 mova m7, [cq+11*16] 2228 call m_suffix(idct_8x8_internal_8bpc, _ssse3).main 2229 mova [rsp+gprsize+3*16], m0 2230 mova [rsp+gprsize+4*16], m1 2231 mova [rsp+gprsize+5*16], m2 2232 mova [rsp+gprsize+6*16], m3 2233 mova [rsp+gprsize+7*16], m4 2234 mova [rsp+gprsize+8*16], m5 2235 mova [rsp+gprsize+9*16], m6 2236 ; m7 is already stored in [rsp+gprsize+0*16] 2237 mova m0, [cq+ 4*16] 2238 mova m1, [cq+12*16] 2239 mova m2, [cq+ 5*16] 2240 mova m3, [cq+13*16] 2241 mova m4, [cq+ 6*16] 2242 mova m5, [cq+14*16] 2243 mova m6, [cq+ 7*16] 2244 mova m7, [cq+15*16] 2245 call m_suffix(idct_16x8_internal_8bpc, _ssse3).main 2246 2247 ; out0-7 is in rsp+gprsize+3-10*mmsize 2248 ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize 2249 2250%if ARCH_X86_64 2251 mova m8, [o(pw_2048)] 2252 mova m10, [o(pixel_10bpc_max)] 2253 pxor m9, m9 2254 mov r6, dstq 2255%else 2256 mov [rsp+16*16+gprsize*1], dstq 2257%endif 2258 lea r3, [strideq*3] 2259 lea dstq, [dstq+strideq*8] 2260 call m(idct_8x8_internal_16bpc).round2_and_write_8x8 2261%if ARCH_X86_64 2262%define mzero m9 2263%else 2264%define mzero m7 2265 pxor m7, m7 2266%endif 2267 REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 2268 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 2269%undef mzero 2270 mova m0, [rsp+gprsize+ 3*16] 2271 mova m1, [rsp+gprsize+ 4*16] 2272 mova m2, [rsp+gprsize+ 5*16] 2273 mova m3, [rsp+gprsize+ 6*16] 2274 mova m4, [rsp+gprsize+ 7*16] 2275 mova m5, [rsp+gprsize+ 8*16] 2276 mova m6, [rsp+gprsize+ 9*16] 2277 mova m7, [rsp+gprsize+10*16] 2278%if ARCH_X86_64 2279 mov dstq, r6 2280%else 2281 mov dstq, [rsp+16*16+gprsize*1] 2282%endif 2283 call m(idct_8x8_internal_16bpc).round1_and_write_8x8 2284 RET 2285 2286INV_TXFM_8X16_FN adst, dct 2287INV_TXFM_8X16_FN adst, adst 2288INV_TXFM_8X16_FN adst, flipadst 2289INV_TXFM_8X16_FN adst, identity, v 2290 2291cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 2292%if WIN64 2293 PUSH r7 2294%elif ARCH_X86_32 2295 mov [rsp+16*16+gprsize*1], r1 2296 mov [rsp+16*16+gprsize*2], r6 2297%endif 2298 lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)] 2299 jmp m(idct_8x16_internal_16bpc).pass1_full 2300 2301.pass2: 2302%if ARCH_X86_32 2303 lea r5, [o(itx8_start)] 2304%endif 2305 mova m4, [cq+ 9*16] 2306 mova m5, [cq+13*16] 2307 mova [rsp+gprsize+7*16], m0 2308 mova [rsp+gprsize+8*16], m1 2309 mova [rsp+gprsize+5*16], m4 2310 mova [rsp+gprsize+6*16], m5 2311 mova m0, m2 2312 mova m1, m3 2313 mova m2, [cq+ 1*16] 2314 mova m3, [cq+ 5*16] 2315 mova m4, [cq+ 2*16] 2316 mova m5, [cq+ 6*16] 2317 mova m6, [cq+11*16] 2318 mova m7, [cq+15*16] 2319 mova [rsp+gprsize+ 3*16], m4 2320 mova [rsp+gprsize+ 4*16], m5 2321 mova [rsp+gprsize+ 9*16], m6 2322 mova [rsp+gprsize+10*16], m7 2323 mova m4, [cq+10*16] 2324 mova m5, [cq+14*16] 2325 mova m6, [cq+ 3*16] 2326 mova m7, [cq+ 7*16] 2327 call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main 2328 call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end 2329 2330%if ARCH_X86_64 2331 mova m11, [o(pw_m2048)] 2332 mova m8, [o(pw_2048)] 2333 mova m10, [o(pixel_10bpc_max)] 2334 pxor m9, m9 2335 mov r6, dstq 2336%else 2337 mov [rsp+16*16+gprsize*1], dstq 2338%endif 2339 lea r3, [strideq*3] 2340 lea dstq, [dstq+strideq*8] 2341 call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 2342%if ARCH_X86_64 2343%define mzero m9 2344%else 2345%define mzero m7 2346 pxor m7, m7 2347%endif 2348 REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 2349 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 2350%undef mzero 2351 mova m0, [rsp+gprsize+ 3*16] 2352 mova m1, [rsp+gprsize+ 4*16] 2353 mova m2, [rsp+gprsize+ 5*16] 2354 mova m3, [rsp+gprsize+ 6*16] 2355 mova m4, [rsp+gprsize+ 7*16] 2356 mova m5, [rsp+gprsize+ 8*16] 2357 mova m6, [rsp+gprsize+ 9*16] 2358 mova m7, [rsp+gprsize+10*16] 2359%if ARCH_X86_64 2360 mov dstq, r6 2361%else 2362 mov dstq, [rsp+16*16+gprsize*1] 2363%endif 2364 call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 2365 RET 2366 2367INV_TXFM_8X16_FN flipadst, dct 2368INV_TXFM_8X16_FN flipadst, adst 2369INV_TXFM_8X16_FN flipadst, flipadst 2370INV_TXFM_8X16_FN flipadst, identity, v 2371 2372cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 2373%if WIN64 2374 PUSH r7 2375%elif ARCH_X86_32 2376 mov [rsp+16*16+gprsize*1], r1 2377 mov [rsp+16*16+gprsize*2], r6 2378%endif 2379 lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)] 2380 jmp m(idct_8x16_internal_16bpc).pass1_full 2381 2382.pass2: 2383 lea r3, [strideq*3] 2384 lea r3, [r3*5] 2385 add dstq, r3 2386 neg strideq 2387 jmp m(iadst_8x16_internal_16bpc).pass2 2388 2389INV_TXFM_8X16_FN identity, dct, h 2390INV_TXFM_8X16_FN identity, adst, h 2391INV_TXFM_8X16_FN identity, flipadst, h 2392INV_TXFM_8X16_FN identity, identity 2393 2394cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 2395%if WIN64 2396 PUSH r7 2397%elif ARCH_X86_32 2398 mov [rsp+16*16+gprsize*1], r1 2399 mov [rsp+16*16+gprsize*2], r6 2400%endif 2401 lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)] 2402 jmp m(idct_8x16_internal_16bpc).pass1_full 2403 2404.pass2: 2405%if ARCH_X86_64 2406 mova m4, [o(pw_2048)] 2407 mova m5, [o(pixel_10bpc_max)] 2408 pxor m6, m6 2409 mova m7, [o(pw_1697x16)] 2410%endif 2411 mov r5d, 4 2412 lea r3, [strideq*3] 2413.pass2_loop: 2414 call .main 2415%if ARCH_X86_64 2416 call m(idct_8x4_internal_16bpc).round1_and_write_8x4 2417%else 2418 call m(idct_8x4_internal_16bpc).round2_and_write_8x4 2419%endif 2420 REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28 2421 dec r5d 2422 jle .end 2423 add cq, 16 2424 lea dstq, [dstq+strideq*4] 2425 mova m0, [cq+ 0*16] 2426 mova m1, [cq+ 4*16] 2427 mova m2, [cq+ 8*16] 2428 mova m3, [cq+12*16] 2429 jmp .pass2_loop 2430.end: 2431 RET 2432.main: 2433 ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y) 2434%if ARCH_X86_32 2435 mova m7, [o(pw_1697x16)] 2436 pmulhrsw m4, m7, m0 2437 pmulhrsw m5, m7, m1 2438 pmulhrsw m6, m7, m2 2439 pmulhrsw m7, m3 2440%else 2441 pmulhrsw m8, m7, m0 2442 pmulhrsw m9, m7, m1 2443 pmulhrsw m10, m7, m2 2444 pmulhrsw m11, m7, m3 2445%endif 2446 REPX {paddsw x, x}, m0, m1, m2, m3 2447%if ARCH_X86_64 2448 paddsw m0, m8 2449 paddsw m1, m9 2450 paddsw m2, m10 2451 paddsw m3, m11 2452%else 2453 paddsw m0, m4 2454 paddsw m1, m5 2455 paddsw m2, m6 2456 paddsw m3, m7 2457%endif 2458 ret 2459 2460%macro INV_TXFM_16X4_FN 2 ; type1, type2 2461%if ARCH_X86_64 2462 INV_TXFM_FN %1, %2, 0, 16x4, 16, 0-8*16 2463%else 2464 INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16 2465%endif 2466%ifidn %1_%2, dct_dct 2467 imul r5d, [cq], 181 2468 mov [cq], eobd ; 0 2469 mov r3d, 4 2470.dconly: 2471 add r5d, 384 2472 sar r5d, 9 2473.dconly2: 2474 imul r5d, 2896 2475 add r5d, 34816 2476 movd m0, r5d 2477 pshuflw m0, m0, q1111 2478 punpcklqdq m0, m0 2479 mova m3, [o(pixel_10bpc_max)] 2480 pxor m4, m4 2481.loop: 2482 mova m1, [dstq+ 0] 2483 mova m2, [dstq+16] 2484 REPX {paddw x, m0}, m1, m2 2485 REPX {pminsw x, m3}, m1, m2 2486 REPX {pmaxsw x, m4}, m1, m2 2487 mova [dstq+ 0], m1 2488 mova [dstq+16], m2 2489 add dstq, strideq 2490 dec r3d 2491 jg .loop 2492 RET 2493%endif 2494%endmacro 2495 2496INV_TXFM_16X4_FN dct, dct 2497INV_TXFM_16X4_FN dct, identity 2498INV_TXFM_16X4_FN dct, adst 2499INV_TXFM_16X4_FN dct, flipadst 2500 2501cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 2502%if ARCH_X86_64 2503 mova m11, [o(pd_2048)] 2504 mova m12, [o(clip_18b_min)] 2505 mova m13, [o(clip_18b_max)] 2506 mova m14, [o(pd_2896)] 2507%endif 2508 ; setup stack pointer 2509 lea r3, [rsp+gprsize] 2510 2511 mova m0, [cq+ 1*16] 2512 mova m1, [cq+ 3*16] 2513 mova m2, [cq+ 5*16] 2514 mova m3, [cq+ 7*16] 2515 mova m4, [cq+ 9*16] 2516 mova m5, [cq+11*16] 2517 mova m6, [cq+13*16] 2518 mova m7, [cq+15*16] 2519 call .main_oddhalf 2520 mova m0, [cq+ 0*16] 2521 mova m1, [cq+ 2*16] 2522 mova m2, [cq+ 4*16] 2523 mova m3, [cq+ 6*16] 2524 mova m4, [cq+ 8*16] 2525 mova m5, [cq+10*16] 2526 mova m6, [cq+12*16] 2527 mova m7, [cq+14*16] 2528 call m(idct_8x4_internal_16bpc).main_pass1 2529 call m(idct_8x4_internal_16bpc).round 2530 ; t0-7 is in m0-7 2531 2532 call .round 2533 2534%if ARCH_X86_64 2535.pack_transpose: 2536 ; transpose in two parts 2537 packssdw m0, m1 2538 packssdw m2, m3 2539 packssdw m4, m5 2540 packssdw m6, m7 2541 packssdw m8, m9 2542 packssdw m10, m11 2543 packssdw m12, m13 2544 packssdw m14, m15 2545.transpose: 2546 call m(idct_8x4_internal_16bpc).transpose4x8packed 2547 call .transpose4x8packed_hi 2548%else 2549 call m(idct_8x4_internal_16bpc).transpose4x8packed 2550 mova [r3+0*16], m0 2551 mova [r3+1*16], m1 2552 mova [r3+2*16], m2 2553 mova [r3+3*16], m3 2554 mova m0, [r3+ 8*16] 2555 mova m2, [r3+ 9*16] 2556 mova m4, [r3+10*16] 2557 mova m6, [r3+11*16] 2558 call m(idct_8x4_internal_16bpc).transpose4x8packed 2559%endif 2560 jmp tx2q 2561%if ARCH_X86_64 2562.transpose4x8packed_hi: 2563 punpcklwd m9, m10, m14 2564 punpckhwd m10, m14 2565 punpckhwd m14, m8, m12 2566 punpcklwd m8, m12 2567 2568 punpckhwd m11, m8, m9 2569 punpcklwd m8, m9 2570 punpckhwd m12, m14, m10 2571 punpcklwd m14, m10 2572 2573 punpcklwd m10, m11, m12 2574 punpckhwd m11, m12 2575 punpckhwd m9, m8, m14 2576 punpcklwd m8, m14 2577 ret 2578%endif 2579.main_oddhalf_fast: ; lower half zero 2580 pmulld m7, m0, [o(pd_4076)] 2581 pmulld m0, [o(pd_401)] 2582 pmulld m6, m1, [o(pd_m1189)] 2583 pmulld m1, [o(pd_3920)] 2584%if ARCH_X86_32 2585 mova m4, [o(pd_2048)] 2586 REPX {paddd x, m4}, m1, m6 2587 REPX {psrad x, 12}, m1, m6 2588 mova [r3+1*16], m1 2589%endif 2590 pmulld m5, m2, [o(pd_3612)] 2591 pmulld m2, [o(pd_1931)] 2592%if ARCH_X86_32 2593 pmulld m1, m3, [o(pd_m2598)] 2594%else 2595 pmulld m4, m3, [o(pd_m2598)] 2596%endif 2597 pmulld m3, [o(pd_3166)] 2598 jmp .main_oddhalf_fast2 2599.main_oddhalf: 2600%if ARCH_X86_64 2601 ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a 2602 ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a 2603 ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a 2604 ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a 2605.main_oddhalf_fast2: 2606 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 2607 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 2608 psubd m8, m0, m4 ; t9 2609 paddd m0, m4 ; t8 2610 psubd m4, m6, m2 ; t10 2611 paddd m2, m6 ; t11 2612 psubd m6, m1, m5 ; t13 2613 paddd m5, m1 ; t12 2614 psubd m1, m7, m3 ; t14 2615 paddd m7, m3 ; t15 2616 REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 2617 REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 2618 mova m15, [o(pd_3784)] 2619 mova m10, [o(pd_1567)] 2620 ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 2621 ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4 2622 psubd m3, m1, m4 ; t10 2623 paddd m1, m4 ; t9 2624 psubd m4, m0, m2 ; t11a 2625 paddd m0, m2 ; t8a 2626 psubd m2, m8, m6 ; t13 2627 paddd m6, m8 ; t14 2628 psubd m8, m7, m5 ; t12a 2629 paddd m7, m5 ; t15a 2630 REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 2631 REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 2632 REPX {pmulld x, m14}, m2, m8, m3, m4 2633 paddd m2, m11 2634 paddd m8, m11 2635 paddd m5, m2, m3 ; t13a 2636 psubd m2, m3 ; t10a 2637 psubd m3, m8, m4 ; t11 2638 paddd m4, m8 ; t12 2639 REPX {psrad x, 12}, m5, m2, m3, m4 2640 mova [r3+0*16], m0 2641 mova [r3+1*16], m1 2642 mova [r3+2*16], m2 2643 mova [r3+3*16], m3 2644 mova [r3+4*16], m4 2645 mova [r3+5*16], m5 2646 mova [r3+6*16], m6 2647 mova [r3+7*16], m7 2648%else 2649 mova [r3+0*16], m2 2650 mova [r3+1*16], m3 2651 mova [r3+2*16], m4 2652 mova [r3+3*16], m5 2653 mova m4, [o(pd_2048)] 2654 2655 ITX_MULSUB_2D 0, 7, 2, 3, 5, _, 401, 4076 ; t8a, t15a 2656 ITX_MULSUB_2D 6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a 2657 2658 mova m2, [r3+0*16] 2659 mova m3, [r3+1*16] 2660 mova [r3+0*16], m0 2661 mova [r3+1*16], m1 2662 mova m1, [r3+2*16] 2663 mova m5, [r3+3*16] 2664 mova [r3+2*16], m6 2665 mova [r3+3*16], m7 2666 2667 ITX_MULSUB_2D 2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a 2668 ITX_MULSUB_2D 1, 3, 0, 6, 7, _, 3166, 2598 ; t9a, t14a 2669 2670 mova m0, [r3+0*16] 2671 mova m6, [r3+2*16] 2672 mova m7, [r3+3*16] 2673.main_oddhalf_fast2: 2674 REPX {paddd x, m4}, m0, m7, m2, m5, m1, m3 2675 REPX {psrad x, 12}, m0, m7, m2, m5, m1, m3 2676 psubd m4, m0, m1 ; t9 2677 paddd m0, m1 ; t8 2678 mova m1, [r3+1*16] 2679 mova [r3+0*16], m4 2680 psubd m4, m6, m2 ; t10 2681 paddd m2, m6 ; t11 2682 psubd m6, m1, m5 ; t13 2683 paddd m5, m1 ; t12 2684 psubd m1, m7, m3 ; t14 2685 paddd m7, m3 ; t15 2686 mova m3, [o(clip_18b_min)] 2687 REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7 2688 pmaxsd m3, [r3+0*16] 2689 mova [r3+0*16], m3 2690 mova m3, [o(clip_18b_max)] 2691 REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7 2692 pminsd m3, [r3+0*16] 2693 mova [r3+0*16], m0 2694 mova [r3+1*16], m2 2695 mova [r3+2*16], m5 2696 mova [r3+3*16], m7 2697 mova m7, [o(pd_2048)] 2698 ITX_MULSUB_2D 1, 3, 0, 2, 5, 7, 1567, 3784 2699 ITX_MULSUB_2D 6, 4, 0, 2, _, 7, 5, 3784, 4 2700 mova m0, [r3+0*16] 2701 mova m2, [r3+1*16] 2702 psubd m5, m1, m4 ; t10 2703 mova [r3+1*16], m5 2704 paddd m1, m4 ; t9 2705 psubd m4, m0, m2 ; t11a 2706 paddd m0, m2 ; t8a 2707 mova m5, [r3+2*16] 2708 mova m7, [r3+3*16] 2709 psubd m2, m3, m6 ; t13 2710 paddd m6, m3 ; t14 2711 paddd m3, m7, m5 ; t15a 2712 psubd m7, m5 ; t12a 2713 mova [r3+0*16], m3 2714 mova m3, [r3+1*16] 2715 mova m5, [o(clip_18b_min)] 2716 REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6 2717 pmaxsd m5, [r3+0*16] 2718 mova [r3+0*16], m5 2719 mova m5, [o(clip_18b_max)] 2720 REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6 2721 pminsd m5, [r3+0*16] 2722 mova [r3+0*16], m5 2723 mova m5, [o(pd_2896)] 2724 REPX {pmulld x, m5}, m2, m7, m3, m4 2725 mova m5, [o(pd_2048)] 2726 REPX {paddd x, m5}, m2, m7 2727 paddd m5, m2, m3 ; t13a 2728 psubd m2, m3 ; t10a 2729 psubd m3, m7, m4 ; t11 2730 paddd m4, m7 ; t12 2731 REPX {psrad x, 12}, m5, m2, m3, m4 2732 mova m7, [r3+0*16] 2733 mova [r3+11*16], m0 2734 mova [r3+10*16], m1 2735 mova [r3+9*16], m2 2736 mova [r3+8*16], m3 2737 mova [r3+7*16], m4 2738 mova [r3+6*16], m5 2739 mova [r3+5*16], m6 2740 mova [r3+4*16], m7 2741%endif 2742 ret 2743.round: 2744%if ARCH_X86_64 2745 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 2746 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 2747 pcmpeqd m8, m8 2748 REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 2749 mova m8, [r3+1*16] 2750 mova m9, [r3+2*16] 2751 mova m10, [r3+3*16] 2752 mova m11, [r3+4*16] 2753 mova m12, [r3+5*16] 2754 mova m13, [r3+6*16] 2755 mova m14, [r3+7*16] 2756 psubd m15, m0, m14 ; out15 2757 paddd m0, m14 ; out0 2758 psubd m14, m1, m13 ; out14 2759 paddd m1, m13 ; out1 2760 psubd m13, m2, m12 ; out13 2761 paddd m2, m12 ; out2 2762 psubd m12, m3, m11 ; out12 2763 paddd m3, m11 ; out3 2764 psubd m11, m4, m10 ; out11 2765 paddd m4, m10 ; out4 2766 psubd m10, m5, m9 ; out10 2767 paddd m5, m9 ; out5 2768 psubd m9, m6, m8 ; out9 2769 paddd m6, m8 ; out6 2770 psubd m8, m7, [r3+0*16] ; out8 2771 paddd m7, [r3+0*16] ; out7 2772 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ 2773 m8, m9, m10, m11, m12, m13, m14, m15 2774 ; and out0-15 is now in m0-15 2775%else 2776 mova [r3+ 0*16], m0 2777 mova m0, [o(clip_18b_min)] 2778 REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 2779 pmaxsd m0, [r3+ 0*16] 2780 mova [r3+ 0*16], m7 2781 mova m7, [o(clip_18b_max)] 2782 REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 2783 pminsd m7, [r3+ 0*16] 2784 mova [r3+ 0*16], m0 2785 pcmpeqd m0, m0 2786 REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7 2787 mova [r3+ 1*16], m1 2788 mova [r3+ 2*16], m2 2789 mova m1, [r3+ 0*16] 2790 psubd m1, m0 2791 mova [r3+ 0*16], m1 2792 mova m1, [r3+11*16] 2793 mova m2, [r3+10*16] 2794 psubd m0, m7, m1 2795 paddd m7, m1 2796 psubd m1, m6, m2 2797 paddd m6, m2 2798 REPX {psrad x, 1}, m0, m1, m6, m7 2799 packssdw m0, m1 ; out8-9 2800 packssdw m6, m7 ; out6-7 2801 mova [r3+11*16], m6 2802 mova m1, [r3+9*16] 2803 mova m7, [r3+8*16] 2804 psubd m2, m5, m1 2805 paddd m5, m1 2806 psubd m1, m4, m7 2807 paddd m4, m7 2808 REPX {psrad x, 1}, m2, m1, m4, m5 2809 packssdw m2, m1 ; out10-11 2810 packssdw m4, m5 ; out4-5 2811 mova m1, [r3+2*16] 2812 mova [r3+10*16], m4 2813 mova m6, [r3+7*16] 2814 mova m7, [r3+6*16] 2815 psubd m4, m3, m6 2816 paddd m3, m6 2817 psubd m6, m1, m7 2818 paddd m1, m7 2819 REPX {psrad x, 1}, m4, m6, m1, m3 2820 packssdw m4, m6 ; out12-13 2821 packssdw m1, m3 ; out2-3 2822 mova m3, [r3+1*16] 2823 mova [r3+9*16], m1 2824 mova m1, [r3+0*16] 2825 mova m5, [r3+5*16] 2826 mova m7, [r3+4*16] 2827 psubd m6, m3, m5 2828 paddd m3, m5 2829 psubd m5, m1, m7 2830 paddd m1, m7 2831 REPX {psrad x, 1}, m6, m5, m1, m3 2832 packssdw m6, m5 ; out14-15 2833 packssdw m1, m3 ; out0-1 2834 mova [r3+8*16], m1 2835%endif 2836 ret 2837 2838.pass2: 2839 lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)] 2840.pass2_loop: 2841 lea r3, [strideq*3] 2842%if ARCH_X86_32 2843 lea r5, [o(itx8_start)] 2844%endif 2845 call r4 2846 call m(idct_8x4_internal_16bpc).round2_and_write_8x4 2847 REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 2848%if ARCH_X86_64 2849 mova m0, m8 2850 mova m1, m9 2851 mova m2, m10 2852 mova m3, m11 2853%else 2854 mova m0, [rsp+gprsize+0*16] 2855 mova m1, [rsp+gprsize+1*16] 2856 mova m2, [rsp+gprsize+2*16] 2857 mova m3, [rsp+gprsize+3*16] 2858%endif 2859 add dstq, 16 2860%if ARCH_X86_32 2861 lea r5, [o(itx8_start)] 2862%endif 2863 call r4 2864 call m(idct_8x4_internal_16bpc).round2_and_write_8x4 2865 RET 2866 2867INV_TXFM_16X4_FN adst, dct 2868INV_TXFM_16X4_FN adst, adst 2869INV_TXFM_16X4_FN adst, flipadst 2870INV_TXFM_16X4_FN adst, identity 2871 2872cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 2873 ; setup stack pointer 2874 lea r3, [rsp+gprsize] 2875 call .main 2876%if ARCH_X86_64 2877 jmp m(idct_16x4_internal_16bpc).pack_transpose 2878%else 2879 call m(idct_8x4_internal_16bpc).transpose4x8packed 2880 mova [rsp+gprsize+0*16], m0 2881 mova [rsp+gprsize+1*16], m1 2882 mova [rsp+gprsize+2*16], m2 2883 mova [rsp+gprsize+3*16], m3 2884 mova m0, [rsp+gprsize+ 8*16] 2885 mova m2, [rsp+gprsize+ 9*16] 2886 mova m4, [rsp+gprsize+10*16] 2887 mova m6, [rsp+gprsize+11*16] 2888 call m(idct_8x4_internal_16bpc).transpose4x8packed 2889 jmp tx2q 2890%endif 2891 2892.main: 2893%if ARCH_X86_64 2894 mova m11, [o(pd_2048)] 2895 mova m12, [o(clip_18b_min)] 2896 mova m13, [o(clip_18b_max)] 2897 mova m14, [o(pd_2896)] 2898%endif 2899 mova m0, [cq+ 2*16] 2900 mova m1, [cq+13*16] 2901 mova m2, [cq+ 6*16] 2902 mova m3, [cq+ 9*16] 2903 mova m4, [cq+10*16] 2904 mova m5, [cq+ 5*16] 2905 mova m6, [cq+14*16] 2906 mova m7, [cq+ 1*16] 2907 call .main_part1 2908 mova m0, [cq+ 0*16] 2909 mova m1, [cq+15*16] 2910 mova m2, [cq+ 4*16] 2911 mova m3, [cq+11*16] 2912 mova m4, [cq+ 8*16] 2913 mova m5, [cq+ 7*16] 2914 mova m6, [cq+12*16] 2915 mova m7, [cq+ 3*16] 2916 call .main_part2 2917.round: 2918%if ARCH_X86_64 2919 mova m15, [o(pd_6144)] 2920 psrld m14, 11 ; pd_1 2921 pcmpeqd m8, m8 ; -1 2922 psubd m13, m15, m14 ; pd_6143 2923 REPX {paddd x, m14}, m0, m2 2924 REPX {paddd x, m15}, m4, m6 2925 REPX {pxor x, m8 }, m1, m3, m5, m7 2926 REPX {psrad x, 1 }, m1, m3 2927 REPX {paddd x, m15}, m5, m7 2928 REPX {psubd x, m8 }, m1, m3 2929 paddd m8, m15, m9 2930 psubd m9, m13, m10 2931 paddd m10, m15, m11 2932 psubd m11, m13, m12 2933 paddd m12, m14, [r3+3*16] 2934 psubd m13, m14, [r3+2*16] 2935 psubd m15, m14, [r3+0*16] 2936 paddd m14, [r3+1*16] 2937 REPX {psrad x, 1 }, m0, m2, m12, m13, m14, m15 2938 REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 2939%else 2940 mova [r3+8*16], m1 2941 mova [r3+9*16], m3 2942 mova m3, [o(pd_6144)] 2943 pcmpeqd m1, m1 2944 REPX {pxor x, m1}, m5, m7 2945 REPX {paddd x, m3}, m4, m5, m6, m7 2946 REPX {psrad x, 13}, m4, m5, m6, m7 2947 packssdw m4, m5 2948 packssdw m6, m7 2949 mova [r3+10*16], m4 2950 mova [r3+11*16], m6 2951 mova m4, [r3+4*16] 2952 mova m5, [r3+5*16] 2953 mova m6, [r3+6*16] 2954 mova m7, [r3+7*16] 2955 REPX {pxor x, m1}, m5, m7 2956 REPX {psubd x, m1}, m4, m6 2957 REPX {psrad x, 1 }, m4, m5, m6, m7 2958 REPX {psubd x, m1}, m5, m7 2959 packssdw m4, m5 2960 packssdw m6, m7 2961 mova m5, [r3+8*16] 2962 mova m7, [r3+9*16] 2963 mova [r3+8*16], m4 2964 mova [r3+9*16], m6 2965 REPX {pxor x, m1}, m5, m7 2966 REPX {paddd x, m3}, m0, m5, m2, m7 2967 REPX {psrad x, 13}, m0, m5, m2, m7 2968 packssdw m0, m5 2969 packssdw m2, m7 2970 mova m4, [r3+0*16] 2971 mova m5, [r3+1*16] 2972 mova m6, [r3+2*16] 2973 mova m7, [r3+3*16] 2974 REPX {psubd x, m1}, m4, m6 2975 REPX {pxor x, m1}, m5, m7 2976 REPX {psrad x, 1 }, m4, m5, m6, m7 2977 REPX {psubd x, m1}, m5, m7 2978 packssdw m4, m5 2979 packssdw m6, m7 2980%endif 2981 ret 2982 2983.main_part2: 2984%if ARCH_X86_64 2985 ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201, 4091 2986 ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751, 3703 2987 ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035, 2751 2988 ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857, 1380 2989 psubd m8, m0, m4 ; t8a 2990 paddd m0, m4 ; t0a 2991 psubd m4, m1, m5 ; t9a 2992 paddd m1, m5 ; t1a 2993 psubd m5, m2, m6 ; t12a 2994 paddd m2, m6 ; t4a 2995 psubd m6, m3, m7 ; t13a 2996 paddd m7, m3 ; t5a 2997 REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 2998 REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 2999 mova m15, [o(pd_4017)] 3000 mova m10, [o(pd_799)] 3001 ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 3002 ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 3003 psubd m3, m0, m2 ; t4 3004 paddd m0, m2 ; t0 3005 psubd m2, m1, m7 ; t5 3006 paddd m1, m7 ; t1 3007 psubd m7, m4, m6 ; t12a 3008 paddd m4, m6 ; t8a 3009 psubd m6, m8, m5 ; t13a 3010 paddd m5, m8 ; t9a 3011 REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 3012 REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 3013 mova m15, [o(pd_3784)] 3014 mova m10, [o(pd_1567)] 3015 ITX_MULSUB_2D 3, 2, 8, 9, _, 11, 10, 15 3016 ITX_MULSUB_2D 7, 6, 8, 9, _, 11, 10, 15 3017 mova m10, [r3+0*16] ; t2 3018 mova m8, [r3+1*16] ; t3 3019 psubd m9, m0, m10 ; t2a 3020 paddd m0, m10 ; out0 3021 psubd m10, m1, m8 ; t3a 3022 paddd m1, m8 ; -out15 3023 mova [r3+0*16], m1 3024 mova m15, [r3+3*16] ; t7a 3025 mova m1, [r3+2*16] ; t6a 3026 psubd m8, m3, m15 ; t7 3027 paddd m15, m3 ; out12 3028 paddd m3, m2, m1 ; -out3 3029 psubd m2, m1 ; t6 3030 mova [r3+3*16], m15 3031 mova [r3+1*16], m2 3032 mova m1, [r3+7*16] ; t15 3033 mova m2, [r3+6*16] ; t14 3034 paddd m15, m7, m1 ; -out13 3035 psubd m7, m1 ; t15a 3036 psubd m11, m6, m2 ; t14a 3037 paddd m2, m6 ; out2 3038 mova [r3+2*16], m15 3039 mova m1, [r3+4*16] ; t10a 3040 mova m15, [r3+5*16] ; t11a 3041 psubd m6, m4, m1 ; t10 3042 paddd m1, m4 ; -out1 3043 psubd m4, m5, m15 ; t11 3044 paddd m5, m15 ; out14 3045 REPX {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8 3046 pmaxsd m12, [r3+1*16] ; t6 3047 mova [r3+1*16], m5 3048 REPX {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8 3049 REPX {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8 3050 paddd m5, m11, m7 ; -out5 (unshifted) 3051 psubd m11, m7 ; out10 (unshifted) 3052 paddd m7, m9, m10 ; -out7 (unshifted) 3053 psubd m9, m10 ; out8 (unshifted) 3054 psubd m10, m6, m4 ; -out9 (unshifted) 3055 paddd m6, m4 ; out6 (unshifted) 3056 paddd m4, m12, m8 ; out4 (unshifted) 3057 psubd m12, m8 ; -out11 (unshifted) 3058%else 3059 mova [r3+8*16], m0 3060 mova [r3+9*16], m1 3061 mova [r3+10*16], m2 3062 mova [r3+11*16], m3 3063 mova m3, [o(pd_2048)] 3064 ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3035, 2751 3065 ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 3857, 1380 3066 mova m0, [r3+8*16] 3067 mova m1, [r3+9*16] 3068 mova [r3+8*16], m4 3069 mova m4, [r3+10*16] 3070 mova [r3+9*16], m5 3071 mova [r3+10*16], m6 3072 mova m5, [r3+11*16] 3073 mova [r3+11*16], m7 3074 ITX_MULSUB_2D 1, 0, 2, 6, 7, 3, 201, 4091 3075 ITX_MULSUB_2D 5, 4, 2, 6, 7, 3, 1751, 3703 3076 mova m2, [r3+8*16] 3077 mova m6, [r3+9*16] 3078 psubd m3, m0, m2 ; t8a 3079 paddd m0, m2 ; t0a 3080 mova [r3+8*16], m3 3081 psubd m2, m1, m6 ; t9a 3082 paddd m1, m6 ; t1a 3083 mova m3, [r3+10*16] 3084 psubd m6, m4, m3 ; t12a 3085 paddd m4, m3 ; t4a 3086 mova m3, [r3+11*16] 3087 psubd m7, m5, m3 ; t13a 3088 paddd m5, m3 ; t5a 3089 mova m3, [o(clip_18b_min)] 3090 REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5 3091 pmaxsd m3, [r3+8*16] 3092 mova [r3+8*16], m3 3093 mova m3, [o(clip_18b_max)] 3094 REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5 3095 pminsd m3, [r3+8*16] 3096 mova [r3+8*16], m3 3097 psubd m3, m0, m4 ; t4 3098 paddd m0, m4 ; t0 3099 psubd m4, m1, m5 ; t5 3100 paddd m1, m5 ; t1 3101 mova m5, [o(pd_2048)] 3102 mova [r3+9*16], m1 3103 mova [r3+10*16], m4 3104 mova [r3+11*16], m3 3105 mova m3, [r3+8*16] 3106 mova [r3+8*16], m0 3107 ITX_MULSUB_2D 3, 2, 0, 1, 4, 5, 799, 4017 3108 ITX_MULSUB_2D 7, 6, 0, 1, 4, 5, 4017, 4 3109 psubd m5, m2, m7 ; t12a 3110 paddd m2, m7 ; t8a 3111 psubd m7, m3, m6 ; t13a 3112 paddd m6, m3 ; t9a 3113 mova m0, [r3+8*16] 3114 mova m1, [r3+9*16] 3115 mova m4, [r3+10*16] 3116 mova m3, [o(clip_18b_min)] 3117 REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6 3118 pmaxsd m3, [r3+11*16] 3119 mova [r3+8*16], m3 3120 mova m3, [o(clip_18b_max)] 3121 REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6 3122 pminsd m3, [r3+8*16] 3123 mova [r3+8*16], m0 3124 mova [r3+9*16], m1 3125 mova [r3+10*16], m2 3126 mova [r3+11*16], m6 3127 mova m0, [o(pd_2048)] 3128 ITX_MULSUB_2D 3, 4, 1, 2, 6, 0, 1567, 3784 3129 ITX_MULSUB_2D 5, 7, 1, 2, 6, 0, 6, 3784 3130 mova m0, [r3+7*16] ; t7a 3131 mova m2, [r3+6*16] ; t6a 3132 psubd m1, m3, m0 ; t7 3133 paddd m0, m3 ; out12 3134 paddd m3, m4, m2 ; -out3 3135 psubd m4, m2 ; t6 3136 mova [r3+7*16], m3 3137 mova m3, [r3+3*16] ; t15 3138 mova m2, [r3+2*16] ; t14 3139 paddd m6, m5, m3 ; -out13 3140 psubd m5, m3 ; t15a 3141 psubd m3, m7, m2 ; t14a 3142 paddd m2, m7 ; out2 3143 mova [r3+6*16], m2 3144 mova m7, [r3+0*16] ; t10a 3145 mova m2, [r3+1*16] ; t11a 3146 mova [r3+0*16], m0 3147 mova [r3+1*16], m6 3148 mova m6, [r3+11*16] 3149 psubd m0, m6, m2 ; t11 3150 paddd m6, m2 ; out14 3151 mova [r3+2*16], m6 3152 mova m2, [r3+10*16] 3153 psubd m6, m2, m7 ; t10 3154 paddd m2, m7 ; -out1 3155 mova m7, [r3+5*16] ; t3 3156 mova [r3+5*16], m2 3157 mova [r3+10*16], m1 3158 mova m1, [r3+9*16] 3159 psubd m2, m1, m7 ; t3a 3160 paddd m1, m7 ; -out15 3161 mova [r3+3*16], m1 3162 mova m1, [r3+4*16] ; t2 3163 mova m7, [r3+8*16] 3164 psubd m7, m1 ; t2a 3165 paddd m1, [r3+8*16] ; out0 3166 mova [r3+4*16], m1 3167 mova m1, [o(clip_18b_min)] 3168 REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7 3169 pmaxsd m1, [r3+10*16] 3170 mova [r3+10*16], m1 3171 mova m1, [o(clip_18b_max)] 3172 REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7 3173 pminsd m1, [r3+10*16] 3174 mova [r3+10*16], m1 3175 mova m1, [o(pd_2896)] 3176 REPX {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7 3177 pmulld m1, [r3+10*16] 3178 mova [r3+11*16], m3 3179 psubd m3, m4, m1 ; -out11 (unshifted) 3180 paddd m4, m1 ; out4 (unshifted) 3181 psubd m1, m6, m0 ; -out9 (unshifted) 3182 paddd m6, m0 ; out6 (unshifted) 3183 psubd m0, m7, m2 ; out8 (unshifted) 3184 paddd m7, m2 ; -out7 (unshifted) 3185 mova m2, [r3+11*16] 3186 mova [r3+11*16], m5 3187 paddd m5, m2 ; -out5 (unshifted) 3188 psubd m2, [r3+11*16] ; out10 (unshifted) 3189 ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted) 3190 ; r[-4,3] contain out0-3 and out12-15 3191%endif 3192 ret 3193.main_part1: 3194%if ARCH_X86_64 3195 ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 995, 3973 3196 ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 2440, 3290 3197 ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3513, 2106 3198 ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 4052, 601 3199 psubd m8, m0, m4 ; t10a 3200 paddd m0, m4 ; t2a 3201 psubd m4, m1, m5 ; t11a 3202 paddd m1, m5 ; t3a 3203 psubd m5, m2, m6 ; t14a 3204 paddd m2, m6 ; t6a 3205 psubd m6, m3, m7 ; t15a 3206 paddd m7, m3 ; t7a 3207 REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 3208 REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 3209 mova m15, [o(pd_2276)] 3210 mova m10, [o(pd_3406)] 3211 ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 3212 ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 3213 psubd m3, m0, m2 ; t6 3214 paddd m0, m2 ; t2 3215 psubd m2, m1, m7 ; t7 3216 paddd m1, m7 ; t3 3217 psubd m7, m4, m6 ; t14a 3218 paddd m4, m6 ; t10a 3219 psubd m6, m8, m5 ; t15a 3220 paddd m5, m8 ; t11a 3221 REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 3222 REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 3223 mova m15, [o(pd_1567)] 3224 mova m10, [o(pd_3784)] 3225 ITX_MULSUB_2D 2, 3, 8, 9, _, 11, 10, 15 3226 ITX_MULSUB_2D 6, 7, 8, 9, _, 11, 10, 15 3227 mova [r3+0*16], m0 3228 mova [r3+1*16], m1 3229 mova [r3+4*16], m4 3230 mova [r3+5*16], m5 3231 mova [r3+2*16], m2 3232 mova [r3+3*16], m3 3233 mova [r3+6*16], m6 3234 mova [r3+7*16], m7 3235%else 3236 mova [r3+4*16], m0 3237 mova [r3+5*16], m1 3238 mova [r3+6*16], m2 3239 mova [r3+7*16], m3 3240 mova m3, [o(pd_2048)] 3241 ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3513, 2106 3242 ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 4052, 601 3243 mova [r3+0*16], m4 3244 mova [r3+1*16], m5 3245 mova [r3+2*16], m6 3246 mova [r3+3*16], m7 3247 mova m0, [r3+4*16] 3248 mova m1, [r3+5*16] 3249 mova m2, [r3+6*16] 3250 mova m7, [r3+7*16] 3251 ITX_MULSUB_2D 1, 0, 4, 5, 6, 3, 995, 3973 3252 ITX_MULSUB_2D 7, 2, 4, 5, 6, 3, 2440, 3290 3253 mova m4, [r3+0*16] 3254 mova m5, [r3+1*16] 3255 psubd m6, m0, m4 ; t10a 3256 paddd m0, m4 ; t2a 3257 mova [r3+4*16], m6 3258 mova m6, [r3+2*16] 3259 mova m3, [r3+3*16] 3260 psubd m4, m1, m5 ; t11a 3261 paddd m1, m5 ; t3a 3262 psubd m5, m2, m6 ; t14a 3263 paddd m2, m6 ; t6a 3264 psubd m6, m7, m3 ; t15a 3265 paddd m7, m3 ; t7a 3266 mova m3, [o(clip_18b_min)] 3267 REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7 3268 pmaxsd m3, [r3+4*16] 3269 mova [r3+4*16], m3 3270 mova m3, [o(clip_18b_max)] 3271 REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7 3272 pminsd m3, [r3+4*16] 3273 mova [r3+4*16], m3 3274 psubd m3, m0, m2 ; t6 3275 paddd m0, m2 ; t2 3276 psubd m2, m1, m7 ; t7 3277 paddd m1, m7 ; t3 3278 mova [r3+5*16], m1 3279 mova [r3+6*16], m3 3280 mova [r3+7*16], m2 3281 mova m1, [r3+4*16] 3282 mova [r3+4*16], m0 3283 mova m3, [o(pd_2048)] 3284 ITX_MULSUB_2D 1, 4, 0, 7, 2, 3, 3406, 2276 3285 ITX_MULSUB_2D 6, 5, 0, 7, 2, 3, 2276, 2 3286 psubd m7, m4, m6 ; t14a 3287 paddd m4, m6 ; t10a 3288 psubd m6, m1, m5 ; t15a 3289 paddd m5, m1 ; t11a 3290 mova m1, [r3+5*16] 3291 mova m3, [r3+6*16] 3292 mova m2, [r3+7*16] 3293 mova m0, [o(clip_18b_min)] 3294 REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5 3295 pmaxsd m0, [r3+4*16] 3296 mova [r3+4*16], m0 3297 mova m0, [o(clip_18b_max)] 3298 REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5 3299 pminsd m0, [r3+4*16] 3300 mova [r3+4*16], m0 3301 mova [r3+5*16], m1 3302 mova [r3+0*16], m4 3303 mova [r3+1*16], m5 3304 mova m0, [o(pd_2048)] 3305 ITX_MULSUB_2D 2, 3, 1, 4, 5, 0, 3784, 1567 3306 ITX_MULSUB_2D 6, 7, 1, 4, 5, 0, 5, 1567 3307 mova [r3+6*16], m2 3308 mova [r3+7*16], m3 3309 mova [r3+2*16], m6 3310 mova [r3+3*16], m7 3311%endif 3312 ret 3313 3314.pass2: 3315 lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] 3316 jmp m(idct_16x4_internal_16bpc).pass2_loop 3317 3318INV_TXFM_16X4_FN flipadst, dct 3319INV_TXFM_16X4_FN flipadst, adst 3320INV_TXFM_16X4_FN flipadst, flipadst 3321INV_TXFM_16X4_FN flipadst, identity 3322 3323cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 3324 lea r3, [rsp+gprsize] 3325 call m(iadst_16x4_internal_16bpc).main 3326%if ARCH_X86_64 3327 packssdw m1, m0 3328 packssdw m3, m2 3329 packssdw m5, m4 3330 packssdw m7, m6 3331 packssdw m9, m8 3332 packssdw m11, m10 3333 packssdw m13, m12 3334 packssdw m15, m14 3335 mova m0, m15 3336 mova m2, m13 3337 mova m4, m11 3338 mova m6, m9 3339 mova m8, m7 3340 mova m10, m5 3341 mova m12, m3 3342 mova m14, m1 3343 jmp m(idct_16x4_internal_16bpc).transpose 3344%else 3345 mova [rsp+gprsize+4*16], m0 3346 mova [rsp+gprsize+5*16], m2 3347 mova [rsp+gprsize+6*16], m4 3348 mova [rsp+gprsize+7*16], m6 3349 pshufd m6, [rsp+gprsize+ 8*16], q1032 3350 pshufd m4, [rsp+gprsize+ 9*16], q1032 3351 pshufd m2, [rsp+gprsize+10*16], q1032 3352 pshufd m0, [rsp+gprsize+11*16], q1032 3353 call m(idct_8x4_internal_16bpc).transpose4x8packed 3354 mova [rsp+gprsize+0*16], m0 3355 mova [rsp+gprsize+1*16], m1 3356 mova [rsp+gprsize+2*16], m2 3357 mova [rsp+gprsize+3*16], m3 3358 pshufd m6, [rsp+gprsize+ 4*16], q1032 3359 pshufd m4, [rsp+gprsize+ 5*16], q1032 3360 pshufd m2, [rsp+gprsize+ 6*16], q1032 3361 pshufd m0, [rsp+gprsize+ 7*16], q1032 3362 call m(idct_8x4_internal_16bpc).transpose4x8packed 3363 jmp tx2q 3364%endif 3365 3366.pass2: 3367 lea r3, [strideq*3] 3368 lea dstq, [dstq+r3] 3369 neg strideq 3370 lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] 3371 jmp m(idct_16x4_internal_16bpc).pass2_loop 3372 3373INV_TXFM_16X4_FN identity, dct 3374INV_TXFM_16X4_FN identity, adst 3375INV_TXFM_16X4_FN identity, flipadst 3376INV_TXFM_16X4_FN identity, identity 3377 3378cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 3379%if ARCH_X86_64 3380 mova m15, [o(pd_11586)] 3381 pmulld m0, m15, [cq+ 0*16] 3382 pmulld m1, m15, [cq+ 1*16] 3383 pmulld m2, m15, [cq+ 2*16] 3384 pmulld m3, m15, [cq+ 3*16] 3385 pmulld m4, m15, [cq+ 4*16] 3386 pmulld m5, m15, [cq+ 5*16] 3387 pmulld m6, m15, [cq+ 6*16] 3388 pmulld m7, m15, [cq+ 7*16] 3389 pmulld m8, m15, [cq+ 8*16] 3390 pmulld m9, m15, [cq+ 9*16] 3391 pmulld m10, m15, [cq+10*16] 3392 pmulld m11, m15, [cq+11*16] 3393 pmulld m12, m15, [cq+12*16] 3394 pmulld m13, m15, [cq+13*16] 3395 pmulld m14, m15, [cq+14*16] 3396 pmulld m15, [cq+15*16] 3397 mova [cq+ 0*16], m15 3398 mova m15, [o(pd_6144)] 3399 REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3400 m8, m9, m10, m11, m12, m13, m14 3401 paddd m15, [cq+ 0*16] 3402 REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ 3403 m8, m9, m10, m11, m12, m13, m14, m15 3404 jmp m(idct_16x4_internal_16bpc).pack_transpose 3405%else 3406 add cq, 8*16 3407 mov r5d, 2 3408.loop_pass1: 3409 mova m7, [o(pd_11586)] 3410 pmulld m0, m7, [cq+0*16] 3411 pmulld m1, m7, [cq+1*16] 3412 pmulld m2, m7, [cq+2*16] 3413 pmulld m3, m7, [cq+3*16] 3414 pmulld m4, m7, [cq+4*16] 3415 pmulld m5, m7, [cq+5*16] 3416 pmulld m6, m7, [cq+6*16] 3417 pmulld m7, [cq+7*16] 3418 mova [cq+7*16], m7 3419 mova m7, [o(pd_6144)] 3420 REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 3421 paddd m7, [cq+7*16] 3422 REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 3423 packssdw m0, m1 3424 packssdw m2, m3 3425 packssdw m4, m5 3426 packssdw m6, m7 3427 call m(idct_8x4_internal_16bpc).transpose4x8packed 3428 dec r5d 3429 jz .end_pass1 3430 mova [rsp+gprsize+0*16], m0 3431 mova [rsp+gprsize+1*16], m1 3432 mova [rsp+gprsize+2*16], m2 3433 mova [rsp+gprsize+3*16], m3 3434 sub cq, 8*16 3435 jmp .loop_pass1 3436.end_pass1: 3437 jmp tx2q 3438%endif 3439 3440.pass2: 3441%if ARCH_X86_64 3442 mova m12, [o(pw_1697x8)] 3443%endif 3444 lea r4, [o(.main)] 3445 jmp m(idct_16x4_internal_16bpc).pass2_loop 3446.main: 3447%if ARCH_X86_64 3448 pmulhrsw m4, m0, m12 3449 pmulhrsw m5, m1, m12 3450 pmulhrsw m6, m2, m12 3451 pmulhrsw m7, m3, m12 3452%else 3453 mova m7, [o(pw_1697x8)] 3454 pmulhrsw m4, m0, m7 3455 pmulhrsw m5, m1, m7 3456 pmulhrsw m6, m2, m7 3457 pmulhrsw m7, m3 3458%endif 3459 paddsw m0, m4 3460 paddsw m1, m5 3461 paddsw m2, m6 3462 paddsw m3, m7 3463 ret 3464 3465%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset 3466%if ARCH_X86_64 3467 INV_TXFM_FN %1, %2, %3, 16x8, 16, 0-8*16 3468%else 3469 INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16 3470%endif 3471%ifidn %1_%2, dct_dct 3472 imul r5d, [cq], 181 3473 mov [cq], eobd ; 0 3474 mov r3d, 8 3475 add r5d, 128 3476 sar r5d, 8 3477 imul r5d, 181 3478%if ARCH_X86_32 3479 add rsp, 1*16 3480%endif 3481 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly 3482%endif 3483%endmacro 3484 3485INV_TXFM_16X8_FN dct, dct 3486INV_TXFM_16X8_FN dct, identity, 6 3487INV_TXFM_16X8_FN dct, adst 3488INV_TXFM_16X8_FN dct, flipadst 3489 3490cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 3491%if ARCH_X86_64 3492 DECLARE_REG_TMP 6, 4, 6 3493%else 3494 mov [rsp+gprsize+12*16], r1 3495 DECLARE_REG_TMP 1, 4, 3 3496%endif 3497 lea t0, [o(.main)] 3498.loop_main: 3499%undef cmp 3500%if ARCH_X86_64 3501 xor r5d, r5d 3502 cmp eobd, 10 3503 setge r5b 3504%else 3505 mov r5d, 1 3506 cmp eobd, 10 3507 sbb r5d, 0 3508%endif 3509 shl r5d, 4 3510 3511 lea r3, [rsp+gprsize] 3512.loop_pass1: 3513 call t0 3514%if ARCH_X86_64 3515 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 3516 mova [cq+4*32+r5], m8 3517 mova [cq+5*32+r5], m9 3518 mova [cq+6*32+r5], m10 3519 mova [cq+7*32+r5], m11 3520%else 3521 call m(idct_8x4_internal_16bpc).transpose4x8packed 3522 mova [cq+4*32+r5], m0 3523 mova [cq+5*32+r5], m1 3524 mova [cq+6*32+r5], m2 3525 mova [cq+7*32+r5], m3 3526 mova m0, [rsp+gprsize+ 8*16] 3527 mova m2, [rsp+gprsize+ 9*16] 3528 mova m4, [rsp+gprsize+10*16] 3529 mova m6, [rsp+gprsize+11*16] 3530%endif 3531 call m(idct_8x4_internal_16bpc).transpose4x8packed 3532 pxor m7, m7 3533 REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15 3534 test r5d, r5d 3535 jz .end 3536 mova [cq+0*32+r5], m0 3537 mova [cq+1*32+r5], m1 3538 mova [cq+2*32+r5], m2 3539 mova [cq+3*32+r5], m3 3540 xor r5d, r5d 3541 jmp .loop_pass1 3542.end: 3543 3544 jmp tx2q 3545.main: 3546%if ARCH_X86_64 3547 mova m11, [o(pd_2048)] 3548 mova m12, [o(clip_18b_min)] 3549 mova m13, [o(clip_18b_max)] 3550 mova m14, [o(pd_2896)] 3551%endif 3552 mova m0, [cq+ 1*32+r5] 3553 mova m1, [cq+ 3*32+r5] 3554 mova m2, [cq+ 5*32+r5] 3555 mova m3, [cq+ 7*32+r5] 3556 mova m4, [cq+ 9*32+r5] 3557 mova m5, [cq+11*32+r5] 3558 mova m6, [cq+13*32+r5] 3559 mova m7, [cq+15*32+r5] 3560 call m(idct_8x4_internal_16bpc).rect2_mul 3561 call m(idct_16x4_internal_16bpc).main_oddhalf 3562 3563 mova m0, [cq+ 0*32+r5] 3564 mova m1, [cq+ 2*32+r5] 3565 mova m2, [cq+ 4*32+r5] 3566 mova m3, [cq+ 6*32+r5] 3567 mova m4, [cq+ 8*32+r5] 3568 mova m5, [cq+10*32+r5] 3569 mova m6, [cq+12*32+r5] 3570 mova m7, [cq+14*32+r5] 3571 call m(idct_8x4_internal_16bpc).rect2_mul 3572 call m(idct_8x4_internal_16bpc).main_pass1 3573 call m(idct_8x4_internal_16bpc).round 3574 call m(idct_16x4_internal_16bpc).round 3575%if ARCH_X86_64 3576 packssdw m0, m1 3577 packssdw m2, m3 3578 packssdw m4, m5 3579 packssdw m6, m7 3580 packssdw m8, m9 3581 packssdw m10, m11 3582 packssdw m12, m13 3583 packssdw m14, m15 3584%endif 3585 ret 3586 3587.pass2: 3588%if ARCH_X86_32 3589 mov strideq, [rsp+gprsize+12*16] 3590%endif 3591 mov r4d, 2 3592.pass2_main: 3593%if ARCH_X86_64 3594 mova m8, [o(pw_2048)] 3595 pxor m9, m9 3596 mova m10, [o(pixel_10bpc_max)] 3597%endif 3598 lea r3, [strideq*3] 3599 jmp .loop_pass2_entry 3600.loop_pass2: 3601 mova m0, [cq+0*32+ 0] 3602 mova m1, [cq+1*32+ 0] 3603 mova m2, [cq+2*32+ 0] 3604 mova m3, [cq+3*32+ 0] 3605.loop_pass2_entry: 3606 mova m4, [cq+0*32+16] 3607 mova m5, [cq+1*32+16] 3608 mova m6, [cq+2*32+16] 3609 mova m7, [cq+3*32+16] 3610%if ARCH_X86_32 3611 lea r5, [o(itx8_start)] 3612%endif 3613 call m_suffix(idct_8x8_internal_8bpc, _ssse3).main 3614 call m(idct_8x8_internal_16bpc).round2_and_write_8x8 3615%if ARCH_X86_64 3616%define mzero m9 3617%else 3618%define mzero m7 3619 pxor m7, m7 3620%endif 3621 REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 3622 add dstq, 16 3623 add cq, 4*32 3624 dec r4d 3625 jg .loop_pass2 3626 RET 3627 3628INV_TXFM_16X8_FN adst, dct 3629INV_TXFM_16X8_FN adst, adst 3630INV_TXFM_16X8_FN adst, flipadst 3631INV_TXFM_16X8_FN adst, identity, 6 3632 3633cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 3634%if ARCH_X86_32 3635 mov [rsp+gprsize+12*16], r1 3636%endif 3637 lea t0, [o(.main)] 3638 jmp m(idct_16x8_internal_16bpc).loop_main 3639 3640.main: 3641%if ARCH_X86_64 3642 mova m11, [o(pd_2048)] 3643 mova m12, [o(clip_18b_min)] 3644 mova m13, [o(clip_18b_max)] 3645 mova m14, [o(pd_2896)] 3646%endif 3647 mova m0, [cq+ 2*32+r5] 3648 mova m1, [cq+13*32+r5] 3649 mova m2, [cq+ 6*32+r5] 3650 mova m3, [cq+ 9*32+r5] 3651 mova m4, [cq+10*32+r5] 3652 mova m5, [cq+ 5*32+r5] 3653 mova m6, [cq+14*32+r5] 3654 mova m7, [cq+ 1*32+r5] 3655 call m(idct_8x4_internal_16bpc).rect2_mul 3656 call m(iadst_16x4_internal_16bpc).main_part1 3657 mova m0, [cq+ 0*32+r5] 3658 mova m1, [cq+15*32+r5] 3659 mova m2, [cq+ 4*32+r5] 3660 mova m3, [cq+11*32+r5] 3661 mova m4, [cq+ 8*32+r5] 3662 mova m5, [cq+ 7*32+r5] 3663 mova m6, [cq+12*32+r5] 3664 mova m7, [cq+ 3*32+r5] 3665%if ARCH_X86_32 3666 add r3, 8*16 3667%endif 3668 call m(idct_8x4_internal_16bpc).rect2_mul 3669%if ARCH_X86_32 3670 sub r3, 8*16 3671%endif 3672 call m(iadst_16x4_internal_16bpc).main_part2 3673 call m(iadst_16x4_internal_16bpc).round 3674%if ARCH_X86_64 3675 packssdw m0, m1 3676 packssdw m2, m3 3677 packssdw m4, m5 3678 packssdw m6, m7 3679 packssdw m8, m9 3680 packssdw m10, m11 3681 packssdw m12, m13 3682 packssdw m14, m15 3683%endif 3684 ret 3685 3686.pass2: 3687%if ARCH_X86_32 3688 mov strideq, [rsp+gprsize+12*16] 3689%endif 3690 mov r4d, 2 3691%if ARCH_X86_64 3692 mova m8, [o(pw_2048)] 3693 pxor m9, m9 3694 mova m10, [o(pixel_10bpc_max)] 3695 mova m11, [o(pw_m2048)] 3696%endif 3697 lea r3, [strideq*3] 3698 jmp .loop_pass2_entry 3699.loop_pass2: 3700 mova m0, [cq+0*32+ 0] 3701 mova m1, [cq+1*32+ 0] 3702 mova m2, [cq+2*32+ 0] 3703 mova m3, [cq+3*32+ 0] 3704.loop_pass2_entry: 3705 mova m4, [cq+0*32+16] 3706 mova m5, [cq+1*32+16] 3707 mova m6, [cq+2*32+16] 3708 mova m7, [cq+3*32+16] 3709%if ARCH_X86_32 3710 lea r5, [o(itx8_start)] 3711%endif 3712 call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main 3713 call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end 3714 call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 3715%if ARCH_X86_64 3716%define mzero m9 3717%else 3718%define mzero m7 3719 pxor m7, m7 3720%endif 3721 REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 3722 add dstq, 16 3723 add cq, 4*32 3724 dec r4d 3725 jg .loop_pass2 3726 RET 3727 3728INV_TXFM_16X8_FN flipadst, dct 3729INV_TXFM_16X8_FN flipadst, adst 3730INV_TXFM_16X8_FN flipadst, flipadst 3731INV_TXFM_16X8_FN flipadst, identity, 6 3732 3733cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 3734%if ARCH_X86_32 3735 mov [rsp+gprsize+12*16], r1 3736%endif 3737 lea t0, [o(.main)] 3738 jmp m(idct_16x8_internal_16bpc).loop_main 3739.main: 3740 call m(iadst_16x8_internal_16bpc).main 3741%if ARCH_X86_64 3742 pshufd m1, m0, q1032 3743 pshufd m3, m2, q1032 3744 pshufd m5, m4, q1032 3745 pshufd m7, m6, q1032 3746 pshufd m0, m14, q1032 3747 pshufd m2, m12, q1032 3748 pshufd m4, m10, q1032 3749 pshufd m6, m8, q1032 3750 mova m14, m1 3751 mova m12, m3 3752 mova m10, m5 3753 mova m8, m7 3754%else 3755 pshufd m1, m0, q1032 3756 pshufd m3, m2, q1032 3757 pshufd m5, m4, q1032 3758 pshufd m7, m6, q1032 3759 pshufd m0, [r3+11*16], q1032 3760 pshufd m2, [r3+10*16], q1032 3761 pshufd m4, [r3+9*16], q1032 3762 pshufd m6, [r3+8*16], q1032 3763 mova [r3+8*16], m7 3764 mova [r3+9*16], m5 3765 mova [r3+10*16], m3 3766 mova [r3+11*16], m1 3767%endif 3768 ret 3769 3770.pass2: 3771%if ARCH_X86_32 3772 mov strideq, [rsp+gprsize+12*16] 3773%endif 3774 lea dstq, [dstq+strideq*8] 3775 neg strideq 3776 add dstq, strideq 3777%if ARCH_X86_32 3778 mov [rsp+gprsize+12*16], strideq 3779%endif 3780 jmp m(iadst_16x8_internal_16bpc).pass2 3781 3782INV_TXFM_16X8_FN identity, dct, -54 3783INV_TXFM_16X8_FN identity, adst, -54 3784INV_TXFM_16X8_FN identity, flipadst, -54 3785INV_TXFM_16X8_FN identity, identity 3786 3787cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 3788%if ARCH_X86_32 3789 mov [rsp+gprsize+12*16], r1 3790%endif 3791 lea t0, [o(.main)] 3792 jmp m(idct_16x8_internal_16bpc).loop_main 3793.main: 3794%if ARCH_X86_64 3795 mova m15, [o(pd_2896)] 3796 pmulld m0, m15, [cq+ 0*32+r5] 3797 pmulld m1, m15, [cq+ 1*32+r5] 3798 pmulld m2, m15, [cq+ 2*32+r5] 3799 pmulld m3, m15, [cq+ 3*32+r5] 3800 pmulld m4, m15, [cq+ 4*32+r5] 3801 pmulld m5, m15, [cq+ 5*32+r5] 3802 pmulld m6, m15, [cq+ 6*32+r5] 3803 pmulld m7, m15, [cq+ 7*32+r5] 3804 pmulld m8, m15, [cq+ 8*32+r5] 3805 pmulld m9, m15, [cq+ 9*32+r5] 3806 pmulld m10, m15, [cq+10*32+r5] 3807 pmulld m11, m15, [cq+11*32+r5] 3808 pmulld m12, m15, [cq+12*32+r5] 3809 pmulld m13, m15, [cq+13*32+r5] 3810 pmulld m14, m15, [cq+14*32+r5] 3811 pmulld m15, [cq+15*32+r5] 3812 mova [r3], m15 3813 mova m15, [o(pd_2048)] 3814 REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3815 m8, m9, m10, m11, m12, m13, m14 3816 paddd m15, [r3] 3817 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ 3818 m8, m9, m10, m11, m12, m13, m14, m15 3819 mova [r3], m15 3820 mova m15, [o(pd_11586)] 3821 REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3822 m8, m9, m10, m11, m12, m13, m14 3823 pmulld m15, [r3] 3824 mova [r3], m15 3825 mova m15, [o(pd_6144)] 3826 REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3827 m8, m9, m10, m11, m12, m13, m14 3828 paddd m15, [r3] 3829 REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ 3830 m8, m9, m10, m11, m12, m13, m14, m15 3831 packssdw m0, m1 3832 packssdw m2, m3 3833 packssdw m4, m5 3834 packssdw m6, m7 3835 packssdw m8, m9 3836 packssdw m10, m11 3837 packssdw m12, m13 3838 packssdw m14, m15 3839%else 3840 mova m0, [cq+ 0*32+r5] 3841 mova m1, [cq+ 1*32+r5] 3842 mova m2, [cq+ 2*32+r5] 3843 mova m3, [cq+ 3*32+r5] 3844 mova m4, [cq+ 4*32+r5] 3845 mova m5, [cq+ 5*32+r5] 3846 mova m6, [cq+ 6*32+r5] 3847 mova m7, [cq+ 7*32+r5] 3848 call m(idct_8x4_internal_16bpc).rect2_mul 3849 mova [r3], m7 3850 mova m7, [o(pd_11586)] 3851 REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 3852 pmulld m7, [r3] 3853 mova [r3], m7 3854 mova m7, [o(pd_6144)] 3855 REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 3856 paddd m7, [r3] 3857 REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 3858 packssdw m0, m1 3859 packssdw m2, m3 3860 packssdw m4, m5 3861 packssdw m6, m7 3862 mova [r3+ 8*16], m0 3863 mova [r3+ 9*16], m2 3864 mova [r3+10*16], m4 3865 mova [r3+11*16], m6 3866 mova m0, [cq+ 8*32+r5] 3867 mova m1, [cq+ 9*32+r5] 3868 mova m2, [cq+10*32+r5] 3869 mova m3, [cq+11*32+r5] 3870 mova m4, [cq+12*32+r5] 3871 mova m5, [cq+13*32+r5] 3872 mova m6, [cq+14*32+r5] 3873 mova m7, [cq+15*32+r5] 3874 call m(idct_8x4_internal_16bpc).rect2_mul 3875 mova [r3], m7 3876 mova m7, [o(pd_11586)] 3877 REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 3878 pmulld m7, [r3] 3879 mova [r3], m7 3880 mova m7, [o(pd_6144)] 3881 REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 3882 paddd m7, [r3] 3883 REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 3884 packssdw m0, m1 3885 packssdw m2, m3 3886 packssdw m4, m5 3887 packssdw m6, m7 3888%endif 3889 ret 3890.pass2: 3891%if ARCH_X86_32 3892 mov strideq, [rsp+gprsize+12*16] 3893%endif 3894 mov r4d, 2 3895%if ARCH_X86_64 3896 mova m8, [o(pw_4096)] 3897 pxor m9, m9 3898 mova m10, [o(pixel_10bpc_max)] 3899%endif 3900 lea r3, [strideq*3] 3901 jmp .loop_pass2_entry 3902.loop_pass2: 3903 mova m0, [cq+0*32+ 0] 3904 mova m1, [cq+1*32+ 0] 3905 mova m2, [cq+2*32+ 0] 3906 mova m3, [cq+3*32+ 0] 3907.loop_pass2_entry: 3908 mova m4, [cq+0*32+16] 3909 mova m5, [cq+1*32+16] 3910 mova m6, [cq+2*32+16] 3911 mova m7, [cq+3*32+16] 3912%if ARCH_X86_64 3913 call m(idct_8x8_internal_16bpc).round1_and_write_8x8 3914%else 3915 mova [rsp+gprsize], m7 3916 mova m7, [o(pw_4096)] 3917 call m(idct_8x8_internal_16bpc).round4_and_write_8x8 3918%endif 3919%if ARCH_X86_64 3920%define mzero m9 3921%else 3922%define mzero m7 3923 pxor m7, m7 3924%endif 3925 REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 3926 add dstq, 16 3927 add cq, 4*32 3928 dec r4d 3929 jg .loop_pass2 3930 RET 3931 3932%macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix 3933%if ARCH_X86_64 3934 INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16 3935%else 3936 INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16 3937%endif 3938%ifidn %1_%2, dct_dct 3939 imul r5d, [cq], 181 3940 mov [cq], eobd ; 0 3941 mov r3d, 16 3942 add r5d, 640 3943 sar r5d, 10 3944 add rsp, (5+ARCH_X86_64*3+WIN64)*16 3945 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 3946%endif 3947%endmacro 3948 3949INV_TXFM_16X16_FN dct, dct 3950INV_TXFM_16X16_FN dct, identity, v 3951INV_TXFM_16X16_FN dct, adst 3952INV_TXFM_16X16_FN dct, flipadst 3953 3954cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 3955%if ARCH_X86_64 3956 DECLARE_REG_TMP 6, 7 3957%if WIN64 3958 mov [rsp+16*16+gprsize], r7 3959%endif 3960%elif ARCH_X86_32 3961 DECLARE_REG_TMP 1, 6 3962 mov [rsp+16*16+gprsize*1], r1 3963 mov [rsp+16*16+gprsize*2], r6 3964%endif 3965 lea t0, [o(.main)] 3966.pass1_full: 3967%undef cmp 3968 mov t1d, 4 3969.zero_loop: 3970 dec t1d 3971 cmp eobb, byte [r5+t1] 3972 jb .zero_loop 3973 mov r5d, t1d 3974 shl r5d, 4 3975%if ARCH_X86_32 3976 ; restore pic-ptr 3977 mov r6, [rsp+16*16+2*gprsize] 3978%endif 3979 ; setup stack pointer 3980 lea r3, [rsp+gprsize] 3981.loop_pass1: 3982 call t0 3983%if ARCH_X86_64 3984 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 3985 mova [cq+4*64+r5], m8 3986 mova [cq+5*64+r5], m9 3987 mova [cq+6*64+r5], m10 3988 mova [cq+7*64+r5], m11 3989%else 3990 call m(idct_8x4_internal_16bpc).transpose4x8packed 3991 mova [cq+4*64+r5], m0 3992 mova [cq+5*64+r5], m1 3993 mova [cq+6*64+r5], m2 3994 mova [cq+7*64+r5], m3 3995 mova m0, [rsp+gprsize+ 8*16] 3996 mova m2, [rsp+gprsize+ 9*16] 3997 mova m4, [rsp+gprsize+10*16] 3998 mova m6, [rsp+gprsize+11*16] 3999%endif 4000 call m(idct_8x4_internal_16bpc).transpose4x8packed 4001 mova [cq+0*64+r5], m0 4002 mova [cq+1*64+r5], m1 4003 mova [cq+2*64+r5], m2 4004 mova [cq+3*64+r5], m3 4005 pxor m0, m0 4006 REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15 4007 sub r5d, 16 4008 jge .loop_pass1 4009 4010%if ARCH_X86_32 4011 ; restore pic-ptr 4012 mov r1, [rsp+16*16+1*gprsize] 4013%endif 4014 jmp tx2q 4015.main: 4016%if ARCH_X86_64 4017 mova m11, [o(pd_2048)] 4018 mova m12, [o(clip_18b_min)] 4019 mova m13, [o(clip_18b_max)] 4020 mova m14, [o(pd_2896)] 4021%endif 4022 4023 mova m0, [cq+ 1*64+r5] 4024 mova m1, [cq+ 3*64+r5] 4025 mova m2, [cq+ 5*64+r5] 4026 mova m3, [cq+ 7*64+r5] 4027 mova m4, [cq+ 9*64+r5] 4028 mova m5, [cq+11*64+r5] 4029 mova m6, [cq+13*64+r5] 4030 mova m7, [cq+15*64+r5] 4031 call m(idct_16x4_internal_16bpc).main_oddhalf 4032 4033 mova m0, [cq+ 0*64+r5] 4034 mova m1, [cq+ 2*64+r5] 4035 mova m2, [cq+ 4*64+r5] 4036 mova m3, [cq+ 6*64+r5] 4037 mova m4, [cq+ 8*64+r5] 4038 mova m5, [cq+10*64+r5] 4039 mova m6, [cq+12*64+r5] 4040 mova m7, [cq+14*64+r5] 4041 call m(idct_8x4_internal_16bpc).main_pass1 4042 call m(idct_8x4_internal_16bpc).round 4043 call .round 4044%if ARCH_X86_64 4045 packssdw m0, m1 4046 packssdw m2, m3 4047 packssdw m4, m5 4048 packssdw m6, m7 4049 packssdw m8, m9 4050 packssdw m10, m11 4051 packssdw m12, m13 4052 packssdw m14, m15 4053%endif 4054 ret 4055.round: 4056%if ARCH_X86_64 4057 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 4058 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 4059 psrld m8, m11, 10 ; 2 4060 REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 4061 mova m8, [r3+1*16] 4062 mova m9, [r3+2*16] 4063 mova m10, [r3+3*16] 4064 mova m11, [r3+4*16] 4065 mova m12, [r3+5*16] 4066 mova m13, [r3+6*16] 4067 mova m14, [r3+7*16] 4068 psubd m15, m0, m14 ; out15 4069 paddd m0, m14 ; out0 4070 psubd m14, m1, m13 ; out14 4071 paddd m1, m13 ; out1 4072 psubd m13, m2, m12 ; out13 4073 paddd m2, m12 ; out2 4074 psubd m12, m3, m11 ; out12 4075 paddd m3, m11 ; out3 4076 psubd m11, m4, m10 ; out11 4077 paddd m4, m10 ; out4 4078 psubd m10, m5, m9 ; out10 4079 paddd m5, m9 ; out5 4080 psubd m9, m6, m8 ; out9 4081 paddd m6, m8 ; out6 4082 psubd m8, m7, [r3+0*16] ; out8 4083 paddd m7, [r3+0*16] ; out7 4084 REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ 4085 m8, m9, m10, m11, m12, m13, m14, m15 4086 ; and out0-15 is now in m0-15 4087%else 4088 mova [r3+ 0*16], m0 4089 mova m0, [o(clip_18b_min)] 4090 REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 4091 pmaxsd m0, [r3+ 0*16] 4092 mova [r3+ 0*16], m7 4093 mova m7, [o(clip_18b_max)] 4094 REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 4095 pminsd m7, [r3+ 0*16] 4096 mova [r3+ 0*16], m0 4097 mova m0, [o(pd_2)] 4098 REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7 4099 paddd m0, [r3+ 0*16] 4100 mova [r3+ 0*16], m0 4101 mova [r3+ 1*16], m1 4102 mova [r3+ 2*16], m2 4103 mova m1, [r3+11*16] 4104 mova m2, [r3+10*16] 4105 psubd m0, m7, m1 4106 paddd m7, m1 4107 psubd m1, m6, m2 4108 paddd m6, m2 4109 REPX {psrad x, 2}, m0, m1, m6, m7 4110 packssdw m0, m1 ; out8-9 4111 packssdw m6, m7 ; out6-7 4112 mova [r3+11*16], m6 4113 mova m1, [r3+9*16] 4114 mova m7, [r3+8*16] 4115 psubd m2, m5, m1 4116 paddd m5, m1 4117 psubd m1, m4, m7 4118 paddd m4, m7 4119 REPX {psrad x, 2}, m2, m1, m4, m5 4120 packssdw m2, m1 ; out10-11 4121 packssdw m4, m5 ; out4-5 4122 mova m1, [r3+2*16] 4123 mova [r3+10*16], m4 4124 mova m6, [r3+7*16] 4125 mova m7, [r3+6*16] 4126 psubd m4, m3, m6 4127 paddd m3, m6 4128 psubd m6, m1, m7 4129 paddd m1, m7 4130 REPX {psrad x, 2}, m4, m6, m1, m3 4131 packssdw m4, m6 ; out12-13 4132 packssdw m1, m3 ; out2-3 4133 mova m3, [r3+1*16] 4134 mova [r3+9*16], m1 4135 mova m1, [r3+0*16] 4136 mova m5, [r3+5*16] 4137 mova m7, [r3+4*16] 4138 psubd m6, m3, m5 4139 paddd m3, m5 4140 psubd m5, m1, m7 4141 paddd m1, m7 4142 REPX {psrad x, 2}, m6, m5, m1, m3 4143 packssdw m6, m5 ; out14-15 4144 packssdw m1, m3 ; out0-1 4145 mova [r3+8*16], m1 4146%endif 4147 ret 4148 4149.pass2: 4150%if ARCH_X86_64 4151 mova m8, [o(pw_2048)] 4152 pxor m9, m9 4153 mova m10, [o(pixel_10bpc_max)] 4154 mov r7, dstq 4155%else 4156 mov [rsp+2*gprsize+16*16], dstq 4157%endif 4158 lea r3, [strideq*3] 4159 mov r4d, 2 4160.loop_pass2: 4161%if ARCH_X86_32 4162 lea r5, [o(itx8_start)] 4163%endif 4164 mova m0, [cq+0*64+ 0] 4165 mova m1, [cq+2*64+ 0] 4166 mova m2, [cq+0*64+16] 4167 mova m3, [cq+2*64+16] 4168 mova m4, [cq+0*64+32] 4169 mova m5, [cq+2*64+32] 4170 mova m6, [cq+0*64+48] 4171 mova m7, [cq+2*64+48] 4172 call m_suffix(idct_8x8_internal_8bpc, _ssse3).main 4173 mova [rsp+gprsize+3*16], m0 4174 mova [rsp+gprsize+4*16], m1 4175 mova [rsp+gprsize+5*16], m2 4176 mova [rsp+gprsize+6*16], m3 4177 mova [rsp+gprsize+7*16], m4 4178 mova [rsp+gprsize+8*16], m5 4179 mova [rsp+gprsize+9*16], m6 4180 ; m7 is already stored in [rsp+gprsize+0*16] 4181 mova m0, [cq+1*64+ 0] 4182 mova m1, [cq+3*64+ 0] 4183 mova m2, [cq+1*64+16] 4184 mova m3, [cq+3*64+16] 4185 mova m4, [cq+1*64+32] 4186 mova m5, [cq+3*64+32] 4187 mova m6, [cq+1*64+48] 4188 mova m7, [cq+3*64+48] 4189 call m_suffix(idct_16x8_internal_8bpc, _ssse3).main 4190 4191 ; out0-7 is in rsp+gprsize+3-10*mmsize 4192 ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize 4193 4194%if ARCH_X86_64 4195 lea dstq, [r7+strideq*8] 4196%else 4197 mov dstq, [rsp+2*gprsize+16*16] 4198 lea dstq, [dstq+strideq*8] 4199%endif 4200 call m(idct_8x8_internal_16bpc).round2_and_write_8x8 4201%if ARCH_X86_64 4202 mov dstq, r7 4203%else 4204 mov dstq, [rsp+2*gprsize+16*16] 4205%endif 4206 mova m0, [rsp+gprsize+ 3*16] 4207 mova m1, [rsp+gprsize+ 4*16] 4208 mova m2, [rsp+gprsize+ 5*16] 4209 mova m3, [rsp+gprsize+ 6*16] 4210 mova m4, [rsp+gprsize+ 7*16] 4211 mova m5, [rsp+gprsize+ 8*16] 4212 mova m6, [rsp+gprsize+ 9*16] 4213 mova m7, [rsp+gprsize+10*16] 4214 call m(idct_8x8_internal_16bpc).round1_and_write_8x8 4215%if ARCH_X86_64 4216 add r7, 16 4217%define mzero m9 4218%else 4219 add dword [rsp+2*gprsize+16*16], 16 4220%define mzero m7 4221 pxor m7, m7 4222%endif 4223 REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 4224 add cq, 64*4 4225 REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 4226%undef mzero 4227 dec r4d 4228 jg .loop_pass2 4229%if WIN64 4230 mov r7, [rsp+16*16+gprsize] 4231%endif 4232 RET 4233 4234INV_TXFM_16X16_FN adst, dct 4235INV_TXFM_16X16_FN adst, adst 4236INV_TXFM_16X16_FN adst, flipadst 4237 4238cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 4239%if WIN64 4240 mov [rsp+16*16+gprsize], r7 4241%elif ARCH_X86_32 4242 mov [rsp+16*16+gprsize*1], r1 4243 mov [rsp+16*16+gprsize*2], r6 4244%endif 4245 lea t0, [o(.main)] 4246 jmp m(idct_16x16_internal_16bpc).pass1_full 4247 4248.main: 4249%if ARCH_X86_64 4250 mova m11, [o(pd_2048)] 4251 mova m12, [o(clip_18b_min)] 4252 mova m13, [o(clip_18b_max)] 4253 mova m14, [o(pd_2896)] 4254%endif 4255 mova m0, [cq+ 2*64+r5] 4256 mova m1, [cq+13*64+r5] 4257 mova m2, [cq+ 6*64+r5] 4258 mova m3, [cq+ 9*64+r5] 4259 mova m4, [cq+10*64+r5] 4260 mova m5, [cq+ 5*64+r5] 4261 mova m6, [cq+14*64+r5] 4262 mova m7, [cq+ 1*64+r5] 4263 call m(iadst_16x4_internal_16bpc).main_part1 4264 mova m0, [cq+ 0*64+r5] 4265 mova m1, [cq+15*64+r5] 4266 mova m2, [cq+ 4*64+r5] 4267 mova m3, [cq+11*64+r5] 4268 mova m4, [cq+ 8*64+r5] 4269 mova m5, [cq+ 7*64+r5] 4270 mova m6, [cq+12*64+r5] 4271 mova m7, [cq+ 3*64+r5] 4272 call m(iadst_16x4_internal_16bpc).main_part2 4273 call .round 4274%if ARCH_X86_64 4275 packssdw m0, m1 4276 packssdw m2, m3 4277 packssdw m4, m5 4278 packssdw m6, m7 4279 packssdw m8, m9 4280 packssdw m10, m11 4281 packssdw m12, m13 4282 packssdw m14, m15 4283%endif 4284 ret 4285.round: 4286%if ARCH_X86_64 4287 pcmpeqd m8, m8 ; -1 4288 mova m15, [o(pd_10240)] 4289 psrld m14, 10 ; +2 4290 psubd m13, m14, m8 ; +3 4291 REPX {pxor x, m8 }, m1, m3, m5, m7 4292 REPX {paddd x, m14}, m0, m2 4293 REPX {paddd x, m13}, m1, m3 4294 REPX {paddd x, m15}, m4, m5, m6, m7 4295 paddd m13, m15, m8 ; +10239 4296 paddd m8, m15, m9 4297 psubd m9, m13, m10 4298 paddd m10, m15, m11 4299 psubd m11, m13, m12 4300 paddd m12, m14, [r3+3*16] 4301 psubd m13, m14, [r3+2*16] 4302 psubd m15, m14, [r3+0*16] 4303 paddd m14, [r3+1*16] 4304 REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 4305 REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 4306%else 4307 mova [r3+8*16], m1 4308 mova [r3+9*16], m3 4309 mova m3, [o(pd_10240)] 4310 pcmpeqd m1, m1 4311 REPX {pxor x, m1}, m5, m7 4312 REPX {paddd x, m3}, m4, m5, m6, m7 4313 REPX {psrad x, 14}, m4, m5, m6, m7 4314 packssdw m4, m5 4315 packssdw m6, m7 4316 mova [r3+10*16], m4 4317 mova [r3+11*16], m6 4318 mova m4, [r3+4*16] 4319 mova m5, [r3+5*16] 4320 mova m6, [r3+6*16] 4321 mova m7, [r3+7*16] 4322 mova m3, [o(pd_2)] 4323 REPX {pxor x, m1}, m5, m7 4324 REPX {paddd x, m3}, m4, m6 4325 psubd m3, m1 4326 REPX {paddd x, m3}, m5, m7 4327 REPX {psrad x, 2 }, m4, m5, m6, m7 4328 packssdw m4, m5 4329 packssdw m6, m7 4330 mova m5, [r3+8*16] 4331 mova m7, [r3+9*16] 4332 mova [r3+8*16], m4 4333 mova [r3+9*16], m6 4334 mova m3, [o(pd_10240)] 4335 REPX {pxor x, m1}, m5, m7 4336 REPX {paddd x, m3}, m0, m5, m2, m7 4337 REPX {psrad x, 14}, m0, m5, m2, m7 4338 packssdw m0, m5 4339 packssdw m2, m7 4340 mova m4, [r3+0*16] 4341 mova m5, [r3+1*16] 4342 mova m6, [r3+2*16] 4343 mova m7, [r3+3*16] 4344 mova m3, [o(pd_2)] 4345 REPX {pxor x, m1}, m5, m7 4346 REPX {paddd x, m3}, m4, m6 4347 psubd m3, m1 4348 REPX {paddd x, m3}, m5, m7 4349 REPX {psrad x, 2 }, m4, m5, m6, m7 4350 packssdw m4, m5 4351 packssdw m6, m7 4352%endif 4353 ret 4354.pass2: 4355%if ARCH_X86_64 4356 mova m8, [o(pw_2048)] 4357 mova m11, [o(pw_m2048)] 4358 pxor m9, m9 4359 mova m10, [o(pixel_10bpc_max)] 4360 mov r7, dstq 4361%else 4362 mov [rsp+2*gprsize+16*16], dstq 4363%endif 4364 lea r3, [strideq*3] 4365 mov r4d, 2 4366.loop_pass2: 4367%if ARCH_X86_32 4368 lea r5, [o(itx8_start)] 4369%endif 4370 mova m0, [cq+0*64+32] 4371 mova m1, [cq+1*64+32] 4372 mova m2, [cq+2*64+16] 4373 mova m3, [cq+3*64+16] 4374 mova m4, [cq+0*64+ 0] 4375 mova m5, [cq+1*64+ 0] 4376 mova m6, [cq+2*64+48] 4377 mova m7, [cq+3*64+48] 4378 mova [rsp+gprsize+3*16], m0 4379 mova [rsp+gprsize+4*16], m1 4380 mova [rsp+gprsize+5*16], m2 4381 mova [rsp+gprsize+6*16], m3 4382 mova [rsp+gprsize+7*16], m4 4383 mova [rsp+gprsize+8*16], m5 4384 mova [rsp+gprsize+9*16], m6 4385 mova [rsp+gprsize+10*16], m7 4386 mova m0, [cq+2*64+ 0] 4387 mova m1, [cq+3*64+ 0] 4388 mova m2, [cq+0*64+16] 4389 mova m3, [cq+1*64+16] 4390 mova m4, [cq+2*64+32] 4391 mova m5, [cq+3*64+32] 4392 mova m6, [cq+0*64+48] 4393 mova m7, [cq+1*64+48] 4394 call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main 4395 call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end 4396 4397 ; out0-7 is in rsp+gprsize+3-10*mmsize 4398 ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize 4399 4400%if ARCH_X86_64 4401 lea dstq, [r7+strideq*8] 4402%else 4403 mov dstq, [rsp+2*gprsize+16*16] 4404 lea dstq, [dstq+strideq*8] 4405%endif 4406 call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 4407%if ARCH_X86_64 4408 mov dstq, r7 4409%else 4410 mov dstq, [rsp+2*gprsize+16*16] 4411%endif 4412 mova m0, [rsp+gprsize+ 3*16] 4413 mova m1, [rsp+gprsize+ 4*16] 4414 mova m2, [rsp+gprsize+ 5*16] 4415 mova m3, [rsp+gprsize+ 6*16] 4416 mova m4, [rsp+gprsize+ 7*16] 4417 mova m5, [rsp+gprsize+ 8*16] 4418 mova m6, [rsp+gprsize+ 9*16] 4419 mova m7, [rsp+gprsize+10*16] 4420 call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 4421%if ARCH_X86_64 4422 add r7, 16 4423%define mzero m9 4424%else 4425 add dword [rsp+2*gprsize+16*16], 16 4426%define mzero m7 4427 pxor m7, m7 4428%endif 4429 REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 4430 add cq, 64*4 4431 REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 4432%undef mzero 4433 dec r4d 4434 jg .loop_pass2 4435%if WIN64 4436 mov r7, [rsp+16*16+gprsize] 4437%endif 4438 RET 4439 4440INV_TXFM_16X16_FN flipadst, dct 4441INV_TXFM_16X16_FN flipadst, adst 4442INV_TXFM_16X16_FN flipadst, flipadst 4443 4444cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 4445%if WIN64 4446 mov [rsp+16*16+gprsize], r7 4447%elif ARCH_X86_32 4448 mov [rsp+16*16+gprsize*1], r1 4449 mov [rsp+16*16+gprsize*2], r6 4450%endif 4451 lea t0, [o(.main)] 4452 jmp m(idct_16x16_internal_16bpc).pass1_full 4453 4454.main: 4455 call m(iadst_16x16_internal_16bpc).main 4456%if ARCH_X86_64 4457 mova m1, m0 4458 mova m3, m2 4459 mova m5, m4 4460 mova m7, m6 4461 pshufd m0, m14, q1032 4462 pshufd m2, m12, q1032 4463 pshufd m4, m10, q1032 4464 pshufd m6, m8, q1032 4465 pshufd m8, m7, q1032 4466 pshufd m10, m5, q1032 4467 pshufd m12, m3, q1032 4468 pshufd m14, m1, q1032 4469%else 4470 pshufd m1, m0, q1032 4471 pshufd m3, m2, q1032 4472 pshufd m5, m4, q1032 4473 pshufd m7, m6, q1032 4474 pshufd m0, [r3+11*16], q1032 4475 pshufd m2, [r3+10*16], q1032 4476 pshufd m4, [r3+9*16], q1032 4477 pshufd m6, [r3+8*16], q1032 4478 mova [r3+11*16], m1 4479 mova [r3+10*16], m3 4480 mova [r3+ 9*16], m5 4481 mova [r3+ 8*16], m7 4482%endif 4483 ret 4484 4485.pass2: 4486 lea r3, [strideq*3] 4487 lea r3, [r3*5] 4488 add dstq, r3 4489 neg strideq 4490 jmp m(iadst_16x16_internal_16bpc).pass2 4491 4492INV_TXFM_16X16_FN identity, dct, h 4493INV_TXFM_16X16_FN identity, identity 4494 4495cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 4496%if WIN64 4497 mov [rsp+16*16+gprsize], r7 4498%elif ARCH_X86_32 4499 mov [rsp+16*16+gprsize*1], r1 4500 mov [rsp+16*16+gprsize*2], r6 4501%endif 4502 lea t0, [o(.main)] 4503 jmp m(idct_16x16_internal_16bpc).pass1_full 4504 4505.main: 4506%if ARCH_X86_64 4507 mova m15, [o(pd_11586)] 4508 pmulld m0, m15, [cq+ 0*64+r5] 4509 pmulld m1, m15, [cq+ 1*64+r5] 4510 pmulld m2, m15, [cq+ 2*64+r5] 4511 pmulld m3, m15, [cq+ 3*64+r5] 4512 pmulld m4, m15, [cq+ 4*64+r5] 4513 pmulld m5, m15, [cq+ 5*64+r5] 4514 pmulld m6, m15, [cq+ 6*64+r5] 4515 pmulld m7, m15, [cq+ 7*64+r5] 4516 pmulld m8, m15, [cq+ 8*64+r5] 4517 pmulld m9, m15, [cq+ 9*64+r5] 4518 pmulld m10, m15, [cq+10*64+r5] 4519 pmulld m11, m15, [cq+11*64+r5] 4520 pmulld m12, m15, [cq+12*64+r5] 4521 pmulld m13, m15, [cq+13*64+r5] 4522 pmulld m14, m15, [cq+14*64+r5] 4523 pmulld m15, [cq+15*64+r5] 4524 mova [r3], m15 4525 mova m15, [o(pd_10240)] 4526 REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ 4527 m8, m9, m10, m11, m12, m13, m14 4528 paddd m15, [r3] 4529 REPX {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \ 4530 m8, m9, m10, m11, m12, m13, m14, m15 4531 packssdw m0, m1 4532 packssdw m2, m3 4533 packssdw m4, m5 4534 packssdw m6, m7 4535 packssdw m8, m9 4536 packssdw m10, m11 4537 packssdw m12, m13 4538 packssdw m14, m15 4539%else 4540 mova m7, [o(pd_11586)] 4541 pmulld m0, m7, [cq+ 0*64+r5] 4542 pmulld m1, m7, [cq+ 1*64+r5] 4543 pmulld m2, m7, [cq+ 2*64+r5] 4544 pmulld m3, m7, [cq+ 3*64+r5] 4545 pmulld m4, m7, [cq+ 4*64+r5] 4546 pmulld m5, m7, [cq+ 5*64+r5] 4547 pmulld m6, m7, [cq+ 6*64+r5] 4548 pmulld m7, [cq+ 7*64+r5] 4549 mova [r3], m7 4550 mova m7, [o(pd_10240)] 4551 REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 4552 paddd m7, [r3] 4553 REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 4554 packssdw m0, m1 4555 packssdw m2, m3 4556 packssdw m4, m5 4557 packssdw m6, m7 4558 mova [r3+8*16], m0 4559 mova [r3+9*16], m2 4560 mova [r3+10*16], m4 4561 mova [r3+11*16], m6 4562 mova m7, [o(pd_11586)] 4563 pmulld m0, m7, [cq+ 8*64+r5] 4564 pmulld m1, m7, [cq+ 9*64+r5] 4565 pmulld m2, m7, [cq+10*64+r5] 4566 pmulld m3, m7, [cq+11*64+r5] 4567 pmulld m4, m7, [cq+12*64+r5] 4568 pmulld m5, m7, [cq+13*64+r5] 4569 pmulld m6, m7, [cq+14*64+r5] 4570 pmulld m7, [cq+15*64+r5] 4571 mova [r3], m7 4572 mova m7, [o(pd_10240)] 4573 REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 4574 paddd m7, [r3] 4575 REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 4576 packssdw m0, m1 4577 packssdw m2, m3 4578 packssdw m4, m5 4579 packssdw m6, m7 4580%endif 4581 ret 4582 4583.pass2: 4584%if ARCH_X86_64 4585 mova m4, [o(pw_2048)] 4586 mova m5, [o(pixel_10bpc_max)] 4587 pxor m6, m6 4588 mova m7, [o(pw_1697x16)] 4589 mov r7, dstq 4590%else 4591 mov [rsp+2*gprsize+16*16], dstq 4592%endif 4593 mov r5d, 4 4594 lea r3, [strideq*3] 4595.pass2_loop: 4596 mova m0, [cq+0*64+0] 4597 mova m1, [cq+1*64+0] 4598 mova m2, [cq+2*64+0] 4599 mova m3, [cq+3*64+0] 4600 call m(iidentity_8x16_internal_16bpc).main 4601%if ARCH_X86_64 4602 call m(idct_8x4_internal_16bpc).round1_and_write_8x4 4603%else 4604 call m(idct_8x4_internal_16bpc).round2_and_write_8x4 4605%endif 4606 REPX {mova [cq+x*16], m6}, 0, 4, 8, 12 4607 add cq, 16 4608 lea dstq, [dstq+strideq*4] 4609 dec r5w 4610 jg .pass2_loop 4611 add cq, 64*3 4612 btc r5d, 16 4613 jc .end 4614%if ARCH_X86_64 4615 lea dstq, [r7+16] 4616%else 4617 mov dstq, [rsp+2*gprsize+16*16] 4618 add dstq, 16 4619%endif 4620 add r5d, 4 4621 jmp .pass2_loop 4622.end: 4623%if WIN64 4624 mov r7, [rsp+16*16+gprsize] 4625%endif 4626 RET 4627 4628cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob 4629%if ARCH_X86_32 4630 LEA r6, $$ 4631%endif 4632 mova m5, [o(pw_5)] 4633 mova m7, [o(pixel_10bpc_max)] 4634 pxor m6, m6 4635 mov r5d, eobd 4636 add eobb, 21 4637 cmovc eobd, r5d ; 43, 107, 171 -> 64, 128, 192 4638 lea r4, [strideq*3] 4639.loop: 4640 mova m0, [cq+128*0] 4641 packssdw m0, [cq+128*1] 4642 mova m1, [cq+128*2] 4643 packssdw m1, [cq+128*3] 4644 mova m2, [cq+128*4] 4645 packssdw m2, [cq+128*5] 4646 mova m3, [cq+128*6] 4647 packssdw m3, [cq+128*7] 4648 REPX {paddsw x, m5}, m0, m1, m2, m3 4649 REPX {psraw x, 3 }, m0, m1, m2, m3 4650 call .main_zero 4651 add cq, 16 4652 lea dstq, [dstq+strideq*4] 4653 btc eobd, 16 4654 jnc .loop 4655 sub eobd, 64 4656 jge .loop 4657 RET 4658ALIGN function_align 4659.main_zero: 4660 REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 4661.main: 4662 punpckhwd m4, m0, m1 4663 punpcklwd m0, m1 4664 punpckhwd m1, m2, m3 4665 punpcklwd m2, m3 4666 punpckhwd m3, m0, m4 4667 punpcklwd m0, m4 4668 punpckhwd m4, m2, m1 4669 punpcklwd m2, m1 4670 punpckhqdq m1, m0, m2 4671 punpcklqdq m0, m2 4672 punpcklqdq m2, m3, m4 4673 punpckhqdq m3, m4 4674 paddw m0, [dstq+strideq*0] 4675 paddw m1, [dstq+strideq*1] 4676 paddw m2, [dstq+strideq*2] 4677 paddw m3, [dstq+r4 ] 4678 REPX {pmaxsw x, m6}, m0, m1, m2, m3 4679 REPX {pminsw x, m7}, m0, m1, m2, m3 4680 mova [dstq+strideq*0], m0 4681 mova [dstq+strideq*1], m1 4682 mova [dstq+strideq*2], m2 4683 mova [dstq+r4 ], m3 4684 ret 4685 4686cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob 4687%if ARCH_X86_32 4688 LEA r6, $$ 4689%endif 4690 mova m5, [o(pw_4096)] 4691 mova m7, [o(pixel_10bpc_max)] 4692 pxor m6, m6 4693 mov r4d, eobd 4694 add eobb, 21 4695 cmovc eobd, r4d 4696 lea r4, [strideq*3] 4697 mov r5, dstq 4698.loop: 4699 mova m0, [cq+32*0] 4700 packssdw m0, [cq+32*1] 4701 mova m1, [cq+32*2] 4702 packssdw m1, [cq+32*3] 4703 mova m2, [cq+32*4] 4704 packssdw m2, [cq+32*5] 4705 mova m3, [cq+32*6] 4706 packssdw m3, [cq+32*7] 4707 REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 4708 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 4709 call m(inv_txfm_add_identity_identity_8x32_16bpc).main 4710 lea dstq, [dstq+strideq*4] 4711 add cq, 16 4712 btc eobd, 16 4713 jnc .loop 4714 add cq, 32*8-32 4715 add r5, 16 4716 mov dstq, r5 4717 sub eobd, 64 4718 jge .loop 4719 RET 4720 4721cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob 4722%if ARCH_X86_32 4723 LEA r6, $$ 4724%else 4725 mova m8, [o(pw_2896x8)] 4726 mova m9, [o(pw_1697x16)] 4727 mova m11, [o(pw_8192)] 4728%endif 4729 mova m7, [o(pixel_10bpc_max)] 4730 lea r4, [strideq*3] 4731 pxor m6, m6 4732%if ARCH_X86_64 4733 paddw m10, m11, m11 ; pw_16384 4734%endif 4735 mov r5, dstq 4736 call .main 4737 sub eobd, 36 4738 jl .ret 4739 add cq, 128*8-32 4740 lea dstq, [r5+16] 4741 call .main 4742 sub cq, 128*8 4743 lea dstq, [r5+strideq*8] 4744 mov r5, dstq 4745 call .main 4746 sub eobd, 107 ; eob < 143 4747 jl .ret 4748 add cq, 128*8-32 4749 lea dstq, [r5+16] 4750 call .main 4751 sub cq, 128*8 4752 lea dstq, [r5+strideq*8] 4753 mov r5, dstq 4754 call .main 4755 sub eobd, 128 ; eob < 271 4756 jl .ret 4757 add cq, 128*8-32 4758 lea dstq, [r5+16] 4759 call .main 4760 sub cq, 128*8 4761 lea dstq, [r5+strideq*8] 4762 mov r5, dstq 4763 call .main 4764 sub eobd, 128 ; eob < 399 4765 jl .ret 4766 add cq, 128*8-32 4767 lea dstq, [r5+16] 4768 call .main 4769.ret: 4770 RET 4771ALIGN function_align 4772.main: 4773 mova m0, [cq+128*0] 4774 packssdw m0, [cq+128*1] 4775 mova m1, [cq+128*2] 4776 packssdw m1, [cq+128*3] 4777 mova m2, [cq+128*4] 4778 packssdw m2, [cq+128*5] 4779 mova m3, [cq+128*6] 4780 packssdw m3, [cq+128*7] 4781%if ARCH_X86_64 4782 REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 4783 pmulhrsw m4, m9, m0 4784 pmulhrsw m5, m9, m1 4785 REPX {pmulhrsw x, m10}, m4, m5 4786%else 4787 mova m6, [o(pw_2896x8)] 4788 REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 4789 mova m5, [o(pw_1697x16)] 4790 pmulhrsw m4, m5, m0 4791 pmulhrsw m5, m1 4792 mova m6, [o(pw_16384)] 4793 REPX {pmulhrsw x, m6 }, m4, m5 4794%endif 4795 paddsw m0, m4 4796 paddsw m1, m5 4797%if ARCH_X86_64 4798 pmulhrsw m4, m9, m2 4799 pmulhrsw m5, m9, m3 4800 REPX {pmulhrsw x, m10}, m4, m5 4801%else 4802 mova m5, [o(pw_1697x16)] 4803 pmulhrsw m4, m5, m2 4804 pmulhrsw m5, m3 4805 REPX {pmulhrsw x, m6 }, m4, m5 4806%endif 4807 paddsw m2, m4 4808 paddsw m3, m5 4809%if ARCH_X86_64 4810 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 4811%else 4812 psrlw m6, 1 ; pw_8192 4813 REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 4814 pxor m6, m6 4815%endif 4816 call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero 4817 lea dstq, [dstq+strideq*4] 4818 add cq, 16 4819 btc eobd, 16 4820 jnc .main 4821 ret 4822 4823cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob 4824%if ARCH_X86_32 4825 LEA r6, $$ 4826%else 4827 mova m8, [o(pw_2896x8)] 4828 mova m9, [o(pw_1697x16)] 4829 mova m10, [o(pw_2048)] 4830%endif 4831 mova m7, [o(pixel_10bpc_max)] 4832 lea r4, [strideq*3] 4833 pxor m6, m6 4834 mov r5, dstq 4835 call .main 4836 sub eobd, 36 4837 jl .ret 4838 call .main 4839 add cq, 64*8-64 4840 lea dstq, [r5+16*1] 4841 call .main 4842 sub eobd, 107 ; eob < 143 4843 jl .ret 4844 call .main 4845 add cq, 64*8-64 4846 lea dstq, [r5+16*2] 4847 call .main 4848 sub eobd, 128 ; eob < 271 4849 jl .ret 4850 call .main 4851 add cq, 64*8-64 4852 lea dstq, [r5+16*3] 4853 call .main 4854 sub eobd, 128 ; eob < 399 4855 jl .ret 4856 call .main 4857.ret: 4858 RET 4859ALIGN function_align 4860.main: 4861 mova m0, [cq+64*0] 4862 packssdw m0, [cq+64*1] 4863 mova m1, [cq+64*2] 4864 packssdw m1, [cq+64*3] 4865 mova m2, [cq+64*4] 4866 packssdw m2, [cq+64*5] 4867 mova m3, [cq+64*6] 4868 packssdw m3, [cq+64*7] 4869%if ARCH_X86_64 4870 REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 4871%else 4872 mova m6, [o(pw_2896x8)] 4873 REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 4874%endif 4875 REPX {paddsw x, x }, m0, m1, m2, m3 4876%if ARCH_X86_64 4877 pmulhrsw m4, m9, m0 4878 pmulhrsw m5, m9, m1 4879%else 4880 mova m6, [o(pw_1697x16)] 4881 pmulhrsw m4, m6, m0 4882 pmulhrsw m5, m6, m1 4883%endif 4884 REPX {paddsw x, x }, m0, m1 4885 paddsw m0, m4 4886 paddsw m1, m5 4887%if ARCH_X86_64 4888 pmulhrsw m4, m9, m2 4889 pmulhrsw m5, m9, m3 4890%else 4891 pmulhrsw m4, m6, m2 4892 pmulhrsw m6, m3 4893%endif 4894 REPX {paddsw x, x }, m2, m3 4895 paddsw m2, m4 4896%if ARCH_X86_64 4897 paddsw m3, m5 4898 REPX {pmulhrsw x, m10}, m0, m1, m2, m3 4899%else 4900 paddsw m3, m6 4901 mova m6, [o(pw_2048)] 4902 REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 4903 pxor m6, m6 4904%endif 4905 REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 4906 call m(inv_txfm_add_identity_identity_8x32_16bpc).main 4907 lea dstq, [dstq+strideq*4] 4908 add cq, 16 4909 btc eobd, 16 4910 jnc .main 4911 ret 4912 4913cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob 4914%undef cmp 4915%if ARCH_X86_32 4916 LEA r6, $$ 4917%endif 4918 mova m5, [o(pw_8192)] 4919 mova m7, [o(pixel_10bpc_max)] 4920 pxor m6, m6 4921 lea r4, [strideq*3] 4922 mov r5, dstq 4923 call .main ; 0 4924 cmp eobd, 36 4925 jl .ret 4926 add cq, 128*8-32 ; 0 1 4927 lea dstq, [r5+16] ; 1 4928 call .main 4929 call .main2 4930 cmp eobd, 136 4931 jl .ret 4932 add cq, 128*16-64 ; 0 1 2 4933 lea dstq, [r5+16*2] ; 1 2 4934 call .main ; 2 4935 call .main2 4936 call .main2 4937 cmp eobd, 300 4938 jl .ret 4939 add cq, 128*24-96 ; 0 1 2 3 4940 add r5, 16*3 ; 1 2 3 4941 mov dstq, r5 ; 2 3 4942 call .main ; 3 4943 call .main2 4944 call .main2 4945 call .main2 4946 cmp eobd, 535 4947 jl .ret 4948 add cq, 128*24-96 ; 0 1 2 3 4949 lea dstq, [r5+strideq*8] ; 1 2 3 4 4950 mov r5, dstq ; 2 3 4 4951 call .main ; 3 4 4952 call .main2 4953 call .main2 4954 cmp eobd, 755 4955 jl .ret 4956 add cq, 128*16-64 ; 0 1 2 3 4957 lea dstq, [r5+strideq*8] ; 1 2 3 4 4958 mov r5, dstq ; 2 3 4 5 4959 call .main ; 3 4 5 4960 call .main2 4961 cmp eobd, 911 4962 jl .ret 4963 add cq, 128*8-32 ; 0 1 2 3 4964 lea dstq, [r5+strideq*8] ; 1 2 3 4 4965 call .main ; 2 3 4 5 4966.ret: ; 3 4 5 6 4967 RET 4968ALIGN function_align 4969.main2: 4970 sub cq, 128*8 4971 sub dstq, 16 4972.main: 4973 mova m0, [cq+128*0] 4974 packssdw m0, [cq+128*1] 4975 mova m1, [cq+128*2] 4976 packssdw m1, [cq+128*3] 4977 mova m2, [cq+128*4] 4978 packssdw m2, [cq+128*5] 4979 mova m3, [cq+128*6] 4980 packssdw m3, [cq+128*7] 4981 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 4982 call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero 4983 lea dstq, [dstq+strideq*4] 4984 add cq, 16 4985 btc eobd, 16 4986 jnc .main 4987 ret 4988 4989cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \ 4990 dst, stride, c, eob 4991%if ARCH_X86_32 4992 LEA r6, $$ 4993%define base $$ 4994 DECLARE_REG_TMP 0, 4 4995%else 4996 lea r6, [tbl_Nx32_odd_offset] 4997%define base tbl_Nx32_odd_offset 4998 DECLARE_REG_TMP 4, 7 4999%if WIN64 5000 mov [rsp+gprsize*1+35*16], r7 5001%endif 5002%endif 5003%define o2(x) r6-base+x 5004 test eobd, eobd 5005 jz .dconly 5006 5007%if ARCH_X86_32 5008 mov [rsp+gprsize*1+35*16], r0 5009%endif 5010%undef cmp 5011 ; remove entirely-zero iterations 5012 mov r5d, 7*2 5013 cmp eobw, word [o2(tbl_8x32_2d)+r5] 5014 jge .end_zero_loop 5015 pxor m0, m0 5016.zero_loop: 5017 movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] 5018 movzx t1d, t0b 5019 shr t0d, 8 5020 mova [rsp+ 3*16+r5*8], m0 5021 mova [rsp+11*16+r5*8], m0 5022 mova [rsp+ 3*16+t0*8], m0 5023 mova [rsp+ 3*16+t1*8], m0 5024 sub r5d, 2 5025 cmp eobw, word [o2(tbl_8x32_2d)+r5] 5026 jl .zero_loop 5027.end_zero_loop: 5028 ; actual first pass after skipping all-zero data 5029 mov [rsp+gprsize*0+35*16], eobd 5030 mov r3, rsp 5031.loop_pass1: 5032%if ARCH_X86_64 5033 mova m11, [o(pd_2048)] 5034 mova m12, [o(clip_18b_min)] 5035 mova m13, [o(clip_18b_max)] 5036 mova m14, [o(pd_2896)] 5037%endif 5038 mova m0, [cq+0*128+r5*8] 5039 mova m1, [cq+1*128+r5*8] 5040 mova m2, [cq+2*128+r5*8] 5041 mova m3, [cq+3*128+r5*8] 5042 mova m4, [cq+4*128+r5*8] 5043 mova m5, [cq+5*128+r5*8] 5044 mova m6, [cq+6*128+r5*8] 5045 mova m7, [cq+7*128+r5*8] 5046 call m(idct_8x4_internal_16bpc).main_pass1 5047 mova m1, [o(pd_2)] 5048 REPX {paddd x, m1}, m0, m6, m5, m3 5049 call m(idct_8x4_internal_16bpc).round 5050 REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 5051 packssdw m0, m1 5052 packssdw m2, m3 5053 packssdw m4, m5 5054 packssdw m6, m7 5055 call m(idct_8x4_internal_16bpc).transpose4x8packed 5056 5057 movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] 5058 movzx t1d, t0b 5059 shr t0d, 8 5060 mova [r3+ 3*16+r5*8], m0 5061 mova [r3+11*16+r5*8], m2 5062 mova [r3+ 3*16+t1*8], m1 5063 mova [r3+ 3*16+t0*8], m3 5064 pxor m7, m7 5065 REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7 5066 sub r5d, 2 5067 jge .loop_pass1 5068 5069 ; pass 2 code starts here 5070 ; m0 is already loaded from last iteration of first pass 5071%if ARCH_X86_32 5072 mov r0, [rsp+gprsize*1+35*16] 5073%endif 5074 mov eobd, [rsp+gprsize*0+35*16] 5075 cmp eobd, 43 5076 jl .load_veryfast 5077 cmp eobd, 107 5078 jl .load_fast 5079 ; load normal 5080 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] 5081 jmp .run 5082.load_fast: 5083 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] 5084 jmp .run 5085.load_veryfast: 5086 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] 5087 ; fall-through 5088.run: 5089 call .pass2 5090%if WIN64 5091 mov r7, [rsp+gprsize*1+35*16] 5092%endif 5093 RET 5094 5095.pass2: 5096%if ARCH_X86_32 5097 lea r5, [o(itx8_start)] 5098%endif 5099 mova m1, [rsp+gprsize+16* 4] 5100 mova m2, [rsp+gprsize+16* 5] 5101 mova m3, [rsp+gprsize+16* 6] 5102 mova m4, [rsp+gprsize+16* 7] 5103 mova m5, [rsp+gprsize+16* 8] 5104 mova m6, [rsp+gprsize+16* 9] 5105 mova m7, [rsp+gprsize+16*10] 5106 call m_suffix(idct_8x8_internal_8bpc, _ssse3).main 5107 mova [rsp+gprsize+ 3*16], m0 5108 mova [rsp+gprsize+ 4*16], m1 5109 mova [rsp+gprsize+ 5*16], m2 5110 mova [rsp+gprsize+ 6*16], m3 5111 mova [rsp+gprsize+ 7*16], m4 5112 mova [rsp+gprsize+ 8*16], m5 5113 mova [rsp+gprsize+ 9*16], m6 5114 mova m0, [rsp+gprsize+11*16] 5115 mova m1, [rsp+gprsize+12*16] 5116 mova m2, [rsp+gprsize+13*16] 5117 mova m3, [rsp+gprsize+14*16] 5118 mova m4, [rsp+gprsize+15*16] 5119 mova m5, [rsp+gprsize+16*16] 5120 mova m6, [rsp+gprsize+17*16] 5121 mova m7, [rsp+gprsize+18*16] 5122 call m_suffix(idct_16x8_internal_8bpc, _ssse3).main 5123 mova m7, [rsp+gprsize+ 0*16] 5124 mova [rsp+gprsize+11*16], m0 5125 mova [rsp+gprsize+12*16], m1 5126 mova [rsp+gprsize+13*16], m2 5127 mova [rsp+gprsize+14*16], m3 5128 mova [rsp+gprsize+15*16], m4 5129 mova [rsp+gprsize+16*16], m5 5130 mova [rsp+gprsize+17*16], m6 5131 mova [rsp+gprsize+18*16], m7 5132 call r4 5133%if ARCH_X86_64 5134 mova m8, [o(pw_2048)] 5135 pxor m9, m9 5136 mova m10, [o(pixel_10bpc_max)] 5137%endif 5138 lea r3, [strideq*3] 5139 call m(idct_8x8_internal_16bpc).round1_and_write_8x8 5140 lea dstq, [dstq+strideq*8] 5141 mova m0, [rsp+gprsize+11*16] 5142 mova m1, [rsp+gprsize+12*16] 5143 mova m2, [rsp+gprsize+13*16] 5144 mova m3, [rsp+gprsize+14*16] 5145 mova m4, [rsp+gprsize+15*16] 5146 mova m5, [rsp+gprsize+16*16] 5147 mova m6, [rsp+gprsize+17*16] 5148 mova m7, [rsp+gprsize+18*16] 5149 call m(idct_8x8_internal_16bpc).round1_and_write_8x8 5150 lea dstq, [dstq+strideq*8] 5151 mova m0, [rsp+gprsize+19*16] 5152 mova m1, [rsp+gprsize+20*16] 5153 mova m2, [rsp+gprsize+21*16] 5154 mova m3, [rsp+gprsize+22*16] 5155 mova m4, [rsp+gprsize+23*16] 5156 mova m5, [rsp+gprsize+24*16] 5157 mova m6, [rsp+gprsize+25*16] 5158 mova m7, [rsp+gprsize+26*16] 5159 call m(idct_8x8_internal_16bpc).round1_and_write_8x8 5160 lea dstq, [dstq+strideq*8] 5161 mova m0, [rsp+gprsize+27*16] 5162 mova m1, [rsp+gprsize+28*16] 5163 mova m2, [rsp+gprsize+29*16] 5164 mova m3, [rsp+gprsize+30*16] 5165 mova m4, [rsp+gprsize+31*16] 5166 mova m5, [rsp+gprsize+32*16] 5167 mova m6, [rsp+gprsize+33*16] 5168 mova m7, [rsp+gprsize+34*16] 5169 call m(idct_8x8_internal_16bpc).round1_and_write_8x8 5170 ret 5171.dconly: 5172 imul r5d, [cq], 181 5173 mov [cq], eobd ; 0 5174 mov r3d, 8 5175 add r5d, 640 5176 sar r5d, 10 5177 add rsp, (31+2*ARCH_X86_64)*16 5178 jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2 5179 5180cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \ 5181 dst, stride, c, eob 5182 LEA r6, base 5183 test eobd, eobd 5184 jz .dconly 5185 5186%if ARCH_X86_32 5187 mov [rsp+gprsize*1+76*16], r0 5188%elif WIN64 5189 mov [rsp+gprsize*1+76*16], r7 5190%endif 5191%undef cmp 5192 ; remove entirely-zero iterations 5193 mov r5d, 7*2 5194 cmp eobw, word [o2(tbl_16x32_2d)+r5] 5195 jge .end_zero_loop 5196 pxor m0, m0 5197.zero_loop: 5198 movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] 5199 movzx t1d, t0b 5200 shr t0d, 8 5201 mova [rsp+12*16+r5*8], m0 5202 mova [rsp+20*16+r5*8], m0 5203 mova [rsp+12*16+t0*8], m0 5204 mova [rsp+12*16+t1*8], m0 5205 mova [rsp+44*16+r5*8], m0 5206 mova [rsp+52*16+r5*8], m0 5207 mova [rsp+44*16+t0*8], m0 5208 mova [rsp+44*16+t1*8], m0 5209 sub r5d, 2 5210 cmp eobw, word [o2(tbl_16x32_2d)+r5] 5211 jl .zero_loop 5212.end_zero_loop: 5213 ; actual first pass after skipping all-zero data 5214 mov [rsp+gprsize*0+76*16], eobd 5215 mov r3, rsp 5216.loop_pass1: 5217%if ARCH_X86_64 5218 mova m11, [o(pd_2048)] 5219 mova m12, [o(clip_18b_min)] 5220 mova m13, [o(clip_18b_max)] 5221 mova m14, [o(pd_2896)] 5222%endif 5223 mova m0, [cq+ 1*128+r5*8] 5224 mova m1, [cq+ 3*128+r5*8] 5225 mova m2, [cq+ 5*128+r5*8] 5226 mova m3, [cq+ 7*128+r5*8] 5227 mova m4, [cq+ 9*128+r5*8] 5228 mova m5, [cq+11*128+r5*8] 5229 mova m6, [cq+13*128+r5*8] 5230 mova m7, [cq+15*128+r5*8] 5231 call m(idct_8x4_internal_16bpc).rect2_mul 5232 call m(idct_16x4_internal_16bpc).main_oddhalf 5233 5234 mova m0, [cq+ 0*128+r5*8] 5235 mova m1, [cq+ 2*128+r5*8] 5236 mova m2, [cq+ 4*128+r5*8] 5237 mova m3, [cq+ 6*128+r5*8] 5238 mova m4, [cq+ 8*128+r5*8] 5239 mova m5, [cq+10*128+r5*8] 5240 mova m6, [cq+12*128+r5*8] 5241 mova m7, [cq+14*128+r5*8] 5242 call m(idct_8x4_internal_16bpc).rect2_mul 5243 call m(idct_8x4_internal_16bpc).main_pass1 5244 call m(idct_8x4_internal_16bpc).round 5245 call m(idct_16x4_internal_16bpc).round 5246%if ARCH_X86_64 5247 packssdw m0, m1 5248 packssdw m2, m3 5249 packssdw m4, m5 5250 packssdw m6, m7 5251 packssdw m8, m9 5252 packssdw m10, m11 5253 packssdw m12, m13 5254 packssdw m14, m15 5255%endif 5256 call m(idct_8x4_internal_16bpc).transpose4x8packed 5257 movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] 5258 movzx t1d, t0b 5259 shr t0d, 8 5260%if ARCH_X86_64 5261 mova [rsp+12*16+r5*8], m0 5262 mova [rsp+20*16+r5*8], m2 5263 mova [rsp+12*16+t1*8], m1 5264 mova [rsp+12*16+t0*8], m3 5265 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 5266 mova [rsp+44*16+r5*8], m8 5267 mova [rsp+52*16+r5*8], m10 5268 mova [rsp+44*16+t1*8], m9 5269 mova [rsp+44*16+t0*8], m11 5270%else 5271 mova [rsp+44*16+r5*8], m0 5272 mova [rsp+52*16+r5*8], m2 5273 mova [rsp+44*16+t1*8], m1 5274 mova [rsp+44*16+t0*8], m3 5275 mova m0, [r3+ 8*16] 5276 mova m2, [r3+ 9*16] 5277 mova m4, [r3+10*16] 5278 mova m6, [r3+11*16] 5279 call m(idct_8x4_internal_16bpc).transpose4x8packed 5280 mova [rsp+12*16+r5*8], m0 5281 mova [rsp+20*16+r5*8], m2 5282 mova [rsp+12*16+t1*8], m1 5283 mova [rsp+12*16+t0*8], m3 5284%endif 5285 pxor m7, m7 5286 REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 5287 sub r5d, 2 5288 jge .loop_pass1 5289 5290 ; pass=2 5291 add rsp, 9*16 5292%if ARCH_X86_64 5293 mov r6, dstq 5294%else 5295 mov dstq, [rsp+gprsize*1+67*16] 5296%endif 5297 mov eobd, [rsp+gprsize*0+67*16] 5298 cmp eobd, 44 5299 jl .load_veryfast 5300 cmp eobd, 151 5301 jl .load_fast 5302 ; load normal 5303 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] 5304 jmp .run 5305.load_fast: 5306 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] 5307 jmp .run 5308.load_veryfast: 5309 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] 5310 ; fall-through 5311.run: 5312%if ARCH_X86_64 5313 lea r2, [dstq+32] 5314 mov r7, -4 5315%else 5316 lea r2, [rsp+67*16] 5317 mov dword [r2+0*gprsize], 2 5318%endif 5319 jmp .loop_pass2_entry 5320.loop_pass2: 5321 mova m0, [rsp+16* 3] 5322.loop_pass2_entry: 5323%if ARCH_X86_32 5324 mov dstq, [r2+1*gprsize] 5325%endif 5326 call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2 5327 add rsp, 32*16 5328%if ARCH_X86_64 5329 add r7, 2 5330 lea dstq, [r2+r7*8] 5331 jl .loop_pass2 5332%if WIN64 5333 mov r7, [rsp+gprsize*1+3*16] 5334%endif 5335%else 5336 add dword [r2+1*gprsize], 16 5337 dec dword [r2+0*gprsize] 5338 jg .loop_pass2 5339%endif 5340%assign stack_size (stack_size-73*16) 5341%if STACK_ALIGNMENT >= 16 5342%assign stack_size_padded (stack_size_padded-73*16) 5343%assign stack_offset (stack_offset-73*16) 5344%else 5345%xdefine rstkm [rsp + stack_size] 5346%endif 5347 RET 5348.dconly: 5349 imul r5d, [cq], 181 5350 mov [cq], eobd ; 0 5351 mov r3d, 32 5352 add r5d, 128 5353 sar r5d, 8 5354 imul r5d, 181 5355 add rsp, (65+4*ARCH_X86_64)*16 5356 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly 5357 5358cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ 5359 dst, stride, c, eob 5360%if ARCH_X86_32 5361 LEA r6, $$ 5362%endif 5363 test eobd, eobd 5364 jz .dconly 5365 5366 ; remove entirely-zero iterations 5367%undef cmp 5368%if ARCH_X86_64 5369 xor r5d, r5d 5370 cmp eobd, 10 5371 setge r5b 5372%else 5373 mov r5d, 1 5374 cmp eobd, 10 5375 sbb r5d, 0 5376%endif 5377 add r5d, r5d 5378 5379 ; actual first pass after skipping all-zero data 5380.loop_pass1: 5381 mova m0, [cq+32* 1+r5*8] 5382 mova m1, [cq+32* 7+r5*8] 5383 mova m2, [cq+32* 9+r5*8] 5384 mova m3, [cq+32*15+r5*8] 5385 mova m4, [cq+32*17+r5*8] 5386 mova m5, [cq+32*23+r5*8] 5387 mova m6, [cq+32*25+r5*8] 5388 mova m7, [cq+32*31+r5*8] 5389%if ARCH_X86_64 5390 mova m11, [o(pd_2048)] 5391 mova m12, [o(clip_18b_min)] 5392 mova m13, [o(clip_18b_max)] 5393 mova m14, [o(pd_2896)] 5394%endif 5395 mov r3, rsp 5396 call .main_oddhalf_part1 5397 mova m0, [cq+32* 3+r5*8] 5398 mova m1, [cq+32* 5+r5*8] 5399 mova m2, [cq+32*11+r5*8] 5400 mova m3, [cq+32*13+r5*8] 5401 mova m4, [cq+32*19+r5*8] 5402 mova m5, [cq+32*21+r5*8] 5403 mova m6, [cq+32*27+r5*8] 5404 mova m7, [cq+32*29+r5*8] 5405 call .main_oddhalf_part2 5406 mova m0, [cq+32* 2+r5*8] 5407 mova m1, [cq+32* 6+r5*8] 5408 mova m2, [cq+32*10+r5*8] 5409 mova m3, [cq+32*14+r5*8] 5410 mova m4, [cq+32*18+r5*8] 5411 mova m5, [cq+32*22+r5*8] 5412 mova m6, [cq+32*26+r5*8] 5413 mova m7, [cq+32*30+r5*8] 5414 add r3, 16*(16+4*ARCH_X86_32) 5415 call m(idct_16x4_internal_16bpc).main_oddhalf 5416 mova m0, [cq+32* 0+r5*8] 5417 mova m1, [cq+32* 4+r5*8] 5418 mova m2, [cq+32* 8+r5*8] 5419 mova m3, [cq+32*12+r5*8] 5420 mova m4, [cq+32*16+r5*8] 5421 mova m5, [cq+32*20+r5*8] 5422 mova m6, [cq+32*24+r5*8] 5423 mova m7, [cq+32*28+r5*8] 5424 call m(idct_8x4_internal_16bpc).main_pass1 5425 call m(idct_8x4_internal_16bpc).round 5426 sub r3, 16*(16+4*ARCH_X86_32) 5427 call .round_dct32 5428%if ARCH_X86_64 5429 call m(idct_8x4_internal_16bpc).transpose4x8packed 5430 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 5431 mova [cq+32* 8+r5*8], m8 5432 mova [cq+32* 9+r5*8], m9 5433 mova [cq+32*10+r5*8], m10 5434 mova [cq+32*11+r5*8], m11 5435 mova m8, [r3+16* 9] ; 8 9 5436 mova m10, [r3+16*11] ; 10 11 5437 mova m12, [r3+16*13] ; 12 13 5438 mova m14, [r3+16*15] ; 14 15 5439 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 5440 mova [cq+32* 4+r5*8], m8 5441 mova [cq+32* 5+r5*8], m9 5442 mova [cq+32* 6+r5*8], m10 5443 mova [cq+32* 7+r5*8], m11 5444 mova m8, [r3+16* 8] ; 24 25 5445 mova m10, [r3+16*10] ; 26 27 5446 mova m12, [r3+16*12] ; 28 29 5447 mova m14, [r3+16*14] ; 30 31 5448 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 5449 mova [cq+32*12+r5*8], m8 5450 mova [cq+32*13+r5*8], m9 5451 mova [cq+32*14+r5*8], m10 5452 mova [cq+32*15+r5*8], m11 5453%else 5454 sub r3, 8*16 5455 mova m0, [r3+ 8*16] 5456 mova m2, [r3+10*16] 5457 mova m4, [r3+12*16] 5458 mova m6, [r3+14*16] 5459 packssdw m0, [r3+ 9*16] 5460 packssdw m2, [r3+11*16] 5461 packssdw m4, [r3+13*16] 5462 packssdw m6, [r3+15*16] 5463 call m(idct_8x4_internal_16bpc).transpose4x8packed 5464 mova [cq+32* 4+r5*8], m0 5465 mova [cq+32* 5+r5*8], m1 5466 mova [cq+32* 6+r5*8], m2 5467 mova [cq+32* 7+r5*8], m3 5468 mova m0, [r3+16*16] 5469 mova m2, [r3+18*16] 5470 mova m4, [r3+20*16] 5471 mova m6, [r3+22*16] 5472 packssdw m0, [r3+17*16] 5473 packssdw m2, [r3+19*16] 5474 packssdw m4, [r3+21*16] 5475 packssdw m6, [r3+23*16] 5476 call m(idct_8x4_internal_16bpc).transpose4x8packed 5477 mova [cq+32* 8+r5*8], m0 5478 mova [cq+32* 9+r5*8], m1 5479 mova [cq+32*10+r5*8], m2 5480 mova [cq+32*11+r5*8], m3 5481 mova m0, [r3+31*16] 5482 mova m2, [r3+29*16] 5483 mova m4, [r3+27*16] 5484 mova m6, [r3+25*16] 5485 packssdw m0, [r3+30*16] 5486 packssdw m2, [r3+28*16] 5487 packssdw m4, [r3+26*16] 5488 packssdw m6, [r3+24*16] 5489 call m(idct_8x4_internal_16bpc).transpose4x8packed 5490 mova [cq+32*12+r5*8], m0 5491 mova [cq+32*13+r5*8], m1 5492 mova [cq+32*14+r5*8], m2 5493 mova [cq+32*15+r5*8], m3 5494 mova m0, [r3+ 0*16] 5495 mova m2, [r3+ 2*16] 5496 mova m4, [r3+ 4*16] 5497 mova m6, [r3+ 6*16] 5498 packssdw m0, [r3+ 1*16] 5499 packssdw m2, [r3+ 3*16] 5500 packssdw m4, [r3+ 5*16] 5501 packssdw m6, [r3+ 7*16] 5502 call m(idct_8x4_internal_16bpc).transpose4x8packed 5503%endif 5504 pxor m7, m7 5505 ; clear lower half of [cq] 5506 REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \ 5507 24, 25, 26, 27, 28, 29, 30, 31 5508 test r5d, r5d 5509 jz .end_pass1 5510 mova [cq+32* 0+r5*8], m0 5511 mova [cq+32* 1+r5*8], m1 5512 mova [cq+32* 2+r5*8], m2 5513 mova [cq+32* 3+r5*8], m3 5514 sub r5d, 2 5515 jmp .loop_pass1 5516.end_pass1: 5517 5518 ; pass=2, we need to call this otherwise the stack pointer has 5519 ; the wrong offset in the 8-bit code 5520 mov r4d, 4 5521 call m(idct_16x8_internal_16bpc).pass2_main 5522 RET 5523 5524.main_oddhalf_part1_fast: ; lower half zero 5525 pmulld m7, m0, [o(pd_4091)] 5526 pmulld m0, [o(pd_201)] 5527 pmulld m4, m3, [o(pd_m2751)] 5528%if ARCH_X86_32 5529 pmulld m3, [o(pd_3035)] 5530 mova m5, [o(pd_2048)] 5531 REPX {paddd x, m5}, m0, m7 5532 REPX {psrad x, 12}, m0, m7 5533 mova [r3+3*16], m7 5534 mova m7, m3 5535 mova m3, m5 5536%else 5537 pmulld m3, [o(pd_3035)] 5538%endif 5539 pmulld m6, m1, [o(pd_m1380)] 5540 pmulld m1, [o(pd_3857)] 5541 pmulld m5, m2, [o(pd_3703)] 5542 pmulld m2, [o(pd_1751)] 5543 jmp .main_oddhalf_part1_fast2 5544.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 5545%if ARCH_X86_64 5546 ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a 5547 ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a 5548 ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a 5549 ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a 5550.main_oddhalf_part1_fast2: 5551 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 5552 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 5553 psubd m8, m0, m4 ; t17 5554 paddd m0, m4 ; t16 5555 psubd m4, m6, m2 ; t18 5556 paddd m6, m2 ; t19 5557 psubd m2, m1, m5 ; t29 5558 paddd m1, m5 ; t28 5559 psubd m5, m7, m3 ; t30 5560 paddd m7, m3 ; t31 5561 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 5562 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 5563 mova m15, [o(pd_4017)] 5564 mova m10, [o(pd_799)] 5565 ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a 5566 ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a 5567 psubd m3, m0, m6 ; t19a 5568 paddd m0, m6 ; t16a 5569 psubd m6, m7, m1 ; t28a 5570 paddd m7, m1 ; t31a 5571 psubd m1, m5, m4 ; t18 5572 paddd m5, m4 ; t17 5573 psubd m4, m8, m2 ; t29 5574 paddd m8, m2 ; t30 5575 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 5576 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 5577 mova m15, [o(pd_3784)] 5578 mova m10, [o(pd_1567)] 5579 ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a 5580 ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 5581 mova [r3+16*0], m0 5582 mova [r3+16*1], m5 5583 mova [r3+16*2], m4 5584 mova [r3+16*3], m6 5585 mova [r3+16*4], m3 5586 mova [r3+16*5], m1 5587 mova [r3+16*6], m8 5588 mova [r3+16*7], m7 5589%else 5590 mova [r3+0*16], m2 5591 mova [r3+1*16], m3 5592 mova [r3+2*16], m4 5593 mova [r3+3*16], m5 5594 mova m3, [o(pd_2048)] 5595 ITX_MULSUB_2D 0, 7, 2, 4, 5, 3, 201, 4091 ; t16a, t31a 5596 ITX_MULSUB_2D 6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a 5597 mova m4, [r3+2*16] 5598 mova m5, [r3+3*16] 5599 mova [r3+2*16], m6 5600 mova [r3+3*16], m7 5601 mova m2, [r3+0*16] 5602 mova m7, [r3+1*16] 5603 mova [r3+0*16], m0 5604 mova [r3+1*16], m1 5605 ITX_MULSUB_2D 2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a 5606 ITX_MULSUB_2D 4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a 5607 mova m0, [r3+0*16] 5608 mova m1, [r3+1*16] 5609 mova m6, [r3+2*16] 5610.main_oddhalf_part1_fast2: 5611 REPX {paddd x, m3}, m1, m2, m4, m5, m6, m7 5612 REPX {psrad x, 12}, m1, m2, m4, m5, m6, m7 5613 psubd m3, m0, m4 ; t17 5614 mova [r3+0*16], m3 5615 mova m3, [r3+3*16] 5616 paddd m0, m4 ; t16 5617 psubd m4, m6, m2 ; t18 5618 paddd m6, m2 ; t19 5619 psubd m2, m1, m5 ; t29 5620 paddd m1, m5 ; t28 5621 psubd m5, m3, m7 ; t30 5622 paddd m7, m3 ; t31 5623 mova m3, [o(clip_18b_min)] 5624 REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 5625 pmaxsd m3, [r3+0*16] 5626 mova [r3+0*16], m3 5627 mova m3, [o(clip_18b_max)] 5628 REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 5629 pminsd m3, [r3+0*16] 5630 mova [r3+0*16], m0 5631 mova [r3+1*16], m1 5632 mova [r3+2*16], m6 5633 mova [r3+3*16], m7 5634 mova m0, [o(pd_2048)] 5635 ITX_MULSUB_2D 5, 3, 1, 6, 7, 0, 799, 4017 ; t17a, t30a 5636 ITX_MULSUB_2D 2, 4, 1, 6, _, 0, 7, 4017, 4 ; t29a, t18a 5637 psubd m1, m5, m4 ; t18 5638 paddd m5, m4 ; t17 5639 psubd m4, m3, m2 ; t29 5640 paddd m3, m2 ; t30 5641 mova m0, [r3+0*16] 5642 mova m2, [r3+1*16] 5643 mova m6, [r3+2*16] 5644 mova m7, [r3+3*16] 5645 mova [r3+0*16], m3 5646 psubd m3, m0, m6 ; t19a 5647 paddd m0, m6 ; t16a 5648 psubd m6, m7, m2 ; t28a 5649 paddd m7, m2 ; t31a 5650 mova m2, [o(clip_18b_min)] 5651 REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 5652 pmaxsd m2, [r3+0*16] 5653 mova [r3+0*16], m2 5654 mova m2, [o(clip_18b_max)] 5655 REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 5656 pminsd m2, [r3+0*16] 5657 mova [r3+16*0], m0 5658 mova [r3+16*1], m5 5659 mova [r3+16*6], m2 5660 mova [r3+16*7], m7 5661 mova m7, [o(pd_2048)] 5662 ITX_MULSUB_2D 4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a 5663 ITX_MULSUB_2D 6, 3, 0, 5, 2, 7, 2, 3784 ; t19, t28 5664 mova [r3+16*2], m4 5665 mova [r3+16*3], m6 5666 mova [r3+16*4], m3 5667 mova [r3+16*5], m1 5668%endif 5669 ret 5670.main_oddhalf_part2_fast: ; lower half zero 5671 pmulld m7, m0, [o(pd_m601)] 5672 pmulld m0, [o(pd_4052)] 5673 pmulld m4, m3, [o(pd_3290)] 5674%if ARCH_X86_32 5675 pmulld m3, [o(pd_2440)] 5676 mova m5, [o(pd_2048)] 5677 REPX {paddd x, m5}, m0, m7 5678 REPX {psrad x, 12}, m0, m7 5679 mova [r3+11*16], m7 5680 mova m7, m3 5681 mova m3, m5 5682%else 5683 pmulld m3, [o(pd_2440)] 5684%endif 5685 pmulld m6, m1, [o(pd_3973)] 5686 pmulld m1, [o(pd_995)] 5687 pmulld m5, m2, [o(pd_m2106)] 5688 pmulld m2, [o(pd_3513)] 5689 jmp .main_oddhalf_part2_fast2 5690.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 5691%if ARCH_X86_64 5692 ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a 5693 ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a 5694 ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a 5695 ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a 5696.main_oddhalf_part2_fast2: 5697 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 5698 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 5699 psubd m8, m0, m4 ; t25 5700 paddd m0, m4 ; t24 5701 psubd m4, m6, m2 ; t26 5702 paddd m6, m2 ; t27 5703 psubd m2, m1, m5 ; t21 5704 paddd m1, m5 ; t20 5705 psubd m5, m7, m3 ; t22 5706 paddd m7, m3 ; t23 5707 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 5708 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 5709 mova m15, [o(pd_2276)] 5710 mova m10, [o(pd_3406)] 5711 ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a 5712 ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a 5713 psubd m3, m0, m6 ; t27a 5714 paddd m0, m6 ; t24a 5715 psubd m6, m7, m1 ; t20a 5716 paddd m7, m1 ; t23a 5717 psubd m1, m5, m4 ; t21 5718 paddd m5, m4 ; t22 5719 psubd m4, m8, m2 ; t26 5720 paddd m8, m2 ; t25 5721 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 5722 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 5723 mova m15, [o(pd_3784)] 5724 mova m10, [o(pd_1567)] 5725 ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a 5726 ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20 5727 mova m9, [r3+16*0] ; t16a 5728 mova m10, [r3+16*1] ; t17 5729 psubd m2, m9, m7 ; t23 5730 paddd m9, m7 ; t16 5731 psubd m7, m10, m5 ; t22a 5732 paddd m10, m5 ; t17a 5733 REPX {pmaxsd x, m12}, m9, m10, m2, m7 5734 REPX {pminsd x, m13}, m9, m10, m2, m7 5735 mova [r3+16*0], m9 5736 mova [r3+16*1], m10 5737 mova m9, [r3+16*2] ; t18a 5738 mova m10, [r3+16*3] ; t19 5739 psubd m5, m9, m1 ; t21 5740 paddd m9, m1 ; t18 5741 psubd m1, m10, m6 ; t20a 5742 paddd m10, m6 ; t19a 5743 REPX {pmaxsd x, m12}, m9, m10, m5, m1 5744 REPX {pminsd x, m13}, m9, m10, m5, m1 5745 mova [r3+16*2], m9 5746 mova [r3+16*3], m10 5747 mova m9, [r3+16*4] ; t28 5748 mova m10, [r3+16*5] ; t29a 5749 psubd m6, m9, m3 ; t27a 5750 paddd m9, m3 ; t28a 5751 psubd m3, m10, m4 ; t26 5752 paddd m10, m4 ; t29 5753 REPX {pmaxsd x, m12}, m9, m10, m6, m3 5754 REPX {pminsd x, m13}, m9, m10, m6, m3 5755 REPX {pmulld x, m14}, m6, m3, m1, m5 5756 paddd m6, m11 5757 paddd m3, m11 5758 psubd m4, m6, m1 ; t20 5759 paddd m6, m1 ; t27 5760 psubd m1, m3, m5 ; t21a 5761 paddd m3, m5 ; t26a 5762 REPX {psrad x, 12 }, m4, m1, m3, m6 5763 mova [r3+16*4], m4 5764 mova [r3+16*5], m1 5765 mova m4, [r3+16*6] ; t30 5766 mova m1, [r3+16*7] ; t31a 5767 psubd m5, m4, m8 ; t25a 5768 paddd m4, m8 ; t30a 5769 psubd m8, m1, m0 ; t24 5770 paddd m1, m0 ; t31 5771 REPX {pmaxsd x, m12}, m8, m5, m4, m1 5772 REPX {pminsd x, m13}, m8, m5, m4, m1 5773 REPX {pmulld x, m14}, m5, m8, m7, m2 5774 paddd m5, m11 5775 paddd m8, m11 5776 psubd m0, m5, m7 ; t22 5777 paddd m5, m7 ; t25 5778 psubd m7, m8, m2 ; t23a 5779 paddd m2, m8 ; t24a 5780 REPX {psrad x, 12 }, m0, m7, m2, m5 5781 mova [r3+16*6], m0 5782 mova [r3+16*7], m7 5783 mova [r3+16*8], m2 5784 mova [r3+16*9], m5 5785 mova [r3+16*10], m3 5786 mova [r3+16*11], m6 5787 mova [r3+16*12], m9 5788 mova [r3+16*13], m10 5789 mova [r3+16*14], m4 5790 mova [r3+16*15], m1 5791%else 5792 mova [r3+ 8*16], m2 5793 mova [r3+ 9*16], m3 5794 mova [r3+10*16], m4 5795 mova [r3+11*16], m5 5796 mova m3, [o(pd_2048)] 5797 ITX_MULSUB_2D 7, 0, 2, 4, 5, 3, 4052, 601 ; t23a, t24a 5798 ITX_MULSUB_2D 1, 6, 2, 4, 5, _, 995, 3973 ; t20a, t27a 5799 mova m2, [r3+ 8*16] 5800 mova m4, [r3+10*16] 5801 mova m5, [r3+11*16] 5802 mova [r3+ 8*16], m0 5803 mova [r3+10*16], m6 5804 mova [r3+11*16], m7 5805 mova m7, [r3+ 9*16] 5806 mova [r3+ 9*16], m1 5807 ITX_MULSUB_2D 5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a 5808 ITX_MULSUB_2D 7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a 5809 mova m0, [r3+ 8*16] 5810 mova m1, [r3+ 9*16] 5811 mova m6, [r3+10*16] 5812.main_oddhalf_part2_fast2: 5813 REPX {paddd x, m3}, m1, m2, m7, m4, m5, m6 5814 REPX {psrad x, 12}, m1, m2, m7, m4, m5, m6 5815 psubd m3, m0, m4 ; t25 5816 mova [r3+ 8*16], m3 5817 mova m3, [r3+11*16] 5818 paddd m0, m4 ; t24 5819 psubd m4, m6, m2 ; t26 5820 paddd m6, m2 ; t27 5821 psubd m2, m1, m5 ; t21 5822 paddd m1, m5 ; t20 5823 psubd m5, m3, m7 ; t22 5824 paddd m7, m3 ; t23 5825 mova m3, [o(clip_18b_min)] 5826 REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 5827 pmaxsd m3, [r3+ 8*16] 5828 mova [r3+ 8*16], m3 5829 mova m3, [o(clip_18b_max)] 5830 REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 5831 pminsd m3, [r3+ 8*16] 5832 mova [r3+ 8*16], m0 5833 mova [r3+ 9*16], m1 5834 mova [r3+10*16], m6 5835 mova [r3+11*16], m7 5836 mova m7, [o(pd_2048)] 5837 ITX_MULSUB_2D 4, 2, 0, 1, 6, 7, 3406, 2276 ; t21a, t26a 5838 ITX_MULSUB_2D 3, 5, 0, 1, _, 7, 6, 2276, 4 ; t25a, t22a 5839 psubd m1, m5, m4 ; t21 5840 paddd m5, m4 ; t22 5841 psubd m4, m3, m2 ; t26 5842 paddd m3, m2 ; t25 5843 mova m0, [r3+ 8*16] 5844 mova m2, [r3+ 9*16] 5845 mova m6, [r3+10*16] 5846 mova m7, [r3+11*16] 5847 mova [r3+ 8*16], m3 5848 psubd m3, m0, m6 ; t27a 5849 paddd m0, m6 ; t24a 5850 psubd m6, m7, m2 ; t20a 5851 paddd m7, m2 ; t23a 5852 mova m2, [o(clip_18b_min)] 5853 REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 5854 pmaxsd m2, [r3+ 8*16] 5855 mova [r3+ 8*16], m2 5856 mova m2, [o(clip_18b_max)] 5857 REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 5858 pminsd m2, [r3+ 8*16] 5859 mova [r3+ 8*16], m0 5860 mova [r3+ 9*16], m2 5861 mova [r3+14*16], m5 5862 mova [r3+15*16], m7 5863 mova m0, [o(pd_2048)] 5864 ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a 5865 ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20 5866 mova [r3+10*16], m3 5867 mova m0, [o(clip_18b_min)] 5868 mova m2, [o(clip_18b_max)] 5869 mova m5, [r3+16*2] ; t18a 5870 mova m7, [r3+16*3] ; t19 5871 psubd m3, m5, m1 ; t21 5872 paddd m5, m1 ; t18 5873 psubd m1, m7, m6 ; t20a 5874 paddd m7, m6 ; t19a 5875 REPX {pmaxsd x, m0}, m5, m7, m3, m1 5876 REPX {pminsd x, m2}, m5, m7, m3, m1 5877 mova [r3+16*2], m5 5878 mova [r3+16*3], m7 5879 mova [r3+11*16], m3 5880 mova m3, [r3+10*16] 5881 mova m5, [r3+16*4] ; t28 5882 mova m7, [r3+16*5] ; t29a 5883 psubd m6, m5, m3 ; t27a 5884 paddd m5, m3 ; t28a 5885 psubd m3, m7, m4 ; t26 5886 paddd m7, m4 ; t29 5887 REPX {pmaxsd x, m0}, m5, m7, m6, m3 5888 REPX {pminsd x, m2}, m5, m7, m6, m3 5889 mova [r3+16*12], m5 5890 mova [r3+16*13], m7 5891 mova m5, [o(pd_2048)] 5892 mova m7, [o(pd_2896)] 5893 mova m4, [r3+11*16] 5894 REPX {pmulld x, m7}, m6, m3, m1, m4 5895 paddd m6, m5 5896 paddd m3, m5 5897 psubd m5, m6, m1 ; t20 5898 paddd m6, m1 ; t27 5899 psubd m1, m3, m4 ; t21a 5900 paddd m3, m4 ; t26a 5901 REPX {psrad x, 12}, m5, m1, m3, m6 5902 mova [r3+16*4], m5 5903 mova [r3+16*5], m1 5904 mova [r3+16*10], m3 5905 mova [r3+16*11], m6 5906 5907 mova m5, [r3+14*16] 5908 mova m6, [r3+15*16] 5909 mova m3, [r3+16*0] ; t16a 5910 mova m4, [r3+16*1] ; t17 5911 psubd m1, m3, m6 ; t23 5912 paddd m3, m6 ; t16 5913 psubd m6, m4, m5 ; t22a 5914 paddd m4, m5 ; t17a 5915 REPX {pmaxsd x, m0}, m3, m4, m1, m6 5916 REPX {pminsd x, m2}, m3, m4, m1, m6 5917 mova [r3+16*0], m3 5918 mova [r3+16*1], m4 5919 mova m5, [r3+ 8*16] 5920 mova m3, [r3+ 9*16] 5921 mova [r3+ 8*16], m1 5922 mova [r3+ 9*16], m6 5923 mova m4, [r3+16*6] ; t30 5924 mova m1, [r3+16*7] ; t31a 5925 psubd m6, m1, m5 ; t24 5926 paddd m1, m5 ; t31 5927 psubd m5, m4, m3 ; t25a 5928 paddd m4, m3 ; t30a 5929 REPX {pmaxsd x, m0}, m6, m5, m4, m1 5930 REPX {pminsd x, m2}, m6, m5, m4, m1 5931 mova [r3+16*14], m4 5932 mova [r3+16*15], m1 5933 mova m4, [o(pd_2048)] 5934 mova m1, [r3+ 9*16] 5935 mova m2, [r3+ 8*16] 5936 REPX {pmulld x, m7}, m5, m6, m1, m2 5937 paddd m5, m4 5938 paddd m6, m4 5939 psubd m0, m5, m1 ; t22 5940 paddd m5, m1 ; t25 5941 psubd m1, m6, m2 ; t23a 5942 paddd m2, m6 ; t24a 5943 REPX {psrad x, 12}, m0, m1, m2, m5 5944 mova [r3+16*6], m0 5945 mova [r3+16*7], m1 5946 mova [r3+16*8], m2 5947 mova [r3+16*9], m5 5948%endif 5949 ret 5950 5951 ; final sumsub for idct16 as well as idct32, plus final downshift 5952%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx 5953 mova m%4, [r3+16*(23-%1)] 5954 pmaxsd m%1, m12 5955 pminsd m%1, m13 5956 psubd m%3, m%1, m%4 ; idct16 out15 - n 5957 paddd m%1, m%4 ; idct16 out0 + n 5958 pmaxsd m%1, m12 5959 pmaxsd m%3, m12 5960 pminsd m%1, m13 5961 pminsd m%3, m13 5962 paddd m%1, m11 5963 paddd m%3, m11 5964 mova m%5, [r3+16*( 0+%1)] 5965 mova m%2, [r3+16*(15-%1)] 5966 psubd m%4, m%1, m%2 ; out31 - n 5967 paddd m%1, m%2 ; out0 + n 5968 paddd m%2, m%3, m%5 ; out15 - n 5969 psubd m%3, m%5 ; out16 + n 5970 REPX {psrad x, %6}, m%1, m%3, m%2, m%4 5971%endmacro 5972 5973.round_dct32: 5974%if ARCH_X86_64 5975 psrld m11, 10 ; pd_2 5976 IDCT32_END 0, 15, 8, 9, 10, 2 ; 0 15 16 31 5977 mova [r3+ 0*16], m6 5978 mova [r3+23*16], m7 5979 IDCT32_END 1, 14, 6, 7, 10, 2 ; 1 14 17 30 5980 packssdw m0, m1 ; 0 1 5981 packssdw m14, m15 ; 14 15 5982 packssdw m8, m6 ; 16 17 5983 packssdw m7, m9 ; 30 31 5984 mova [r3+16*15], m14 5985 mova [r3+16*14], m7 5986 IDCT32_END 2, 15, 10, 7, 6, 2 ; 2 13 18 29 5987 IDCT32_END 3, 14, 1, 9, 6, 2 ; 3 12 19 28 5988 packssdw m2, m3 ; 2 3 5989 packssdw m14, m15 ; 12 13 5990 packssdw m10, m1 ; 18 19 5991 packssdw m9, m7 ; 28 29 5992 mova [r3+16*13], m14 5993 mova [r3+16*12], m9 5994 IDCT32_END 4, 15, 1, 7, 6, 2 ; 4 11 20 27 5995 IDCT32_END 5, 14, 3, 9, 6, 2 ; 5 10 21 26 5996 packssdw m4, m5 ; 4 5 5997 packssdw m14, m15 ; 10 11 5998 packssdw m1, m3 ; 20 21 5999 packssdw m9, m7 ; 26 27 6000 mova [r3+16*11], m14 6001 mova [r3+16*10], m9 6002 mova m6, [r3+ 0*16] 6003 mova m7, [r3+23*16] 6004 IDCT32_END 6, 15, 14, 5, 3, 2 ; 6 9 22 25 6005 IDCT32_END 7, 11, 3, 9, 13, 2 ; 7 8 23 24 6006 packssdw m6, m7 ; 6 7 6007 packssdw m11, m15 ; 8 9 6008 packssdw m14, m3 ; 22 23 6009 packssdw m9, m5 ; 24 25 6010 mova [r3+16*9], m11 6011 mova [r3+16*8], m9 6012 mova m12, m1 6013 ret 6014%else 6015 mova [r3+16*16], m0 6016 mova [r3+17*16], m1 6017 mova [r3+18*16], m2 6018 mova [r3+19*16], m3 6019 mova [r3+20*16], m4 6020 mova [r3+21*16], m5 6021 mova [r3+22*16], m6 6022 mova [r3+23*16], m7 6023 mova m1, [o(pd_2)] 6024 mova m2, [o(clip_18b_min)] 6025 mova m3, [o(clip_18b_max)] 6026 6027 mov r4, 15*16 6028.loop_dct32_end: 6029 mova m0, [r3+16*16] 6030 mova m6, [r3+16*24] 6031 pmaxsd m0, m2 6032 pminsd m0, m3 6033 psubd m5, m0, m6 ; idct16 out15 - n 6034 paddd m0, m6 ; idct16 out0 + n 6035 pmaxsd m0, m2 6036 pmaxsd m5, m2 6037 pminsd m0, m3 6038 pminsd m5, m3 6039 paddd m0, m1 6040 paddd m5, m1 6041 mova m7, [r3] 6042 mova m4, [r3+r4] 6043 psubd m6, m0, m4 ; out31 - n 6044 paddd m0, m4 ; out0 + n 6045 paddd m4, m5, m7 ; out15 - n 6046 psubd m5, m7 ; out16 + n 6047 REPX {psrad x, 2}, m0, m5, m4, m6 6048 mova [r3], m0 6049 mova [r3+r4], m4 6050 mova [r3+16*16], m5 6051 mova [r3+24*16], m6 6052 add r3, 16 6053 sub r4, 32 6054 jg .loop_dct32_end 6055 ret 6056%endif 6057 6058.dconly: 6059 imul r5d, [cq], 181 6060 mov [cq], eobd ; 0 6061 mov r3d, 8 6062.dconly1: 6063 add r5d, 640 6064 sar r5d, 10 6065.dconly2: 6066 imul r5d, 2896 6067 add r5d, 34816 6068 movd m0, r5d 6069 pshuflw m0, m0, q1111 6070 punpcklqdq m0, m0 6071 mova m6, [o(pixel_10bpc_max)] 6072 pxor m5, m5 6073.dconly_loop: 6074 mova m1, [dstq+16*0] 6075 mova m2, [dstq+16*1] 6076 mova m3, [dstq+16*2] 6077 mova m4, [dstq+16*3] 6078 REPX {paddw x, m0}, m1, m2, m3, m4 6079 REPX {pminsw x, m6}, m1, m2, m3, m4 6080 REPX {pmaxsw x, m5}, m1, m2, m3, m4 6081 mova [dstq+16*0], m1 6082 mova [dstq+16*1], m2 6083 mova [dstq+16*2], m3 6084 mova [dstq+16*3], m4 6085 add dstq, strideq 6086 dec r3d 6087 jg .dconly_loop 6088 RET 6089 6090cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ 6091 dst, stride, c, eob 6092 LEA r6, base 6093 test eobd, eobd 6094 jz .dconly 6095 6096 ; remove entirely-zero iterations 6097%undef cmp 6098 mov r5d, 8 6099.zero_loop: 6100 sub r5d, 2 6101 cmp eobw, word [o2(tbl_32x16_2d)+r5] 6102 jl .zero_loop 6103 6104 ; actual first pass after skipping all-zero data 6105.loop_pass1: 6106%if ARCH_X86_64 6107 mova m11, [o(pd_2048)] 6108 mova m12, [o(clip_18b_min)] 6109 mova m13, [o(clip_18b_max)] 6110 mova m14, [o(pd_2896)] 6111%endif 6112 mova m0, [cq+64* 1+r5*8] 6113 mova m1, [cq+64* 7+r5*8] 6114 mova m2, [cq+64* 9+r5*8] 6115 mova m3, [cq+64*15+r5*8] 6116 mova m4, [cq+64*17+r5*8] 6117 mova m5, [cq+64*23+r5*8] 6118 mova m6, [cq+64*25+r5*8] 6119 mova m7, [cq+64*31+r5*8] 6120 mov r3, rsp 6121 call m(idct_8x4_internal_16bpc).rect2_mul 6122 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 6123 6124 mova m0, [cq+64* 3+r5*8] 6125 mova m1, [cq+64* 5+r5*8] 6126 mova m2, [cq+64*11+r5*8] 6127 mova m3, [cq+64*13+r5*8] 6128 mova m4, [cq+64*19+r5*8] 6129 mova m5, [cq+64*21+r5*8] 6130 mova m6, [cq+64*27+r5*8] 6131 mova m7, [cq+64*29+r5*8] 6132%if ARCH_X86_32 6133 add r3, 16*8 6134%endif 6135 call m(idct_8x4_internal_16bpc).rect2_mul 6136%if ARCH_X86_32 6137 sub r3, 16*8 6138%endif 6139 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 6140 add r3, 16*(16+4*ARCH_X86_32) 6141 6142 mova m0, [cq+64* 2+r5*8] 6143 mova m1, [cq+64* 6+r5*8] 6144 mova m2, [cq+64*10+r5*8] 6145 mova m3, [cq+64*14+r5*8] 6146 mova m4, [cq+64*18+r5*8] 6147 mova m5, [cq+64*22+r5*8] 6148 mova m6, [cq+64*26+r5*8] 6149 mova m7, [cq+64*30+r5*8] 6150 call m(idct_8x4_internal_16bpc).rect2_mul 6151 call m(idct_16x4_internal_16bpc).main_oddhalf 6152 6153 mova m0, [cq+64* 0+r5*8] 6154 mova m1, [cq+64* 4+r5*8] 6155 mova m2, [cq+64* 8+r5*8] 6156 mova m3, [cq+64*12+r5*8] 6157 mova m4, [cq+64*16+r5*8] 6158 mova m5, [cq+64*20+r5*8] 6159 mova m6, [cq+64*24+r5*8] 6160 mova m7, [cq+64*28+r5*8] 6161 call m(idct_8x4_internal_16bpc).rect2_mul 6162 call m(idct_8x4_internal_16bpc).main_pass1 6163 call m(idct_8x4_internal_16bpc).round 6164 sub r3, 16*(16+4*ARCH_X86_32) 6165 call .round_dct32 6166 6167%if ARCH_X86_64 6168 call m(idct_8x4_internal_16bpc).transpose4x8packed 6169 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 6170 mova [cq+64* 8+r5*8], m8 6171 mova [cq+64* 9+r5*8], m9 6172 mova [cq+64*10+r5*8], m10 6173 mova [cq+64*11+r5*8], m11 6174 mova m8, [r3+16* 9] ; 8 9 6175 mova m10, [r3+16*11] ; 10 11 6176 mova m12, [r3+16*13] ; 12 13 6177 mova m14, [r3+16*15] ; 14 15 6178 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 6179 mova [cq+64* 4+r5*8], m8 6180 mova [cq+64* 5+r5*8], m9 6181 mova [cq+64* 6+r5*8], m10 6182 mova [cq+64* 7+r5*8], m11 6183 mova m8, [r3+16* 8] ; 24 25 6184 mova m10, [r3+16*10] ; 26 27 6185 mova m12, [r3+16*12] ; 28 29 6186 mova m14, [r3+16*14] ; 30 31 6187 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 6188 mova [cq+64*12+r5*8], m8 6189 mova [cq+64*13+r5*8], m9 6190 mova [cq+64*14+r5*8], m10 6191 mova [cq+64*15+r5*8], m11 6192%else 6193 sub r3, 8*16 6194 mova m0, [r3+ 8*16] 6195 mova m2, [r3+10*16] 6196 mova m4, [r3+12*16] 6197 mova m6, [r3+14*16] 6198 packssdw m0, [r3+ 9*16] 6199 packssdw m2, [r3+11*16] 6200 packssdw m4, [r3+13*16] 6201 packssdw m6, [r3+15*16] 6202 call m(idct_8x4_internal_16bpc).transpose4x8packed 6203 mova [cq+64* 4+r5*8], m0 6204 mova [cq+64* 5+r5*8], m1 6205 mova [cq+64* 6+r5*8], m2 6206 mova [cq+64* 7+r5*8], m3 6207 mova m0, [r3+16*16] 6208 mova m2, [r3+18*16] 6209 mova m4, [r3+20*16] 6210 mova m6, [r3+22*16] 6211 packssdw m0, [r3+17*16] 6212 packssdw m2, [r3+19*16] 6213 packssdw m4, [r3+21*16] 6214 packssdw m6, [r3+23*16] 6215 call m(idct_8x4_internal_16bpc).transpose4x8packed 6216 mova [cq+64* 8+r5*8], m0 6217 mova [cq+64* 9+r5*8], m1 6218 mova [cq+64*10+r5*8], m2 6219 mova [cq+64*11+r5*8], m3 6220 mova m0, [r3+31*16] 6221 mova m2, [r3+29*16] 6222 mova m4, [r3+27*16] 6223 mova m6, [r3+25*16] 6224 packssdw m0, [r3+30*16] 6225 packssdw m2, [r3+28*16] 6226 packssdw m4, [r3+26*16] 6227 packssdw m6, [r3+24*16] 6228 call m(idct_8x4_internal_16bpc).transpose4x8packed 6229 mova [cq+64*12+r5*8], m0 6230 mova [cq+64*13+r5*8], m1 6231 mova [cq+64*14+r5*8], m2 6232 mova [cq+64*15+r5*8], m3 6233 mova m0, [r3+ 0*16] 6234 mova m2, [r3+ 2*16] 6235 mova m4, [r3+ 4*16] 6236 mova m6, [r3+ 6*16] 6237 packssdw m0, [r3+ 1*16] 6238 packssdw m2, [r3+ 3*16] 6239 packssdw m4, [r3+ 5*16] 6240 packssdw m6, [r3+ 7*16] 6241 call m(idct_8x4_internal_16bpc).transpose4x8packed 6242%endif 6243 mova [cq+64* 0+r5*8], m0 6244 mova [cq+64* 1+r5*8], m1 6245 mova [cq+64* 2+r5*8], m2 6246 mova [cq+64* 3+r5*8], m3 6247 pxor m0, m0 6248 REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ 6249 24, 25, 26, 27, 28, 29, 30, 31 6250 sub r5d, 2 6251 jge .loop_pass1 6252 6253 ; pass=2, we need to call this otherwise the stack pointer has 6254 ; the wrong offset in the 8-bit code 6255 call .pass2 6256 RET 6257 6258.pass2: 6259%if ARCH_X86_64 6260 mova m8, [o(pw_2048)] 6261 pxor m9, m9 6262 mova m10, [o(pixel_10bpc_max)] 6263%if WIN64 6264 mov [rsp+16*16+gprsize], r7 6265%endif 6266 mov r7, dstq 6267%else 6268 mov [rsp+2*gprsize+16*16], dstq 6269%endif 6270 lea r3, [strideq*3] 6271 mov r4d, 4 6272 jmp m(idct_16x16_internal_16bpc).loop_pass2 6273 6274.round_dct32: 6275%if ARCH_X86_64 6276 psrld m11, 11 ; pd_1 6277 IDCT32_END 0, 15, 8, 9, 10, 1 ; 0 15 16 31 6278 mova [r3+ 0*16], m6 6279 mova [r3+23*16], m7 6280 IDCT32_END 1, 14, 6, 7, 10, 1 ; 1 14 17 30 6281 packssdw m0, m1 ; 0 1 6282 packssdw m14, m15 ; 14 15 6283 packssdw m8, m6 ; 16 17 6284 packssdw m7, m9 ; 30 31 6285 mova [r3+16*15], m14 6286 mova [r3+16*14], m7 6287 IDCT32_END 2, 15, 10, 7, 6, 1 ; 2 13 18 29 6288 IDCT32_END 3, 14, 1, 9, 6, 1 ; 3 12 19 28 6289 packssdw m2, m3 ; 2 3 6290 packssdw m14, m15 ; 12 13 6291 packssdw m10, m1 ; 18 19 6292 packssdw m9, m7 ; 28 29 6293 mova [r3+16*13], m14 6294 mova [r3+16*12], m9 6295 IDCT32_END 4, 15, 1, 7, 6, 1 ; 4 11 20 27 6296 IDCT32_END 5, 14, 3, 9, 6, 1 ; 5 10 21 26 6297 packssdw m4, m5 ; 4 5 6298 packssdw m14, m15 ; 10 11 6299 packssdw m1, m3 ; 20 21 6300 packssdw m9, m7 ; 26 27 6301 mova [r3+16*11], m14 6302 mova [r3+16*10], m9 6303 mova m6, [r3+ 0*16] 6304 mova m7, [r3+23*16] 6305 IDCT32_END 6, 15, 14, 5, 3, 1 ; 6 9 22 25 6306 IDCT32_END 7, 11, 3, 9, 13, 1 ; 7 8 23 24 6307 packssdw m6, m7 ; 6 7 6308 packssdw m11, m15 ; 8 9 6309 packssdw m14, m3 ; 22 23 6310 packssdw m9, m5 ; 24 25 6311 mova [r3+16*9], m11 6312 mova [r3+16*8], m9 6313 mova m12, m1 6314 ret 6315%else 6316 mova [r3+16*16], m0 6317 mova [r3+17*16], m1 6318 mova [r3+18*16], m2 6319 mova [r3+19*16], m3 6320 mova [r3+20*16], m4 6321 mova [r3+21*16], m5 6322 mova [r3+22*16], m6 6323 mova [r3+23*16], m7 6324 pcmpeqd m1, m1 ; -1 6325 mova m2, [o(clip_18b_min)] 6326 mova m3, [o(clip_18b_max)] 6327 6328 mov r4, 15*16 6329.loop_dct32_end: 6330 mova m0, [r3+16*16] 6331 mova m6, [r3+16*24] 6332 psubd m5, m0, m6 ; idct16 out15 - n 6333 paddd m0, m6 ; idct16 out0 + n 6334 pmaxsd m0, m2 6335 pmaxsd m5, m2 6336 pminsd m0, m3 6337 pminsd m5, m3 6338 psubd m0, m1 6339 psubd m5, m1 6340 mova m7, [r3] 6341 mova m4, [r3+r4] 6342 psubd m6, m0, m4 ; out31 - n 6343 paddd m0, m4 ; out0 + n 6344 paddd m4, m5, m7 ; out15 - n 6345 psubd m5, m7 ; out16 + n 6346 REPX {psrad x, 1}, m0, m5, m4, m6 6347 mova [r3], m0 6348 mova [r3+r4], m4 6349 mova [r3+16*16], m5 6350 mova [r3+24*16], m6 6351 add r3, 16 6352 sub r4, 32 6353 jg .loop_dct32_end 6354 ret 6355%endif 6356 6357.dconly: 6358 imul r5d, [cq], 181 6359 mov [cq], eobd ; 0 6360 mov r3d, 16 6361 add r5d, 128 6362 sar r5d, 8 6363 imul r5d, 181 6364 add r5d, 384 6365 sar r5d, 9 6366 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 6367 6368cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ 6369 dst, stride, c, eob 6370 LEA r6, base 6371 test eobd, eobd 6372 jz .dconly 6373 6374 ; remove entirely-zero iterations 6375%if ARCH_X86_32 6376 mov [rsp+5*32*16+1*gprsize], dstq 6377%elif WIN64 6378 mov [rsp+5*32*16+1*gprsize], r7 6379%endif 6380%undef cmp 6381 mov r5d, 14 6382 cmp eobw, word [o2(tbl_32x32_2d)+r5] 6383 jge .end_zero_loop 6384 pxor m0, m0 6385.zero_loop: 6386 movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] 6387 movzx t1d, t0b 6388 shr t0d, 8 6389 mova [rsp+32*16+r5*8+0*32*16], m0 6390 mova [rsp+40*16+r5*8+0*32*16], m0 6391 mova [rsp+32*16+t0*8+0*32*16], m0 6392 mova [rsp+32*16+t1*8+0*32*16], m0 6393 mova [rsp+32*16+r5*8+1*32*16], m0 6394 mova [rsp+40*16+r5*8+1*32*16], m0 6395 mova [rsp+32*16+t0*8+1*32*16], m0 6396 mova [rsp+32*16+t1*8+1*32*16], m0 6397 mova [rsp+32*16+r5*8+2*32*16], m0 6398 mova [rsp+40*16+r5*8+2*32*16], m0 6399 mova [rsp+32*16+t0*8+2*32*16], m0 6400 mova [rsp+32*16+t1*8+2*32*16], m0 6401 mova [rsp+32*16+r5*8+3*32*16], m0 6402 mova [rsp+40*16+r5*8+3*32*16], m0 6403 mova [rsp+32*16+t0*8+3*32*16], m0 6404 mova [rsp+32*16+t1*8+3*32*16], m0 6405 sub r5d, 2 6406 cmp eobw, word [o2(tbl_32x32_2d)+r5] 6407 jl .zero_loop 6408.end_zero_loop: 6409 6410 ; actual first pass after skipping all-zero data 6411 mov [rsp+gprsize*0+5*32*16], eobd 6412.loop_pass1: 6413 mova m0, [cq+128* 1+r5*8] 6414 mova m1, [cq+128* 7+r5*8] 6415 mova m2, [cq+128* 9+r5*8] 6416 mova m3, [cq+128*15+r5*8] 6417 mova m4, [cq+128*17+r5*8] 6418 mova m5, [cq+128*23+r5*8] 6419 mova m6, [cq+128*25+r5*8] 6420 mova m7, [cq+128*31+r5*8] 6421%if ARCH_X86_64 6422 mova m11, [o(pd_2048)] 6423 mova m12, [o(clip_18b_min)] 6424 mova m13, [o(clip_18b_max)] 6425 mova m14, [o(pd_2896)] 6426%endif 6427 mov r3, rsp 6428 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 6429 mova m0, [cq+128* 3+r5*8] 6430 mova m1, [cq+128* 5+r5*8] 6431 mova m2, [cq+128*11+r5*8] 6432 mova m3, [cq+128*13+r5*8] 6433 mova m4, [cq+128*19+r5*8] 6434 mova m5, [cq+128*21+r5*8] 6435 mova m6, [cq+128*27+r5*8] 6436 mova m7, [cq+128*29+r5*8] 6437 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 6438 mova m0, [cq+128* 2+r5*8] 6439 mova m1, [cq+128* 6+r5*8] 6440 mova m2, [cq+128*10+r5*8] 6441 mova m3, [cq+128*14+r5*8] 6442 mova m4, [cq+128*18+r5*8] 6443 mova m5, [cq+128*22+r5*8] 6444 mova m6, [cq+128*26+r5*8] 6445 mova m7, [cq+128*30+r5*8] 6446 add r3, 16*(16+4*ARCH_X86_32) 6447 call m(idct_16x4_internal_16bpc).main_oddhalf 6448 mova m0, [cq+128* 0+r5*8] 6449 mova m1, [cq+128* 4+r5*8] 6450 mova m2, [cq+128* 8+r5*8] 6451 mova m3, [cq+128*12+r5*8] 6452 mova m4, [cq+128*16+r5*8] 6453 mova m5, [cq+128*20+r5*8] 6454 mova m6, [cq+128*24+r5*8] 6455 mova m7, [cq+128*28+r5*8] 6456 call m(idct_8x4_internal_16bpc).main_pass1 6457 call m(idct_8x4_internal_16bpc).round 6458 sub r3, 16*(16+4*ARCH_X86_32) 6459 call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32 6460 movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] 6461 movzx t1d, t0b 6462 shr t0d, 8 6463%if ARCH_X86_64 6464 call m(idct_8x4_internal_16bpc).transpose4x8packed 6465 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 6466 mova [rsp+32*16+r5*8+2*32*16], m8 6467 mova [rsp+40*16+r5*8+2*32*16], m10 6468 mova [rsp+32*16+t1*8+2*32*16], m9 6469 mova [rsp+32*16+t0*8+2*32*16], m11 6470 mova m8, [r3+16* 9] ; 8 9 6471 mova m10, [r3+16*11] ; 10 11 6472 mova m12, [r3+16*13] ; 12 13 6473 mova m14, [r3+16*15] ; 14 15 6474 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 6475 mova [rsp+32*16+r5*8+1*32*16], m8 6476 mova [rsp+40*16+r5*8+1*32*16], m10 6477 mova [rsp+32*16+t1*8+1*32*16], m9 6478 mova [rsp+32*16+t0*8+1*32*16], m11 6479 mova m8, [r3+16* 8] ; 24 25 6480 mova m10, [r3+16*10] ; 26 27 6481 mova m12, [r3+16*12] ; 28 29 6482 mova m14, [r3+16*14] ; 30 31 6483 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 6484 mova [rsp+32*16+r5*8+3*32*16], m8 6485 mova [rsp+40*16+r5*8+3*32*16], m10 6486 mova [rsp+32*16+t1*8+3*32*16], m9 6487 mova [rsp+32*16+t0*8+3*32*16], m11 6488%else 6489 sub r3, 8*16 6490 mova m0, [r3+ 8*16] 6491 mova m2, [r3+10*16] 6492 mova m4, [r3+12*16] 6493 mova m6, [r3+14*16] 6494 packssdw m0, [r3+ 9*16] 6495 packssdw m2, [r3+11*16] 6496 packssdw m4, [r3+13*16] 6497 packssdw m6, [r3+15*16] 6498 call m(idct_8x4_internal_16bpc).transpose4x8packed 6499 mova [rsp+32*16+r5*8+1*32*16], m0 6500 mova [rsp+40*16+r5*8+1*32*16], m2 6501 mova [rsp+32*16+t1*8+1*32*16], m1 6502 mova [rsp+32*16+t0*8+1*32*16], m3 6503 mova m0, [r3+16*16] 6504 mova m2, [r3+18*16] 6505 mova m4, [r3+20*16] 6506 mova m6, [r3+22*16] 6507 packssdw m0, [r3+17*16] 6508 packssdw m2, [r3+19*16] 6509 packssdw m4, [r3+21*16] 6510 packssdw m6, [r3+23*16] 6511 call m(idct_8x4_internal_16bpc).transpose4x8packed 6512 mova [rsp+32*16+r5*8+2*32*16], m0 6513 mova [rsp+40*16+r5*8+2*32*16], m2 6514 mova [rsp+32*16+t1*8+2*32*16], m1 6515 mova [rsp+32*16+t0*8+2*32*16], m3 6516 mova m0, [r3+31*16] 6517 mova m2, [r3+29*16] 6518 mova m4, [r3+27*16] 6519 mova m6, [r3+25*16] 6520 packssdw m0, [r3+30*16] 6521 packssdw m2, [r3+28*16] 6522 packssdw m4, [r3+26*16] 6523 packssdw m6, [r3+24*16] 6524 call m(idct_8x4_internal_16bpc).transpose4x8packed 6525 mova [rsp+32*16+r5*8+3*32*16], m0 6526 mova [rsp+40*16+r5*8+3*32*16], m2 6527 mova [rsp+32*16+t1*8+3*32*16], m1 6528 mova [rsp+32*16+t0*8+3*32*16], m3 6529 mova m0, [r3+ 0*16] 6530 mova m2, [r3+ 2*16] 6531 mova m4, [r3+ 4*16] 6532 mova m6, [r3+ 6*16] 6533 packssdw m0, [r3+ 1*16] 6534 packssdw m2, [r3+ 3*16] 6535 packssdw m4, [r3+ 5*16] 6536 packssdw m6, [r3+ 7*16] 6537 call m(idct_8x4_internal_16bpc).transpose4x8packed 6538%endif 6539 pxor m7, m7 6540 ; clear lower half of [cq] 6541 REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ 6542 8, 9, 10, 11, 12, 13, 14, 15, \ 6543 16, 17, 18, 19, 20, 21, 22, 23, \ 6544 24, 25, 26, 27, 28, 29, 30, 31 6545 mova [rsp+32*16+r5*8+0*32*16], m0 6546 mova [rsp+40*16+r5*8+0*32*16], m2 6547 mova [rsp+32*16+t1*8+0*32*16], m1 6548 mova [rsp+32*16+t0*8+0*32*16], m3 6549 sub r5d, 2 6550 jge .loop_pass1 6551 6552 ; pass=2 code starts here 6553 mov eobd, [rsp+gprsize*0+5*32*16] 6554 add rsp, 29*16 6555 cmp eobd, 36 6556 jl .load_veryfast 6557 cmp eobd, 136 6558 jl .load_fast 6559 ; load normal 6560 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] 6561 jmp .run 6562.load_fast: 6563 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] 6564 jmp .run 6565.load_veryfast: 6566 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] 6567 ; fall-through 6568.run: 6569%if ARCH_X86_64 6570 lea r2, [dstq+64] 6571 mov r7, -8 6572%else 6573 lea r2, [rsp+(4*32+3)*16] 6574 mov dword [r2+0*gprsize], 4 6575%endif 6576 jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry 6577 6578.dconly: 6579 imul r5d, [cq], 181 6580 mov [cq], eobd ; 0 6581 mov r3d, 32 6582 add rsp, (5*32+1-(24+8*ARCH_X86_32))*16 6583 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1 6584 6585cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \ 6586 0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \ 6587 dst, stride, c, eob 6588 LEA r6, base 6589 test eobd, eobd 6590 jz .dconly 6591 6592%if ARCH_X86_32 6593 DECLARE_REG_TMP 4, 1, 2, 0 6594 mov [rsp+gprsize*1+(64*2+12)*16], r0 6595 mov [rsp+gprsize*2+(64*2+12)*16], r1 6596 mov [rsp+gprsize*3+(64*2+12)*16], r2 6597%else 6598 DECLARE_REG_TMP 8, 9, 4, 7 6599 mov [rsp+gprsize*1+(64*2+12)*16], r9 6600%if WIN64 6601 mov [rsp+gprsize*2+(64*2+12)*16], r7 6602 mov [rsp+gprsize*3+(64*2+12)*16], r8 6603%endif 6604%endif 6605%undef cmp 6606 ; remove entirely-zero iterations 6607 mov r5d, 7*2 6608 cmp eobw, word [o2(tbl_16x32_2d)+r5] 6609 jge .end_zero_loop 6610 pxor m0, m0 6611.zero_loop: 6612 movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] 6613 movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] 6614 movzx t0d, t1b 6615 movzx t2d, t3b 6616 shr t1d, 8 6617 shr t3d, 8 6618 mova [rsp+12*16+t0*8], m0 6619 mova [rsp+12*16+t1*8], m0 6620 mova [rsp+12*16+t2*8], m0 6621 mova [rsp+12*16+t3*8], m0 6622 mova [rsp+76*16+t0*8], m0 6623 mova [rsp+76*16+t1*8], m0 6624 mova [rsp+76*16+t2*8], m0 6625 mova [rsp+76*16+t3*8], m0 6626 sub r5d, 2 6627 cmp eobw, word [o2(tbl_16x32_2d)+r5] 6628 jl .zero_loop 6629.end_zero_loop: 6630 ; actual first pass after skipping all-zero data 6631 mov [rsp+gprsize*0+(64*2+12)*16], eobd 6632 mov r3, rsp 6633%if ARCH_X86_32 6634 DECLARE_REG_TMP 4, 1, 6, 0 6635 mov r2, [rsp+gprsize*3+(64*2+12)*16] 6636 mov [rsp+gprsize*3+(64*2+12)*16], r6 6637%endif 6638.loop_pass1: 6639%if ARCH_X86_64 6640 mova m11, [o(pd_2048)] 6641 mova m12, [o(clip_18b_min)] 6642 mova m13, [o(clip_18b_max)] 6643 mova m14, [o(pd_2896)] 6644%endif 6645 mova m0, [cq+ 1*128+r5*8] 6646 mova m1, [cq+ 3*128+r5*8] 6647 mova m2, [cq+ 5*128+r5*8] 6648 mova m3, [cq+ 7*128+r5*8] 6649 mova m4, [cq+ 9*128+r5*8] 6650 mova m5, [cq+11*128+r5*8] 6651 mova m6, [cq+13*128+r5*8] 6652 mova m7, [cq+15*128+r5*8] 6653 call m(idct_16x4_internal_16bpc).main_oddhalf 6654 6655 mova m0, [cq+ 0*128+r5*8] 6656 mova m1, [cq+ 2*128+r5*8] 6657 mova m2, [cq+ 4*128+r5*8] 6658 mova m3, [cq+ 6*128+r5*8] 6659 mova m4, [cq+ 8*128+r5*8] 6660 mova m5, [cq+10*128+r5*8] 6661 mova m6, [cq+12*128+r5*8] 6662 mova m7, [cq+14*128+r5*8] 6663 call m(idct_8x4_internal_16bpc).main_pass1 6664 call m(idct_8x4_internal_16bpc).round 6665 call m(idct_16x16_internal_16bpc).round 6666%if ARCH_X86_64 6667 packssdw m0, m1 6668 packssdw m2, m3 6669 packssdw m4, m5 6670 packssdw m6, m7 6671 packssdw m8, m9 6672 packssdw m10, m11 6673 packssdw m12, m13 6674 packssdw m14, m15 6675%endif 6676 call m(idct_8x4_internal_16bpc).transpose4x8packed 6677 movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] 6678 movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] 6679 movzx t0d, t1b 6680 movzx t2d, t3b 6681 shr t1d, 8 6682 shr t3d, 8 6683%if ARCH_X86_64 6684 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 6685 mova [rsp+76*16+t0*8], m8 6686 mova [rsp+76*16+t1*8], m9 6687 mova [rsp+76*16+t2*8], m10 6688 mova [rsp+76*16+t3*8], m11 6689%else 6690 mova [rsp+76*16+t0*8], m0 6691 mova [rsp+76*16+t1*8], m1 6692 mova [rsp+76*16+t2*8], m2 6693 mova [rsp+76*16+t3*8], m3 6694 mova m0, [rsp+ 8*16] 6695 mova m2, [rsp+ 9*16] 6696 mova m4, [rsp+10*16] 6697 mova m6, [rsp+11*16] 6698 call m(idct_8x4_internal_16bpc).transpose4x8packed 6699%endif 6700 mova [rsp+12*16+t0*8], m0 6701 mova [rsp+12*16+t1*8], m1 6702 mova [rsp+12*16+t2*8], m2 6703 mova [rsp+12*16+t3*8], m3 6704%if ARCH_X86_32 6705 mov r6, [rsp+gprsize*3+(64*2+12)*16] 6706%endif 6707 pxor m7, m7 6708 REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 6709 sub r5d, 2 6710 jge .loop_pass1 6711 6712 ; pass=2 6713 mov eobd, [rsp+gprsize*0+(64*2+12)*16] 6714 cmp eobd, 151 6715 jl .fast 6716 ; fall-through 6717%if ARCH_X86_64 6718 DECLARE_REG_TMP 8, 9 6719%else 6720 DECLARE_REG_TMP 1, 5 6721%endif 6722 lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] 6723 lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] 6724 jmp .run 6725.fast: 6726 lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] 6727 lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] 6728.run: 6729 add rsp, 9*16 6730 6731%if ARCH_X86_64 6732 lea r2, [dstq+32] 6733 mov r7, -4 6734%else 6735 lea r2, [rsp+(64*2+3)*16] 6736 mov [r2+4*gprsize], t0 6737 mov [r2+5*gprsize], t1 6738 mov r1, [r2+2*gprsize] 6739 mov dword [r2+0*gprsize], 2 6740%endif 6741.loop_pass2: 6742%if ARCH_X86_32 6743 mov dstq, [r2+1*gprsize] 6744%endif 6745 call .pass2 6746 add rsp, 64*16 6747%if ARCH_X86_64 6748 add r7, 2 6749 lea dstq, [r2+r7*8] 6750 jl .loop_pass2 6751%else 6752 add dword [r2+1*gprsize], 16 6753 dec dword [r2+0*gprsize] 6754 jg .loop_pass2 6755%endif 6756%assign stack_size (stack_size-(64*2+9)*16) 6757%if STACK_ALIGNMENT >= 16 6758%assign stack_size_padded (stack_size_padded-(64*2+9)*16) 6759%assign stack_offset (stack_offset-(64*2+9)*16) 6760%else 6761%xdefine rstkm [rsp + stack_size] 6762%endif 6763%if ARCH_X86_64 6764 mov r9, [rsp+gprsize*1+3*16] 6765%if WIN64 6766 mov r7, [rsp+gprsize*2+3*16] 6767 mov r8, [rsp+gprsize*3+3*16] 6768%endif 6769%endif 6770 RET 6771 6772.pass2: 6773%if ARCH_X86_32 6774 lea r5, [o(itx8_start)] 6775%endif 6776 mova m0, [rsp+gprsize+16* 3] 6777 mova m1, [rsp+gprsize+16* 4] 6778 mova m2, [rsp+gprsize+16* 5] 6779 mova m3, [rsp+gprsize+16* 6] 6780 pxor m4, m4 6781 REPX {mova x, m4}, m5, m6, m7 6782 call m_suffix(idct_8x8_internal_8bpc, _ssse3).main 6783 mova [rsp+gprsize+ 3*16], m0 6784 mova [rsp+gprsize+ 4*16], m1 6785 mova [rsp+gprsize+ 5*16], m2 6786 mova [rsp+gprsize+ 6*16], m3 6787 mova [rsp+gprsize+ 7*16], m4 6788 mova [rsp+gprsize+ 8*16], m5 6789 mova [rsp+gprsize+ 9*16], m6 6790 mova [rsp+gprsize+10*16], m7 6791 mova m0, [rsp+gprsize+16*11] 6792 mova m1, [rsp+gprsize+16*12] 6793 mova m2, [rsp+gprsize+16*13] 6794 mova m3, [rsp+gprsize+16*14] 6795 pxor m4, m4 6796 REPX {mova x, m4}, m5, m6, m7 6797 call m_suffix(idct_16x8_internal_8bpc, _ssse3).main 6798 mova m7, [rsp+gprsize+ 0*16] 6799 mova [rsp+gprsize+11*16], m0 6800 mova [rsp+gprsize+12*16], m1 6801 mova [rsp+gprsize+13*16], m2 6802 mova [rsp+gprsize+14*16], m3 6803 mova [rsp+gprsize+15*16], m4 6804 mova [rsp+gprsize+16*16], m5 6805 mova [rsp+gprsize+17*16], m6 6806 mova [rsp+gprsize+18*16], m7 6807%if ARCH_X86_64 6808 call r8 6809%else 6810 call [r2+4*gprsize] 6811%endif 6812 mova [rsp+gprsize+ 3*16], m0 6813 mova [rsp+gprsize+ 5*16], m2 6814 mova [rsp+gprsize+ 8*16], m5 6815 mova [rsp+gprsize+10*16], m7 6816%if ARCH_X86_64 6817 call r9 6818 mova m8, [o(pw_2048)] 6819 pxor m9, m9 6820 mova m10, [o(pixel_10bpc_max)] 6821%else 6822 call [r2+5*gprsize] 6823%endif 6824 lea r3, [strideq*3] 6825 lea r4, [rsp+gprsize+ 3*16] 6826%if ARCH_X86_64 6827 mov r6d, 8 6828%else 6829 mov dword [r2+2*gprsize], 8 6830%endif 6831.loop_write: 6832 mova m0, [r4+0*16] 6833 mova m1, [r4+1*16] 6834 mova m2, [r4+2*16] 6835 mova m3, [r4+3*16] 6836 mova m4, [r4+4*16] 6837 mova m5, [r4+5*16] 6838 mova m6, [r4+6*16] 6839 mova m7, [r4+7*16] 6840 call m(idct_8x8_internal_16bpc).round1_and_write_8x8 6841 lea dstq, [dstq+strideq*8] 6842 add r4, 8*16 6843%if ARCH_X86_64 6844 dec r6d 6845%else 6846 dec dword [r2+2*gprsize] 6847%endif 6848 jg .loop_write 6849 ret 6850 6851.dconly: 6852 imul r5d, [cq], 181 6853 mov [cq], eobd ; 0 6854 mov r3d, 64 6855 add r5d, 640 6856 sar r5d, 10 6857 add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16 6858 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 6859 6860cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \ 6861 0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \ 6862 dst, stride, c, eob 6863 LEA r6, base 6864 test eobd, eobd 6865 jz .dconly 6866 6867%if ARCH_X86_32 6868 DECLARE_REG_TMP 4, 1, 2, 0 6869 mov [rsp+gprsize*1+(64*4+32)*16], r0 6870 mov [rsp+gprsize*2+(64*4+32)*16], r1 6871 mov [rsp+gprsize*3+(64*4+32)*16], r2 6872%else 6873 DECLARE_REG_TMP 8, 9, 4, 7 6874 mov [rsp+gprsize*1+(64*4+32)*16], r9 6875%if WIN64 6876 mov [rsp+gprsize*2+(64*4+32)*16], r7 6877 mov [rsp+gprsize*3+(64*4+32)*16], r8 6878%endif 6879%endif 6880%undef cmp 6881 ; remove entirely-zero iterations 6882 mov r5d, 7*2 6883 cmp eobw, word [o2(tbl_32x32_2d)+r5] 6884 jge .end_zero_loop 6885 pxor m0, m0 6886.zero_loop: 6887 movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] 6888 movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] 6889 movzx t0d, t1b 6890 movzx t2d, t3b 6891 shr t1d, 8 6892 shr t3d, 8 6893 mova [rsp+ 32*16+t0*8], m0 6894 mova [rsp+ 32*16+t1*8], m0 6895 mova [rsp+ 32*16+t2*8], m0 6896 mova [rsp+ 32*16+t3*8], m0 6897 mova [rsp+ 96*16+t0*8], m0 6898 mova [rsp+ 96*16+t1*8], m0 6899 mova [rsp+ 96*16+t2*8], m0 6900 mova [rsp+ 96*16+t3*8], m0 6901 mova [rsp+160*16+t0*8], m0 6902 mova [rsp+160*16+t1*8], m0 6903 mova [rsp+160*16+t2*8], m0 6904 mova [rsp+160*16+t3*8], m0 6905 mova [rsp+224*16+t0*8], m0 6906 mova [rsp+224*16+t1*8], m0 6907 mova [rsp+224*16+t2*8], m0 6908 mova [rsp+224*16+t3*8], m0 6909 sub r5d, 2 6910 cmp eobw, word [o2(tbl_32x32_2d)+r5] 6911 jl .zero_loop 6912.end_zero_loop: 6913 ; actual first pass after skipping all-zero data 6914 mov [rsp+gprsize*0+(64*4+32)*16], eobd 6915 mov r3, rsp 6916%if ARCH_X86_32 6917 DECLARE_REG_TMP 4, 1, 6, 0 6918 mov r2, [rsp+gprsize*3+(64*4+32)*16] 6919 mov [rsp+gprsize*3+(64*4+32)*16], r6 6920%endif 6921.loop_pass1: 6922%if ARCH_X86_64 6923 mova m11, [o(pd_2048)] 6924 mova m12, [o(clip_18b_min)] 6925 mova m13, [o(clip_18b_max)] 6926 mova m14, [o(pd_2896)] 6927%endif 6928 mova m0, [cq+128* 1+r5*8] 6929 mova m1, [cq+128* 7+r5*8] 6930 mova m2, [cq+128* 9+r5*8] 6931 mova m3, [cq+128*15+r5*8] 6932 mova m4, [cq+128*17+r5*8] 6933 mova m5, [cq+128*23+r5*8] 6934 mova m6, [cq+128*25+r5*8] 6935 mova m7, [cq+128*31+r5*8] 6936 mov r3, rsp 6937 call m(idct_8x4_internal_16bpc).rect2_mul 6938 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 6939 6940 mova m0, [cq+128* 3+r5*8] 6941 mova m1, [cq+128* 5+r5*8] 6942 mova m2, [cq+128*11+r5*8] 6943 mova m3, [cq+128*13+r5*8] 6944 mova m4, [cq+128*19+r5*8] 6945 mova m5, [cq+128*21+r5*8] 6946 mova m6, [cq+128*27+r5*8] 6947 mova m7, [cq+128*29+r5*8] 6948%if ARCH_X86_32 6949 add r3, 16*8 6950%endif 6951 call m(idct_8x4_internal_16bpc).rect2_mul 6952%if ARCH_X86_32 6953 sub r3, 16*8 6954%endif 6955 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 6956 add r3, 16*(16+4*ARCH_X86_32) 6957 6958 mova m0, [cq+128* 2+r5*8] 6959 mova m1, [cq+128* 6+r5*8] 6960 mova m2, [cq+128*10+r5*8] 6961 mova m3, [cq+128*14+r5*8] 6962 mova m4, [cq+128*18+r5*8] 6963 mova m5, [cq+128*22+r5*8] 6964 mova m6, [cq+128*26+r5*8] 6965 mova m7, [cq+128*30+r5*8] 6966 call m(idct_8x4_internal_16bpc).rect2_mul 6967 call m(idct_16x4_internal_16bpc).main_oddhalf 6968 6969 mova m0, [cq+128* 0+r5*8] 6970 mova m1, [cq+128* 4+r5*8] 6971 mova m2, [cq+128* 8+r5*8] 6972 mova m3, [cq+128*12+r5*8] 6973 mova m4, [cq+128*16+r5*8] 6974 mova m5, [cq+128*20+r5*8] 6975 mova m6, [cq+128*24+r5*8] 6976 mova m7, [cq+128*28+r5*8] 6977 call m(idct_8x4_internal_16bpc).rect2_mul 6978 call m(idct_8x4_internal_16bpc).main_pass1 6979 call m(idct_8x4_internal_16bpc).round 6980 sub r3, 16*(16+4*ARCH_X86_32) 6981 call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32 6982 6983 movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] 6984 movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] 6985 movzx t0d, t1b 6986 movzx t2d, t3b 6987 shr t1d, 8 6988 shr t3d, 8 6989%if ARCH_X86_64 6990 call m(idct_8x4_internal_16bpc).transpose4x8packed 6991 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 6992 mova [rsp+160*16+t0*8], m8 6993 mova [rsp+160*16+t1*8], m9 6994 mova [rsp+160*16+t2*8], m10 6995 mova [rsp+160*16+t3*8], m11 6996 mova m8, [r3+16* 9] ; 8 9 6997 mova m10, [r3+16*11] ; 10 11 6998 mova m12, [r3+16*13] ; 12 13 6999 mova m14, [r3+16*15] ; 14 15 7000 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 7001 mova [rsp+ 96*16+t0*8], m8 7002 mova [rsp+ 96*16+t1*8], m9 7003 mova [rsp+ 96*16+t2*8], m10 7004 mova [rsp+ 96*16+t3*8], m11 7005 mova m8, [r3+16* 8] ; 24 25 7006 mova m10, [r3+16*10] ; 26 27 7007 mova m12, [r3+16*12] ; 28 29 7008 mova m14, [r3+16*14] ; 30 31 7009 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi 7010 mova [rsp+224*16+t0*8], m8 7011 mova [rsp+224*16+t1*8], m9 7012 mova [rsp+224*16+t2*8], m10 7013 mova [rsp+224*16+t3*8], m11 7014%else 7015 sub r3, 8*16 7016 mova m0, [r3+ 8*16] 7017 mova m2, [r3+10*16] 7018 mova m4, [r3+12*16] 7019 mova m6, [r3+14*16] 7020 packssdw m0, [r3+ 9*16] 7021 packssdw m2, [r3+11*16] 7022 packssdw m4, [r3+13*16] 7023 packssdw m6, [r3+15*16] 7024 call m(idct_8x4_internal_16bpc).transpose4x8packed 7025 mova [rsp+ 96*16+t0*8], m0 7026 mova [rsp+ 96*16+t1*8], m1 7027 mova [rsp+ 96*16+t2*8], m2 7028 mova [rsp+ 96*16+t3*8], m3 7029 mova m0, [r3+16*16] 7030 mova m2, [r3+18*16] 7031 mova m4, [r3+20*16] 7032 mova m6, [r3+22*16] 7033 packssdw m0, [r3+17*16] 7034 packssdw m2, [r3+19*16] 7035 packssdw m4, [r3+21*16] 7036 packssdw m6, [r3+23*16] 7037 call m(idct_8x4_internal_16bpc).transpose4x8packed 7038 mova [rsp+160*16+t0*8], m0 7039 mova [rsp+160*16+t1*8], m1 7040 mova [rsp+160*16+t2*8], m2 7041 mova [rsp+160*16+t3*8], m3 7042 mova m0, [r3+31*16] 7043 mova m2, [r3+29*16] 7044 mova m4, [r3+27*16] 7045 mova m6, [r3+25*16] 7046 packssdw m0, [r3+30*16] 7047 packssdw m2, [r3+28*16] 7048 packssdw m4, [r3+26*16] 7049 packssdw m6, [r3+24*16] 7050 call m(idct_8x4_internal_16bpc).transpose4x8packed 7051 mova [rsp+224*16+t0*8], m0 7052 mova [rsp+224*16+t1*8], m1 7053 mova [rsp+224*16+t2*8], m2 7054 mova [rsp+224*16+t3*8], m3 7055 mova m0, [r3+ 0*16] 7056 mova m2, [r3+ 2*16] 7057 mova m4, [r3+ 4*16] 7058 mova m6, [r3+ 6*16] 7059 packssdw m0, [r3+ 1*16] 7060 packssdw m2, [r3+ 3*16] 7061 packssdw m4, [r3+ 5*16] 7062 packssdw m6, [r3+ 7*16] 7063 call m(idct_8x4_internal_16bpc).transpose4x8packed 7064%endif 7065 mova [rsp+ 32*16+t0*8], m0 7066 mova [rsp+ 32*16+t1*8], m1 7067 mova [rsp+ 32*16+t2*8], m2 7068 mova [rsp+ 32*16+t3*8], m3 7069 pxor m0, m0 7070 REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \ 7071 8, 9, 10, 11, 12, 13, 14, 15, \ 7072 16, 17, 18, 19, 20, 21, 22, 23, \ 7073 24, 25, 26, 27, 28, 29, 30, 31 7074%if ARCH_X86_32 7075 mov r6, [rsp+gprsize*3+(64*4+32)*16] 7076%endif 7077 sub r5d, 2 7078 jge .loop_pass1 7079 7080 ; pass=2 7081 mov eobd, [rsp+gprsize*0+(64*4+32)*16] 7082 cmp eobd, 136 7083 jl .fast 7084 ; fall-through 7085%if ARCH_X86_64 7086 DECLARE_REG_TMP 8, 9 7087%else 7088 DECLARE_REG_TMP 1, 5 7089%endif 7090 lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] 7091 lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] 7092 jmp .run 7093.fast: 7094 lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] 7095 lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] 7096.run: 7097 add rsp, 29*16 7098 7099%if ARCH_X86_64 7100 lea r2, [dstq+64] 7101 mov r7, -8 7102%else 7103 lea r2, [rsp+(64*4+3)*16] 7104 mov [r2+4*gprsize], t0 7105 mov [r2+5*gprsize], t1 7106 mov r1, [r2+2*gprsize] 7107 mov dword [r2+0*gprsize], 4 7108%endif 7109 jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 7110 7111.dconly: 7112 imul r5d, [cq], 181 7113 mov [cq], eobd ; 0 7114 mov r3d, 64 7115 add r5d, 128 7116 sar r5d, 8 7117 imul r5d, 181 7118 add r5d, 384 7119 sar r5d, 9 7120 add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16 7121 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 7122 7123cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ 7124 dst, stride, c, eob 7125 LEA r6, base 7126 test eobd, eobd 7127 jz .dconly 7128 7129 ; remove entirely-zero iterations 7130%undef cmp 7131 mov r5d, 8 7132.zero_loop: 7133 sub r5d, 2 7134 cmp eobw, word [o2(tbl_32x16_2d)+r5] 7135 jl .zero_loop 7136 7137 ; actual first pass after skipping all-zero data 7138.loop_pass1: 7139%if ARCH_X86_64 7140 mova m11, [o(pd_2048)] 7141 mova m12, [o(clip_18b_min)] 7142 mova m13, [o(clip_18b_max)] 7143 mova m14, [o(pd_2896)] 7144%endif 7145 7146 mov r3, rsp 7147 lea r4, [o(idct64_mul_16bpc)] 7148 mova m0, [cq+64* 1+r5*8] 7149 mova m1, [cq+64*31+r5*8] 7150 mova m2, [cq+64*17+r5*8] 7151 mova m3, [cq+64*15+r5*8] 7152 call .main_part1 7153 mova m0, [cq+64* 7+r5*8] 7154 mova m1, [cq+64*25+r5*8] 7155 mova m2, [cq+64*23+r5*8] 7156 mova m3, [cq+64* 9+r5*8] 7157 call .main_part1 7158 mova m0, [cq+64* 5+r5*8] 7159 mova m1, [cq+64*27+r5*8] 7160 mova m2, [cq+64*21+r5*8] 7161 mova m3, [cq+64*11+r5*8] 7162 call .main_part1 7163 mova m0, [cq+64* 3+r5*8] 7164 mova m1, [cq+64*29+r5*8] 7165 mova m2, [cq+64*19+r5*8] 7166 mova m3, [cq+64*13+r5*8] 7167 call .main_part1 7168 call .main_part2 7169 7170 mova m0, [cq+64* 2+r5*8] 7171 mova m1, [cq+64*14+r5*8] 7172 mova m2, [cq+64*18+r5*8] 7173 mova m3, [cq+64*30+r5*8] 7174 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast 7175 7176 mova m0, [cq+64* 6+r5*8] 7177 mova m1, [cq+64*10+r5*8] 7178 mova m2, [cq+64*22+r5*8] 7179 mova m3, [cq+64*26+r5*8] 7180 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast 7181 add r3, 16*(24+4*ARCH_X86_32) 7182 7183 mova m0, [cq+64* 4+r5*8] 7184 mova m1, [cq+64*12+r5*8] 7185 mova m2, [cq+64*20+r5*8] 7186 mova m3, [cq+64*28+r5*8] 7187 call m(idct_16x4_internal_16bpc).main_oddhalf_fast 7188 7189 mova m0, [cq+64* 0+r5*8] 7190 mova m1, [cq+64* 8+r5*8] 7191 mova m2, [cq+64*16+r5*8] 7192 mova m3, [cq+64*24+r5*8] 7193 call m(idct_8x4_internal_16bpc).main_pass1_fast 7194 call m(idct_8x4_internal_16bpc).round 7195 mova [r3-(7+4*ARCH_X86_32)*16], m1 7196 mova [r3-(6+4*ARCH_X86_32)*16], m2 7197 mova [r3-(5+4*ARCH_X86_32)*16], m3 7198 mova [r3-(4+4*ARCH_X86_32)*16], m4 7199 mova [r3-(3+4*ARCH_X86_32)*16], m5 7200 mova [r3-(2+4*ARCH_X86_32)*16], m6 7201 mova [r3-(1+4*ARCH_X86_32)*16], m7 7202 sub r3, 16*(40+4*ARCH_X86_32-4) 7203 7204%if ARCH_X86_64 7205 psrld m15, m11, 10 ; pd_2 7206%else 7207 mova m7, [o(pd_2)] 7208%endif 7209 call .main_end_loop_start 7210 7211 lea r3, [rsp+56*16] 7212 lea r4, [cq+r5*8+64*28] 7213 call .shift_transpose 7214 sub r5d, 2 7215 jge .loop_pass1 7216 7217 ; pass=2, we need to call this otherwise the stack pointer has 7218 ; the wrong offset in the 8-bit code 7219 call .pass2 7220 RET 7221 7222.pass2: 7223%if ARCH_X86_64 7224 mova m8, [o(pw_2048)] 7225 pxor m9, m9 7226 mova m10, [o(pixel_10bpc_max)] 7227%if WIN64 7228 mov [rsp+16*16+gprsize], r7 7229%endif 7230 mov r7, dstq 7231%else 7232 mov [rsp+2*gprsize+16*16], dstq 7233%endif 7234 lea r3, [strideq*3] 7235 mov r4d, 8 7236 jmp m(idct_16x16_internal_16bpc).loop_pass2 7237 7238.main_part1: ; idct64 steps 1-5 7239 ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 7240 ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 7241 ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 7242 ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 7243%if ARCH_X86_64 7244 movd m7, [r4+4*0] 7245 movd m8, [r4+4*1] 7246 movd m6, [r4+4*2] 7247 movd m9, [r4+4*3] 7248 movd m5, [r4+4*4] 7249 movd m10, [r4+4*5] 7250 movd m4, [r4+4*6] 7251 movd m15, [r4+4*7] 7252 REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15 7253 pmulld m7, m0 ; t63a 7254 pmulld m0, m8 ; t32a 7255 pmulld m6, m1 ; t62a 7256 pmulld m1, m9 ; t33a 7257 pmulld m5, m2 ; t61a 7258 pmulld m2, m10 ; t34a 7259 pmulld m4, m3 ; t60a 7260 pmulld m3, m15 ; t35a 7261 movd m10, [r4+4*8] 7262 movd m15, [r4+4*9] 7263 REPX {pshufd x, x, q0000}, m10, m15 7264 REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 7265 REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 7266 psubd m8, m0, m1 ; t33 7267 paddd m0, m1 ; t32 7268 psubd m1, m7, m6 ; t62 7269 paddd m7, m6 ; t63 7270 psubd m6, m3, m2 ; t34 7271 paddd m3, m2 ; t35 7272 psubd m2, m4, m5 ; t61 7273 paddd m4, m5 ; t60 7274 REPX {pmaxsd x, m12}, m8, m1, m6, m2 7275 REPX {pminsd x, m13}, m8, m1, m6, m2 7276 ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a 7277 ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a 7278 REPX {pmaxsd x, m12}, m0, m3, m7, m4 7279 REPX {pminsd x, m13}, m0, m3, m7, m4 7280 movd m10, [r4+4*10] 7281 movd m15, [r4+4*11] 7282 REPX {pshufd x, x, q0000}, m10, m15 7283 psubd m5, m0, m3 ; t35a 7284 paddd m0, m3 ; t32a 7285 psubd m3, m7, m4 ; t60a 7286 paddd m7, m4 ; t63a 7287 psubd m4, m1, m6 ; t34 7288 paddd m1, m6 ; t33 7289 psubd m6, m8, m2 ; t61 7290 paddd m8, m2 ; t62 7291 REPX {pmaxsd x, m12}, m5, m3, m4, m6 7292 REPX {pminsd x, m13}, m5, m3, m4, m6 7293 ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 7294 ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a 7295 REPX {pmaxsd x, m12}, m0, m7, m1, m8 7296 REPX {pminsd x, m13}, m0, m7, m1, m8 7297 add r4, 4*12 7298 mova [r3+16*0], m0 7299 mova [r3+16*7], m7 7300 mova [r3+16*1], m1 7301 mova [r3+16*6], m8 7302 mova [r3+16*2], m6 7303 mova [r3+16*5], m4 7304 mova [r3+16*3], m3 7305 mova [r3+16*4], m5 7306%else 7307 movd m7, [r4+4*0] 7308 movd m6, [r4+4*2] 7309 movd m5, [r4+4*4] 7310 movd m4, [r4+4*6] 7311 REPX {pshufd x, x, q0000}, m7, m6, m5, m4 7312 pmulld m7, m0 ; t63a 7313 pmulld m6, m1 ; t62a 7314 pmulld m5, m2 ; t61a 7315 pmulld m4, m3 ; t60a 7316 mova [r3+0*16], m6 7317 mova [r3+1*16], m7 7318 movd m6, [r4+4*1] 7319 movd m7, [r4+4*3] 7320 REPX {pshufd x, x, q0000}, m7, m6 7321 pmulld m0, m6 ; t32a 7322 pmulld m1, m7 ; t33a 7323 movd m6, [r4+4*5] 7324 movd m7, [r4+4*7] 7325 REPX {pshufd x, x, q0000}, m7, m6 7326 pmulld m2, m6 ; t34a 7327 pmulld m3, m7 ; t35a 7328 mova m6, [r3+0*16] 7329 mova m7, [o(pd_2048)] 7330 REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 7331 paddd m7, [r3+1*16] 7332 REPX {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4 7333 mova [r3+0*16], m5 7334 psubd m5, m0, m1 ; t33 7335 paddd m0, m1 ; t32 7336 mova [r3+1*16], m0 7337 mova m0, [r3+0*16] 7338 psubd m1, m7, m6 ; t62 7339 paddd m7, m6 ; t63 7340 psubd m6, m3, m2 ; t34 7341 paddd m3, m2 ; t35 7342 psubd m2, m4, m0 ; t61 7343 paddd m4, m0 ; t60 7344 mova m0, [o(clip_18b_min)] 7345 REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4 7346 pmaxsd m0, [r3+1*16] 7347 mova [r3+0*16], m0 7348 mova m0, [o(clip_18b_max)] 7349 REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4 7350 pminsd m0, [r3+0*16] 7351 mova [r3+0*16], m0 7352 mova [r3+1*16], m3 7353 mova [r3+2*16], m4 7354 mova [r3+3*16], m7 7355 mova m0, [o(pd_2048)] 7356 movd m3, [r4+4*8] 7357 movd m4, [r4+4*9] 7358 REPX {pshufd x, x, q0000}, m3, m4 7359 mova [r3+4*16], m2 7360 ITX_MULSUB_2D 1, 5, 2, 7, _, 0, 3, 4 ; t33a, t62a 7361 mova m2, [r3+4*16] 7362 mova [r3+4*16], m5 7363 ITX_MULSUB_2D 2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a 7364 mova m0, [r3+0*16] 7365 mova m3, [r3+1*16] 7366 mova m4, [r3+2*16] 7367 mova m7, [r3+3*16] 7368 psubd m5, m0, m3 ; t35a 7369 paddd m0, m3 ; t32a 7370 mova [r3+0*16], m5 7371 mova m5, [r3+4*16] 7372 psubd m3, m7, m4 ; t60a 7373 paddd m7, m4 ; t63a 7374 psubd m4, m1, m6 ; t34 7375 paddd m1, m6 ; t33 7376 psubd m6, m5, m2 ; t61 7377 paddd m2, m5 ; t62 7378 mova m5, [o(clip_18b_min)] 7379 REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2 7380 pmaxsd m5, [r3+0*16] 7381 mova [r3+0*16], m5 7382 mova m5, [o(clip_18b_max)] 7383 REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2 7384 pminsd m5, [r3+0*16] 7385 mova [r3+16*0], m0 7386 mova [r3+16*7], m7 7387 mova [r3+16*1], m1 7388 mova [r3+16*6], m2 7389 mova [r3+16*2], m4 7390 mova m7, [o(pd_2048)] 7391 movd m0, [r4+4*10] 7392 movd m1, [r4+4*11] 7393 REPX {pshufd x, x, q0000}, m0, m1 7394 ITX_MULSUB_2D 3, 5, 2, 4, _, 7, 0, 1 ; t35, t60 7395 mova [r3+16*3], m3 7396 mova [r3+16*4], m5 7397 mova m4, [r3+2*16] 7398 ITX_MULSUB_2D 6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a 7399 add r4, 4*12 7400 mova [r3+16*2], m6 7401 mova [r3+16*5], m4 7402%endif 7403 add r3, 16*8 7404 ret 7405 7406.main_part2: ; idct64 steps 6-9 7407 lea r4, [r3+16*7] 7408%if ARCH_X86_64 7409 mova m10, [o(pd_1567)] 7410 mova m15, [o(pd_3784)] 7411.main_part2_loop: 7412 mova m0, [r3-16*32] ; t32a 7413 mova m1, [r4-16*24] ; t39a 7414 mova m2, [r4-16*32] ; t63a 7415 mova m3, [r3-16*24] ; t56a 7416 mova m4, [r3-16*16] ; t40a 7417 mova m5, [r4-16* 8] ; t47a 7418 mova m6, [r4-16*16] ; t55a 7419 mova m7, [r3-16* 8] ; t48a 7420 psubd m8, m0, m1 ; t39 7421 paddd m0, m1 ; t32 7422 psubd m1, m2, m3 ; t56 7423 paddd m2, m3 ; t63 7424 psubd m3, m5, m4 ; t40 7425 paddd m5, m4 ; t47 7426 psubd m4, m7, m6 ; t55 7427 paddd m7, m6 ; t48 7428 REPX {pmaxsd x, m12}, m8, m1, m3, m4 7429 REPX {pminsd x, m13}, m8, m1, m3, m4 7430 ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a 7431 ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a 7432 REPX {pmaxsd x, m12}, m0, m2, m5, m7 7433 REPX {pminsd x, m13}, m0, m5, m2, m7 7434 psubd m6, m2, m7 ; t48a 7435 paddd m2, m7 ; t63a 7436 psubd m7, m0, m5 ; t47a 7437 paddd m0, m5 ; t32a 7438 psubd m5, m8, m4 ; t55 7439 paddd m8, m4 ; t56 7440 psubd m4, m1, m3 ; t40 7441 paddd m1, m3 ; t39 7442 REPX {pmaxsd x, m12}, m6, m7, m5, m4 7443 REPX {pminsd x, m13}, m6, m7, m5, m4 7444 REPX {pmulld x, m14}, m6, m7, m5, m4 7445 REPX {pmaxsd x, m12}, m2, m0, m8, m1 7446 REPX {pminsd x, m13}, m2, m0, m8, m1 7447 paddd m6, m11 7448 paddd m5, m11 7449 psubd m3, m6, m7 ; t47 7450 paddd m6, m7 ; t48 7451 psubd m7, m5, m4 ; t40a 7452 paddd m5, m4 ; t55a 7453 REPX {psrad x, 12}, m3, m6, m7, m5 7454 mova [r4-16* 8], m2 7455 mova [r3-16*32], m0 7456 mova [r3-16* 8], m8 7457 mova [r4-16*32], m1 7458 mova [r4-16*24], m3 7459 mova [r3-16*16], m6 7460 mova [r3-16*24], m7 7461 mova [r4-16*16], m5 7462%else 7463.main_part2_loop: 7464 mova m0, [r3-16*32] ; t32a 7465 mova m1, [r4-16*24] ; t39a 7466 mova m2, [r4-16*32] ; t63a 7467 mova m3, [r3-16*24] ; t56a 7468 mova m4, [r3-16*16] ; t40a 7469 mova m5, [r4-16* 8] ; t47a 7470 mova m6, [r4-16*16] ; t55a 7471 psubd m7, m0, m1 ; t39 7472 paddd m0, m1 ; t32 7473 mova [r3+0*16], m7 7474 mova m7, [r3-16* 8] ; t48a 7475 psubd m1, m2, m3 ; t56 7476 paddd m2, m3 ; t63 7477 psubd m3, m5, m4 ; t40 7478 paddd m5, m4 ; t47 7479 psubd m4, m7, m6 ; t55 7480 paddd m7, m6 ; t48 7481 mova m6, [o(clip_18b_min)] 7482 REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7 7483 pmaxsd m6, [r3+0*16] 7484 mova [r3+0*16], m6 7485 mova m6, [o(clip_18b_max)] 7486 REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7 7487 pminsd m6, [r3+0*16] 7488 mova [r3+0*16], m0 7489 mova [r3+1*16], m2 7490 mova [r3+2*16], m5 7491 mova [r3+3*16], m7 7492 mova m0, [o(pd_2048)] 7493 ITX_MULSUB_2D 1, 6, 2, 5, 7, 0, 1567, 3784 ; t39a, t56a 7494 ITX_MULSUB_2D 4, 3, 2, 5, _, 0, 7, 3784, 4 ; t55a, t40a 7495 mova m2, [r3+1*16] 7496 mova m7, [r3+3*16] 7497 psubd m5, m2, m7 ; t48a 7498 paddd m2, m7 ; t63a 7499 mova [r3+1*16], m5 7500 mova m0, [r3+0*16] 7501 mova m5, [r3+2*16] 7502 psubd m7, m0, m5 ; t47a 7503 paddd m0, m5 ; t32a 7504 psubd m5, m6, m4 ; t55 7505 paddd m6, m4 ; t56 7506 psubd m4, m1, m3 ; t40 7507 paddd m1, m3 ; t39 7508 mova m3, [o(clip_18b_min)] 7509 REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1 7510 pmaxsd m3, [r3+1*16] 7511 mova [r3+0*16], m3 7512 mova m3, [o(clip_18b_max)] 7513 REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1 7514 pminsd m3, [r3+0*16] 7515 mova [r4-16* 8], m2 7516 mova [r3-16*32], m0 7517 mova [r3-16* 8], m6 7518 mova [r4-16*32], m1 7519 mova m0, [o(pd_2896)] 7520 mova m1, [o(pd_2048)] 7521 REPX {pmulld x, m0}, m3, m7, m5, m4 7522 REPX {paddd x, m1}, m3, m5 7523 psubd m6, m3, m7 ; t47 7524 paddd m3, m7 ; t48 7525 psubd m7, m5, m4 ; t40a 7526 paddd m5, m4 ; t55a 7527 REPX {psrad x, 12}, m6, m3, m7, m5 7528 mova [r4-16*24], m6 7529 mova [r3-16*16], m3 7530 mova [r3-16*24], m7 7531 mova [r4-16*16], m5 7532%endif 7533 add r3, 16 7534 sub r4, 16 7535 cmp r3, r4 7536 jl .main_part2_loop 7537 sub r3, 4*16 7538 ret 7539 7540.main_end_loop: 7541 mova m0, [r3+16*28] ; idct8 0 + n 7542.main_end_loop_start: 7543 mova m2, [r3+16*12] ; idct32 16 + n 7544 mova m3, [r4+16*12] ; idct32 31 - n 7545%if ARCH_X86_64 7546 mova m1, [r4+16*28] ; idct16 15 - n 7547 mova m4, [r4-16* 4] ; idct64 63 - n 7548 mova m5, [r3-16* 4] ; idct64 48 + n 7549 mova m6, [r4-16*20] ; idct64 47 - n 7550 mova m7, [r3-16*20] ; idct64 32 + n 7551 pmaxsd m0, m12 7552 pminsd m0, m13 7553 paddd m8, m0, m1 ; idct16 out0 + n 7554 psubd m0, m1 ; idct16 out15 - n 7555 REPX {pmaxsd x, m12}, m8, m0 7556 REPX {pminsd x, m13}, m8, m0 7557 paddd m1, m8, m3 ; idct32 out0 + n 7558 psubd m8, m3 ; idct32 out31 - n 7559 paddd m3, m0, m2 ; idct32 out15 - n 7560 psubd m0, m2 ; idct32 out16 + n 7561 REPX {pmaxsd x, m12}, m1, m8, m3, m0 7562 REPX {pminsd x, m13}, m1, m3, m8, m0 7563 REPX {paddd x, m15}, m1, m3, m0, m8 7564 paddd m2, m1, m4 ; idct64 out0 + n (unshifted) 7565 psubd m1, m4 ; idct64 out63 - n (unshifted) 7566 paddd m4, m3, m5 ; idct64 out15 - n (unshifted) 7567 psubd m3, m5 ; idct64 out48 + n (unshifted) 7568 paddd m5, m0, m6 ; idct64 out16 + n (unshifted) 7569 psubd m0, m6 ; idct64 out47 - n (unshifted) 7570 paddd m6, m8, m7 ; idct64 out31 - n (unshifted) 7571 psubd m8, m7 ; idct64 out32 + n (unshifted) 7572 mova [r3-16*20], m2 7573 mova [r4+16*28], m1 7574 mova [r4-16*20], m4 7575 mova [r3+16*28], m3 7576 mova [r3-16* 4], m5 7577 mova [r4+16*12], m0 7578 mova [r4-16* 4], m6 7579 mova [r3+16*12], m8 7580%else 7581 mova m5, [o(clip_18b_min)] 7582 mova m6, [o(clip_18b_max)] 7583 mova m1, [r3+16*44] ; idct16 15 - n 7584 pmaxsd m0, m5 7585 pminsd m0, m6 7586 paddd m4, m0, m1 ; idct16 out0 + n 7587 psubd m0, m1 ; idct16 out15 - n 7588 REPX {pmaxsd x, m5}, m4, m0 7589 REPX {pminsd x, m6}, m4, m0 7590 paddd m1, m4, m3 ; idct32 out0 + n 7591 psubd m4, m3 ; idct32 out31 - n 7592 paddd m3, m0, m2 ; idct32 out15 - n 7593 psubd m0, m2 ; idct32 out16 + n 7594 REPX {pmaxsd x, m5}, m1, m4, m3, m0 7595 REPX {pminsd x, m6}, m1, m3, m4, m0 7596 REPX {paddd x, m7}, m1, m3, m0, m4 7597 mova m5, [r4-16* 4] ; idct64 63 - n 7598 mova m6, [r3-16* 4] ; idct64 48 + n 7599 paddd m2, m1, m5 ; idct64 out0 + n (unshifted) 7600 psubd m1, m5 ; idct64 out63 - n (unshifted) 7601 paddd m5, m3, m6 ; idct64 out15 - n (unshifted) 7602 psubd m3, m6 ; idct64 out48 + n (unshifted) 7603 mova [r4+16*28], m1 7604 mova [r3+16*28], m3 7605 mova m6, [r4-16*20] ; idct64 47 - n 7606 mova m1, [r3-16*20] ; idct64 32 + n 7607 mova [r3-16*20], m2 7608 mova [r4-16*20], m5 7609 paddd m5, m0, m6 ; idct64 out16 + n (unshifted) 7610 psubd m0, m6 ; idct64 out47 - n (unshifted) 7611 paddd m6, m4, m1 ; idct64 out31 - n (unshifted) 7612 psubd m4, m1 ; idct64 out32 + n (unshifted) 7613 mova [r3-16* 4], m5 7614 mova [r4+16*12], m0 7615 mova [r4-16* 4], m6 7616 mova [r3+16*12], m4 7617%endif 7618 sub r4, 16 7619 add r3, 16 7620 cmp r3, r4 7621 jl .main_end_loop 7622 ret 7623 7624.shift_transpose: 7625 mova m0, [r3+0*16] 7626 mova m1, [r3+1*16] 7627 mova m2, [r3+2*16] 7628 mova m3, [r3+3*16] 7629 mova m4, [r3+4*16] 7630 mova m5, [r3+5*16] 7631 mova m6, [r3+6*16] 7632 mova m7, [r3+7*16] 7633 REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 7634 packssdw m0, m1 7635 packssdw m2, m3 7636 packssdw m4, m5 7637 packssdw m6, m7 7638 call m(idct_8x4_internal_16bpc).transpose4x8packed 7639 mova [r4+0*64], m0 7640 mova [r4+1*64], m1 7641 mova [r4+2*64], m2 7642 mova [r4+3*64], m3 7643 sub r4, 4*64 7644 sub r3, 8*16 7645 cmp r3, rsp 7646 jg .shift_transpose 7647 ret 7648 7649.dconly: 7650 imul r5d, [cq], 181 7651 mov [cq], eobd ; 0 7652 mov r3d, 16 7653.dconly1: 7654 add r5d, 640 7655 sar r5d, 10 7656.dconly2: 7657 imul r5d, 2896 7658 add r5d, 34816 7659 movd m0, r5d 7660 pshuflw m0, m0, q1111 7661 punpcklqdq m0, m0 7662 mova m6, [o(pixel_10bpc_max)] 7663 pxor m5, m5 7664.dconly_loop: 7665 paddw m1, m0, [dstq+16*0] 7666 paddw m2, m0, [dstq+16*1] 7667 paddw m3, m0, [dstq+16*2] 7668 paddw m4, m0, [dstq+16*3] 7669 REPX {pmaxsw x, m5}, m1, m2, m3, m4 7670 REPX {pminsw x, m6}, m1, m2, m3, m4 7671 mova [dstq+16*0], m1 7672 mova [dstq+16*1], m2 7673 mova [dstq+16*2], m3 7674 mova [dstq+16*3], m4 7675 add dstq, 64 7676 btc r3d, 16 7677 jnc .dconly_loop 7678 lea dstq, [dstq+strideq-128] 7679 dec r3d 7680 jg .dconly_loop 7681 RET 7682 7683cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \ 7684 0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \ 7685 dst, stride, c, eob 7686 LEA r6, base 7687 test eobd, eobd 7688 jz .dconly 7689 7690%if ARCH_X86_32 7691 DECLARE_REG_TMP 0, 4, 1 7692 mov [rsp+(8*32+64+8)*16+1*gprsize], dstq 7693 mov [rsp+(8*32+64+8)*16+2*gprsize], strideq 7694%else 7695 DECLARE_REG_TMP 4, 7, 8 7696%if WIN64 7697 mov [rsp+(8*32+64+1)*16+1*gprsize], r7 7698 mov [rsp+64*16+0*gprsize], r8 7699%endif 7700%endif 7701%undef cmp 7702 ; remove entirely-zero iterations 7703 mov r5d, 14 7704 cmp eobw, word [o2(tbl_32x32_2d)+r5] 7705 jge .end_zero_loop 7706 pxor m0, m0 7707.zero_loop: 7708 movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] 7709 movzx t1d, t0b 7710 shr t0d, 8 7711 lea t2, [rsp+7*32*16] 7712.zero_loop_inner: 7713 mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 7714 mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 7715 mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0 7716 mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0 7717 sub t2, 32*16 7718 cmp t2, rsp 7719 jge .zero_loop_inner 7720 sub r5d, 2 7721 cmp eobw, word [o2(tbl_32x32_2d)+r5] 7722 jl .zero_loop 7723.end_zero_loop: 7724 mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd 7725 ; actual first pass after skipping all-zero data 7726.loop_pass1: 7727%if ARCH_X86_64 7728 mova m11, [o(pd_2048)] 7729 mova m12, [o(clip_18b_min)] 7730 mova m13, [o(clip_18b_max)] 7731 mova m14, [o(pd_2896)] 7732%endif 7733 7734 mov r3, rsp 7735 lea r4, [o(idct64_mul_16bpc)] 7736 mova m0, [cq+128* 1+r5*8] 7737 mova m1, [cq+128*31+r5*8] 7738 mova m2, [cq+128*17+r5*8] 7739 mova m3, [cq+128*15+r5*8] 7740 call .rect2_mul_fast 7741 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 7742 mova m0, [cq+128* 7+r5*8] 7743 mova m1, [cq+128*25+r5*8] 7744 mova m2, [cq+128*23+r5*8] 7745 mova m3, [cq+128* 9+r5*8] 7746 call .rect2_mul_fast 7747 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 7748 mova m0, [cq+128* 5+r5*8] 7749 mova m1, [cq+128*27+r5*8] 7750 mova m2, [cq+128*21+r5*8] 7751 mova m3, [cq+128*11+r5*8] 7752 call .rect2_mul_fast 7753 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 7754 mova m0, [cq+128* 3+r5*8] 7755 mova m1, [cq+128*29+r5*8] 7756 mova m2, [cq+128*19+r5*8] 7757 mova m3, [cq+128*13+r5*8] 7758 call .rect2_mul_fast 7759 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 7760 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 7761 7762 mova m0, [cq+128* 2+r5*8] 7763 mova m1, [cq+128*14+r5*8] 7764 mova m2, [cq+128*18+r5*8] 7765 mova m3, [cq+128*30+r5*8] 7766 call .rect2_mul_fast 7767 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast 7768 7769 mova m0, [cq+128* 6+r5*8] 7770 mova m1, [cq+128*10+r5*8] 7771 mova m2, [cq+128*22+r5*8] 7772 mova m3, [cq+128*26+r5*8] 7773 call .rect2_mul_fast 7774 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast 7775 add r3, 16*(24+4*ARCH_X86_32) 7776 7777 mova m0, [cq+128* 4+r5*8] 7778 mova m1, [cq+128*12+r5*8] 7779 mova m2, [cq+128*20+r5*8] 7780 mova m3, [cq+128*28+r5*8] 7781 call .rect2_mul_fast 7782 call m(idct_16x4_internal_16bpc).main_oddhalf_fast 7783 7784 mova m0, [cq+128* 0+r5*8] 7785 mova m1, [cq+128* 8+r5*8] 7786 mova m2, [cq+128*16+r5*8] 7787 mova m3, [cq+128*24+r5*8] 7788 call .rect2_mul_fast 7789 call m(idct_8x4_internal_16bpc).main_pass1_fast 7790 call m(idct_8x4_internal_16bpc).round 7791 mova [r3-(7+4*ARCH_X86_32)*16], m1 7792 mova [r3-(6+4*ARCH_X86_32)*16], m2 7793 mova [r3-(5+4*ARCH_X86_32)*16], m3 7794 mova [r3-(4+4*ARCH_X86_32)*16], m4 7795 mova [r3-(3+4*ARCH_X86_32)*16], m5 7796 mova [r3-(2+4*ARCH_X86_32)*16], m6 7797 mova [r3-(1+4*ARCH_X86_32)*16], m7 7798 sub r3, 16*(40+4*ARCH_X86_32-4) 7799 7800%if ARCH_X86_64 7801 psrld m15, m11, 11 ; pd_1 7802%else 7803 mova m7, [o(pd_1)] 7804%endif 7805 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start 7806 7807 lea r3, [rsp+56*16] 7808 lea t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16] 7809 movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] 7810 movzx t1d, t0b 7811 shr t0d, 8 7812 call .shift_transpose 7813 ; zero cq 7814 pxor m7, m7 7815 lea r4, [cq+30*128+r5*8] 7816.zero_cq_loop: 7817 REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 7818 sub r4, 4*128 7819 cmp r4, cq 7820 jg .zero_cq_loop 7821 sub r5d, 2 7822 jge .loop_pass1 7823 7824 ; pass=2 code starts here 7825 mov eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16] 7826%if ARCH_X86_32 7827 mov strideq, [rsp+gprsize*2+(8*32+64+8)*16] 7828%elif WIN64 7829 mov r8, [rsp+gprsize*0+64*16] 7830%endif 7831 add rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16 7832 cmp eobd, 36 7833 jl .load_veryfast 7834 cmp eobd, 136 7835 jl .load_fast 7836 ; load normal 7837 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] 7838 jmp .run 7839.load_fast: 7840 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] 7841 jmp .run 7842.load_veryfast: 7843 lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] 7844 ; fall-through 7845.run: 7846%if ARCH_X86_64 7847 lea r2, [dstq+128] 7848 mov r7, -16 7849%else 7850 lea r2, [rsp+(8*32+3)*16] 7851 mov dword [r2+0*gprsize], 8 7852%endif 7853 jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry 7854 7855.rect2_mul_fast: 7856%if ARCH_X86_64 7857 REPX {pmulld x, m14}, m0, m1, m2, m3 7858 REPX {paddd x, m11}, m0, m1, m2, m3 7859%else 7860 mova m4, [o(pd_2896)] 7861 mova m5, [o(pd_2048)] 7862 REPX {pmulld x, m4 }, m0, m1, m2, m3 7863 REPX {paddd x, m5 }, m0, m1, m2, m3 7864%endif 7865 REPX {psrad x, 12 }, m0, m1, m2, m3 7866 ret 7867 7868.shift_transpose: 7869 mova m0, [r3+0*16] 7870 mova m1, [r3+1*16] 7871 mova m2, [r3+2*16] 7872 mova m3, [r3+3*16] 7873 mova m4, [r3+4*16] 7874 mova m5, [r3+5*16] 7875 mova m6, [r3+6*16] 7876 mova m7, [r3+7*16] 7877 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 7878 packssdw m0, m1 7879 packssdw m2, m3 7880 packssdw m4, m5 7881 packssdw m6, m7 7882 call m(idct_8x4_internal_16bpc).transpose4x8packed 7883 mova [t2+0*16+r5*8], m0 7884 mova [t2+8*16+r5*8], m2 7885 mova [t2+0*16+t0*8], m3 7886 mova [t2+0*16+t1*8], m1 7887 sub t2, 16*32 7888 sub r3, 8*16 7889 cmp r3, rsp 7890 jg .shift_transpose 7891 ret 7892 7893.dconly: 7894 imul r5d, [cq], 181 7895 mov [cq], eobd ; 0 7896 mov r3d, 32 7897 add r5d, 128 7898 sar r5d, 8 7899 imul r5d, 181 7900 add r5d, 384 7901 sar r5d, 9 7902 add rsp, (1+8*32+1*WIN64)*16 7903 jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 7904 7905cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \ 7906 0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \ 7907 dst, stride, c, eob 7908 LEA r6, base 7909 test eobd, eobd 7910 jz .dconly 7911 7912%if ARCH_X86_32 7913 DECLARE_REG_TMP 4, 1, 2, 0, 6 7914 mov [rsp+gprsize*1+(64*9+8)*16], r0 7915 mov [rsp+gprsize*2+(64*9+8)*16], r1 7916 mov [rsp+gprsize*3+(64*9+8)*16], r2 7917 mov [rsp+gprsize*4+(64*9+8)*16], r6 7918%else 7919 DECLARE_REG_TMP 8, 9, 4, 7, 0 7920 mov [rsp+gprsize*1+(64*9+1)*16], r9 7921 mov [rsp+gprsize*0+64*16], r0 7922%if WIN64 7923 mov [rsp+gprsize*2+(64*9+1)*16], r7 7924 mov [rsp+gprsize*3+(64*9+1)*16], r8 7925%endif 7926%endif 7927%undef cmp 7928 7929 ; remove entirely-zero iterations 7930 mov r5d, 14 7931 cmp eobw, word [o2(tbl_32x32_2d)+r5] 7932 jge .end_zero_loop 7933 pxor m0, m0 7934.zero_loop: 7935 movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] 7936 movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] 7937 movzx t0d, t1b 7938 movzx t2d, t3b 7939 shr t1d, 8 7940 shr t3d, 8 7941 lea t4, [rsp+7*64*16] 7942.zero_loop_inner: 7943 mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0 7944 mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0 7945 mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0 7946 mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0 7947 sub t4, 64*16 7948 cmp t4, rsp 7949 jge .zero_loop_inner 7950%if ARCH_X86_32 7951 mov r6, [rsp+gprsize*4+(64*9+8)*16] 7952%endif 7953 sub r5d, 2 7954 cmp eobw, word [o2(tbl_32x32_2d)+r5] 7955 jl .zero_loop 7956.end_zero_loop: 7957 mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd 7958%if ARCH_X86_32 7959 mov cq, [rsp+gprsize*3+(64*9+8)*16] 7960%endif 7961 ; actual first pass after skipping all-zero data 7962.loop_pass1: 7963%if ARCH_X86_64 7964 mova m11, [o(pd_2048)] 7965 mova m12, [o(clip_18b_min)] 7966 mova m13, [o(clip_18b_max)] 7967 mova m14, [o(pd_2896)] 7968%endif 7969 7970 mov r3, rsp 7971 lea r4, [o(idct64_mul_16bpc)] 7972 mova m0, [cq+128* 1+r5*8] 7973 mova m1, [cq+128*31+r5*8] 7974 mova m2, [cq+128*17+r5*8] 7975 mova m3, [cq+128*15+r5*8] 7976 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 7977 mova m0, [cq+128* 7+r5*8] 7978 mova m1, [cq+128*25+r5*8] 7979 mova m2, [cq+128*23+r5*8] 7980 mova m3, [cq+128* 9+r5*8] 7981 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 7982 mova m0, [cq+128* 5+r5*8] 7983 mova m1, [cq+128*27+r5*8] 7984 mova m2, [cq+128*21+r5*8] 7985 mova m3, [cq+128*11+r5*8] 7986 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 7987 mova m0, [cq+128* 3+r5*8] 7988 mova m1, [cq+128*29+r5*8] 7989 mova m2, [cq+128*19+r5*8] 7990 mova m3, [cq+128*13+r5*8] 7991 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 7992 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 7993 7994 mova m0, [cq+128* 2+r5*8] 7995 mova m1, [cq+128*14+r5*8] 7996 mova m2, [cq+128*18+r5*8] 7997 mova m3, [cq+128*30+r5*8] 7998 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast 7999 8000 mova m0, [cq+128* 6+r5*8] 8001 mova m1, [cq+128*10+r5*8] 8002 mova m2, [cq+128*22+r5*8] 8003 mova m3, [cq+128*26+r5*8] 8004 call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast 8005 add r3, 16*(24+4*ARCH_X86_32) 8006 8007 mova m0, [cq+128* 4+r5*8] 8008 mova m1, [cq+128*12+r5*8] 8009 mova m2, [cq+128*20+r5*8] 8010 mova m3, [cq+128*28+r5*8] 8011 call m(idct_16x4_internal_16bpc).main_oddhalf_fast 8012 8013 mova m0, [cq+128* 0+r5*8] 8014 mova m1, [cq+128* 8+r5*8] 8015 mova m2, [cq+128*16+r5*8] 8016 mova m3, [cq+128*24+r5*8] 8017 call m(idct_8x4_internal_16bpc).main_pass1_fast 8018 call m(idct_8x4_internal_16bpc).round 8019 mova [r3-(7+4*ARCH_X86_32)*16], m1 8020 mova [r3-(6+4*ARCH_X86_32)*16], m2 8021 mova [r3-(5+4*ARCH_X86_32)*16], m3 8022 mova [r3-(4+4*ARCH_X86_32)*16], m4 8023 mova [r3-(3+4*ARCH_X86_32)*16], m5 8024 mova [r3-(2+4*ARCH_X86_32)*16], m6 8025 mova [r3-(1+4*ARCH_X86_32)*16], m7 8026 sub r3, 16*(40+4*ARCH_X86_32-4) 8027 8028%if ARCH_X86_64 8029 psrld m15, m11, 10 ; pd_2 8030%else 8031 mova m7, [o(pd_2)] 8032%endif 8033 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start 8034 8035 lea r3, [rsp+56*16] 8036 movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] 8037 movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] 8038 movzx t0d, t1b 8039 movzx t2d, t3b 8040 shr t1d, 8 8041 shr t3d, 8 8042 lea t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16] 8043 call .shift_transpose 8044 ; zero cq 8045 pxor m7, m7 8046%if ARCH_X86_32 8047 mov cq, [rsp+gprsize*3+(64*9+8)*16] 8048%endif 8049 lea r4, [cq+30*128+r5*8] 8050.zero_cq_loop: 8051 REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 8052 sub r4, 4*128 8053 cmp r4, cq 8054 jg .zero_cq_loop 8055%if ARCH_X86_32 8056 mov r6, [rsp+gprsize*4+(64*9+8)*16] 8057%endif 8058 sub r5d, 2 8059 jge .loop_pass1 8060 8061 ; pass=2 code starts here 8062 mov eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16] 8063%if ARCH_X86_32 8064 mov strideq, [rsp+gprsize*2+(9*64+8)*16] 8065%else 8066 mov r0, [rsp+gprsize*0+64*16] 8067%endif 8068 add rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16 8069 cmp eobd, 151 8070 jl .fast 8071 ; fall-through 8072%if ARCH_X86_64 8073 DECLARE_REG_TMP 8, 9 8074%else 8075 DECLARE_REG_TMP 1, 5 8076%endif 8077 lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] 8078 lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] 8079 jmp .run 8080.fast: 8081 lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] 8082 lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] 8083.run: 8084 8085%if ARCH_X86_64 8086 lea r2, [dstq+128] 8087 mov r7, -16 8088%else 8089 lea r2, [rsp+(64*8+3)*16] 8090 mov [r2+4*gprsize], t0 8091 mov [r2+5*gprsize], t1 8092 mov r1, [r2+2*gprsize] 8093 mov dword [r2+0*gprsize], 8 8094%endif 8095 jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 8096 8097 ; copy of pass=1 tmp-regs 8098%if ARCH_X86_32 8099 DECLARE_REG_TMP 4, 1, 2, 0, 6 8100%else 8101 DECLARE_REG_TMP 8, 9, 4, 7, 0 8102%endif 8103 8104.shift_transpose: 8105 mova m0, [r3+0*16] 8106 mova m1, [r3+1*16] 8107 mova m2, [r3+2*16] 8108 mova m3, [r3+3*16] 8109 mova m4, [r3+4*16] 8110 mova m5, [r3+5*16] 8111 mova m6, [r3+6*16] 8112 mova m7, [r3+7*16] 8113 REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 8114 packssdw m0, m1 8115 packssdw m2, m3 8116 packssdw m4, m5 8117 packssdw m6, m7 8118 call m(idct_8x4_internal_16bpc).transpose4x8packed 8119 mova [t4+t0*8], m0 8120 mova [t4+t1*8], m1 8121 mova [t4+t2*8], m2 8122 mova [t4+t3*8], m3 8123 sub t4, 16*64 8124 sub r3, 8*16 8125 cmp r3, rsp 8126 jg .shift_transpose 8127 ret 8128 8129.dconly: 8130 imul r5d, [cq], 181 8131 mov [cq], eobd ; 0 8132 mov r3d, 64 8133 add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \ 8134 (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16 8135 jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1 8136