1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; Copyright © 2021, Matthias Dressel 4; All rights reserved. 5; 6; Redistribution and use in source and binary forms, with or without 7; modification, are permitted provided that the following conditions are met: 8; 9; 1. Redistributions of source code must retain the above copyright notice, this 10; list of conditions and the following disclaimer. 11; 12; 2. Redistributions in binary form must reproduce the above copyright notice, 13; this list of conditions and the following disclaimer in the documentation 14; and/or other materials provided with the distribution. 15; 16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27%include "config.asm" 28%include "ext/x86/x86inc.asm" 29 30%if ARCH_X86_64 31 32SECTION_RODATA 32 33itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 34 dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 35idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 36idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5 37iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 38idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 39iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 40pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 41idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 42idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 43 44%macro COEF_PAIR 2-3 0 45pd_%1_%2: dd %1, %1, %2, %2 46%define pd_%1 (pd_%1_%2 + 4*0) 47%define pd_%2 (pd_%1_%2 + 4*2) 48%if %3 49dd -%2, -%2 50%define pd_%2_m%2 pd_%2 51%endif 52%endmacro 53 54COEF_PAIR 201, 995 55COEF_PAIR 401, 1931 56COEF_PAIR 799, 3406 57COEF_PAIR 1380, 601 58COEF_PAIR 1751, 2440 59COEF_PAIR 2598, 1189 60COEF_PAIR 2751, 2106 61COEF_PAIR 2896, 1567, 1 62COEF_PAIR 2896, 3784, 1 63COEF_PAIR 3035, 3513 64COEF_PAIR 3166, 3920 65COEF_PAIR 3703, 3290 66COEF_PAIR 3857, 4052 67COEF_PAIR 4017, 2276 68COEF_PAIR 4076, 3612 69COEF_PAIR 4091, 3973 70 71pd_8: dd 8 72pd_m601: dd -601 73pd_m1189: dd -1189 74pd_m1380: dd -1380 75pd_m2106: dd -2106 76pd_m2598: dd -2598 77pd_m2751: dd -2751 78pd_m3344: dd -3344 79pd_1024: dd 1024 80pd_1321: dd 1321 81pd_1448: dd 1448 82pd_1697: dd 1697 83pd_2482: dd 2482 84pd_3072: dd 3072 ; 1024 + 2048 85pd_3803: dd 3803 86pd_5119: dd 5119 ; 1024 + 4096 - 1 87pd_5120: dd 5120 ; 1024 + 4096 88pd_5793: dd 5793 89pd_6144: dd 6144 ; 2048 + 4096 90pd_17408: dd 17408 ; 1024 + 16384 91 92pixel_10bpc_max: times 2 dw 0x03ff 93pixel_12bpc_max: times 2 dw 0x0fff 94dconly_10bpc: times 2 dw 0x7c00 95dconly_12bpc: times 2 dw 0x7000 96clip_18b_min: dd -0x20000 97clip_18b_max: dd 0x1ffff 98clip_20b_min: dd -0x80000 99clip_20b_max: dd 0x7ffff 100 101const idct64_mul_16bpc 102dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 103dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 104dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 105dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 106 107cextern deint_shuf 108cextern idct64_mul 109cextern pw_1697x8 110cextern pw_1697x16 111cextern pw_1567_3784 112cextern pw_m1567_m3784 113cextern pw_m3784_1567 114cextern pw_2896_2896 115cextern pw_m2896_2896 116cextern pw_5 117cextern pw_2048 118cextern pw_4096 119cextern pw_8192 120cextern pw_16384 121cextern pw_2896x8 122cextern pd_2048 123 124cextern idct_4x8_internal_8bpc_avx2.main 125cextern idct_4x16_internal_8bpc_avx2.main 126cextern idct_8x8_internal_8bpc_avx2.main 127cextern idct_8x16_internal_8bpc_avx2.main 128cextern idct_16x4_internal_8bpc_avx2.main 129cextern idct_16x8_internal_8bpc_avx2.main 130cextern idct_16x16_internal_8bpc_avx2.main 131cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main 132cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast 133cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf 134cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast 135cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1 136cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal 137 138cextern iadst_4x4_internal_8bpc_avx2.main 139cextern iadst_4x8_internal_8bpc_avx2.main_pass2 140cextern iadst_4x16_internal_8bpc_avx2.main2 141cextern iadst_8x4_internal_8bpc_avx2.main 142cextern iadst_8x8_internal_8bpc_avx2.main_pass2 143cextern iadst_8x16_internal_8bpc_avx2.main 144cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end 145cextern iadst_16x4_internal_8bpc_avx2.main 146cextern iadst_16x8_internal_8bpc_avx2.main 147cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end 148cextern iadst_16x16_internal_8bpc_avx2.main 149cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end 150 151SECTION .text 152 153%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 154 155%macro WRAP_XMM 1+ 156 INIT_XMM cpuname 157 %1 158 INIT_YMM cpuname 159%endmacro 160 161%macro IWHT4_1D_PACKED 0 162 ; m0 = in0 in2, m1 = in1 in3 163 psubd m2, m0, m1 ; t2 164 paddd xm0, xm1 ; t0 165 vpermq m2, m2, q3322 166 vpermq m0, m0, q1100 167 vpermq m1, m1, q3120 168 psubd m3, m0, m2 169 psrad m3, 1 170 psubd m3, m1 ; t1 t3 171 psubd m0, m3 ; ____ out0 172 paddd m2, m3 ; out3 ____ 173%endmacro 174 175INIT_YMM avx2 176cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax 177 mova xm0, [cq+16*0] 178 vinserti128 m0, [cq+16*2], 1 179 mova xm1, [cq+16*1] 180 vinserti128 m1, [cq+16*3], 1 181 pxor m4, m4 182 mova [cq+32*0], m4 183 mova [cq+32*1], m4 184 lea r6, [dstq+strideq*2] 185 psrad m0, 2 186 psrad m1, 2 187 IWHT4_1D_PACKED 188 punpckhdq m0, m3 189 punpckldq m3, m2 190 punpckhqdq m1, m0, m3 191 punpcklqdq m0, m3 192 IWHT4_1D_PACKED 193 vpblendd m0, m2, 0x33 194 packssdw m0, m3 195 vextracti128 xm2, m0, 1 196 punpckhdq xm1, xm0, xm2 ; out2 out1 197 punpckldq xm0, xm2 ; out3 out0 198 movq xm2, [r6 +strideq*1] 199 movhps xm2, [dstq+strideq*0] 200 movq xm3, [r6 +strideq*0] 201 movhps xm3, [dstq+strideq*1] 202%ifidn bdmaxd, bdmaxm 203 movd xm5, bdmaxd 204 vpbroadcastw xm5, xm5 205%else ; win64: load from stack 206 vpbroadcastw xm5, bdmaxm 207%endif 208 paddsw xm0, xm2 209 paddsw xm1, xm3 210 pmaxsw xm0, xm4 211 pmaxsw xm1, xm4 212 pminsw xm0, xm5 213 pminsw xm1, xm5 214 movhps [dstq+strideq*0], xm0 215 movhps [dstq+strideq*1], xm1 216 movq [r6 +strideq*0], xm1 217 movq [r6 +strideq*1], xm0 218 RET 219 220; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 221; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 222; flags: 1 = packed, 2 = inv_dst2 223; skip round/shift if rnd is not a number 224%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags 225%if %8 < 32 226 pmulld m%4, m%1, m%8 227 pmulld m%3, m%2, m%8 228%else 229%if %9 & 1 230 vbroadcasti128 m%3, [pd_%8] 231%else 232 vpbroadcastd m%3, [pd_%8] 233%endif 234 pmulld m%4, m%1, m%3 235 pmulld m%3, m%2 236%endif 237%if %7 < 32 238 pmulld m%1, m%7 239 pmulld m%2, m%7 240%else 241%if %9 & 1 242 vbroadcasti128 m%5, [pd_%7] 243%else 244 vpbroadcastd m%5, [pd_%7] 245%endif 246 pmulld m%1, m%5 247 pmulld m%2, m%5 248%endif 249%if %9 & 2 250 psubd m%4, m%6, m%4 251 psubd m%2, m%4, m%2 252%else 253%ifnum %6 254 paddd m%4, m%6 255%endif 256 paddd m%2, m%4 257%endif 258%ifnum %6 259 paddd m%1, m%6 260%endif 261 psubd m%1, m%3 262%ifnum %6 263 psrad m%2, 12 264 psrad m%1, 12 265%endif 266%endmacro 267 268%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth 269cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2 270 %define %%p1 m(i%1_%4_internal_%5bpc) 271 ; Jump to the 1st txfm function if we're not taking the fast path, which 272 ; in turn performs an indirect jump to the 2nd txfm function. 273 lea tx2q, [m(i%2_%4_internal_%5bpc).pass2] 274%ifidn %1_%2, dct_dct 275 test eobd, eobd 276 jnz %%p1 277%else 278%if %3 279 add eobd, %3 280%endif 281 ; jump to the 1st txfm function unless it's located directly after this 282 times ((%%end - %%p1) >> 31) & 1 jmp %%p1 283ALIGN function_align 284%%end: 285%endif 286%endmacro 287 288%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth 289 INV_TXFM_FN %1, %2, 0, 4x4, %3 290%ifidn %1_%2, dct_dct 291 vpbroadcastd xm2, [dconly_%3bpc] 292%if %3 = 10 293.dconly: 294 imul r6d, [cq], 181 295 mov [cq], eobd ; 0 296 or r3d, 4 297.dconly2: 298 add r6d, 128 299 sar r6d, 8 300.dconly3: 301 imul r6d, 181 302 add r6d, 2176 303 sar r6d, 12 304 movd xm0, r6d 305 paddsw xm0, xm2 306 vpbroadcastw xm0, xm0 307.dconly_loop: 308 movq xm1, [dstq+strideq*0] 309 movhps xm1, [dstq+strideq*1] 310 paddsw xm1, xm0 311 psubusw xm1, xm2 312 movq [dstq+strideq*0], xm1 313 movhps [dstq+strideq*1], xm1 314 lea dstq, [dstq+strideq*2] 315 sub r3d, 2 316 jg .dconly_loop 317 WRAP_XMM RET 318%else 319 jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly 320%endif 321%endif 322%endmacro 323 324%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd 325 ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1 326 punpckhqdq m%3, m%2, m%1 ; t3 t2 327 punpcklqdq m%2, m%1 ; t0 t1 328 paddd m%1, m%2, m%3 ; out0 out1 329 psubd m%2, m%3 ; out3 out2 330%endmacro 331 332%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd 333 vpbroadcastd m%5, [pw_m3784_1567] 334 punpckhwd m%3, m%2, m%1 335 vpbroadcastd m%4, [pw_1567_3784] 336 punpcklwd m%2, m%1 337 vpbroadcastd m%1, [pw_m2896_2896] 338 pmaddwd m%5, m%3 339 pmaddwd m%3, m%4 340 vpbroadcastd m%4, [pw_2896_2896] 341 pmaddwd m%1, m%2 342 pmaddwd m%2, m%4 343 REPX {paddd x, m%6}, m%5, m%3, m%1, m%2 344 REPX {psrad x, 12 }, m%5, m%3, m%1, m%2 345 packssdw m%3, m%5 ; t3 t2 346 packssdw m%2, m%1 ; t0 t1 347 paddsw m%1, m%2, m%3 ; out0 out1 348 psubsw m%2, m%3 ; out3 out2 349%endmacro 350 351INV_TXFM_4X4_FN dct, dct 352INV_TXFM_4X4_FN dct, identity 353INV_TXFM_4X4_FN dct, adst 354INV_TXFM_4X4_FN dct, flipadst 355 356cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 357 call .main 358 vbroadcasti128 m2, [idct4_shuf] 359 packssdw m0, m1 360 pshufb m0, m2 361 jmp tx2q 362.pass2: 363 vextracti128 xm1, m0, 1 364 WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 365 packssdw xm5, xm5 ; pw_2048 366 pmulhrsw xm0, xm5 367 pmulhrsw xm1, xm5 368 movq xm2, [dstq+strideq*0] 369 movhps xm2, [dstq+strideq*1] 370 lea r6, [dstq+strideq*2] 371 movq xm3, [r6 +strideq*1] 372 movhps xm3, [r6 +strideq*0] 373 vpbroadcastd xm5, [pixel_10bpc_max] 374 pxor m4, m4 375 mova [cq+32*0], m4 376 mova [cq+32*1], m4 377 paddw xm0, xm2 378 paddw xm1, xm3 379 pmaxsw xm0, xm4 380 pmaxsw xm1, xm4 381 pminsw xm0, xm5 382 pminsw xm1, xm5 383 movq [dstq+strideq*0], xm0 384 movhps [dstq+strideq*1], xm0 385 movhps [r6 +strideq*0], xm1 386 movq [r6 +strideq*1], xm1 387 RET 388ALIGN function_align 389.main: 390 vpermq m0, [cq+32*0], q3120 391 vpermq m1, [cq+32*1], q3120 392 vpbroadcastd m5, [pd_2048] 393.main2: 394 IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5 395 ret 396 397INV_TXFM_4X4_FN adst, dct 398INV_TXFM_4X4_FN adst, adst 399INV_TXFM_4X4_FN adst, flipadst 400INV_TXFM_4X4_FN adst, identity 401 402%macro IADST4_1D 0 403 vpbroadcastd m5, [pd_1321] 404 vpbroadcastd m7, [pd_2482] 405 pmulld m4, m0, m5 ; 1321*in0 406 pmulld m6, m3, m7 ; 2482*in3 407 paddd m4, m6 ; 1321*in0 + 2482*in3 408 pmulld m6, m0, m7 ; 2482*in0 409 paddd m0, m3 ; in0 + in3 410 paddd m7, m5 ; pd_3803 411 pmulld m5, m2 ; 1321*in2 412 pmulld m3, m7 ; 3803*in3 413 pmulld m7, m2 ; 3803*in2 414 psubd m2, m0 ; in2 - in0 - in3 415 vpbroadcastd m0, [pd_m3344] 416 pmulld m1, m0 ; -t3 417 pmulld m2, m0 ; out2 (unrounded) 418 psubd m6, m5 ; 2482*in0 - 1321*in2 419 paddd m4, m7 ; t0 420 psubd m6, m3 ; t1 421 paddd m3, m4, m6 422 psubd m4, m1 ; out0 (unrounded) 423 psubd m6, m1 ; out1 (unrounded) 424 paddd m3, m1 ; out3 (unrounded) 425%endmacro 426 427cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 428 call .main 429 vinserti128 m0, m4, xm6, 1 430 vinserti128 m1, m2, xm3, 1 431.pass1_end: 432 vpbroadcastd m5, [pd_2048] 433 mova m2, [itx4_shuf] 434 paddd m0, m5 435 paddd m1, m5 436 psrad m0, 12 437 psrad m1, 12 438 packssdw m0, m1 439 vpermd m0, m2, m0 440 psrld m2, 4 441 pshufb m0, m2 442%if WIN64 443 movaps xmm6, [rsp+ 8] 444 movaps xmm7, [rsp+24] 445%endif 446 jmp tx2q 447.pass2: 448 lea r6, [deint_shuf+128] 449 vextracti128 xm1, m0, 1 450 call m(iadst_4x4_internal_8bpc).main 451.end: 452 vpbroadcastd xm4, [pw_2048] 453 movq xm2, [dstq+strideq*0] 454 movhps xm2, [dstq+strideq*1] 455 lea r6, [dstq+strideq*2] 456 movq xm3, [r6 +strideq*0] 457 movhps xm3, [r6 +strideq*1] 458 vpbroadcastd xm5, [pixel_10bpc_max] 459 pmulhrsw xm0, xm4 460 pmulhrsw xm1, xm4 461 pxor m4, m4 462 mova [cq+32*0], m4 463 mova [cq+32*1], m4 464 paddw xm0, xm2 465 paddw xm1, xm3 466 pmaxsw xm0, xm4 467 pmaxsw xm1, xm4 468 pminsw xm0, xm5 469 pminsw xm1, xm5 470 movq [dstq+strideq*0], xm0 471 movhps [dstq+strideq*1], xm0 472 movq [r6 +strideq*0], xm1 473 movhps [r6 +strideq*1], xm1 474 RET 475ALIGN function_align 476.main: 477 mova xm0, [cq+16*0] 478 mova xm1, [cq+16*1] 479 mova xm2, [cq+16*2] 480 mova xm3, [cq+16*3] 481%if WIN64 482 movaps [rsp+16], xmm6 483 movaps [rsp+32], xmm7 484%endif 485.main2: 486 WRAP_XMM IADST4_1D 487 ret 488 489INV_TXFM_4X4_FN flipadst, dct 490INV_TXFM_4X4_FN flipadst, adst 491INV_TXFM_4X4_FN flipadst, flipadst 492INV_TXFM_4X4_FN flipadst, identity 493 494cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 495 call m(iadst_4x4_internal_10bpc).main 496 vinserti128 m0, m3, xm2, 1 497 vinserti128 m1, m6, xm4, 1 498 jmp m(iadst_4x4_internal_10bpc).pass1_end 499.pass2: 500 lea r6, [deint_shuf+128] 501 vextracti128 xm1, m0, 1 502 call m(iadst_4x4_internal_8bpc).main 503 vpbroadcastd xm4, [pw_2048] 504 movq xm3, [dstq+strideq*1] 505 movhps xm3, [dstq+strideq*0] 506 lea r6, [dstq+strideq*2] 507 movq xm2, [r6 +strideq*1] 508 movhps xm2, [r6 +strideq*0] 509 vpbroadcastd xm5, [pixel_10bpc_max] 510 pmulhrsw xm0, xm4 511 pmulhrsw xm1, xm4 512 pxor m4, m4 513 mova [cq+32*0], m4 514 mova [cq+32*1], m4 515 paddw xm0, xm2 516 paddw xm1, xm3 517 pmaxsw xm0, xm4 518 pmaxsw xm1, xm4 519 pminsw xm0, xm5 520 pminsw xm1, xm5 521 movhps [dstq+strideq*0], xm1 522 movq [dstq+strideq*1], xm1 523 movhps [r6 +strideq*0], xm0 524 movq [r6 +strideq*1], xm0 525 RET 526 527INV_TXFM_4X4_FN identity, dct 528INV_TXFM_4X4_FN identity, adst 529INV_TXFM_4X4_FN identity, flipadst 530INV_TXFM_4X4_FN identity, identity 531 532cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 533 vpbroadcastd m1, [pd_5793] 534 pmulld m0, m1, [cq+32*0] 535 pmulld m1, [cq+32*1] 536 vpbroadcastd m5, [pd_2048] 537 mova m3, [itx4_shuf] 538 paddd m0, m5 539 paddd m1, m5 540 psrad m0, 12 541 psrad m1, 12 542 packssdw m0, m1 543 vpermd m0, m3, m0 544 psrld m3, 4 545 pshufb m0, m3 546 jmp tx2q 547.pass2: 548 vpbroadcastd m1, [pw_1697x8] 549 movq xm2, [dstq+strideq*0] 550 movhps xm2, [dstq+strideq*1] 551 lea r6, [dstq+strideq*2] 552 pmulhrsw m1, m0 553 paddsw m0, m1 554 movq xm3, [r6 +strideq*0] 555 movhps xm3, [r6 +strideq*1] 556 vpbroadcastd xm4, [pixel_10bpc_max] 557 packssdw m5, m5 ; pw_2048 558 pmulhrsw m0, m5 559 pxor m5, m5 560 mova [cq+32*0], m5 561 mova [cq+32*1], m5 562 vextracti128 xm1, m0, 1 563 paddw xm0, xm2 564 paddw xm1, xm3 565 pmaxsw xm0, xm5 566 pmaxsw xm1, xm5 567 pminsw xm0, xm4 568 pminsw xm1, xm4 569 movq [dstq+strideq*0], xm0 570 movhps [dstq+strideq*1], xm0 571 movq [r6 +strideq*0], xm1 572 movhps [r6 +strideq*1], xm1 573 RET 574 575INV_TXFM_4X4_FN dct, dct, 12 576INV_TXFM_4X4_FN dct, identity, 12 577INV_TXFM_4X4_FN dct, adst, 12 578INV_TXFM_4X4_FN dct, flipadst, 12 579 580cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 581 call m(idct_4x4_internal_10bpc).main 582 mova m3, [idct4_12_shuf] 583 mova m4, [idct4_12_shuf2] 584 vpermd m2, m4, m1 585 vpermd m1, m3, m0 586 jmp m(iadst_4x4_internal_12bpc).pass1_end2 587.pass2: 588 vpbroadcastd m5, [pd_2048] 589 vpermq m0, m0, q3120 590 vpermq m1, m1, q3120 591 call m(idct_4x4_internal_10bpc).main2 592 vpermq m0, m0, q3120 593 vpermq m1, m1, q2031 594 jmp m(iadst_4x4_internal_12bpc).end 595 596INV_TXFM_4X4_FN adst, dct, 12 597INV_TXFM_4X4_FN adst, adst, 12 598INV_TXFM_4X4_FN adst, flipadst, 12 599INV_TXFM_4X4_FN adst, identity, 12 600 601cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 602 call m(iadst_4x4_internal_10bpc).main 603 vinserti128 m1, m4, xm6, 1 604 vinserti128 m2, xm3, 1 605.pass1_end: 606 mova m3, [itx4_shuf] 607 vpbroadcastd m5, [pd_1024] 608 psrad m1, 1 609 psrad m2, 1 610 vpermd m1, m3, m1 611 vpermd m2, m3, m2 612 paddd m1, m5 613 paddd m2, m5 614 psrad m1, 11 615 psrad m2, 11 616.pass1_end2: 617 vpbroadcastd m3, [clip_18b_min] 618 vpbroadcastd m4, [clip_18b_max] 619 punpcklqdq m0, m1, m2 620 punpckhqdq m1, m2 621 pmaxsd m0, m3 622 pmaxsd m1, m3 623 pminsd m0, m4 624 pminsd m1, m4 625 jmp tx2q 626.pass2: 627 call .main_pass2 628 vinserti128 m0, m4, xm6, 1 629 vinserti128 m1, m2, xm3, 1 630.pass2_end: 631 vpbroadcastd m5, [pd_2048] 632 paddd m0, m5 633 paddd m1, m5 634 psrad m0, 12 635 psrad m1, 12 636.end: 637%if WIN64 638 WIN64_RESTORE_XMM_INTERNAL 639 %assign xmm_regs_used 6 640%endif 641.end2: 642 vpbroadcastd m4, [pw_16384] 643 movq xm2, [dstq+strideq*0] 644 movq xm3, [dstq+strideq*1] 645 lea r6, [dstq+strideq*2] 646 movhps xm2, [r6 +strideq*0] ; dst0 dst2 647 movhps xm3, [r6 +strideq*1] ; dst1 dst3 648 vpbroadcastd m5, [pixel_12bpc_max] 649 vinserti128 m2, xm3, 1 650 psrad m0, 3 651 psrad m1, 3 652 packssdw m0, m1 ; t0 t2 t1 t3 653 pmulhrsw m0, m4 654 pxor m4, m4 655 mova [cq+32*0], m4 656 mova [cq+32*1], m4 657 paddw m0, m2 ; out0 out2 out1 out3 658 pmaxsw m0, m4 659 pminsw m0, m5 660 vextracti128 xm1, m0, 1 ; out1 out3 661 movq [dstq+strideq*0], xm0 662 movq [dstq+strideq*1], xm1 663 movhps [r6 +strideq*0], xm0 664 movhps [r6 +strideq*1], xm1 665 RET 666.main_pass2: 667 vextracti128 xm3, m1, 1 668 mova xm2, xm1 669 vextracti128 xm1, m0, 1 670 jmp m(iadst_4x4_internal_10bpc).main2 671 672INV_TXFM_4X4_FN flipadst, dct, 12 673INV_TXFM_4X4_FN flipadst, adst, 12 674INV_TXFM_4X4_FN flipadst, flipadst, 12 675INV_TXFM_4X4_FN flipadst, identity, 12 676 677cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 678 call m(iadst_4x4_internal_10bpc).main 679 vinserti128 m1, m3, xm2, 1 680 vinserti128 m2, m6, xm4, 1 681 jmp m(iadst_4x4_internal_12bpc).pass1_end 682.pass2: 683 call m(iadst_4x4_internal_12bpc).main_pass2 684 vinserti128 m0, m3, xm2, 1 685 vinserti128 m1, m6, xm4, 1 686 jmp m(iadst_4x4_internal_12bpc).pass2_end 687 688INV_TXFM_4X4_FN identity, dct, 12 689INV_TXFM_4X4_FN identity, adst, 12 690INV_TXFM_4X4_FN identity, flipadst, 12 691INV_TXFM_4X4_FN identity, identity, 12 692 693cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 694 mova m2, [itx4_shuf] 695 vpbroadcastd m3, [pd_1697] 696 vpermd m0, m2, [cq+32*0] 697 vpermd m2, m2, [cq+32*1] 698 vpbroadcastd m5, [pd_2048] 699 pmulld m1, m3, m0 700 pmulld m3, m2 701 paddd m1, m5 702 paddd m3, m5 703 psrad m1, 12 704 psrad m3, 12 705 paddd m1, m0 706 paddd m2, m3 707 jmp m(iadst_4x4_internal_12bpc).pass1_end2 708.pass2: 709 ; m0 = in0 in1 710 ; m1 = in2 in3 711 vpbroadcastd m3, [pd_5793] 712 vpbroadcastd m5, [pd_2048] 713 pmulld m0, m3 714 pmulld m1, m3 715 paddd m0, m5 ; 2048 716 paddd m1, m5 717 psrad m0, 12 718 psrad m1, 12 719 jmp m(iadst_4x4_internal_12bpc).end 720 721%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth 722 INV_TXFM_FN %1, %2, 0, 4x8, %3 723%ifidn %1_%2, dct_dct 724 vpbroadcastd xm2, [dconly_%3bpc] 725%if %3 = 10 726.dconly: 727 imul r6d, [cq], 181 728 mov [cq], eobd ; 0 729 or r3d, 8 730 add r6d, 128 731 sar r6d, 8 732 imul r6d, 181 733 jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2 734%else 735 jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly 736%endif 737%endif 738%endmacro 739 740%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd 741 ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3 742 vpbroadcastd m%5, [pd_2896] 743 pmulld m%1, m%5 744 pmulld m%3, m%5 745 paddd m%1, m%8 746 paddd m%5, m%1, m%3 747 psubd m%1, m%3 748 psrad m%5, 12 ; t0 749 psrad m%1, 12 ; t1 750 psubd m%3, m%1, m%2 751 paddd m%2, m%1 752 paddd m%1, m%5, m%4 753 psubd m%4, m%5, m%4 754%endmacro 755 756INV_TXFM_4X8_FN dct, dct 757INV_TXFM_4X8_FN dct, identity 758INV_TXFM_4X8_FN dct, adst 759INV_TXFM_4X8_FN dct, flipadst 760 761cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 762.pass1: 763 vpbroadcastd m3, [pd_2896] 764 pmulld m0, m3, [cq+32*0] 765 pmulld m1, m3, [cq+32*1] 766 pmulld m2, m3, [cq+32*2] 767 pmulld m3, m3, [cq+32*3] 768 vpbroadcastd m7, [pd_2048] 769 REPX {paddd x, m7}, m0, m1, m2, m3 770 REPX {psrad x, 12}, m0, m1, m2, m3 771 IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 772 jmp tx2q 773.pass2: 774 packssdw m0, m2 775 packssdw m1, m3 776 lea r6, [deint_shuf+128] 777 punpckhwd m2, m0, m1 778 punpcklwd m0, m1 779 punpckhdq m1, m0, m2 ; 2 3 780 punpckldq m0, m2 ; 0 1 781 vextracti128 xm2, m0, 1 ; 4 5 782 vextracti128 xm3, m1, 1 ; 6 7 783 call m(idct_4x8_internal_8bpc).main 784 vpbroadcastd xm4, [pw_2048] 785 REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 786 lea r3, [strideq*3] 787 lea r6, [dstq+strideq*4] 788 movq xm4, [dstq+strideq*0] 789 movhps xm4, [dstq+strideq*1] 790 movq xm5, [dstq+r3 ] 791 movhps xm5, [dstq+strideq*2] 792 movq xm6, [r6 +strideq*0] 793 movhps xm6, [r6 +strideq*1] 794 movq xm7, [r6 +r3 ] 795 movhps xm7, [r6 +strideq*2] 796 paddw xm0, xm4 ; 0 1 797 paddw xm1, xm5 ; 3 2 798 paddw xm2, xm6 ; 4 5 799 paddw xm3, xm7 ; 7 6 800 vpbroadcastd xm5, [pixel_10bpc_max] 801 pxor m4, m4 802 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 803 REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 804 REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 805 movq [dstq+strideq*0], xm0 806 movhps [dstq+strideq*1], xm0 807 movhps [dstq+strideq*2], xm1 808 movq [dstq+r3 ], xm1 809 movq [r6 +strideq*0], xm2 810 movhps [r6 +strideq*1], xm2 811 movhps [r6 +strideq*2], xm3 812 movq [r6 +r3 ], xm3 813 RET 814 815INV_TXFM_4X8_FN adst, dct 816INV_TXFM_4X8_FN adst, adst 817INV_TXFM_4X8_FN adst, flipadst 818INV_TXFM_4X8_FN adst, identity 819 820cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 821 call m(iadst_8x4_internal_10bpc).main 822 vpbroadcastd m5, [pd_2048] 823 paddd m0, m5, m4 824 paddd m1, m5, m6 825 paddd m2, m5 826 paddd m3, m5 827.pass1_end: 828 REPX {psrad x, 12}, m0, m1, m2, m3 829 jmp tx2q 830.pass2: 831 call .pass2_main 832 mova xm4, [pw_2048_m2048] 833 REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 834.end: 835 lea r3, [strideq*3] 836 lea r6, [dstq+strideq*4] 837 movq xm4, [dstq+strideq*0] 838 movhps xm4, [dstq+strideq*1] 839 movq xm5, [dstq+strideq*2] 840 movhps xm5, [dstq+r3 ] 841 movq xm6, [r6 +strideq*0] 842 movhps xm6, [r6 +strideq*1] 843 movq xm7, [r6 +strideq*2] 844 movhps xm7, [r6 +r3 ] 845 paddw xm0, xm4 ; 0 1 846 paddw xm1, xm5 ; 2 3 847 paddw xm2, xm6 ; 4 5 848 paddw xm3, xm7 ; 6 7 849 vpbroadcastd xm5, [pixel_10bpc_max] 850 pxor m4, m4 851 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 852 REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 853 REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 854 movq [dstq+strideq*0], xm0 855 movhps [dstq+strideq*1], xm0 856 movq [dstq+strideq*2], xm1 857 movhps [dstq+r3 ], xm1 858 movq [r6 +strideq*0], xm2 859 movhps [r6 +strideq*1], xm2 860 movq [r6 +strideq*2], xm3 861 movhps [r6 +r3 ], xm3 862 RET 863ALIGN function_align 864.pass2_main: 865 packssdw m0, m2 866 packssdw m1, m3 867 lea r6, [deint_shuf+128] 868 punpcklwd m4, m0, m1 869 punpckhwd m0, m1 870 punpckhdq m5, m4, m0 871 punpckldq m4, m0 872 vextracti128 xm2, m4, 1 ; 4 5 873 vextracti128 xm3, m5, 1 ; 6 7 874 pshufd xm4, xm4, q1032 ; 1 0 875 pshufd xm5, xm5, q1032 ; 3 2 876 jmp m(iadst_4x8_internal_8bpc).main_pass2 877ALIGN function_align 878.main: 879 vpbroadcastd m8, [clip_18b_min] 880 vpbroadcastd m9, [clip_18b_max] 881.main2: 882 vbroadcasti128 m0, [cq+16*0] 883 vbroadcasti128 m2, [cq+16*2] 884 vbroadcasti128 m3, [cq+16*5] 885 vbroadcasti128 m1, [cq+16*7] 886 vpbroadcastd m6, [pd_2896] 887 shufpd m0, m2, 0x0c ; 0 2 888 shufpd m1, m3, 0x0c ; 7 5 889 vbroadcasti128 m2, [cq+16*4] 890 vbroadcasti128 m4, [cq+16*6] 891 vbroadcasti128 m5, [cq+16*1] 892 vbroadcasti128 m3, [cq+16*3] 893 vpbroadcastd m7, [pd_2048] 894 shufpd m2, m4, 0x0c ; 4 6 895 shufpd m3, m5, 0x0c ; 3 1 896 REPX {pmulld x, m6}, m0, m1, m2, m3 897 REPX {paddd x, m7}, m0, m1, m2, m3 898 REPX {psrad x, 12}, m0, m1, m2, m3 899.main3: 900 ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1 901 ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1 902 psubd m4, m0, m2 ; t4 t6 903 paddd m0, m2 ; t0 t2 904 psubd m2, m1, m3 ; t5 t7 905 paddd m1, m3 ; t1 t3 906 REPX {pmaxsd x, m8}, m4, m2, m0, m1 907 REPX {pminsd x, m9}, m4, m2, m0, m1 908 pxor m5, m5 909 psubd m5, m4 910 vpblendd m4, m2, 0xcc ; t4 t7 911 vpblendd m2, m5, 0xcc ; t5 -t6 912 ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784 913 vpbroadcastd m5, [pd_2896] 914 vbroadcasti128 m6, [pw_2048_m2048] ; + + - - 915 punpckhqdq m3, m0, m1 916 punpcklqdq m0, m1 917 psubd m1, m0, m3 ; t2 t3 918 paddd m0, m3 ; out0 -out7 919 punpckhqdq m3, m4, m2 ; t7a t6a 920 punpcklqdq m4, m2 ; t5a t4a 921 psubd m2, m4, m3 ; t7 t6 922 paddd m4, m3 ; out6 -out1 923 REPX {pmaxsd x, m8}, m1, m2 924 REPX {pminsd x, m9}, m1, m2 925 vpblendd m3, m1, m2, 0xcc 926 shufpd m1, m2, 0x05 927 pmulld m3, m5 928 pmulld m5, m1 929 psignd m0, m6 ; out0 out7 930 psignd m4, m6 ; out6 out1 931 paddd m3, m7 932 psubd m2, m3, m5 933 paddd m5, m3 934 psrad m2, 12 ; out4 -out5 935 psrad m5, 12 ; -out3 out2 936 ret 937 938INV_TXFM_4X8_FN flipadst, dct 939INV_TXFM_4X8_FN flipadst, adst 940INV_TXFM_4X8_FN flipadst, flipadst 941INV_TXFM_4X8_FN flipadst, identity 942 943cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 944 call m(iadst_8x4_internal_10bpc).main 945 vpbroadcastd m5, [pd_2048] 946 paddd m0, m5, m3 947 paddd m1, m5, m2 948 paddd m2, m5, m6 949 paddd m3, m5, m4 950 jmp m(iadst_4x8_internal_10bpc).pass1_end 951.pass2: 952 call m(iadst_4x8_internal_10bpc).pass2_main 953 mova xm4, [pw_2048_m2048] 954 REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 955 lea r3, [strideq*3] 956 lea r6, [dstq+strideq*4] 957 movq xm4, [dstq+strideq*1] 958 movhps xm4, [dstq+strideq*0] 959 movq xm5, [dstq+r3 ] 960 movhps xm5, [dstq+strideq*2] 961 movq xm6, [r6 +strideq*1] 962 movhps xm6, [r6 +strideq*0] 963 movq xm7, [r6 +r3 ] 964 movhps xm7, [r6 +strideq*2] 965 paddw xm3, xm4 ; 1 0 966 paddw xm2, xm5 ; 3 2 967 paddw xm1, xm6 ; 5 4 968 paddw xm0, xm7 ; 7 6 969 vpbroadcastd xm5, [pixel_10bpc_max] 970 pxor m4, m4 971 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 972 REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0 973 REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0 974 movhps [dstq+strideq*0], xm3 975 movq [dstq+strideq*1], xm3 976 movhps [dstq+strideq*2], xm2 977 movq [dstq+r3 ], xm2 978 movhps [r6 +strideq*0], xm1 979 movq [r6 +strideq*1], xm1 980 movhps [r6 +strideq*2], xm0 981 movq [r6 +r3 ], xm0 982 RET 983 984INV_TXFM_4X8_FN identity, dct 985INV_TXFM_4X8_FN identity, adst 986INV_TXFM_4X8_FN identity, flipadst 987INV_TXFM_4X8_FN identity, identity 988 989cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 990.pass1: 991 vpbroadcastd m3, [pd_2896] 992 pmulld m0, m3, [cq+32*0] 993 pmulld m1, m3, [cq+32*1] 994 pmulld m2, m3, [cq+32*2] 995 pmulld m3, [cq+32*3] 996 vpbroadcastd m5, [pd_2048] 997 vpbroadcastd m4, [pd_5793] 998 REPX {paddd x, m5}, m0, m1, m2, m3 999 REPX {psrad x, 12}, m0, m1, m2, m3 1000 REPX {pmulld x, m4}, m0, m1, m2, m3 1001 REPX {paddd x, m5}, m0, m1, m2, m3 1002 REPX {psrad x, 12}, m0, m1, m2, m3 1003 jmp tx2q 1004.pass2: 1005 vpbroadcastd m6, [pixel_10bpc_max] 1006 call .pass2_end 1007 RET 1008ALIGN function_align 1009.pass2_end: 1010 vpbroadcastd m4, [pw_4096] 1011 packssdw m0, m2 1012 packssdw m1, m3 1013 punpckhwd m2, m0, m1 1014 punpcklwd m0, m1 1015 pmulhrsw m2, m4 1016 pmulhrsw m0, m4 1017 punpckhdq m1, m0, m2 ; 2 3 6 7 1018 punpckldq m0, m2 ; 0 1 4 5 1019 lea r3, [strideq*3] 1020 lea r6, [dstq+strideq*4] 1021 movq xm2, [dstq+strideq*0] 1022 movhps xm2, [dstq+strideq*1] 1023 vpbroadcastq m4, [r6 +strideq*0] 1024 vpbroadcastq m5, [r6 +strideq*1] 1025 movq xm3, [dstq+strideq*2] 1026 movhps xm3, [dstq+r3 ] 1027 vpblendd m2, m4, 0x30 1028 vpblendd m2, m5, 0xc0 1029 vpbroadcastq m4, [r6 +strideq*2] 1030 vpbroadcastq m5, [r6 +r3 ] 1031 vpblendd m3, m4, 0x30 1032 vpblendd m3, m5, 0xc0 1033 pxor m4, m4 1034 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 1035 paddw m0, m2 ; out0 out1 out4 out5 1036 paddw m1, m3 ; out2 out3 out6 out7 1037 pmaxsw m0, m4 1038 pmaxsw m1, m4 1039 pminsw m0, m6 1040 pminsw m1, m6 1041 vextracti128 xm2, m0, 1 ; out4 out5 1042 vextracti128 xm3, m1, 1 ; out6 out7 1043 movq [dstq+strideq*0], xm0 1044 movhps [dstq+strideq*1], xm0 1045 movq [dstq+strideq*2], xm1 1046 movhps [dstq+r3 ], xm1 1047 movq [r6 +strideq*0], xm2 1048 movhps [r6 +strideq*1], xm2 1049 movq [r6 +strideq*2], xm3 1050 movhps [r6 +r3 ], xm3 1051 ret 1052 1053INV_TXFM_4X8_FN dct, dct, 12 1054INV_TXFM_4X8_FN dct, identity, 12 1055INV_TXFM_4X8_FN dct, adst, 12 1056INV_TXFM_4X8_FN dct, flipadst, 12 1057 1058cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 1059 jmp m(idct_4x8_internal_10bpc).pass1 1060.pass2: 1061 vpbroadcastd m8, [clip_18b_min] 1062 vpbroadcastd m9, [clip_18b_max] 1063 REPX {pmaxsd x, m8}, m0, m1, m2, m3 1064 REPX {pminsd x, m9}, m0, m1, m2, m3 1065 ; transpose & interleave 1066 pshufd m0, m0, q1320 1067 pshufd m1, m1, q1320 1068 pshufd m2, m2, q1320 1069 pshufd m3, m3, q1320 1070 punpckldq m4, m0, m1 1071 punpckhdq m0, m1 1072 punpckldq m5, m2, m3 1073 punpckhdq m2, m3 1074 vpermq m0, m0, q3102 1075 vpermq m2, m2, q3102 1076 vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved) 1077 vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved) 1078 vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) 1079 vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) 1080 vpbroadcastd m7, [pd_2048] 1081 call m(idct_8x4_internal_10bpc).main 1082 psubd m3, m0, m4 ; out7 out6 1083 paddd m0, m4 ; out0 out1 1084 paddd m1, m2, m5 ; out3 out2 1085 psubd m2, m5 ; out4 out5 1086 pshufd m1, m1, q1032 1087 pshufd m3, m3, q1032 1088 jmp m(iadst_4x8_internal_12bpc).end 1089 1090INV_TXFM_4X8_FN adst, dct, 12 1091INV_TXFM_4X8_FN adst, adst, 12 1092INV_TXFM_4X8_FN adst, flipadst, 12 1093INV_TXFM_4X8_FN adst, identity, 12 1094 1095cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 1096 call m(iadst_8x4_internal_10bpc).main 1097 psrad m0, m4, 1 1098 psrad m1, m6, 1 1099 psrad m2, 1 1100 psrad m3, 1 1101.pass1_end: 1102 vpbroadcastd m5, [pd_1024] 1103 REPX {paddd x, m5}, m0, m1, m2, m3 1104 REPX {psrad x, 11}, m0, m1, m2, m3 1105 jmp tx2q 1106.pass2: 1107 vpbroadcastd m8, [clip_18b_min] 1108 vpbroadcastd m9, [clip_18b_max] 1109 REPX {pmaxsd x, m8}, m0, m1, m2, m3 1110 REPX {pminsd x, m9}, m0, m1, m2, m3 1111 call .pass2_main 1112 vpblendd m3, m0, m4, 0x33 ; out6 out7 1113 vpblendd m0, m4, 0xcc ; out0 out1 1114 pshufd m1, m5, q1032 1115 psignd m2, m6 ; out4 out5 1116 psignd m1, m6 ; out2 out3 1117.end: 1118 vpbroadcastd m4, [pw_16384] 1119 REPX {psrad x, 3}, m0, m1, m2, m3 1120 packssdw m0, m2 ; 0 1 4 5 (interleaved) 1121 packssdw m1, m3 ; 2 3 6 7 (interleaved) 1122 mova m2, [iadst8_12_shuf] 1123 vpermd m0, m2, m0 ; 0 1 4 5 1124 vpermd m1, m2, m1 ; 2 3 6 7 1125 pmulhrsw m0, m4 1126 pmulhrsw m1, m4 1127 lea r3, [strideq*3] 1128 lea r6, [dstq+strideq*4] 1129 movq xm4, [dstq+strideq*0] 1130 movhps xm4, [dstq+strideq*1] 1131 movq xm5, [dstq+strideq*2] 1132 movhps xm5, [dstq+r3 ] 1133 movq xm6, [r6 +strideq*0] 1134 movhps xm6, [r6 +strideq*1] 1135 vinserti128 m4, xm6, 1 1136 movq xm7, [r6 +strideq*2] 1137 movhps xm7, [r6 +r3 ] 1138 vinserti128 m5, xm7, 1 1139 paddw m0, m4 ; 0 1 4 5 1140 paddw m1, m5 ; 2 3 6 7 1141 vpbroadcastd m5, [pixel_12bpc_max] 1142 pxor m4, m4 1143 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 1144 REPX {pmaxsw x, m4}, m0, m1 1145 REPX {pminsw x, m5}, m0, m1 1146 vextracti128 xm2, m0, 1 ; out4 out5 1147 vextracti128 xm3, m1, 1 ; out6 out7 1148 movq [dstq+strideq*0], xm0 1149 movhps [dstq+strideq*1], xm0 1150 movq [dstq+strideq*2], xm1 1151 movhps [dstq+r3 ], xm1 1152 movq [r6 +strideq*0], xm2 1153 movhps [r6 +strideq*1], xm2 1154 movq [r6 +strideq*2], xm3 1155 movhps [r6 +r3 ], xm3 1156 RET 1157ALIGN function_align 1158.pass2_main: 1159 ; transpose & interleave 1160 pshufd m0, m0, q1320 1161 pshufd m1, m1, q1320 1162 pshufd m2, m2, q1320 1163 pshufd m3, m3, q1320 1164 punpckldq m4, m0, m1 1165 punpckhdq m0, m1 1166 punpckldq m5, m2, m3 1167 punpckhdq m2, m3 1168 vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved) 1169 vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved) 1170 vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) 1171 vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) 1172 vpbroadcastd m7, [pd_2048] 1173 jmp m(iadst_4x8_internal_10bpc).main3 1174 1175INV_TXFM_4X8_FN flipadst, dct, 12 1176INV_TXFM_4X8_FN flipadst, adst, 12 1177INV_TXFM_4X8_FN flipadst, flipadst, 12 1178INV_TXFM_4X8_FN flipadst, identity, 12 1179 1180cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 1181 call m(iadst_8x4_internal_10bpc).main 1182 psrad m0, m3, 1 1183 psrad m1, m2, 1 1184 psrad m2, m6, 1 1185 psrad m3, m4, 1 1186 jmp m(iadst_4x8_internal_12bpc).pass1_end 1187.pass2: 1188 vpbroadcastd m8, [clip_18b_min] 1189 vpbroadcastd m9, [clip_18b_max] 1190 REPX {pmaxsd x, m8}, m0, m1, m2, m3 1191 REPX {pminsd x, m9}, m0, m1, m2, m3 1192 call m(iadst_4x8_internal_12bpc).pass2_main 1193 shufpd m3, m4, m0, 0x05 ; out1 out0 1194 shufpd m0, m4, 0x05 ; out7 out6 1195 psignd m2, m6 1196 pshufd m6, m6, q1032 1197 pshufd m1, m2, q1032 ; out5 out4 1198 psignd m2, m5, m6 ; out3 out2 1199 jmp m(iadst_4x8_internal_12bpc).end 1200 1201INV_TXFM_4X8_FN identity, dct, 12 1202INV_TXFM_4X8_FN identity, adst, 12 1203INV_TXFM_4X8_FN identity, flipadst, 12 1204INV_TXFM_4X8_FN identity, identity, 12 1205 1206cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 1207 jmp m(iidentity_4x8_internal_10bpc).pass1 1208.pass2: 1209 ; m0 = in0 in1 1210 ; m1 = in2 in3 1211 ; m2 = in4 in5 1212 ; m3 = in6 in7 1213 vpbroadcastd m6, [pixel_12bpc_max] 1214 call m(iidentity_4x8_internal_10bpc).pass2_end 1215 RET 1216 1217%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth 1218 INV_TXFM_FN %1, %2, 0, 4x16, %3 1219%ifidn %1_%2, dct_dct 1220 imul r6d, [cq], 181 1221 vpbroadcastd xm2, [dconly_%3bpc] 1222 mov [cq], eobd ; 0 1223 or r3d, 16 1224 add r6d, 384 1225 sar r6d, 9 1226 jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3 1227%endif 1228%endmacro 1229 1230INV_TXFM_4X16_FN dct, dct 1231INV_TXFM_4X16_FN dct, identity 1232INV_TXFM_4X16_FN dct, adst 1233INV_TXFM_4X16_FN dct, flipadst 1234 1235cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 1236.pass1: 1237 vpbroadcastd m10, [pd_3072] 1238 mova m1, [cq+32*2] 1239 mova m3, [cq+32*6] 1240 mova m5, [cq+32*3] 1241 mova m7, [cq+32*7] 1242 call .pass1_main 1243 pmulld m0, m6, [cq+32*0] 1244 pmulld m2, m6, [cq+32*4] 1245 pmulld m4, m6, [cq+32*1] 1246 pmulld m6, [cq+32*5] 1247 call .pass1_main2 1248 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 1249 jmp tx2q 1250.pass2: 1251 packssdw m0, m4 1252 packssdw m1, m5 1253 packssdw m2, m6 1254 packssdw m3, m7 1255 lea r6, [deint_shuf+128] 1256 punpcklwd m4, m2, m3 1257 punpckhwd m2, m3 1258 punpckhwd m5, m0, m1 1259 punpcklwd m0, m1 1260 punpckhdq m1, m0, m4 ; 2 3 1261 punpckldq m0, m4 ; 0 1 1262 punpckldq m4, m5, m2 ; 8 9 1263 punpckhdq m5, m2 ; a b 1264 vextracti128 xm2, m0, 1 ; 4 5 1265 vextracti128 xm3, m1, 1 ; 6 7 1266 vextracti128 xm6, m4, 1 ; c d 1267 vextracti128 xm7, m5, 1 ; e f 1268 call m(idct_4x16_internal_8bpc).main 1269 vpbroadcastd m9, [pw_2048] 1270 vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 1271 vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 1272 vinserti128 m2, m4, xm5, 1 ; 8 9 b a 1273 vinserti128 m3, m6, xm7, 1 ; c d f e 1274 vpbroadcastd m8, [pixel_10bpc_max] 1275 call .pass2_end 1276 RET 1277ALIGN function_align 1278.pass1_main: 1279 vpbroadcastd m4, [pd_3784] 1280 vpbroadcastd m8, [pd_1567] 1281 vpbroadcastd m9, [pd_2048] 1282 vpbroadcastd m6, [pd_1448] 1283 ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l 1284 ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h 1285 ret 1286ALIGN function_align 1287.pass1_main2: 1288 paddd m0, m10 1289 paddd m4, m10 1290 paddd m8, m0, m2 1291 psubd m0, m2 1292 paddd m9, m4, m6 1293 psubd m4, m6 1294 REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h 1295 psubd m2, m0, m1 1296 paddd m1, m0 1297 psubd m6, m4, m5 1298 paddd m5, m4 1299 paddd m0, m8, m3 1300 psubd m3, m8, m3 1301 paddd m4, m9, m7 1302 psubd m7, m9, m7 1303 ret 1304ALIGN function_align 1305.pass2_end: 1306 lea r6, [strideq*3] 1307 pxor m7, m7 1308 pmulhrsw m0, m9 1309 call .write_4x4 1310 pmulhrsw m0, m1, m9 1311 call .write_4x4 1312 pmulhrsw m0, m2, m9 1313 call .write_4x4 1314 pmulhrsw m0, m3, m9 1315 call .write_4x4 1316 ret 1317ALIGN function_align 1318.write_4x4: 1319 movq xm4, [dstq+strideq*0] 1320 movhps xm4, [dstq+strideq*1] 1321 vpbroadcastq m5, [dstq+strideq*2] 1322 vpbroadcastq m6, [dstq+r6 ] 1323 mova [cq+32*0], m7 1324 mova [cq+32*1], m7 1325 add cq, 32*2 1326 vpblendd m4, m5, 0xc0 1327 vpblendd m4, m6, 0x30 1328 paddw m4, m0 1329 pmaxsw m4, m7 1330 pminsw m4, m8 1331 vextracti128 xm5, m4, 1 1332 movq [dstq+strideq*0], xm4 1333 movhps [dstq+strideq*1], xm4 1334 movhps [dstq+strideq*2], xm5 1335 movq [dstq+r6 ], xm5 1336 lea dstq, [dstq+strideq*4] 1337 ret 1338 1339INV_TXFM_4X16_FN adst, dct 1340INV_TXFM_4X16_FN adst, adst 1341INV_TXFM_4X16_FN adst, flipadst 1342INV_TXFM_4X16_FN adst, identity 1343 1344cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 1345 call m(iadst_16x4_internal_10bpc).main 1346 vpbroadcastd m6, [pd_6144] 1347 call m(iadst_16x4_internal_10bpc).main_end 1348 psrad m0, m4, 13 1349 psrad m1, m5, 13 1350 psrad m2, 13 1351 psrad m3, 13 1352 psrad m4, m8, 13 1353 psrad m5, m9, 13 1354 psrad m6, 13 1355 psrad m7, 13 1356 jmp tx2q 1357.pass2: 1358 call .pass2_main 1359 vpbroadcastd m5, [pw_2048] 1360 vpbroadcastd m8, [pixel_10bpc_max] 1361 lea r6, [strideq*3] 1362 vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1 1363 pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 1364 vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13 1365 pxor m7, m7 1366 psubw m9, m7, m5 1367 vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 1368 pmulhrsw m0, m4, m9 1369 call .write_4x4 1370 pmulhrsw m0, m1, m9 1371 call .write_4x4 1372 pmulhrsw m0, m2, m9 1373 call .write_4x4 1374 pmulhrsw m0, m3, m9 1375 call .write_4x4 1376 RET 1377ALIGN function_align 1378.write_4x4: 1379 movq xm4, [dstq+r6 ] 1380 movhps xm4, [dstq+strideq*0] 1381 vpbroadcastq m5, [dstq+strideq*1] 1382 vpbroadcastq m6, [dstq+strideq*2] 1383 mova [cq+32*0], m7 1384 mova [cq+32*1], m7 1385 add cq, 32*2 1386 vpblendd m4, m5, 0xc0 1387 vpblendd m4, m6, 0x30 1388 paddw m4, m0 1389 pmaxsw m4, m7 1390 pminsw m4, m8 1391 vextracti128 xm5, m4, 1 1392 movhps [dstq+strideq*0], xm4 1393 movhps [dstq+strideq*1], xm5 1394 movq [dstq+strideq*2], xm5 1395 movq [dstq+r6 ], xm4 1396 lea dstq, [dstq+strideq*4] 1397 ret 1398ALIGN function_align 1399.pass2_main: 1400 packssdw m0, m4 1401 packssdw m1, m5 1402 packssdw m2, m6 1403 packssdw m3, m7 1404 lea r6, [deint_shuf+128] 1405 punpcklwd m4, m2, m3 1406 punpckhwd m2, m3 1407 punpckhwd m5, m0, m1 1408 punpcklwd m0, m1 1409 punpckhdq m1, m0, m4 1410 punpckldq m0, m4 1411 punpckldq m4, m5, m2 1412 punpckhdq m5, m2 1413 vpblendd m3, m0, m1, 0x33 1414 vpblendd m0, m1, 0xcc 1415 shufpd m2, m5, m4, 0x05 1416 shufpd m4, m5, 0x05 1417 vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5 1418 vinserti128 m0, xm3, 1 ; 0 3 2 1 1419 vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? 1420 vinserti128 m2, xm4, 1 ; b 8 9 a 1421 call m(iadst_4x16_internal_8bpc).main2 1422 vpbroadcastd m5, [pw_2896x8] 1423 paddsw m1, m2, m4 1424 psubsw m2, m4 1425 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 1426 pmulhrsw m2, m5 ; out8 -out11 -out9 out10 1427 ret 1428ALIGN function_align 1429.main: 1430 vbroadcasti128 m0, [cq+16* 0] 1431 vbroadcasti128 m4, [cq+16* 2] 1432 vbroadcasti128 m1, [cq+16*15] 1433 vbroadcasti128 m5, [cq+16*13] 1434 vbroadcasti128 m2, [cq+16* 4] 1435 vbroadcasti128 m6, [cq+16* 6] 1436 vbroadcasti128 m3, [cq+16*11] 1437 vbroadcasti128 m7, [cq+16* 9] 1438 shufpd m0, m4, 0x0c ; 0 2 1439 shufpd m1, m5, 0x0c ; 15 13 1440 shufpd m2, m6, 0x0c ; 4 6 1441 shufpd m3, m7, 0x0c ; 11 9 1442 vbroadcasti128 m4, [cq+16* 8] 1443 vbroadcasti128 m6, [cq+16*10] 1444 vbroadcasti128 m5, [cq+16* 7] 1445 vbroadcasti128 m7, [cq+16* 5] 1446 shufpd m4, m6, 0x0c ; 8 10 1447 shufpd m5, m7, 0x0c ; 7 5 1448 vbroadcasti128 m6, [cq+16*12] 1449 vbroadcasti128 m7, [cq+16*14] 1450 shufpd m6, m7, 0x0c ; 12 14 1451 vbroadcasti128 m7, [cq+16* 3] 1452 vbroadcasti128 m8, [cq+16* 1] 1453 shufpd m7, m8, 0x0c ; 3 1 1454.main2: 1455 ; expects: m12 = clip_min m13 = clip_max 1456 vpbroadcastd m11, [pd_2048] 1457 ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1 1458 ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1 1459 ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1 1460 ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1 1461 psubd m8, m0, m4 ; t8a t10a 1462 paddd m0, m4 ; t0a t2a 1463 psubd m4, m1, m5 ; t9a t11a 1464 paddd m1, m5 ; t1a t3a 1465 psubd m5, m2, m6 ; t12a t14a 1466 paddd m2, m6 ; t4a t6a 1467 psubd m6, m3, m7 ; t13a t15a 1468 paddd m3, m7 ; t5a t7a 1469 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8 1470 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8 1471 ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1 1472 ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1 1473 psubd m7, m0, m2 ; t4 t6 1474 paddd m0, m2 ; t0 t2 1475 psubd m2, m1, m3 ; t5 t7 1476 paddd m1, m3 ; t1 t3 1477 psubd m3, m4, m6 ; t12a t14a 1478 paddd m4, m6 ; t8a t10a 1479 psubd m6, m8, m5 ; t13a t15a 1480 paddd m8, m5 ; t9a t11a 1481 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8 1482 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8 1483 punpcklqdq m5, m3, m7 ; t12a t4 1484 punpckhqdq m3, m7 ; t14a t6 1485 punpckhqdq m7, m6, m2 ; t15a t7 1486 punpcklqdq m6, m2 ; t13a t5 1487 ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567 1488 ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10 1489 vpbroadcastd m10, [pd_2896] 1490 vbroadcasti128 m9, [pw_2048_m2048] ; + + - - 1491 punpckhqdq m2, m4, m0 ; t10a t2 1492 punpcklqdq m4, m0 ; t8a t0 1493 punpckhqdq m0, m8, m1 ; t11a t3 1494 punpcklqdq m8, m1 ; t9a t1 1495 paddd m1, m6, m7 ; out2 -out3 1496 psubd m6, m7 ; t14a t6 1497 paddd m7, m5, m3 ; -out13 out12 1498 psubd m5, m3 ; t15a t7 1499 psubd m3, m8, m0 ; t11 t3a 1500 paddd m8, m0 ; out14 -out15 1501 paddd m0, m4, m2 ; -out1 out0 1502 psubd m4, m2 ; t10 t2a 1503 REPX {pmaxsd x, m12}, m6, m5, m3, m4 1504 REPX {pminsd x, m13}, m6, m5, m3, m4 1505 REPX {pmulld x, m10}, m6, m5, m3, m4 1506 paddd m6, m11 1507 paddd m4, m11 1508 paddd m2, m6, m5 ; -out5 out4 1509 psubd m6, m5 ; out10 -out11 1510 psubd m5, m4, m3 ; -out9 out8 1511 paddd m3, m4 ; out6 -out7 1512 REPX {psrad x, 12}, m2, m3, m5, m6 1513 REPX {psignd x, m9}, m1, m8, m3, m6 1514 pshufd m9, m9, q1032 1515 REPX {psignd x, m9}, m0, m7, m2, m5 1516 ret 1517 1518INV_TXFM_4X16_FN flipadst, dct 1519INV_TXFM_4X16_FN flipadst, adst 1520INV_TXFM_4X16_FN flipadst, flipadst 1521INV_TXFM_4X16_FN flipadst, identity 1522 1523cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 1524.pass1: 1525 call m(iadst_16x4_internal_10bpc).main 1526 vpbroadcastd m6, [pd_6144] 1527 call m(iadst_16x4_internal_10bpc).main_end 1528 psrad m0, m3, 13 1529 psrad m1, m2, 13 1530 psrad m2, m5, 13 1531 psrad m3, m4, 13 1532 psrad m4, m7, 13 1533 psrad m5, m6, 13 1534 psrad m6, m9, 13 1535 psrad m7, m8, 13 1536 jmp tx2q 1537.pass2: 1538 call m(iadst_4x16_internal_10bpc).pass2_main 1539 vpbroadcastd m5, [pw_2048] 1540 vpbroadcastd m8, [pixel_10bpc_max] 1541 lea r6, [strideq*3] 1542 vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2 1543 pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 1544 vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14 1545 pxor m7, m7 1546 psubw m9, m7, m5 1547 vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 1548 pmulhrsw m0, m4, m9 1549 call .write_4x4 1550 pmulhrsw m0, m2, m9 1551 call .write_4x4 1552 pmulhrsw m0, m1, m9 1553 call .write_4x4 1554 pmulhrsw m0, m3, m9 1555 call .write_4x4 1556 RET 1557ALIGN function_align 1558.write_4x4: 1559 movq xm4, [dstq+strideq*0] 1560 movhps xm4, [dstq+r6 ] 1561 vpbroadcastq m5, [dstq+strideq*1] 1562 vpbroadcastq m6, [dstq+strideq*2] 1563 mova [cq+32*0], m7 1564 mova [cq+32*1], m7 1565 add cq, 32*2 1566 vpblendd m4, m5, 0x30 1567 vpblendd m4, m6, 0xc0 1568 paddw m4, m0 1569 pmaxsw m4, m7 1570 pminsw m4, m8 1571 vextracti128 xm5, m4, 1 1572 movq [dstq+strideq*0], xm4 1573 movq [dstq+strideq*1], xm5 1574 movhps [dstq+strideq*2], xm5 1575 movhps [dstq+r6 ], xm4 1576 lea dstq, [dstq+strideq*4] 1577 ret 1578 1579INV_TXFM_4X16_FN identity, dct 1580INV_TXFM_4X16_FN identity, adst 1581INV_TXFM_4X16_FN identity, flipadst 1582INV_TXFM_4X16_FN identity, identity 1583 1584cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 1585 vpbroadcastd m7, [pd_5793] 1586 pmulld m0, m7, [cq+32*0] 1587 pmulld m4, m7, [cq+32*1] 1588 pmulld m1, m7, [cq+32*2] 1589 pmulld m5, m7, [cq+32*3] 1590 pmulld m2, m7, [cq+32*4] 1591 pmulld m6, m7, [cq+32*5] 1592 pmulld m3, m7, [cq+32*6] 1593 pmulld m7, [cq+32*7] 1594 vpbroadcastd m8, [pd_6144] 1595 REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 1596 REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7 1597 jmp tx2q 1598.pass2: 1599 packssdw m0, m4 1600 packssdw m1, m5 1601 packssdw m2, m6 1602 packssdw m3, m7 1603 vpbroadcastd m7, [pw_1697x16] 1604 vpbroadcastd m8, [pw_2048] 1605 pmulhrsw m4, m7, m0 1606 pmulhrsw m5, m7, m1 1607 pmulhrsw m6, m7, m2 1608 pmulhrsw m7, m3 1609 REPX {paddsw x, x}, m0, m1, m2, m3 1610 paddsw m0, m4 1611 paddsw m1, m5 1612 paddsw m2, m6 1613 paddsw m3, m7 1614 vpbroadcastd m4, [pixel_10bpc_max] 1615 call .pass2_end 1616 RET 1617ALIGN function_align 1618.pass2_end: 1619 punpckhwd m7, m0, m1 1620 punpcklwd m0, m1 1621 punpckhwd m1, m2, m3 1622 punpcklwd m2, m3 1623 lea r6, [strideq*5] 1624 pxor m3, m3 1625 punpckhdq m5, m0, m2 ; 2 3 6 7 1626 punpckldq m0, m2 ; 0 1 4 5 1627 punpckldq m6, m7, m1 ; 8 9 c d 1628 punpckhdq m7, m1 ; a b e f 1629 pmulhrsw m0, m8 1630 call .write_2x4x2 1631 pmulhrsw m0, m5, m8 1632 call .write_2x4x2 1633 pmulhrsw m0, m6, m8 1634 lea dstq, [dstq+strideq*4] 1635 call .write_2x4x2 1636 pmulhrsw m0, m7, m8 1637 call .write_2x4x2 1638 ret 1639ALIGN function_align 1640.write_2x4x2: 1641 movq xm1, [dstq+strideq*0] 1642 movhps xm1, [dstq+strideq*1] 1643 vpbroadcastq m2, [dstq+strideq*4] 1644 vpblendd m1, m2, 0x30 1645 vpbroadcastq m2, [dstq+r6 ] 1646 vpblendd m1, m2, 0xc0 1647 mova [cq+32*0], m3 1648 mova [cq+32*1], m3 1649 add cq, 32*2 1650 paddw m1, m0 1651 pmaxsw m1, m3 1652 pminsw m1, m4 1653 vextracti128 xm2, m1, 1 1654 movq [dstq+strideq*0], xm1 1655 movhps [dstq+strideq*1], xm1 1656 movq [dstq+strideq*4], xm2 1657 movhps [dstq+r6 ], xm2 1658 lea dstq, [dstq+strideq*2] 1659 ret 1660 1661INV_TXFM_4X16_FN dct, dct, 12 1662INV_TXFM_4X16_FN dct, identity, 12 1663INV_TXFM_4X16_FN dct, adst, 12 1664INV_TXFM_4X16_FN dct, flipadst, 12 1665 1666cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 1667 jmp m(idct_4x16_internal_10bpc).pass1 1668.pass2: 1669 punpckldq m8, m0, m1 1670 punpckhdq m0, m1 1671 punpckldq m9, m2, m3 1672 punpckhdq m2, m3 1673 punpckldq m1, m4, m5 1674 punpckhdq m4, m5 1675 punpckldq m3, m6, m7 1676 punpckhdq m6, m7 1677 punpcklqdq m5, m0, m2 ; 2 6 1678 punpckhqdq m12, m0, m2 ; 3 7 1679 punpcklqdq m0, m8, m9 ; 0 4 1680 punpckhqdq m10, m8, m9 ; 1 5 1681 punpcklqdq m2, m1, m3 ; 8 12 1682 punpckhqdq m13, m1, m3 ; 9 13 1683 punpcklqdq m9, m4, m6 ; 10 14 1684 punpckhqdq m4, m6 ; 11 15 1685 vperm2i128 m1, m5, m9, 0x20 ; 2 10 1686 vperm2i128 m3, m9, m5, 0x31 ; 14 6 1687 vpermq m11, m4, q1302 ; 15 11 1688 ; interleave 1689 REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10 1690 vpbroadcastd m8, [clip_18b_min] 1691 vpbroadcastd m9, [clip_18b_max] 1692 REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13 1693 REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13 1694 call m(idct_16x4_internal_10bpc).pass1_main 1695 vpermq m6, m12, q1302 ; 7 3 1696 vpermq m5, m13, q3120 ; 9 13 1697 call m(idct_16x4_internal_10bpc).pass1_main2 1698 call m(idct_16x4_internal_10bpc).pass1_main3 1699 REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 1700 packssdw m0, m1 1701 packssdw m1, m2, m3 1702 packssdw m2, m4, m5 1703 packssdw m3, m6, m7 1704 mova m4, [idct16_12_shuf] 1705 REPX {vpermd x, m4, x}, m0, m1, m2, m3 1706 vpbroadcastd m9, [pw_16384] 1707 vpbroadcastd m8, [pixel_12bpc_max] 1708 call m(idct_4x16_internal_10bpc).pass2_end 1709 RET 1710 1711INV_TXFM_4X16_FN adst, dct, 12 1712INV_TXFM_4X16_FN adst, adst, 12 1713INV_TXFM_4X16_FN adst, flipadst, 12 1714INV_TXFM_4X16_FN adst, identity, 12 1715 1716cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 1717 call .main_pass1 1718 psrad m0, m4, 12 1719 psrad m1, m5, 12 1720 psrad m2, 12 1721 psrad m3, 12 1722 psrad m4, m8, 12 1723 psrad m5, m9, 12 1724 psrad m6, 12 1725 psrad m7, 12 1726 jmp tx2q 1727.pass2: 1728 vpbroadcastd m12, [clip_18b_min] 1729 vpbroadcastd m13, [clip_18b_max] 1730 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 1731 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 1732 call .transpose_16x4 1733 call m(iadst_4x16_internal_10bpc).main2 1734 pshufd m4, m5, q1032 1735 psrad m5, m6, 3 1736 pshufd m6, m7, q1032 1737 psrad m7, m8, 3 1738 REPX {pshufd x, x, q1032}, m0, m2 1739 REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6 1740.pass2_end: 1741 packssdw m0, m1 1742 packssdw m1, m2, m3 1743 packssdw m2, m4, m5 1744 packssdw m3, m6, m7 1745 mova m4, [iadst16_12_shuf] 1746 REPX {vpermd x, m4, x}, m0, m1, m2, m3 1747 vpbroadcastd m9, [pw_16384] 1748 vpbroadcastd m8, [pixel_12bpc_max] 1749 lea r6, [strideq*3] 1750 pxor m7, m7 1751 pmulhrsw m0, m9 1752 call m(iadst_4x16_internal_10bpc).write_4x4 1753 pmulhrsw m0, m9, m1 1754 call m(iadst_4x16_internal_10bpc).write_4x4 1755 pmulhrsw m0, m9, m2 1756 call m(iadst_4x16_internal_10bpc).write_4x4 1757 pmulhrsw m0, m9, m3 1758 call m(iadst_4x16_internal_10bpc).write_4x4 1759 RET 1760ALIGN function_align 1761.transpose_16x4: 1762 ; transpose & interleave 1763 punpckldq m8, m0, m1 1764 punpckhdq m0, m1 1765 punpckldq m9, m2, m3 1766 punpckhdq m2, m3 1767 punpckldq m1, m4, m5 1768 punpckhdq m4, m5 1769 punpckldq m3, m6, m7 1770 punpckhdq m6, m7 1771 punpcklqdq m10, m8, m0 1772 punpckhqdq m0, m8 1773 punpcklqdq m11, m9, m2 1774 punpckhqdq m2, m9 1775 punpcklqdq m8, m1, m4 1776 punpckhqdq m4, m1 1777 punpcklqdq m9, m3, m6 1778 punpckhqdq m6, m3 1779 vperm2i128 m5, m0, m2, 0x31 ; 7 5 1780 vperm2i128 m7, m0, m2, 0x20 ; 3 1 1781 vperm2i128 m0, m10, m11, 0x20 ; 0 2 1782 vperm2i128 m2, m10, m11, 0x31 ; 4 6 1783 vperm2i128 m1, m4, m6, 0x31 ; 15 13 1784 vperm2i128 m3, m4, m6, 0x20 ; 11 9 1785 vperm2i128 m4, m8, m9, 0x20 ; 8 10 1786 vperm2i128 m6, m8, m9, 0x31 ; 12 14 1787 ret 1788ALIGN function_align 1789.main_pass1: 1790 call m(iadst_16x4_internal_10bpc).main 1791 vpbroadcastd m6, [pd_3072] 1792 paddd m10, m4, m5 1793 psubd m4, m3 1794 psubd m5, m3 1795 paddd m3, m10 1796 psubd m8, m7, m1 1797 paddd m7, m9 1798 psubd m9, m1 1799 paddd m7, m1 1800 REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7 1801 REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7 1802 paddd m6, m0 1803 ret 1804 1805INV_TXFM_4X16_FN flipadst, dct, 12 1806INV_TXFM_4X16_FN flipadst, adst, 12 1807INV_TXFM_4X16_FN flipadst, flipadst, 12 1808INV_TXFM_4X16_FN flipadst, identity, 12 1809 1810cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 1811 call m(iadst_4x16_internal_12bpc).main_pass1 1812 psrad m0, m3, 12 1813 psrad m1, m2, 12 1814 psrad m2, m5, 12 1815 psrad m3, m4, 12 1816 psrad m4, m7, 12 1817 psrad m5, m6, 12 1818 psrad m6, m9, 12 1819 psrad m7, m8, 12 1820 jmp tx2q 1821.pass2: 1822 vpbroadcastd m12, [clip_18b_min] 1823 vpbroadcastd m13, [clip_18b_max] 1824 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 1825 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 1826 call m(iadst_4x16_internal_12bpc).transpose_16x4 1827 call m(iadst_4x16_internal_10bpc).main2 1828 pshufd m4, m3, q1032 1829 psrad m3, m5, 3 1830 psrad m5, m2, 3 1831 pshufd m2, m6, q1032 1832 pshufd m6, m1, q1032 1833 psrad m1, m7, 3 1834 psrad m7, m0, 3 1835 pshufd m0, m8, q1032 1836 REPX {psrad x, 3}, m0, m2, m4, m6 1837 jmp m(iadst_4x16_internal_12bpc).pass2_end 1838 1839INV_TXFM_4X16_FN identity, dct, 12 1840INV_TXFM_4X16_FN identity, adst, 12 1841INV_TXFM_4X16_FN identity, flipadst, 12 1842INV_TXFM_4X16_FN identity, identity, 12 1843 1844cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 1845 vpbroadcastd m8, [pd_1697] 1846 mova m0, [cq+32*0] 1847 mova m4, [cq+32*1] 1848 mova m1, [cq+32*2] 1849 mova m5, [cq+32*3] 1850 vpbroadcastd m9, [pd_6144] 1851 pmulld m2, m8, m0 1852 pmulld m6, m8, m4 1853 pmulld m3, m8, m1 1854 pmulld m7, m8, m5 1855 mova m10, [cq+32*4] 1856 mova m11, [cq+32*5] 1857 mova m12, [cq+32*6] 1858 mova m13, [cq+32*7] 1859 REPX {paddd x, m9}, m2, m6, m3, m7 1860 REPX {psrad x, 12}, m2, m6, m3, m7 1861 paddd m0, m2 1862 pmulld m2, m8, m10 1863 paddd m4, m6 1864 pmulld m6, m8, m11 1865 paddd m1, m3 1866 pmulld m3, m8, m12 1867 paddd m5, m7 1868 pmulld m7, m8, m13 1869 REPX {psrad x, 1 }, m0, m4, m1, m5 1870 REPX {paddd x, m9}, m2, m6, m3, m7 1871 REPX {psrad x, 12}, m2, m6, m3, m7 1872 paddd m2, m10 1873 paddd m6, m11 1874 paddd m3, m12 1875 paddd m7, m13 1876 REPX {psrad x, 1 }, m2, m6, m3, m7 1877 jmp tx2q 1878.pass2: 1879 vpbroadcastd m12, [clip_18b_min] 1880 vpbroadcastd m13, [clip_18b_max] 1881 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 1882 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 1883 vpbroadcastd m8, [pd_5793] 1884 vpbroadcastd m9, [pd_1024] 1885 REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 1886 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 1887 REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 1888 packssdw m0, m4 1889 packssdw m1, m5 1890 packssdw m2, m6 1891 packssdw m3, m7 1892 vpbroadcastd m8, [pw_16384] 1893 vpbroadcastd m4, [pixel_12bpc_max] 1894 call m(iidentity_4x16_internal_10bpc).pass2_end 1895 RET 1896 1897%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth 1898 INV_TXFM_FN %1, %2, 0, 8x4, %3 1899%ifidn %1_%2, dct_dct 1900 vpbroadcastd m2, [dconly_%3bpc] 1901%if %3 = 10 1902.dconly: 1903 imul r6d, [cq], 181 1904 mov [cq], eobd ; 0 1905 or r3d, 4 1906 add r6d, 128 1907 sar r6d, 8 1908 imul r6d, 181 1909 add r6d, 128 1910 sar r6d, 8 1911 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 1912%else 1913 jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly 1914%endif 1915%endif 1916%endmacro 1917 1918INV_TXFM_8X4_FN dct, dct 1919INV_TXFM_8X4_FN dct, identity 1920INV_TXFM_8X4_FN dct, adst 1921INV_TXFM_8X4_FN dct, flipadst 1922 1923cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 1924 vpbroadcastd m8, [clip_18b_min] 1925 vpbroadcastd m9, [clip_18b_max] 1926.pass1: 1927 vbroadcasti128 m1, [cq+16*1] 1928 vbroadcasti128 m0, [cq+16*5] 1929 vbroadcasti128 m2, [cq+16*3] 1930 vbroadcasti128 m3, [cq+16*7] 1931 vpbroadcastd m6, [pd_2896] 1932 shufpd m1, m0, 0x0c ; 1 5 1933 shufpd m3, m2, 0x0c ; 7 3 1934 vbroadcasti128 m0, [cq+16*0] 1935 vbroadcasti128 m4, [cq+16*2] 1936 vbroadcasti128 m2, [cq+16*4] 1937 vbroadcasti128 m5, [cq+16*6] 1938 vpbroadcastd m7, [pd_2048] 1939 shufpd m0, m4, 0x0c ; 0 2 1940 shufpd m2, m5, 0x0c ; 4 6 1941 REPX {pmulld x, m6}, m1, m3, m0, m2 1942 REPX {paddd x, m7}, m1, m3, m0, m2 1943 REPX {psrad x, 12}, m1, m3, m0, m2 1944 call .main 1945 psubd m3, m0, m4 ; out7 out6 (interleaved) 1946 paddd m0, m4 ; out0 out1 (interleaved) 1947 paddd m1, m2, m5 ; out3 out2 (interleaved) 1948 psubd m2, m5 ; out4 out5 (interleaved) 1949 pshufd m1, m1, q1032 1950 pshufd m3, m3, q1032 1951 jmp tx2q 1952.pass2: 1953 vbroadcasti128 m4, [deint_shuf] 1954 packssdw m0, m1 1955 packssdw m2, m3 1956 vperm2i128 m1, m0, m2, 0x31 1957 vinserti128 m0, xm2, 1 1958 pshufb m0, m4 1959 pshufb m1, m4 1960 IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7 1961 vpermq m0, m0, q3120 ; out0 out1 1962 vpermq m2, m1, q2031 ; out2 out3 1963 jmp m(iadst_8x4_internal_10bpc).end 1964ALIGN function_align 1965.main: 1966 ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1 1967 IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7 1968 vpbroadcastd m6, [pd_2896] 1969 punpcklqdq m4, m1, m3 ; t4a t7a 1970 punpckhqdq m1, m3 ; t5a t6a 1971 psubd m3, m4, m1 ; t5a t6a 1972 paddd m4, m1 ; t4 t7 1973 REPX {pmaxsd x, m8}, m3, m4, m0, m2 1974 REPX {pminsd x, m9}, m3, m4, m0, m2 1975 pmulld m3, m6 1976 pshufd m1, m3, q1032 1977 paddd m3, m7 1978 psubd m5, m3, m1 1979 paddd m1, m3 1980 psrad m5, 12 1981 psrad m1, 12 1982 vpblendd m5, m4, 0x33 ; t4 t5 1983 punpckhqdq m4, m1 ; t7 t6 1984 ret 1985 1986INV_TXFM_8X4_FN adst, dct 1987INV_TXFM_8X4_FN adst, adst 1988INV_TXFM_8X4_FN adst, flipadst 1989INV_TXFM_8X4_FN adst, identity 1990 1991cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 1992 call m(iadst_4x8_internal_10bpc).main 1993 vpblendd m3, m0, m4, 0x33 ; out6 out7 1994 vpblendd m0, m4, 0xcc ; out0 out1 1995 pshufd m1, m5, q1032 1996 psignd m2, m6 ; out4 out5 1997 psignd m1, m6 ; out2 out3 1998 jmp tx2q 1999.pass2: 2000 call .pass2_main 2001 vpermq m0, m0, q3120 ; out0 out1 2002 vpermq m2, m1, q3120 ; out2 out3 2003.end: 2004 vpbroadcastd m1, [pw_2048] 2005 pmulhrsw m0, m1 2006 pmulhrsw m1, m2 2007 vpbroadcastd m5, [pixel_10bpc_max] 2008.end2: 2009 mova xm2, [dstq+strideq*0] 2010 vinserti128 m2, [dstq+strideq*1], 1 2011 lea r6, [dstq+strideq*2] 2012 mova xm3, [r6 +strideq*0] 2013 vinserti128 m3, [r6 +strideq*1], 1 2014 pxor m4, m4 2015 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 2016 paddw m0, m2 2017 paddw m1, m3 2018 pmaxsw m0, m4 2019 pmaxsw m1, m4 2020 pminsw m0, m5 2021 pminsw m1, m5 2022 mova [dstq+strideq*0], xm0 2023 vextracti128 [dstq+strideq*1], m0, 1 2024 mova [r6 +strideq*0], xm1 2025 vextracti128 [r6 +strideq*1], m1, 1 2026 RET 2027ALIGN function_align 2028.pass2_main: 2029 vbroadcasti128 m4, [deint_shuf] 2030 packssdw m0, m1 2031 packssdw m2, m3 2032 lea r6, [deint_shuf+128] 2033 vperm2i128 m1, m0, m2, 0x31 2034 vinserti128 m0, xm2, 1 2035 pshufb m0, m4 2036 pshufb m1, m4 2037 jmp m(iadst_8x4_internal_8bpc).main 2038ALIGN function_align 2039.main: 2040 vpbroadcastd m1, [pd_2896] 2041 pmulld m0, m1, [cq+32*0] 2042 pmulld m3, m1, [cq+32*3] 2043 pmulld m2, m1, [cq+32*2] 2044 pmulld m1, [cq+32*1] 2045 vpbroadcastd m4, [pd_2048] 2046 REPX {paddd x, m4}, m0, m3, m2, m1 2047 REPX {psrad x, 12}, m0, m3, m2, m1 2048.main2: 2049 IADST4_1D 2050 ret 2051 2052INV_TXFM_8X4_FN flipadst, dct 2053INV_TXFM_8X4_FN flipadst, adst 2054INV_TXFM_8X4_FN flipadst, flipadst 2055INV_TXFM_8X4_FN flipadst, identity 2056 2057cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2 2058 call m(iadst_4x8_internal_10bpc).main 2059 shufpd m3, m4, m0, 0x05 2060 shufpd m0, m4, 0x05 2061 psignd m2, m6 2062 pshufd m6, m6, q1032 2063 pshufd m1, m2, q1032 2064 psignd m2, m5, m6 2065 jmp tx2q 2066.pass2: 2067 call m(iadst_8x4_internal_10bpc).pass2_main 2068 vpermq m2, m0, q2031 2069 vpermq m0, m1, q2031 2070 jmp m(iadst_8x4_internal_10bpc).end 2071 2072INV_TXFM_8X4_FN identity, dct 2073INV_TXFM_8X4_FN identity, adst 2074INV_TXFM_8X4_FN identity, flipadst 2075INV_TXFM_8X4_FN identity, identity 2076 2077cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 2078.pass1: 2079 vpbroadcastd m4, [pd_2896] 2080 vpermq m0, [cq+32*0], q3120 2081 vpermq m1, [cq+32*1], q3120 2082 vpermq m2, [cq+32*2], q3120 2083 vpermq m3, [cq+32*3], q3120 2084 vpbroadcastd m7, [pd_2048] 2085 REPX {pmulld x, m4}, m0, m1, m2, m3 2086 REPX {paddd x, m7}, m0, m1, m2, m3 2087 REPX {psrad x, 12}, m0, m1, m2, m3 2088 REPX {paddd x, x }, m0, m1, m2, m3 2089 jmp tx2q 2090.pass2: 2091 vpbroadcastd m5, [pixel_10bpc_max] 2092 vpbroadcastd m4, [pw_1697x8] 2093 packssdw m0, m1 2094 packssdw m2, m3 2095 pmulhrsw m1, m4, m0 2096 pmulhrsw m4, m2 2097 paddsw m0, m1 2098 paddsw m2, m4 2099 packssdw m7, m7 ; pw_2048 2100.pass2_end: 2101 punpckhwd m1, m0, m2 2102 punpcklwd m0, m2 2103 lea r6, [dstq+strideq*2] 2104 punpckhwd m2, m0, m1 2105 punpcklwd m0, m1 2106 pmulhrsw m2, m7 2107 pmulhrsw m0, m7 2108 punpckhwd m1, m0, m2 2109 punpcklwd m0, m2 2110 mova xm2, [dstq+strideq*0] 2111 vinserti128 m2, [r6 +strideq*0], 1 2112 mova xm3, [dstq+strideq*1] 2113 vinserti128 m3, [r6 +strideq*1], 1 2114 pxor m4, m4 2115 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 2116 paddw m0, m2 2117 paddw m1, m3 2118 pmaxsw m0, m4 2119 pmaxsw m1, m4 2120 pminsw m0, m5 2121 pminsw m1, m5 2122 mova [dstq+strideq*0], xm0 2123 mova [dstq+strideq*1], xm1 2124 vextracti128 [r6 +strideq*0], m0, 1 2125 vextracti128 [r6 +strideq*1], m1, 1 2126 RET 2127 2128INV_TXFM_8X4_FN dct, dct, 12 2129INV_TXFM_8X4_FN dct, identity, 12 2130INV_TXFM_8X4_FN dct, adst, 12 2131INV_TXFM_8X4_FN dct, flipadst, 12 2132 2133cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 2134 vpbroadcastd m8, [clip_20b_min] 2135 vpbroadcastd m9, [clip_20b_max] 2136 jmp m(idct_8x4_internal_10bpc).pass1 2137.pass2: 2138 vpbroadcastd m8, [clip_18b_min] 2139 vpbroadcastd m9, [clip_18b_max] 2140 REPX {pmaxsd x, m8}, m0, m1, m2, m3 2141 REPX {pminsd x, m9}, m0, m1, m2, m3 2142 call m(iadst_8x4_internal_12bpc).transpose_4x8 2143 IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 2144 jmp m(iadst_8x4_internal_12bpc).end 2145 2146INV_TXFM_8X4_FN adst, dct, 12 2147INV_TXFM_8X4_FN adst, adst, 12 2148INV_TXFM_8X4_FN adst, flipadst, 12 2149INV_TXFM_8X4_FN adst, identity, 12 2150 2151cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 2152 vpbroadcastd m8, [clip_20b_min] 2153 vpbroadcastd m9, [clip_20b_max] 2154 call m(iadst_4x8_internal_10bpc).main2 2155 vpblendd m3, m0, m4, 0x33 ; out6 out7 2156 vpblendd m0, m4, 0xcc ; out0 out1 2157 pshufd m1, m5, q1032 2158 psignd m2, m6 ; out4 out5 2159 psignd m1, m6 ; out2 out3 2160 jmp tx2q 2161.pass2: 2162 vpbroadcastd m8, [clip_18b_min] 2163 vpbroadcastd m9, [clip_18b_max] 2164 REPX {pmaxsd x, m8}, m0, m1, m2, m3 2165 REPX {pminsd x, m9}, m0, m1, m2, m3 2166 call .pass2_main 2167 vpbroadcastd m5, [pd_2048] 2168 paddd m0, m5, m4 2169 paddd m1, m5, m6 2170 paddd m2, m5 2171 paddd m3, m5 2172.pass2_end: 2173 REPX {psrad x, 12}, m0, m1, m2, m3 2174.end: 2175 vpbroadcastd m4, [pw_16384] 2176 REPX {psrad x, 3}, m0, m1, m2, m3 2177 packssdw m0, m1 2178 packssdw m2, m3 2179 pmulhrsw m0, m4 2180 pmulhrsw m1, m2, m4 2181 vpermq m0, m0, q3120 ; out0 out1 2182 vpermq m1, m1, q3120 ; out2 out3 2183 vpbroadcastd m5, [pixel_12bpc_max] 2184 jmp m(iadst_8x4_internal_10bpc).end2 2185ALIGN function_align 2186.pass2_main: 2187 call .transpose_4x8 2188 jmp m(iadst_8x4_internal_10bpc).main2 2189ALIGN function_align 2190.transpose_4x8: 2191 ; deinterleave 2192 pshufd m0, m0, q3120 2193 pshufd m1, m1, q3120 2194 pshufd m2, m2, q3120 2195 pshufd m3, m3, q3120 2196 ; transpose 2197 punpcklqdq m4, m0, m1 2198 punpckhqdq m0, m1 2199 punpcklqdq m5, m2, m3 2200 punpckhqdq m2, m3 2201 vperm2i128 m1, m0, m2, 0x20 ; out1 2202 vperm2i128 m3, m0, m2, 0x31 ; out3 2203 vperm2i128 m2, m4, m5, 0x31 ; out2 2204 vperm2i128 m0, m4, m5, 0x20 ; out0 2205 ret 2206 2207INV_TXFM_8X4_FN flipadst, dct, 12 2208INV_TXFM_8X4_FN flipadst, adst, 12 2209INV_TXFM_8X4_FN flipadst, flipadst, 12 2210INV_TXFM_8X4_FN flipadst, identity, 12 2211 2212cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2 2213 vpbroadcastd m8, [clip_20b_min] 2214 vpbroadcastd m9, [clip_20b_max] 2215 call m(iadst_4x8_internal_10bpc).main2 2216 shufpd m3, m4, m0, 0x05 2217 shufpd m0, m4, 0x05 2218 psignd m2, m6 2219 pshufd m6, m6, q1032 2220 pshufd m1, m2, q1032 2221 psignd m2, m5, m6 2222 jmp tx2q 2223.pass2: 2224 vpbroadcastd m8, [clip_18b_min] 2225 vpbroadcastd m9, [clip_18b_max] 2226 REPX {pmaxsd x, m8}, m0, m1, m2, m3 2227 REPX {pminsd x, m9}, m0, m1, m2, m3 2228 call m(iadst_8x4_internal_12bpc).pass2_main 2229 vpbroadcastd m5, [pd_2048] 2230 paddd m0, m5, m3 2231 paddd m1, m5, m2 2232 paddd m3, m5, m4 2233 paddd m2, m5, m6 2234 jmp m(iadst_8x4_internal_12bpc).pass2_end 2235 2236INV_TXFM_8X4_FN identity, dct, 12 2237INV_TXFM_8X4_FN identity, adst, 12 2238INV_TXFM_8X4_FN identity, flipadst, 12 2239INV_TXFM_8X4_FN identity, identity, 12 2240 2241cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 2242 jmp m(iidentity_8x4_internal_10bpc).pass1 2243.pass2: 2244 ; m0 = in0 in1 (interleaved) 2245 ; m1 = in2 in3 (interleaved) 2246 ; m2 = in4 in5 (interleaved) 2247 ; m3 = in6 in7 (interleaved) 2248 vpbroadcastd m8, [clip_18b_min] 2249 vpbroadcastd m9, [clip_18b_max] 2250 REPX {pmaxsd x, m8}, m0, m1, m2, m3 2251 REPX {pminsd x, m9}, m0, m1, m2, m3 2252 vpbroadcastd m4, [pd_5793] 2253 REPX {pmulld x, m4}, m0, m1, m2, m3 2254 REPX {paddd x, m7}, m0, m1, m2, m3 2255 REPX {psrad x, 15}, m0, m1, m2, m3 2256 vpbroadcastd m5, [pixel_12bpc_max] 2257 vpbroadcastd m7, [pw_16384] 2258 packssdw m0, m1 2259 packssdw m2, m3 2260 jmp m(iidentity_8x4_internal_10bpc).pass2_end 2261 2262%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth 2263 INV_TXFM_FN %1, %2, 0, 8x8, %3 2264%ifidn %1_%2, dct_dct 2265 vpbroadcastd m2, [dconly_%3bpc] 2266%if %3 = 10 2267.dconly: 2268 imul r6d, [cq], 181 2269 mov [cq], eobd ; 0 2270 or r3d, 8 2271.dconly2: 2272 add r6d, 384 2273 sar r6d, 9 2274.dconly3: 2275 imul r6d, 181 2276 add r6d, 2176 2277 sar r6d, 12 2278 movd xm0, r6d 2279 paddsw xm0, xm2 2280 vpbroadcastw m0, xm0 2281.dconly_loop: 2282 mova xm1, [dstq+strideq*0] 2283 vinserti128 m1, [dstq+strideq*1], 1 2284 paddsw m1, m0 2285 psubusw m1, m2 2286 mova [dstq+strideq*0], xm1 2287 vextracti128 [dstq+strideq*1], m1, 1 2288 lea dstq, [dstq+strideq*2] 2289 sub r3d, 2 2290 jg .dconly_loop 2291 RET 2292%else 2293 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly 2294%endif 2295%endif 2296%endmacro 2297 2298%macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2] 2299 ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a 2300 ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a 2301 ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a 2302 ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a 2303 psubd m%9, m%3, m%7 ; t6 2304 paddd m%3, m%7 ; t2 2305 psubd m%7, m%1, m%5 ; t4 2306 paddd m%1, m%5 ; t0 2307 psubd m%5, m%6, m%2 ; t7 2308 paddd m%6, m%2 ; t3 2309 psubd m%2, m%8, m%4 ; t5 2310 paddd m%8, m%4 ; t1 2311 REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 2312 REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 2313 ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a 2314 ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a 2315 psubd m%10, m%7, m%9 ; t7 2316 paddd m%7, m%9 ; out6 2317 vpbroadcastd m%9, [pd_1448] 2318 psubd m%4, m%8, m%6 ; t3 2319 paddd m%8, m%6 ; -out7 2320 psubd m%6, m%1, m%3 ; t2 2321 paddd m%1, m%3 ; out0 2322 psubd m%3, m%2, m%5 ; t6 2323 paddd m%2, m%5 ; -out1 2324 REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 2325 REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 2326 REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 2327 psubd m%5, m%6, m%4 ; (t2 - t3) * 1448 2328 paddd m%4, m%6 ; (t2 + t3) * 1448 2329 psubd m%6, m%3, m%10 ; (t6 - t7) * 1448 2330 paddd m%3, m%10 ; (t6 + t7) * 1448 2331%endmacro 2332 2333INV_TXFM_8X8_FN dct, dct 2334INV_TXFM_8X8_FN dct, identity 2335INV_TXFM_8X8_FN dct, adst 2336INV_TXFM_8X8_FN dct, flipadst 2337 2338cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 2339 vpbroadcastd m12, [clip_18b_min] 2340 vpbroadcastd m13, [clip_18b_max] 2341.pass1: 2342 mova m0, [cq+32*0] 2343 mova m1, [cq+32*1] 2344 mova m2, [cq+32*2] 2345 mova m3, [cq+32*3] 2346 mova m4, [cq+32*4] 2347 mova m5, [cq+32*5] 2348 mova m6, [cq+32*6] 2349 mova m7, [cq+32*7] 2350 vpbroadcastd m11, [pd_2048] 2351 call .main 2352 call .round_shift1 2353 jmp tx2q 2354.pass2: 2355 call .transpose_8x8_packed 2356 call m(idct_8x8_internal_8bpc).main 2357 vpbroadcastd m12, [pw_2048] 2358 vpermq m0, m0, q3120 2359 vpermq m1, m1, q2031 2360 vpermq m2, m2, q3120 2361 vpermq m3, m3, q2031 2362 pmulhrsw m0, m12 2363 pmulhrsw m1, m12 2364 call .write_8x4_start 2365 pmulhrsw m0, m2, m12 2366 pmulhrsw m1, m3, m12 2367 call .write_8x4 2368 RET 2369ALIGN function_align 2370.write_8x4_start: 2371 vpbroadcastd m11, [pixel_10bpc_max] 2372 lea r6, [strideq*3] 2373 pxor m10, m10 2374.write_8x4: 2375 mova xm8, [dstq+strideq*0] 2376 vinserti128 m8, [dstq+strideq*1], 1 2377 mova xm9, [dstq+strideq*2] 2378 vinserti128 m9, [dstq+r6 ], 1 2379 mova [cq+32*0], m10 2380 mova [cq+32*1], m10 2381 mova [cq+32*2], m10 2382 mova [cq+32*3], m10 2383 add cq, 32*4 2384 paddw m0, m8 2385 paddw m1, m9 2386 pmaxsw m0, m10 2387 pmaxsw m1, m10 2388 pminsw m0, m11 2389 pminsw m1, m11 2390 mova [dstq+strideq*0], xm0 2391 vextracti128 [dstq+strideq*1], m0, 1 2392 mova [dstq+strideq*2], xm1 2393 vextracti128 [dstq+r6 ], m1, 1 2394 lea dstq, [dstq+strideq*4] 2395 ret 2396ALIGN function_align 2397.transpose_8x8_packed: 2398 packssdw m0, m4 2399 packssdw m1, m5 2400 packssdw m2, m6 2401 packssdw m3, m7 2402 lea r6, [deint_shuf+128] 2403 punpckhwd m4, m0, m1 2404 punpcklwd m0, m1 2405 punpckhwd m1, m2, m3 2406 punpcklwd m2, m3 2407 punpckhdq m3, m0, m2 2408 punpckldq m0, m2 2409 punpckhdq m2, m4, m1 2410 punpckldq m4, m1 2411 vinserti128 m1, m3, xm2, 1 2412 vperm2i128 m3, m2, 0x31 2413 vperm2i128 m2, m0, m4, 0x31 2414 vinserti128 m0, xm4, 1 2415 ret 2416ALIGN function_align 2417.main_rect2: 2418 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 2419 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 2420.main: 2421 ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a 2422 ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a 2423 ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3 2424 paddd m8, m1, m5 ; t4 2425 psubd m1, m5 ; t5a 2426 paddd m9, m7, m3 ; t7 2427 psubd m7, m3 ; t6a 2428 vpbroadcastd m3, [pd_2896] 2429 REPX {pmaxsd x, m12}, m1, m8, m7, m9 2430 REPX {pminsd x, m13}, m1, m8, m7, m9 2431 REPX {pmulld x, m3 }, m0, m4, m7, m1 2432 paddd m0, m11 2433 paddd m7, m11 2434 psubd m5, m0, m4 2435 paddd m0, m4 2436 psubd m4, m7, m1 2437 paddd m7, m1 2438 REPX {psrad x, 12 }, m5, m0, m4, m7 2439 psubd m3, m0, m6 ; dct4 out3 2440 paddd m0, m6 ; dct4 out0 2441 paddd m6, m5, m2 ; dct4 out1 2442 psubd m5, m2 ; dct4 out2 2443 REPX {pmaxsd x, m12}, m0, m6, m5, m3 2444 REPX {pminsd x, m13}, m0, m6, m5, m3 2445 ret 2446ALIGN function_align 2447.round_shift1: 2448 pcmpeqd m1, m1 2449 REPX {psubd x, m1}, m0, m6, m5, m3 2450 paddd m1, m6, m7 ; out1 2451 psubd m6, m7 ; out6 2452 psubd m7, m0, m9 ; out7 2453 paddd m0, m9 ; out0 2454 paddd m2, m5, m4 ; out2 2455 psubd m5, m4 ; out5 2456 psubd m4, m3, m8 ; out4 2457 paddd m3, m8 ; out3 2458 REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 2459 ret 2460 2461INV_TXFM_8X8_FN adst, dct 2462INV_TXFM_8X8_FN adst, adst 2463INV_TXFM_8X8_FN adst, flipadst 2464INV_TXFM_8X8_FN adst, identity 2465 2466cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 2467 vpbroadcastd m12, [clip_18b_min] 2468 vpbroadcastd m13, [clip_18b_max] 2469.pass1: 2470 call .main 2471 call .main_end 2472 jmp tx2q 2473.pass2: 2474 call m(idct_8x8_internal_10bpc).transpose_8x8_packed 2475 pshufd m4, m0, q1032 2476 pshufd m5, m1, q1032 2477 call m(iadst_8x8_internal_8bpc).main_pass2 2478 vpbroadcastd m5, [pw_2048] 2479 vpbroadcastd xm12, [pw_4096] 2480 psubw m12, m5 2481 REPX {vpermq x, x, q3120}, m0, m1, m2, m3 2482 pmulhrsw m0, m12 2483 pmulhrsw m1, m12 2484 call m(idct_8x8_internal_10bpc).write_8x4_start 2485 pmulhrsw m0, m2, m12 2486 pmulhrsw m1, m3, m12 2487 call m(idct_8x8_internal_10bpc).write_8x4 2488 RET 2489ALIGN function_align 2490.main: 2491 mova m0, [cq+32*0] 2492 mova m7, [cq+32*7] 2493 mova m1, [cq+32*1] 2494 mova m6, [cq+32*6] 2495 mova m2, [cq+32*2] 2496 mova m5, [cq+32*5] 2497 mova m3, [cq+32*3] 2498 mova m4, [cq+32*4] 2499 vpbroadcastd m11, [pd_2048] 2500.main2: 2501 IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 2502 psrld m8, 10 ; pd_1 2503 vpbroadcastd m9, [pd_3072] 2504 ret 2505ALIGN function_align 2506.main_end: 2507 paddd m0, m8 2508 psubd m1, m8, m1 2509 paddd m6, m8 2510 psubd m7, m8, m7 2511 REPX {psrad x, 1 }, m0, m1, m6, m7 2512 ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12 2513 ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12 2514 psubd m8, m9, m8 ; pd_3071 2515 paddd m2, m9 2516 psubd m3, m8, m3 2517 paddd m4, m9 2518 psubd m5, m8, m5 2519 REPX {psrad x, 12}, m2, m3, m4, m5 2520 ret 2521 2522INV_TXFM_8X8_FN flipadst, dct 2523INV_TXFM_8X8_FN flipadst, adst 2524INV_TXFM_8X8_FN flipadst, flipadst 2525INV_TXFM_8X8_FN flipadst, identity 2526 2527cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 2528 vpbroadcastd m12, [clip_18b_min] 2529 vpbroadcastd m13, [clip_18b_max] 2530.pass1: 2531 call m(iadst_8x8_internal_10bpc).main 2532 call .main_end 2533 jmp tx2q 2534.pass2: 2535 call m(idct_8x8_internal_10bpc).transpose_8x8_packed 2536 pshufd m4, m0, q1032 2537 pshufd m5, m1, q1032 2538 call m(iadst_8x8_internal_8bpc).main_pass2 2539 vpbroadcastd m12, [pw_2048] 2540 vpbroadcastd xm5, [pw_4096] 2541 psubw m12, m5 2542 vpermq m8, m3, q2031 2543 vpermq m9, m2, q2031 2544 vpermq m2, m1, q2031 2545 vpermq m3, m0, q2031 2546 pmulhrsw m0, m8, m12 2547 pmulhrsw m1, m9, m12 2548 call m(idct_8x8_internal_10bpc).write_8x4_start 2549 pmulhrsw m0, m2, m12 2550 pmulhrsw m1, m3, m12 2551 call m(idct_8x8_internal_10bpc).write_8x4 2552 RET 2553ALIGN function_align 2554.main_end: 2555 paddd m10, m8, m0 2556 psubd m0, m8, m7 2557 psubd m7, m8, m1 2558 paddd m1, m8, m6 2559 psrad m0, 1 2560 psrad m1, 1 2561 psrad m6, m7, 1 2562 psrad m7, m10, 1 2563 psubd m8, m9, m8 ; pd_6143 2564 psubd m10, m8, m5 2565 paddd m5, m9, m2 2566 psubd m2, m8, m3 2567 paddd m3, m9, m4 2568 psrad m4, m2, 12 2569 psrad m2, m10, 12 2570 psrad m3, 12 2571 psrad m5, 12 2572 ret 2573 2574INV_TXFM_8X8_FN identity, dct 2575INV_TXFM_8X8_FN identity, adst 2576INV_TXFM_8X8_FN identity, flipadst 2577INV_TXFM_8X8_FN identity, identity 2578 2579cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 2580.pass1: 2581 mova m0, [cq+32*0] 2582 mova m1, [cq+32*1] 2583 mova m2, [cq+32*2] 2584 mova m3, [cq+32*3] 2585 mova m4, [cq+32*4] 2586 mova m5, [cq+32*5] 2587 mova m6, [cq+32*6] 2588 mova m7, [cq+32*7] 2589 jmp tx2q 2590.pass2: 2591 packssdw m3, m7 2592 vpbroadcastd m7, [pixel_10bpc_max] 2593.pass2_main: 2594 packssdw m0, m4 2595 packssdw m1, m5 2596 packssdw m2, m6 2597 vpbroadcastd m12, [pw_4096] 2598 punpckhwd m4, m0, m1 2599 punpcklwd m0, m1 2600 punpckhwd m1, m2, m3 2601 punpcklwd m2, m3 2602 punpckhdq m3, m0, m2 2603 punpckldq m0, m2 2604 punpckldq m2, m4, m1 2605 punpckhdq m4, m1 2606 punpckhqdq m1, m0, m2 ; 1 5 2607 punpcklqdq m0, m2 ; 0 4 2608 punpcklqdq m2, m3, m4 ; 2 6 2609 punpckhqdq m3, m4 ; 3 7 2610 pmulhrsw m0, m12 2611 pmulhrsw m1, m12 2612 call .write_2x8x2_start 2613 pmulhrsw m0, m2, m12 2614 pmulhrsw m1, m3, m12 2615 call .write_2x8x2_zero 2616 RET 2617.write_2x8x2_start: 2618 lea r6, [strideq*5] 2619 pxor m6, m6 2620.write_2x8x2_zero: 2621 mova [cq+32*0], m6 2622 mova [cq+32*1], m6 2623 mova [cq+32*2], m6 2624 mova [cq+32*3], m6 2625 add cq, 32*4 2626.write_2x8x2: 2627 mova xm4, [dstq+strideq*0] 2628 vinserti128 m4, [dstq+strideq*4], 1 2629 mova xm5, [dstq+strideq*1] 2630 vinserti128 m5, [dstq+r6 ], 1 2631 paddw m0, m4 2632 paddw m1, m5 2633 pmaxsw m0, m6 2634 pmaxsw m1, m6 2635 pminsw m0, m7 2636 pminsw m1, m7 2637 mova [dstq+strideq*0], xm0 2638 mova [dstq+strideq*1], xm1 2639 vextracti128 [dstq+strideq*4], m0, 1 2640 vextracti128 [dstq+r6 ], m1, 1 2641 lea dstq, [dstq+strideq*2] 2642 ret 2643 2644%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4] 2645 punpckldq m%9, m%1, m%2 ; aibj emfn 2646 punpckhdq m%1, m%2 ; ckdl gohp 2647 punpckldq m%10, m%3, m%4 ; qyrz uCvD 2648 punpckhdq m%3, m%4 ; sAtB wExF 2649 punpckldq m%11, m%5, m%6 ; GOHP KSLT 2650 punpckhdq m%5, m%6 ; IQJR MUNV 2651 punpckldq m%12, m%7, m%8 ; WeXf aibj 2652 punpckhdq m%7, m%8 ; YgZh ckdl 2653 punpcklqdq m%2, m%9, m%10 ; aiqy emuC 2654 punpckhqdq m%9, m%10 ; bjrz fnvD 2655 punpcklqdq m%4, m%1, m%3 ; cksA gowE 2656 punpckhqdq m%10, m%1, m%3 ; dltB hpxF 2657 punpcklqdq m%6, m%11, m%12 ; GOWe KSai 2658 punpckhqdq m%11, m%12 ; HPXf LTbj 2659 punpcklqdq m%8, m%5, m%7 ; IQYg MUck 2660 punpckhqdq m%12, m%5, m%7 ; JRZh NVdl 2661 vperm2i128 m%1, m%2, m%6, 0x20 ; out0 2662 vperm2i128 m%5, m%2, m%6, 0x31 ; out4 2663 vperm2i128 m%2, m%9, m%11, 0x20 ; out1 2664 vperm2i128 m%6, m%9, m%11, 0x31 ; out5 2665 vperm2i128 m%3, m%4, m%8, 0x20 ; out2 2666 vperm2i128 m%7, m%4, m%8, 0x31 ; out6 2667 vperm2i128 m%4, m%10, m%12, 0x20 ; out3 2668 vperm2i128 m%8, m%10, m%12, 0x31 ; out7 2669%endmacro 2670 2671INV_TXFM_8X8_FN dct, dct, 12 2672INV_TXFM_8X8_FN dct, identity, 12 2673INV_TXFM_8X8_FN dct, adst, 12 2674INV_TXFM_8X8_FN dct, flipadst, 12 2675 2676cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 2677 vpbroadcastd m12, [clip_20b_min] 2678 vpbroadcastd m13, [clip_20b_max] 2679 jmp m(idct_8x8_internal_10bpc).pass1 2680.pass2: 2681 vpbroadcastd m12, [clip_18b_min] 2682 vpbroadcastd m13, [clip_18b_max] 2683 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 2684 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 2685 call .transpose_8x8 2686 vpbroadcastd m11, [pd_2048] 2687 call m(idct_8x8_internal_10bpc).main 2688 call .round_shift4 2689 jmp m(iadst_8x8_internal_12bpc).pass2_end 2690ALIGN function_align 2691.write_8x4_start: 2692 vpbroadcastd m11, [pixel_12bpc_max] 2693 lea r6, [strideq*3] 2694 pxor m10, m10 2695 ret 2696ALIGN function_align 2697.transpose_8x8: 2698 TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 2699 ret 2700ALIGN function_align 2701.round_shift4: 2702 vpbroadcastd m1, [pd_8] 2703 REPX {paddd x, m1}, m0, m6, m5, m3 2704 paddd m1, m6, m7 ; out1 2705 psubd m6, m7 ; out6 2706 psubd m7, m0, m9 ; out7 2707 paddd m0, m9 ; out0 2708 paddd m2, m5, m4 ; out2 2709 psubd m5, m4 ; out5 2710 psubd m4, m3, m8 ; out4 2711 paddd m3, m8 ; out3 2712 REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 2713 ret 2714 2715INV_TXFM_8X8_FN adst, dct, 12 2716INV_TXFM_8X8_FN adst, adst, 12 2717INV_TXFM_8X8_FN adst, flipadst, 12 2718INV_TXFM_8X8_FN adst, identity, 12 2719 2720cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 2721 vpbroadcastd m12, [clip_20b_min] 2722 vpbroadcastd m13, [clip_20b_max] 2723 jmp m(iadst_8x8_internal_10bpc).pass1 2724.pass2: 2725 call .pass2_main 2726.pass2_end: 2727 packssdw m0, m1 2728 packssdw m1, m2, m3 2729 REPX {vpermq x, x, q3120}, m0, m1 2730 call m(idct_8x8_internal_12bpc).write_8x4_start 2731 call m(idct_8x8_internal_10bpc).write_8x4 2732 packssdw m0, m4, m5 2733 packssdw m1, m6, m7 2734 REPX {vpermq x, x, q3120}, m0, m1 2735 call m(idct_8x8_internal_10bpc).write_8x4 2736 RET 2737ALIGN function_align 2738.pass2_main: 2739 vpbroadcastd m12, [clip_18b_min] 2740 vpbroadcastd m13, [clip_18b_max] 2741 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 2742 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 2743 call m(idct_8x8_internal_12bpc).transpose_8x8 2744 vpbroadcastd m11, [pd_2048] 2745.pass2_main2: 2746 call m(iadst_8x8_internal_10bpc).main2 2747 pslld m9, m8, 3 ; pd_8 2748 paddd m0, m9 2749 psubd m1, m9, m1 ; 8+x 2750 paddd m6, m9 2751 psubd m7, m9, m7 2752 REPX {psrad x, 4}, m0, m1, m6, m7 2753 vpbroadcastd m9, [pd_17408] 2754 psubd m8, m9, m8 ; 17407 2755 paddd m2, m9 2756 psubd m3, m8, m3 2757 paddd m4, m9 2758 psubd m5, m8, m5 2759 REPX {psrad x, 15}, m2, m3, m4, m5 2760 ret 2761 2762INV_TXFM_8X8_FN flipadst, dct, 12 2763INV_TXFM_8X8_FN flipadst, adst, 12 2764INV_TXFM_8X8_FN flipadst, flipadst, 12 2765INV_TXFM_8X8_FN flipadst, identity, 12 2766 2767cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 2768 vpbroadcastd m12, [clip_20b_min] 2769 vpbroadcastd m13, [clip_20b_max] 2770 jmp m(iflipadst_8x8_internal_10bpc).pass1 2771.pass2: 2772 call m(iadst_8x8_internal_12bpc).pass2_main 2773 packssdw m7, m7, m6 2774 packssdw m6, m1, m0 2775 packssdw m1, m5, m4 2776 vpermq m0, m7, q3120 2777 vpermq m1, m1, q3120 2778 call m(idct_8x8_internal_12bpc).write_8x4_start 2779 call m(idct_8x8_internal_10bpc).write_8x4 2780 packssdw m0, m3, m2 2781 vpermq m0, m0, q3120 2782 vpermq m1, m6, q3120 2783 call m(idct_8x8_internal_10bpc).write_8x4 2784 RET 2785 2786INV_TXFM_8X8_FN identity, dct, 12 2787INV_TXFM_8X8_FN identity, adst, 12 2788INV_TXFM_8X8_FN identity, flipadst, 12 2789INV_TXFM_8X8_FN identity, identity, 12 2790 2791cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 2792 jmp m(iidentity_8x8_internal_10bpc).pass1 2793.pass2: 2794 packssdw m3, m7 2795 vpbroadcastd m7, [pixel_12bpc_max] 2796 jmp m(iidentity_8x8_internal_10bpc).pass2_main 2797 2798%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth 2799 INV_TXFM_FN %1, %2, %3, 8x16, %4 2800%ifidn %1_%2, dct_dct 2801 imul r6d, [cq], 181 2802 vpbroadcastd m2, [dconly_%4bpc] 2803 mov [cq], eobd ; 0 2804 or r3d, 16 2805 add r6d, 128 2806 sar r6d, 8 2807 imul r6d, 181 2808 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 2809%endif 2810%endmacro 2811 2812INV_TXFM_8X16_FN dct, dct 2813INV_TXFM_8X16_FN dct, identity, 35 2814INV_TXFM_8X16_FN dct, adst 2815INV_TXFM_8X16_FN dct, flipadst 2816 2817cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 2818%undef cmp 2819 vpbroadcastd m12, [clip_18b_min] 2820 vpbroadcastd m13, [clip_18b_max] 2821.pass1: 2822 vpbroadcastd m14, [pd_2896] 2823 vpbroadcastd m11, [pd_2048] 2824 cmp eobd, 43 2825 jl .fast 2826 add cq, 32 2827 call .pass1_main 2828 sub cq, 32 2829 mova [cq+32* 1], m0 2830 mova [cq+32* 3], m1 2831 mova [cq+32* 5], m2 2832 mova [cq+32* 7], m3 2833 mova [cq+32* 9], m4 2834 mova [cq+32*11], m5 2835 mova [cq+32*13], m6 2836 mova m15, m7 2837 call .pass1_main 2838 mova m8, [cq+32* 1] 2839 mova m9, [cq+32* 3] 2840 mova m10, [cq+32* 5] 2841 mova m11, [cq+32* 7] 2842 mova m12, [cq+32* 9] 2843 mova m13, [cq+32*11] 2844 mova m14, [cq+32*13] 2845 jmp tx2q 2846.fast: 2847 call .pass1_main 2848 pxor m8, m8 2849 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 2850 jmp tx2q 2851.pass2: 2852 call .transpose 2853 call m(idct_8x16_internal_8bpc).main 2854 vpbroadcastd m12, [pw_2048] 2855 REPX {vpermq x, x, q3120}, m0, m2, m4, m6 2856 REPX {vpermq x, x, q2031}, m1, m3, m5, m7 2857.end: 2858 pmulhrsw m0, m12 2859 pmulhrsw m1, m12 2860 call m(idct_8x8_internal_10bpc).write_8x4_start 2861 pmulhrsw m0, m2, m12 2862 pmulhrsw m1, m3, m12 2863 call m(idct_8x8_internal_10bpc).write_8x4 2864 pmulhrsw m0, m4, m12 2865 pmulhrsw m1, m5, m12 2866 call m(idct_8x8_internal_10bpc).write_8x4 2867 pmulhrsw m0, m6, m12 2868 pmulhrsw m1, m7, m12 2869 call m(idct_8x8_internal_10bpc).write_8x4 2870 RET 2871ALIGN function_align 2872.transpose: 2873 packssdw m0, m8 2874 packssdw m1, m9 2875 packssdw m2, m10 2876 packssdw m3, m11 2877 packssdw m4, m12 2878 packssdw m5, m13 2879 packssdw m6, m14 2880 packssdw m7, m15 2881 lea r6, [deint_shuf+128] 2882 punpckhwd m8, m0, m1 2883 punpcklwd m0, m1 2884 punpckhwd m1, m2, m3 2885 punpcklwd m2, m3 2886 punpcklwd m3, m4, m5 2887 punpckhwd m4, m5 2888 punpckhwd m5, m6, m7 2889 punpcklwd m6, m7 2890 punpckhdq m7, m3, m6 2891 punpckldq m3, m6 2892 punpckhdq m6, m4, m5 2893 punpckldq m4, m5 2894 punpckhdq m5, m8, m1 2895 punpckldq m8, m1 2896 punpckhdq m1, m0, m2 2897 punpckldq m0, m2 2898 vperm2i128 m2, m0, m3, 0x31 2899 vinserti128 m0, xm3, 1 2900 vperm2i128 m3, m1, m7, 0x31 2901 vinserti128 m1, xm7, 1 2902 vperm2i128 m7, m5, m6, 0x31 2903 vinserti128 m5, xm6, 1 2904 vperm2i128 m6, m8, m4, 0x31 2905 vinserti128 m4, m8, xm4, 1 2906 ret 2907ALIGN function_align 2908.pass1_main: 2909 pmulld m0, m14, [cq+32* 0] 2910 pmulld m1, m14, [cq+32* 2] 2911 pmulld m2, m14, [cq+32* 4] 2912 pmulld m3, m14, [cq+32* 6] 2913 pmulld m4, m14, [cq+32* 8] 2914 pmulld m5, m14, [cq+32*10] 2915 pmulld m6, m14, [cq+32*12] 2916 pmulld m7, m14, [cq+32*14] 2917 call m(idct_8x8_internal_10bpc).main_rect2 2918 jmp m(idct_8x8_internal_10bpc).round_shift1 2919ALIGN function_align 2920.main_evenhalf: 2921 paddd m1, m6, m7 ; idct8 out1 2922 psubd m6, m7 ; idct8 out6 2923 psubd m7, m0, m9 ; idct8 out7 2924 paddd m0, m9 ; idct8 out0 2925 paddd m2, m5, m4 ; idct8 out2 2926 psubd m5, m4 ; idct8 out5 2927 psubd m4, m3, m8 ; idct8 out4 2928 paddd m3, m8 ; idct8 out3 2929 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 2930 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 2931 ret 2932.main_oddhalf_fast_rect2: 2933 REPX {paddd x, m11}, m0, m1, m2, m3 2934 REPX {psrad x, 12 }, m0, m1, m2, m3 2935.main_oddhalf_fast: ; lower half zero 2936 vpbroadcastd m7, [pd_4076] 2937 vpbroadcastd m8, [pd_401] 2938 vpbroadcastd m6, [pd_m1189] 2939 vpbroadcastd m9, [pd_3920] 2940 vpbroadcastd m5, [pd_3612] 2941 vpbroadcastd m10, [pd_1931] 2942 vpbroadcastd m4, [pd_m2598] 2943 vpbroadcastd m15, [pd_3166] 2944 pmulld m7, m0 2945 pmulld m0, m8 2946 pmulld m6, m1 2947 pmulld m1, m9 2948 pmulld m5, m2 2949 pmulld m2, m10 2950 pmulld m4, m3 2951 pmulld m3, m15 2952 jmp .main_oddhalf_fast2 2953.main_oddhalf_rect2: 2954 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 2955 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 2956.main_oddhalf: 2957 ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a 2958 ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a 2959 ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a 2960 ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a 2961.main_oddhalf_fast2: 2962 REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 2963 REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 2964 psubd m8, m0, m4 ; t9 2965 paddd m0, m4 ; t8 2966 psubd m4, m6, m2 ; t10 2967 paddd m2, m6 ; t11 2968 psubd m6, m1, m5 ; t13 2969 paddd m5, m1 ; t12 2970 psubd m1, m7, m3 ; t14 2971 paddd m7, m3 ; t15 2972 REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 2973 REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 2974 vpbroadcastd m15, [pd_3784] 2975 vpbroadcastd m10, [pd_1567] 2976 ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 2977 ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2 2978 psubd m3, m1, m4 ; t10 2979 paddd m1, m4 ; t9 2980 psubd m4, m0, m2 ; t11a 2981 paddd m0, m2 ; t8a 2982 psubd m2, m8, m6 ; t13 2983 paddd m6, m8 ; t14 2984 psubd m8, m7, m5 ; t12a 2985 paddd m7, m5 ; t15a 2986 REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 2987 REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 2988 REPX {pmulld x, m14}, m2, m8, m3, m4 2989 paddd m2, m11 2990 paddd m8, m11 2991 paddd m5, m2, m3 ; t13a 2992 psubd m2, m3 ; t10a 2993 psubd m3, m8, m4 ; t11 2994 paddd m4, m8 ; t12 2995 REPX {psrad x, 12}, m5, m2, m3, m4 2996 mova [r6-32*4], m7 2997 mova [r6-32*3], m6 2998 mova [r6-32*2], m5 2999 mova [r6-32*1], m4 3000 mova [r6+32*0], m3 3001 mova [r6+32*1], m2 3002 mova [r6+32*2], m1 3003 mova [r6+32*3], m0 3004 ret 3005 3006INV_TXFM_8X16_FN adst, dct 3007INV_TXFM_8X16_FN adst, adst 3008INV_TXFM_8X16_FN adst, flipadst 3009INV_TXFM_8X16_FN adst, identity, 35 3010 3011cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 3012%undef cmp 3013 vpbroadcastd m12, [clip_18b_min] 3014 vpbroadcastd m13, [clip_18b_max] 3015.pass1: 3016 vpbroadcastd m14, [pd_2896] 3017 vpbroadcastd m11, [pd_2048] 3018 cmp eobd, 43 3019 jl .fast 3020 add cq, 32 3021 call .pass1_main 3022 call m(iadst_8x8_internal_10bpc).main_end 3023 sub cq, 32 3024 mova [cq+32* 1], m0 3025 mova [cq+32* 3], m1 3026 mova [cq+32* 5], m2 3027 mova [cq+32* 7], m3 3028 mova [cq+32* 9], m4 3029 mova [cq+32*11], m5 3030 mova [cq+32*13], m6 3031 mova m15, m7 3032 call .pass1_main 3033 call m(iadst_8x8_internal_10bpc).main_end 3034 mova m8, [cq+32* 1] 3035 mova m9, [cq+32* 3] 3036 mova m10, [cq+32* 5] 3037 mova m11, [cq+32* 7] 3038 mova m12, [cq+32* 9] 3039 mova m13, [cq+32*11] 3040 mova m14, [cq+32*13] 3041 jmp tx2q 3042.fast: 3043 call .pass1_main 3044 call m(iadst_8x8_internal_10bpc).main_end 3045 pxor m8, m8 3046 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 3047 jmp tx2q 3048.pass2: 3049 call m(idct_8x16_internal_10bpc).transpose 3050 call m(iadst_8x16_internal_8bpc).main 3051 call m(iadst_8x16_internal_8bpc).main_pass2_end 3052 vpbroadcastd m8, [pw_2048] 3053 vpbroadcastd xm12, [pw_4096] 3054 REPX {vpermq x, x, q2031}, m0, m1, m2, m3 3055 REPX {vpermq x, x, q3120}, m4, m5, m6, m7 3056 psubw m12, m8 3057 jmp m(idct_8x16_internal_10bpc).end 3058ALIGN function_align 3059.pass1_main: 3060 pmulld m0, m14, [cq+32* 0] 3061 pmulld m7, m14, [cq+32*14] 3062 pmulld m1, m14, [cq+32* 2] 3063 pmulld m6, m14, [cq+32*12] 3064 pmulld m2, m14, [cq+32* 4] 3065 pmulld m5, m14, [cq+32*10] 3066 pmulld m3, m14, [cq+32* 6] 3067 pmulld m4, m14, [cq+32* 8] 3068 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 3069 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 3070 jmp m(iadst_8x8_internal_10bpc).main2 3071 3072INV_TXFM_8X16_FN flipadst, dct 3073INV_TXFM_8X16_FN flipadst, adst 3074INV_TXFM_8X16_FN flipadst, flipadst 3075INV_TXFM_8X16_FN flipadst, identity, 35 3076 3077cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 3078%undef cmp 3079 vpbroadcastd m12, [clip_18b_min] 3080 vpbroadcastd m13, [clip_18b_max] 3081.pass1: 3082 vpbroadcastd m14, [pd_2896] 3083 vpbroadcastd m11, [pd_2048] 3084 cmp eobd, 43 3085 jl .fast 3086 add cq, 32 3087 call m(iadst_8x16_internal_10bpc).pass1_main 3088 call m(iflipadst_8x8_internal_10bpc).main_end 3089 sub cq, 32 3090 mova [cq+32* 1], m0 3091 mova [cq+32* 3], m1 3092 mova [cq+32* 5], m2 3093 mova [cq+32* 7], m3 3094 mova [cq+32* 9], m4 3095 mova [cq+32*11], m5 3096 mova [cq+32*13], m6 3097 mova m15, m7 3098 call m(iadst_8x16_internal_10bpc).pass1_main 3099 call m(iflipadst_8x8_internal_10bpc).main_end 3100 mova m8, [cq+32* 1] 3101 mova m9, [cq+32* 3] 3102 mova m10, [cq+32* 5] 3103 mova m11, [cq+32* 7] 3104 mova m12, [cq+32* 9] 3105 mova m13, [cq+32*11] 3106 mova m14, [cq+32*13] 3107 jmp tx2q 3108.fast: 3109 call m(iadst_8x16_internal_10bpc).pass1_main 3110 call m(iflipadst_8x8_internal_10bpc).main_end 3111 pxor m8, m8 3112 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 3113 jmp tx2q 3114.pass2: 3115 call m(idct_8x16_internal_10bpc).transpose 3116 call m(iadst_8x16_internal_8bpc).main 3117 call m(iadst_8x16_internal_8bpc).main_pass2_end 3118 vpbroadcastd m12, [pw_2048] 3119 vpbroadcastd xm13, [pw_4096] 3120 mova m11, m0 3121 vpermq m0, m7, q2031 3122 mova m10, m1 3123 vpermq m1, m6, q2031 3124 mova m9, m2 3125 vpermq m2, m5, q2031 3126 mova m8, m3 3127 vpermq m3, m4, q2031 3128 vpermq m4, m8, q3120 3129 vpermq m5, m9, q3120 3130 vpermq m6, m10, q3120 3131 vpermq m7, m11, q3120 3132 psubw m12, m13 3133 jmp m(idct_8x16_internal_10bpc).end 3134 3135INV_TXFM_8X16_FN identity, dct 3136INV_TXFM_8X16_FN identity, adst 3137INV_TXFM_8X16_FN identity, flipadst 3138INV_TXFM_8X16_FN identity, identity 3139 3140%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384] 3141 pmulhrsw m%2, m%3, m%1 3142%if %0 == 4 ; if downshifting by 1 3143%ifnum %4 3144 pmulhrsw m%2, m%4 3145%else ; without rounding 3146 psraw m%2, 1 3147%endif 3148%else 3149 paddsw m%1, m%1 3150%endif 3151 paddsw m%1, m%2 3152%endmacro 3153 3154cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 3155.pass1: 3156 vpbroadcastd m15, [pd_2896] 3157 pmulld m0, m15, [cq+32* 0] 3158 pmulld m8, m15, [cq+32* 1] 3159 pmulld m1, m15, [cq+32* 2] 3160 pmulld m9, m15, [cq+32* 3] 3161 pmulld m2, m15, [cq+32* 4] 3162 pmulld m10, m15, [cq+32* 5] 3163 pmulld m3, m15, [cq+32* 6] 3164 pmulld m11, m15, [cq+32* 7] 3165 pmulld m4, m15, [cq+32* 8] 3166 pmulld m12, m15, [cq+32* 9] 3167 pmulld m5, m15, [cq+32*10] 3168 pmulld m13, m15, [cq+32*11] 3169 pmulld m6, m15, [cq+32*12] 3170 pmulld m14, m15, [cq+32*13] 3171 pmulld m7, m15, [cq+32*14] 3172 pmulld m15, [cq+32*15] 3173 mova [cq], m7 3174 vpbroadcastd m7, [pd_2048] 3175 REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ 3176 m8, m9, m10, m11, m12, m13, m14, m15 3177 paddd m7, [cq] 3178 REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3179 m8, m9, m10, m11, m12, m13, m14, m15 3180 jmp tx2q 3181.pass2: 3182 packssdw m0, m8 3183 packssdw m1, m9 3184 packssdw m2, m10 3185 packssdw m3, m11 3186 packssdw m4, m12 3187 packssdw m5, m13 3188 packssdw m6, m14 3189 packssdw m13, m7, m15 3190 vpbroadcastd m8, [pw_1697x16] 3191 REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13 3192 vpbroadcastd m7, [pixel_10bpc_max] 3193 vpbroadcastd m12, [pw_2048] 3194 call .pass2_end 3195 RET 3196ALIGN function_align 3197.pass2_end: 3198 punpckhwd m9, m0, m1 3199 punpcklwd m0, m1 3200 punpckhwd m1, m6, m13 3201 punpcklwd m6, m13 3202 punpckhwd m13, m4, m5 3203 punpcklwd m4, m5 3204 punpcklwd m5, m2, m3 3205 punpckhwd m2, m3 3206 punpckhdq m3, m0, m5 3207 punpckldq m0, m5 3208 punpckhdq m11, m9, m2 3209 punpckldq m9, m2 3210 punpckldq m2, m4, m6 3211 punpckhdq m4, m6 3212 punpckldq m6, m13, m1 3213 punpckhdq m13, m1 3214 punpckhqdq m1, m0, m2 3215 punpcklqdq m0, m2 3216 punpcklqdq m2, m3, m4 3217 punpckhqdq m3, m4 3218 punpcklqdq m8, m9, m6 3219 punpckhqdq m9, m6 3220 punpcklqdq m10, m11, m13 3221 punpckhqdq m11, m13 3222 pmulhrsw m0, m12 3223 pmulhrsw m1, m12 3224 call m(iidentity_8x8_internal_10bpc).write_2x8x2_start 3225 pmulhrsw m0, m12, m2 3226 pmulhrsw m1, m12, m3 3227 call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero 3228 pmulhrsw m0, m12, m8 3229 pmulhrsw m1, m12, m9 3230 lea dstq, [dstq+strideq*4] 3231 call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero 3232 pmulhrsw m0, m12, m10 3233 pmulhrsw m1, m12, m11 3234 call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero 3235 ret 3236 3237INV_TXFM_8X16_FN dct, dct, 0, 12 3238INV_TXFM_8X16_FN dct, identity, 35, 12 3239INV_TXFM_8X16_FN dct, adst, 0, 12 3240INV_TXFM_8X16_FN dct, flipadst, 0, 12 3241 3242cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 3243 vpbroadcastd m12, [clip_20b_min] 3244 vpbroadcastd m13, [clip_20b_max] 3245 jmp m(idct_8x16_internal_10bpc).pass1 3246.pass2: 3247 lea r6, [rsp+32*4] 3248 call .transpose 3249 vpbroadcastd m12, [clip_18b_min] 3250 vpbroadcastd m13, [clip_18b_max] 3251 mova [cq+32* 8], m0 3252 mova [cq+32*10], m2 3253 mova [cq+32*12], m4 3254 mova [cq+32*14], m6 3255 pmaxsd m0, m12, [cq+32* 1] 3256 pmaxsd m4, m12, m1 3257 pmaxsd m1, m12, [cq+32* 3] 3258 pmaxsd m2, m12, [cq+32* 5] 3259 pmaxsd m6, m12, m5 3260 pmaxsd m5, m12, m3 3261 pmaxsd m3, m12, [cq+32* 7] 3262 pmaxsd m7, m12 3263 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 3264 vpbroadcastd m11, [pd_2048] 3265 vpbroadcastd m14, [pd_2896] 3266 call m(idct_8x16_internal_10bpc).main_oddhalf 3267 pmaxsd m0, m12, [cq+32* 0] 3268 pmaxsd m1, m12, [cq+32* 2] 3269 pmaxsd m2, m12, [cq+32* 4] 3270 pmaxsd m3, m12, [cq+32* 6] 3271 pmaxsd m4, m12, [cq+32* 8] 3272 pmaxsd m5, m12, [cq+32*10] 3273 pmaxsd m6, m12, [cq+32*12] 3274 pmaxsd m7, m12, [cq+32*14] 3275 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 3276 call m(idct_8x8_internal_10bpc).main 3277 call m(idct_8x16_internal_10bpc).main_evenhalf 3278 vpbroadcastd m11, [pd_8] 3279 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 3280 call m(idct_16x8_internal_10bpc).pass1_rotations 3281 REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3282 m8, m9, m10, m11, m12, m13, m14, m15 3283.end: 3284 packssdw m0, m1 3285 packssdw m1, m2, m3 3286 packssdw m2, m4, m5 3287 packssdw m3, m6, m7 3288 packssdw m4, m8, m9 3289 packssdw m5, m10, m11 3290 packssdw m6, m12, m13 3291 packssdw m7, m14, m15 3292 vpermq m0, m0, q3120 3293 vpermq m1, m1, q3120 3294 call m(idct_8x8_internal_12bpc).write_8x4_start 3295 call m(idct_8x8_internal_10bpc).write_8x4 3296 vpermq m0, m2, q3120 3297 vpermq m1, m3, q3120 3298 call m(idct_8x8_internal_10bpc).write_8x4 3299 vpermq m0, m4, q3120 3300 vpermq m1, m5, q3120 3301 call m(idct_8x8_internal_10bpc).write_8x4 3302 vpermq m0, m6, q3120 3303 vpermq m1, m7, q3120 3304 call m(idct_8x8_internal_10bpc).write_8x4 3305 RET 3306ALIGN function_align 3307.transpose: 3308 mova [cq+32* 8], m8 3309 mova [cq+32* 9], m9 3310 mova [cq+32*10], m10 3311 mova [cq+32*11], m11 3312 call m(idct_8x8_internal_12bpc).transpose_8x8 3313 mova [cq+32* 0], m0 3314 mova [cq+32* 1], m1 3315 mova [cq+32* 2], m2 3316 mova [cq+32* 3], m3 3317 mova [cq+32* 4], m4 3318 mova [cq+32* 5], m5 3319 mova [cq+32* 6], m6 3320 mova [cq+32* 7], m7 3321 mova m0, [cq+32* 8] 3322 mova m1, [cq+32* 9] 3323 mova m2, [cq+32*10] 3324 mova m3, [cq+32*11] 3325 mova m4, m12 3326 mova m5, m13 3327 mova m6, m14 3328 mova m7, m15 3329 jmp m(idct_8x8_internal_12bpc).transpose_8x8 3330 3331INV_TXFM_8X16_FN adst, dct, 0, 12 3332INV_TXFM_8X16_FN adst, adst, 0, 12 3333INV_TXFM_8X16_FN adst, flipadst, 0, 12 3334INV_TXFM_8X16_FN adst, identity, 35, 12 3335 3336cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 3337 vpbroadcastd m12, [clip_20b_min] 3338 vpbroadcastd m13, [clip_20b_max] 3339 jmp m(iadst_8x16_internal_10bpc).pass1 3340.pass2: 3341 lea r6, [rsp+32*4] 3342 call .pass2_main 3343 call m(iadst_16x8_internal_10bpc).pass1_rotations 3344.pass2_end: 3345 REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 3346 REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 3347 jmp m(idct_8x16_internal_12bpc).end 3348ALIGN function_align 3349.pass2_main: 3350 call m(idct_8x16_internal_12bpc).transpose 3351 vpbroadcastd m13, [clip_18b_min] 3352 vpbroadcastd m14, [clip_18b_max] 3353 mova [cq+32* 8], m0 3354 mova [cq+32*11], m3 3355 mova [cq+32*12], m4 3356 mova [cq+32*15], m7 3357 pmaxsd m0, m13, [cq+32* 2] ; 2 3358 pmaxsd m3, m13, m1 ; 9 3359 pmaxsd m1, m13, m5 ; 13 3360 pmaxsd m4, m13, m2 ; 10 3361 pmaxsd m2, m13, [cq+32* 6] ; 6 3362 pmaxsd m5, m13, [cq+32* 5] ; 5 3363 pmaxsd m6, m13, m6 ; 14 3364 pmaxsd m7, m13, [cq+32* 1] ; 1 3365 REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 3366 vpbroadcastd m12, [pd_2048] 3367 vpbroadcastd m15, [pd_2896] 3368 call m(iadst_16x8_internal_10bpc).main_part1 3369 pmaxsd m0, m13, [cq+32* 0] ; 0 3370 pmaxsd m1, m13, [cq+32*15] ; 15 3371 pmaxsd m2, m13, [cq+32* 4] ; 4 3372 pmaxsd m3, m13, [cq+32*11] ; 11 3373 pmaxsd m4, m13, [cq+32* 8] ; 8 3374 pmaxsd m5, m13, [cq+32* 7] ; 7 3375 pmaxsd m6, m13, [cq+32*12] ; 12 3376 pmaxsd m7, m13, [cq+32* 3] ; 3 3377 REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 3378 call m(iadst_16x8_internal_10bpc).main_part2 3379 vpbroadcastd m14, [pd_17408] 3380 psrld m15, 11 ; pd_1 3381 psubd m13, m14, m15 ; pd_17407 3382 pslld m15, 3 ; pd_8 3383 ret 3384 3385INV_TXFM_8X16_FN flipadst, dct, 0, 12 3386INV_TXFM_8X16_FN flipadst, adst, 0, 12 3387INV_TXFM_8X16_FN flipadst, flipadst, 0, 12 3388INV_TXFM_8X16_FN flipadst, identity, 35, 12 3389 3390cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 3391 vpbroadcastd m12, [clip_20b_min] 3392 vpbroadcastd m13, [clip_20b_max] 3393 jmp m(iflipadst_8x16_internal_10bpc).pass1 3394.pass2: 3395 lea r6, [rsp+32*4] 3396 call m(iadst_8x16_internal_12bpc).pass2_main 3397 call m(iflipadst_16x8_internal_10bpc).pass1_rotations 3398 jmp m(iadst_8x16_internal_12bpc).pass2_end 3399 3400INV_TXFM_8X16_FN identity, dct, 0, 12 3401INV_TXFM_8X16_FN identity, adst, 0, 12 3402INV_TXFM_8X16_FN identity, flipadst, 0, 12 3403INV_TXFM_8X16_FN identity, identity, 0, 12 3404 3405cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 3406 jmp m(iidentity_8x16_internal_10bpc).pass1 3407.pass2: 3408 call .pass2_main 3409 packssdw m0, m8 3410 packssdw m1, m9 3411 packssdw m2, m10 3412 packssdw m3, m11 3413 packssdw m4, m12 3414 packssdw m5, m13 3415 packssdw m6, m14 3416 packssdw m13, m7, m15 3417 vpbroadcastd m7, [pixel_12bpc_max] 3418 vpbroadcastd m12, [pw_16384] 3419 call m(iidentity_8x16_internal_10bpc).pass2_end 3420 RET 3421ALIGN function_align 3422.pass2_main: 3423 mova [cq], m7 3424 vpbroadcastd m7, [clip_18b_min] 3425 REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ 3426 m8, m9, m10, m11, m12, m13, m14, m15 3427 pmaxsd m7, [cq] 3428 mova [cq], m15 3429 vpbroadcastd m15, [clip_18b_max] 3430 REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3431 m8, m9, m10, m11, m12, m13, m14 3432 pminsd m15, [cq] 3433 mova [cq], m7 3434 vpbroadcastd m7, [pd_5793] 3435 REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ 3436 m8, m9, m10, m11, m12, m13, m14, m15 3437 pmulld m7, [cq] 3438 mova [cq], m15 3439 vpbroadcastd m15, [pd_1024] 3440 REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3441 m8, m9, m10, m11, m12, m13, m14 3442 paddd m15, [cq] 3443 REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3444 m8, m9, m10, m11, m12, m13, m14, m15 3445 ret 3446 3447%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth 3448 INV_TXFM_FN %1, %2, 0, 16x4, %3 3449%ifidn %1_%2, dct_dct 3450 vpbroadcastd m3, [dconly_%3bpc] 3451%if %3 = 10 3452.dconly: 3453 imul r6d, [cq], 181 3454 mov [cq], eobd ; 0 3455 or r3d, 4 3456.dconly2: 3457 add r6d, 384 3458 sar r6d, 9 3459.dconly3: 3460 imul r6d, 181 3461 add r6d, 2176 3462 sar r6d, 12 3463 movd xm0, r6d 3464 paddsw xm0, xm3 3465 vpbroadcastw m0, xm0 3466.dconly_loop: 3467 paddsw m1, m0, [dstq+strideq*0] 3468 paddsw m2, m0, [dstq+strideq*1] 3469 psubusw m1, m3 3470 psubusw m2, m3 3471 mova [dstq+strideq*0], m1 3472 mova [dstq+strideq*1], m2 3473 lea dstq, [dstq+strideq*2] 3474 sub r3d, 2 3475 jg .dconly_loop 3476 RET 3477%else 3478 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly 3479%endif 3480%endif 3481%endmacro 3482 3483INV_TXFM_16X4_FN dct, dct 3484INV_TXFM_16X4_FN dct, identity 3485INV_TXFM_16X4_FN dct, adst 3486INV_TXFM_16X4_FN dct, flipadst 3487 3488cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 3489 vpbroadcastd m8, [clip_18b_min] 3490 vpbroadcastd m9, [clip_18b_max] 3491.pass1: 3492 vbroadcasti128 m0, [cq+16* 0] 3493 vbroadcasti128 m4, [cq+16* 4] 3494 vbroadcasti128 m1, [cq+16* 2] 3495 vbroadcasti128 m7, [cq+16* 6] 3496 vbroadcasti128 m5, [cq+16*10] 3497 vbroadcasti128 m2, [cq+16* 8] 3498 vbroadcasti128 m6, [cq+16*12] 3499 vbroadcasti128 m3, [cq+16*14] 3500 shufpd m0, m4, 0x0c ; 0 4 3501 shufpd m1, m5, 0x0c ; 2 10 3502 shufpd m2, m6, 0x0c ; 8 12 3503 shufpd m3, m7, 0x0c ; 14 6 3504 call .pass1_main 3505 vbroadcasti128 m10, [cq+16* 1] 3506 vbroadcasti128 m4, [cq+16* 5] 3507 vbroadcasti128 m11, [cq+16*15] 3508 vbroadcasti128 m5, [cq+16*11] 3509 shufpd m10, m4, 0x0c ; 1 5 3510 shufpd m11, m5, 0x0c ; 15 11 3511 vbroadcasti128 m5, [cq+16* 9] 3512 vbroadcasti128 m4, [cq+16*13] 3513 shufpd m5, m4, 0x0c ; 9 13 3514 vbroadcasti128 m6, [cq+16* 7] 3515 vbroadcasti128 m4, [cq+16* 3] 3516 shufpd m6, m4, 0x0c ; 7 3 3517 call .pass1_main2 3518 pcmpeqd m4, m4 3519 REPX {psubd x, m4}, m0, m1, m2, m3 3520 call .pass1_main3 3521 REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 3522 jmp tx2q 3523.pass2: 3524 call .transpose_4x16_packed 3525 lea r6, [deint_shuf+128] 3526 call m(idct_16x4_internal_8bpc).main 3527.end: 3528 vpbroadcastd m4, [pw_2048] 3529 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 3530 vpbroadcastd m5, [pixel_10bpc_max] 3531.end2: 3532 paddw m0, [dstq+strideq*0] 3533 paddw m1, [dstq+strideq*1] 3534.end3: 3535 lea r6, [dstq+strideq*2] 3536 paddw m2, [r6 +strideq*0] 3537 paddw m3, [r6 +strideq*1] 3538 pxor m4, m4 3539 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 3540 REPX {pmaxsw x, m4}, m0, m1, m2, m3 3541 REPX {pminsw x, m5}, m0, m1, m2, m3 3542 mova [dstq+strideq*0], m0 3543 mova [dstq+strideq*1], m1 3544 mova [r6 +strideq*0], m2 3545 mova [r6 +strideq*1], m3 3546 RET 3547ALIGN function_align 3548.pass1_main: 3549 vpbroadcastd m7, [pd_2048] 3550 call m(idct_8x4_internal_10bpc).main 3551 psubd m3, m0, m4 ; idct8 out7 out6 3552 paddd m0, m4 ; idct8 out0 out1 3553 paddd m1, m2, m5 ; idct8 out3 out2 3554 psubd m2, m5 ; idct8 out4 out5 3555 ret 3556ALIGN function_align 3557.pass1_main2: 3558 ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 3559 ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 3560 vbroadcasti128 m12, [pd_3784_m3784] 3561 psubd m4, m10, m5 3562 paddd m10, m5 ; t8 t11 3563 psignd m4, m12 ; t9 t10 3564 psubd m5, m11, m6 3565 paddd m11, m6 ; t15 t12 3566 psignd m5, m12 ; t14 t13 3567 vpbroadcastd m6, [pd_1567] 3568 vpbroadcastd m13, [pd_3784] 3569 REPX {pmaxsd x, m8}, m5, m4 3570 REPX {pminsd x, m9}, m5, m4 3571 pmulld m12, m5 3572 pmulld m5, m6 3573 vbroadcasti128 m6, [pd_1567_m1567] 3574 pmulld m13, m4 3575 pmulld m4, m6 3576 REPX {pmaxsd x, m8}, m10, m11, m0, m1 3577 REPX {pminsd x, m9}, m10, m11, m0, m1 3578 paddd m12, m7 3579 paddd m5, m7 3580 paddd m4, m12 3581 psubd m5, m13 3582 psrad m4, 12 ; t14a t10a 3583 psrad m5, 12 ; t9a t13a 3584 vpbroadcastd m12, [pd_2896] 3585 punpckhqdq m6, m11, m5 3586 punpcklqdq m11, m4 3587 punpckhqdq m4, m10, m4 3588 punpcklqdq m10, m5 3589 psubd m5, m11, m6 ; t12a t13 3590 paddd m11, m6 ; t15a t14 3591 psubd m6, m10, m4 ; t11a t10 3592 paddd m10, m4 ; t8a t9 3593 REPX {pmaxsd x, m8}, m5, m6 3594 REPX {pminsd x, m9}, m5, m6 3595 pmulld m5, m12 3596 pmulld m6, m12 3597 REPX {pmaxsd x, m8}, m2, m3, m11, m10 3598 REPX {pminsd x, m9}, m2, m3, m11, m10 3599 ret 3600ALIGN function_align 3601.pass1_main3: 3602 paddd m5, m7 3603 psubd m4, m5, m6 3604 paddd m5, m6 3605 psrad m4, 12 ; t11 t10a 3606 psrad m5, 12 ; t12 t13a 3607 psubd m7, m0, m11 ; out15 out14 3608 paddd m0, m11 ; out0 out1 3609 psubd m6, m1, m5 ; out12 out13 3610 paddd m1, m5 ; out3 out2 3611 psubd m5, m2, m4 ; out11 out10 3612 paddd m2, m4 ; out4 out5 3613 psubd m4, m3, m10 ; out8 out9 3614 paddd m3, m10 ; out7 out6 3615 REPX {pshufd x, x, q1032}, m1, m3, m5, m7 3616 ret 3617ALIGN function_align 3618.transpose_4x16_packed: 3619 vbroadcasti128 m8, [deint_shuf] 3620 packssdw m0, m1 3621 packssdw m2, m3 3622 packssdw m4, m5 3623 packssdw m6, m7 3624 REPX {pshufb x, m8}, m0, m2, m4, m6 3625 punpckhqdq m1, m0, m2 3626 punpcklqdq m0, m2 3627 punpckhqdq m2, m4, m6 3628 punpcklqdq m4, m6 3629 vperm2i128 m3, m1, m2, 0x31 3630 vinserti128 m1, xm2, 1 3631 vperm2i128 m2, m0, m4, 0x31 3632 vinserti128 m0, xm4, 1 3633 ret 3634 3635INV_TXFM_16X4_FN adst, dct 3636INV_TXFM_16X4_FN adst, adst 3637INV_TXFM_16X4_FN adst, flipadst 3638INV_TXFM_16X4_FN adst, identity 3639 3640cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 3641 vpbroadcastd m12, [clip_18b_min] 3642 vpbroadcastd m13, [clip_18b_max] 3643.pass1: 3644 call m(iadst_4x16_internal_10bpc).main 3645 psrad m11, 11 ; pd_1 3646 REPX {paddd x, m11}, m0, m1, m2, m3 3647 paddd m4, m5, m11 3648 paddd m5, m6, m11 3649 paddd m6, m7, m11 3650 paddd m7, m8, m11 3651.pass1_end: 3652 REPX {pshufd x, x, q1032}, m0, m2, m4, m6 3653 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 3654 jmp tx2q 3655.pass2: 3656 call m(idct_16x4_internal_10bpc).transpose_4x16_packed 3657 lea r6, [deint_shuf+128] 3658 call m(iadst_16x4_internal_8bpc).main 3659 jmp m(idct_16x4_internal_10bpc).end 3660ALIGN function_align 3661.main: 3662 vpbroadcastd m6, [pd_1321] 3663 mova m0, [cq+32*0] 3664 mova m1, [cq+32*1] 3665 vpbroadcastd m7, [pd_2482] 3666 mova m2, [cq+32*6] 3667 mova m3, [cq+32*7] 3668 pmulld m4, m0, m6 3669 pmulld m5, m1, m6 ; 1321*in0 3670 pmulld m9, m2, m7 3671 pmulld m8, m3, m7 ; 2482*in3 3672 paddd m4, m9 3673 paddd m8, m5 ; 1321*in0 + 2482*in3 3674 pmulld m5, m0, m7 3675 pmulld m9, m1, m7 ; 2482*in0 3676 paddd m0, m2 3677 paddd m1, m3 ; in0 + in3 3678 paddd m7, m6 ; pd_3803 3679 pmulld m2, m7 3680 pmulld m3, m7 ; 3803*in3 3681 psubd m5, m2 3682 psubd m9, m3 ; 2482*in0 - 3803*in3 3683 mova m2, [cq+32*4] 3684 pmulld m10, m7, m2 3685 pmulld m3, m6, m2 3686 psubd m2, m0 3687 mova m0, [cq+32*5] 3688 pmulld m7, m0 ; 3803*in2 3689 pmulld m6, m0 ; 1321*in2 3690 psubd m0, m1 ; in2 - in0 - in3 3691 vpbroadcastd m1, [pd_m3344] 3692 paddd m4, m10 3693 paddd m7, m8 ; t0 3694 psubd m5, m3 3695 psubd m9, m6 ; t1 3696 pmulld m2, m1 3697 pmulld m0, m1 ; t2 3698 pmulld m3, m1, [cq+32*2] 3699 pmulld m1, [cq+32*3] ; -t3 3700 ret 3701ALIGN function_align 3702.main_end: 3703 ; expects: m6 = rnd 3704 paddd m5, m6 3705 paddd m9, m6 3706 paddd m10, m4, m5 3707 paddd m4, m6 3708 paddd m8, m7, m6 3709 paddd m7, m9 3710 psubd m4, m3 ; out0 (unshifted) 3711 psubd m5, m3 ; out1 (unshifted) 3712 paddd m2, m6 ; out2 (unshifted) 3713 paddd m3, m10 ; out3 (unshifted) 3714 psubd m8, m1 ; out4 (unshifted) 3715 psubd m9, m1 ; out5 (unshifted) 3716 paddd m6, m0 ; out6 (unshifted) 3717 paddd m7, m1 ; out7 (unshifted) 3718 ret 3719 3720INV_TXFM_16X4_FN flipadst, dct 3721INV_TXFM_16X4_FN flipadst, adst 3722INV_TXFM_16X4_FN flipadst, flipadst 3723INV_TXFM_16X4_FN flipadst, identity 3724 3725cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 3726 vpbroadcastd m12, [clip_18b_min] 3727 vpbroadcastd m13, [clip_18b_max] 3728.pass1: 3729 call m(iadst_4x16_internal_10bpc).main 3730 psrad m11, 11 ; pd_1 3731 paddd m4, m3, m11 3732 paddd m3, m5, m11 3733 paddd m5, m2, m11 3734 paddd m2, m6, m11 3735 paddd m6, m1, m11 3736 paddd m1, m7, m11 3737 paddd m7, m0, m11 3738 paddd m0, m8, m11 3739 jmp m(iadst_16x4_internal_10bpc).pass1_end 3740.pass2: 3741 call m(idct_16x4_internal_10bpc).transpose_4x16_packed 3742 lea r6, [deint_shuf+128] 3743 call m(iadst_16x4_internal_8bpc).main 3744 vpbroadcastd m4, [pw_2048] 3745 pmulhrsw m5, m3, m4 3746 pmulhrsw m6, m2, m4 3747 pmulhrsw m2, m1, m4 3748 pmulhrsw m3, m0, m4 3749 paddw m0, m5, [dstq+strideq*0] 3750 paddw m1, m6, [dstq+strideq*1] 3751 vpbroadcastd m5, [pixel_10bpc_max] 3752 jmp m(idct_16x4_internal_10bpc).end3 3753 3754INV_TXFM_16X4_FN identity, dct 3755INV_TXFM_16X4_FN identity, adst 3756INV_TXFM_16X4_FN identity, flipadst 3757INV_TXFM_16X4_FN identity, identity 3758 3759cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 3760 vpbroadcastd m8, [pd_5793] 3761 vpermq m0, [cq+32*0], q3120 ; 0 1 3762 vpermq m1, [cq+32*1], q3120 ; 2 3 3763 vpermq m2, [cq+32*2], q3120 ; 4 5 3764 vpermq m3, [cq+32*3], q3120 ; 6 7 3765 vpermq m4, [cq+32*4], q3120 ; 8 9 3766 vpermq m5, [cq+32*5], q3120 ; a b 3767 vpermq m6, [cq+32*6], q3120 ; c d 3768 vpermq m7, [cq+32*7], q3120 ; e f 3769 vpbroadcastd m9, [pd_3072] 3770 REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 3771 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 3772 REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 3773 jmp tx2q 3774.pass2: 3775 call m(idct_16x4_internal_10bpc).transpose_4x16_packed 3776 vpbroadcastd m7, [pw_1697x8] 3777 pmulhrsw m4, m7, m0 3778 pmulhrsw m5, m7, m1 3779 pmulhrsw m6, m7, m2 3780 pmulhrsw m7, m3 3781 paddsw m0, m4 3782 paddsw m1, m5 3783 paddsw m2, m6 3784 paddsw m3, m7 3785 jmp m(idct_16x4_internal_10bpc).end 3786 3787INV_TXFM_16X4_FN dct, dct, 12 3788INV_TXFM_16X4_FN dct, identity, 12 3789INV_TXFM_16X4_FN dct, adst, 12 3790INV_TXFM_16X4_FN dct, flipadst, 12 3791 3792cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 3793 vpbroadcastd m8, [clip_20b_min] 3794 vpbroadcastd m9, [clip_20b_max] 3795 jmp m(idct_16x4_internal_10bpc).pass1 3796.pass2: 3797 vpbroadcastd m12, [clip_18b_min] 3798 vpbroadcastd m13, [clip_18b_max] 3799 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 3800 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 3801 ; deinterleave 3802 REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 3803 ; transpose 3804 punpcklqdq m8, m0, m1 3805 punpckhqdq m0, m1 3806 punpcklqdq m9, m2, m3 3807 punpckhqdq m2, m3 3808 punpcklqdq m10, m4, m5 3809 punpckhqdq m4, m5 3810 punpcklqdq m11, m6, m7 3811 punpckhqdq m6, m7 3812 vperm2i128 m3, m0, m2, 0x31 ; out6 3813 vperm2i128 m1, m0, m2, 0x20 ; out2 3814 vperm2i128 m7, m4, m6, 0x31 ; out7 3815 vperm2i128 m5, m4, m6, 0x20 ; out3 3816 vperm2i128 m13, m10, m11, 0x31 ; out5 3817 vperm2i128 m12, m10, m11, 0x20 ; out1 3818 vperm2i128 m11, m8, m9, 0x31 ; out4 3819 vperm2i128 m10, m8, m9, 0x20 ; out0 3820 call m(idct_4x16_internal_10bpc).pass1_main 3821 pmulld m0, m6, m10 3822 pmulld m2, m6, m11 3823 pmulld m4, m6, m12 3824 pmulld m6, m13 3825 vpbroadcastd m10, [pd_17408] 3826 call m(idct_4x16_internal_10bpc).pass1_main2 3827 REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 3828 packssdw m0, m4 3829 packssdw m1, m5 3830 packssdw m2, m6 3831 packssdw m3, m7 3832 vpbroadcastd m5, [pixel_12bpc_max] 3833 REPX {vpermq x, x, q3120}, m0, m1, m2, m3 3834 jmp m(idct_16x4_internal_10bpc).end2 3835 3836INV_TXFM_16X4_FN adst, dct, 12 3837INV_TXFM_16X4_FN adst, adst, 12 3838INV_TXFM_16X4_FN adst, flipadst, 12 3839INV_TXFM_16X4_FN adst, identity, 12 3840 3841cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 3842 vpbroadcastd m12, [clip_20b_min] 3843 vpbroadcastd m13, [clip_20b_max] 3844 jmp m(iadst_16x4_internal_10bpc).pass1 3845.pass2: 3846 call .pass2_main 3847 REPX {vpermq x, x, q3120}, m0, m1, m2, m3 3848 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 3849 jmp m(idct_16x4_internal_10bpc).end2 3850ALIGN function_align 3851.pass2_main: 3852 vpbroadcastd m12, [clip_18b_min] 3853 vpbroadcastd m13, [clip_18b_max] 3854 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7 3855 pmaxsd m8, m4, m12 3856 pmaxsd m9, m5, m12 3857 REPX {pminsd x, m13}, m0, m1, m2, m3 3858 call m(iadst_8x4_internal_12bpc).transpose_4x8 3859 mova [cq+32*0], m0 3860 mova [cq+32*2], m1 3861 mova [cq+32*4], m2 3862 mova [cq+32*6], m3 3863 pminsd m0, m8, m13 3864 pminsd m1, m9, m13 3865 pminsd m2, m6, m13 3866 pminsd m3, m7, m13 3867 call m(iadst_8x4_internal_12bpc).transpose_4x8 3868 mova [cq+32*1], m0 3869 mova [cq+32*3], m1 3870 mova [cq+32*5], m2 3871 mova [cq+32*7], m3 3872 call m(iadst_16x4_internal_10bpc).main 3873 vpbroadcastd m6, [pd_2048] 3874 call m(iadst_16x4_internal_10bpc).main_end 3875 psrad m0, m4, 15 3876 psrad m1, m5, 15 3877 psrad m2, 15 3878 psrad m3, 15 3879 psrad m4, m8, 15 3880 psrad m5, m9, 15 3881 psrad m6, 15 3882 psrad m7, 15 3883 packssdw m0, m4 3884 packssdw m1, m5 3885 packssdw m2, m6 3886 packssdw m3, m7 3887 vpbroadcastd m4, [pw_16384] 3888 vpbroadcastd m5, [pixel_12bpc_max] 3889 ret 3890 3891INV_TXFM_16X4_FN flipadst, dct, 12 3892INV_TXFM_16X4_FN flipadst, adst, 12 3893INV_TXFM_16X4_FN flipadst, flipadst, 12 3894INV_TXFM_16X4_FN flipadst, identity, 12 3895 3896cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 3897 vpbroadcastd m12, [clip_20b_min] 3898 vpbroadcastd m13, [clip_20b_max] 3899 jmp m(iflipadst_16x4_internal_10bpc).pass1 3900.pass2: 3901 call m(iadst_16x4_internal_12bpc).pass2_main 3902 vpermq m7, m0, q3120 3903 vpermq m6, m1, q3120 3904 vpermq m1, m2, q3120 3905 vpermq m0, m3, q3120 3906 pmulhrsw m0, m4 3907 pmulhrsw m1, m4 3908 pmulhrsw m2, m6, m4 3909 pmulhrsw m3, m7, m4 3910 jmp m(idct_16x4_internal_10bpc).end2 3911 3912INV_TXFM_16X4_FN identity, dct, 12 3913INV_TXFM_16X4_FN identity, adst, 12 3914INV_TXFM_16X4_FN identity, flipadst, 12 3915INV_TXFM_16X4_FN identity, identity, 12 3916 3917cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 3918 vpbroadcastd m8, [pd_1697] 3919 vpermq m0, [cq+32*0], q3120 ; 0 1 3920 vpermq m1, [cq+32*1], q3120 ; 2 3 3921 vpermq m2, [cq+32*2], q3120 ; 4 5 3922 vpermq m3, [cq+32*3], q3120 ; 6 7 3923 vpbroadcastd m9, [pd_3072] 3924 pmulld m4, m8, m0 3925 pmulld m5, m8, m1 3926 pmulld m6, m8, m2 3927 pmulld m7, m8, m3 3928 vpermq m10, [cq+32*4], q3120 ; 8 9 3929 vpermq m11, [cq+32*5], q3120 ; a b 3930 vpermq m12, [cq+32*6], q3120 ; c d 3931 vpermq m13, [cq+32*7], q3120 ; e f 3932 REPX {paddd x, m9}, m4, m5, m6, m7 3933 REPX {psrad x, 12}, m4, m5, m6, m7 3934 paddd m0, m4 3935 pmulld m4, m8, m10 3936 paddd m1, m5 3937 pmulld m5, m8, m11 3938 paddd m2, m6 3939 pmulld m6, m8, m12 3940 paddd m3, m7 3941 pmulld m7, m8, m13 3942 REPX {paddd x, m9}, m4, m5, m6, m7 3943 REPX {psrad x, 12}, m4, m5, m6, m7 3944 paddd m4, m10 3945 paddd m5, m11 3946 paddd m6, m12 3947 paddd m7, m13 3948 jmp tx2q 3949.pass2: 3950 vpbroadcastd m12, [clip_18b_min] 3951 vpbroadcastd m13, [clip_18b_max] 3952 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 3953 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 3954 vpbroadcastd m8, [pd_5793] 3955 vpbroadcastd m9, [pd_2048] 3956 REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 3957 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 3958 REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 3959 call m(idct_16x4_internal_10bpc).transpose_4x16_packed 3960 vpbroadcastd m4, [pw_16384] 3961 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 3962 vpbroadcastd m5, [pixel_12bpc_max] 3963 jmp m(idct_16x4_internal_10bpc).end2 3964 3965%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth 3966 INV_TXFM_FN %1, %2, 0, 16x8, %3 3967%ifidn %1_%2, dct_dct 3968 imul r6d, [cq], 181 3969 vpbroadcastd m3, [dconly_%3bpc] 3970 mov [cq], eobd ; 0 3971 or r3d, 8 3972 add r6d, 128 3973 sar r6d, 8 3974 imul r6d, 181 3975 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 3976%endif 3977%endmacro 3978 3979INV_TXFM_16X8_FN dct, dct 3980INV_TXFM_16X8_FN dct, identity 3981INV_TXFM_16X8_FN dct, adst 3982INV_TXFM_16X8_FN dct, flipadst 3983 3984cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 3985 vpbroadcastd m12, [clip_18b_min] 3986 vpbroadcastd m13, [clip_18b_max] 3987.pass1: 3988 vpbroadcastd m14, [pd_2896] 3989 pmulld m0, m14, [cq+32* 1] 3990 pmulld m1, m14, [cq+32* 3] 3991 pmulld m2, m14, [cq+32* 5] 3992 pmulld m3, m14, [cq+32* 7] 3993 pmulld m4, m14, [cq+32* 9] 3994 pmulld m5, m14, [cq+32*11] 3995 pmulld m6, m14, [cq+32*13] 3996 pmulld m7, m14, [cq+32*15] 3997 vpbroadcastd m11, [pd_2048] 3998 lea r6, [rsp+32*4] 3999 call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 4000 pmulld m0, m14, [cq+32* 0] 4001 pmulld m1, m14, [cq+32* 2] 4002 pmulld m2, m14, [cq+32* 4] 4003 pmulld m3, m14, [cq+32* 6] 4004 pmulld m4, m14, [cq+32* 8] 4005 pmulld m5, m14, [cq+32*10] 4006 pmulld m6, m14, [cq+32*12] 4007 pmulld m7, m14, [cq+32*14] 4008 call m(idct_8x8_internal_10bpc).main_rect2 4009 call m(idct_8x16_internal_10bpc).main_evenhalf 4010 psrld m11, 11 ; pd_1 4011 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 4012 call .pass1_rotations 4013 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ 4014 m8, m9, m10, m11, m12, m13, m14, m15 4015 jmp tx2q 4016.pass2: 4017 call .transpose 4018 call m(idct_16x8_internal_8bpc).main 4019 vpbroadcastd m10, [pw_2048] 4020.end: 4021 pmulhrsw m0, m10 4022 pmulhrsw m1, m10 4023 pmulhrsw m2, m10 4024 pmulhrsw m3, m10 4025 call .write_16x4_start 4026.end2: 4027 pmulhrsw m0, m4, m10 4028 pmulhrsw m1, m5, m10 4029 pmulhrsw m2, m6, m10 4030 pmulhrsw m3, m7, m10 4031 call .write_16x4_zero 4032 RET 4033ALIGN function_align 4034.pass1_rotations: 4035 mova m14, [r6-32*4] 4036 mova m13, [r6-32*3] 4037 mova m12, [r6-32*2] 4038 mova m11, [r6-32*1] 4039 mova m10, [r6+32*0] 4040 mova m9, [r6+32*1] 4041 mova m8, [r6+32*2] 4042 psubd m15, m0, m14 ; out15 4043 paddd m0, m14 ; out0 4044 psubd m14, m1, m13 ; out14 4045 paddd m1, m13 ; out1 4046 psubd m13, m2, m12 ; out13 4047 paddd m2, m12 ; out2 4048 psubd m12, m3, m11 ; out12 4049 paddd m3, m11 ; out3 4050 psubd m11, m4, m10 ; out11 4051 paddd m4, m10 ; out4 4052 psubd m10, m5, m9 ; out10 4053 paddd m5, m9 ; out5 4054 psubd m9, m6, m8 ; out9 4055 paddd m6, m8 ; out6 4056 psubd m8, m7, [r6+32*3] ; out8 4057 paddd m7, [r6+32*3] ; out7 4058 ret 4059ALIGN function_align 4060.transpose: 4061 lea r6, [deint_shuf+128] 4062.transpose2: 4063 packssdw m0, m8 4064 packssdw m1, m9 4065 packssdw m2, m10 4066 packssdw m3, m11 4067 packssdw m4, m12 4068 packssdw m5, m13 4069 packssdw m6, m14 4070 packssdw m7, m15 4071.transpose3: 4072 punpckhwd m8, m0, m1 4073 punpcklwd m0, m1 4074 punpcklwd m1, m2, m3 4075 punpckhwd m2, m3 4076 punpckhwd m3, m4, m5 4077 punpcklwd m4, m5 4078 punpckhwd m5, m6, m7 4079 punpcklwd m6, m7 4080 punpckhdq m7, m4, m6 4081 punpckldq m4, m6 4082 punpckldq m6, m8, m2 4083 punpckhdq m8, m2 4084 punpckhdq m2, m0, m1 4085 punpckldq m0, m1 4086 punpckhdq m1, m3, m5 4087 punpckldq m3, m5 4088 punpcklqdq m5, m6, m3 4089 punpckhqdq m6, m3 4090 punpckhqdq m3, m2, m7 4091 punpcklqdq m2, m7 4092 punpcklqdq m7, m8, m1 4093 punpckhqdq m8, m1 4094 punpckhqdq m1, m0, m4 4095 punpcklqdq m0, m4 4096 vperm2i128 m4, m0, m5, 0x31 4097 vinserti128 m0, xm5, 1 4098 vperm2i128 m5, m1, m6, 0x31 4099 vinserti128 m1, xm6, 1 4100 vperm2i128 m6, m2, m7, 0x31 4101 vinserti128 m2, xm7, 1 4102 vperm2i128 m7, m3, m8, 0x31 4103 vinserti128 m3, xm8, 1 4104 ret 4105ALIGN function_align 4106.write_16x4_start: 4107 vpbroadcastd m9, [pixel_10bpc_max] 4108 lea r3, [strideq*3] 4109 pxor m8, m8 4110.write_16x4_zero: 4111 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7 4112 add cq, 32*8 4113.write_16x4: 4114 paddw m0, [dstq+strideq*0] 4115 paddw m1, [dstq+strideq*1] 4116 paddw m2, [dstq+strideq*2] 4117 paddw m3, [dstq+r3 ] 4118 REPX {pmaxsw x, m8}, m0, m1, m2, m3 4119 REPX {pminsw x, m9}, m0, m1, m2, m3 4120 mova [dstq+strideq*0], m0 4121 mova [dstq+strideq*1], m1 4122 mova [dstq+strideq*2], m2 4123 mova [dstq+r3 ], m3 4124 lea dstq, [dstq+strideq*4] 4125 ret 4126 4127INV_TXFM_16X8_FN adst, dct 4128INV_TXFM_16X8_FN adst, adst 4129INV_TXFM_16X8_FN adst, flipadst 4130INV_TXFM_16X8_FN adst, identity 4131 4132cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 4133 vpbroadcastd m13, [clip_18b_min] 4134 vpbroadcastd m14, [clip_18b_max] 4135.pass1: 4136 lea r6, [rsp+32*4] 4137 call .main 4138 vpbroadcastd m14, [pd_3072] 4139 psrld m15, 11 ; pd_1 4140 psubd m13, m14, m15 ; pd_3071 4141 call .pass1_rotations 4142.pass1_end: 4143 REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 4144 REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 4145 jmp tx2q 4146.pass2: 4147 call m(idct_16x8_internal_10bpc).transpose 4148 call m(iadst_16x8_internal_8bpc).main 4149 call m(iadst_16x8_internal_8bpc).main_pass2_end 4150 vpbroadcastd m10, [pw_2048] 4151 pxor m11, m11 4152 psubw m11, m10 4153 pmulhrsw m0, m10 4154 pmulhrsw m1, m11 4155 pmulhrsw m2, m10 4156 pmulhrsw m3, m11 4157 call m(idct_16x8_internal_10bpc).write_16x4_start 4158 pmulhrsw m0, m4, m10 4159 pmulhrsw m1, m5, m11 4160 pmulhrsw m2, m6, m10 4161 pmulhrsw m3, m7, m11 4162 call m(idct_16x8_internal_10bpc).write_16x4_zero 4163 RET 4164ALIGN function_align 4165.pass1_rotations: 4166 paddd m0, m15 4167 psubd m1, m15, m1 4168 paddd m2, m15 4169 psubd m3, m15, m3 4170 paddd m4, m14 4171 psubd m5, m13, m5 4172 paddd m6, m14 4173 psubd m7, m13, m7 4174 paddd m8, m14, m9 4175 psubd m9, m13, m10 4176 paddd m10, m14, m11 4177 psubd m11, m13, m12 4178 paddd m12, m15, [r6-32*1] 4179 psubd m13, m15, [r6-32*2] 4180 paddd m14, m15, [r6-32*3] 4181 psubd m15, [r6-32*4] 4182 ret 4183ALIGN function_align 4184.main: 4185 ; expects: m13 = clip_min m14 = clip_max 4186 vpbroadcastd m15, [pd_2896] 4187 pmulld m0, m15, [cq+32* 2] 4188 pmulld m1, m15, [cq+32*13] 4189 pmulld m2, m15, [cq+32* 6] 4190 pmulld m3, m15, [cq+32* 9] 4191 pmulld m4, m15, [cq+32*10] 4192 pmulld m5, m15, [cq+32* 5] 4193 pmulld m6, m15, [cq+32*14] 4194 pmulld m7, m15, [cq+32* 1] 4195 vpbroadcastd m12, [pd_2048] 4196 REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 4197 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 4198 call .main_part1 4199 pmulld m0, m15, [cq+32* 0] 4200 pmulld m1, m15, [cq+32*15] 4201 pmulld m2, m15, [cq+32* 4] 4202 pmulld m3, m15, [cq+32*11] 4203 pmulld m4, m15, [cq+32* 8] 4204 pmulld m5, m15, [cq+32* 7] 4205 pmulld m6, m15, [cq+32*12] 4206 pmulld m7, m15, [cq+32* 3] 4207 REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 4208 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 4209.main_part2: 4210 ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091 4211 ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703 4212 ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751 4213 ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380 4214 psubd m8, m0, m4 ; t8a 4215 paddd m0, m4 ; t0a 4216 psubd m4, m1, m5 ; t9a 4217 paddd m1, m5 ; t1a 4218 psubd m5, m2, m6 ; t12a 4219 paddd m2, m6 ; t4a 4220 psubd m6, m3, m7 ; t13a 4221 paddd m7, m3 ; t5a 4222 REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 4223 REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 4224 vpbroadcastd m11, [pd_4017] 4225 vpbroadcastd m10, [pd_799] 4226 ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 4227 ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 4228 psubd m3, m0, m2 ; t4 4229 paddd m0, m2 ; t0 4230 psubd m2, m1, m7 ; t5 4231 paddd m1, m7 ; t1 4232 psubd m7, m4, m6 ; t12a 4233 paddd m4, m6 ; t8a 4234 psubd m6, m8, m5 ; t13a 4235 paddd m5, m8 ; t9a 4236 REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 4237 REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5 4238 vpbroadcastd m11, [pd_3784] 4239 vpbroadcastd m10, [pd_1567] 4240 ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11 4241 ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11 4242 pminsd m10, m14, [r6-32*4] ; t2 4243 pminsd m8, m14, [r6-32*3] ; t3 4244 psubd m9, m0, m10 ; t2a 4245 paddd m0, m10 ; out0 4246 psubd m10, m1, m8 ; t3a 4247 paddd m1, m8 ; -out15 4248 pmaxsd m9, m13 4249 pmaxsd m10, m13 4250 pminsd m9, m14 4251 pminsd m10, m14 4252 mova [r6-32*4], m1 4253 mova m11, [r6-32*1] ; t7a 4254 mova m1, [r6-32*2] ; t6a 4255 psubd m8, m3, m11 ; t7 4256 paddd m11, m3 ; out12 4257 paddd m3, m2, m1 ; -out3 4258 psubd m2, m1 ; t6 4259 pmaxsd m8, m13 4260 pmaxsd m2, m13 4261 pminsd m8, m14 4262 pminsd m2, m14 4263 mova [r6-32*1], m11 4264 mova [r6-32*3], m2 4265 mova m1, [r6+32*3] ; t15 4266 mova m2, [r6+32*2] ; t14 4267 paddd m12, m7, m1 ; -out13 4268 psubd m7, m1 ; t15a 4269 psubd m11, m6, m2 ; t14a 4270 paddd m2, m6 ; out2 4271 pmaxsd m7, m13 4272 pmaxsd m11, m13 4273 pminsd m7, m14 4274 pminsd m11, m14 4275 mova [r6-32*2], m12 4276 pminsd m1, m14, [r6+32*0] ; t10a 4277 pminsd m12, m14, [r6+32*1] ; t11a 4278 psubd m6, m4, m1 ; t10 4279 paddd m1, m4 ; -out1 4280 psubd m4, m5, m12 ; t11 4281 paddd m5, m12 ; out14 4282 vpbroadcastd m12, [pd_1448] 4283 pmaxsd m6, m13 4284 pmaxsd m4, m13 4285 pminsd m6, m14 4286 pminsd m4, m14 4287 REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4 4288 pmulld m12, [r6-32*3] ; t6 4289 mova [r6-32*3], m5 4290 paddd m5, m11, m7 ; -out5 (unshifted) 4291 psubd m11, m7 ; out10 (unshifted) 4292 paddd m7, m9, m10 ; -out7 (unshifted) 4293 psubd m9, m10 ; out8 (unshifted) 4294 psubd m10, m6, m4 ; -out9 (unshifted) 4295 paddd m6, m4 ; out6 (unshifted) 4296 paddd m4, m12, m8 ; out4 (unshifted) 4297 psubd m12, m8 ; -out11 (unshifted) 4298 ret 4299.main_part1: 4300 ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973 4301 ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290 4302 ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106 4303 ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601 4304 psubd m8, m0, m4 ; t10a 4305 paddd m0, m4 ; t2a 4306 psubd m4, m1, m5 ; t11a 4307 paddd m1, m5 ; t3a 4308 psubd m5, m2, m6 ; t14a 4309 paddd m2, m6 ; t6a 4310 psubd m6, m3, m7 ; t15a 4311 paddd m7, m3 ; t7a 4312 REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 4313 REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 4314 vpbroadcastd m11, [pd_2276] 4315 vpbroadcastd m10, [pd_3406] 4316 ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 4317 ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 4318 psubd m3, m0, m2 ; t6 4319 paddd m0, m2 ; t2 4320 psubd m2, m1, m7 ; t7 4321 paddd m1, m7 ; t3 4322 psubd m7, m4, m6 ; t14a 4323 paddd m4, m6 ; t10a 4324 psubd m6, m8, m5 ; t15a 4325 paddd m5, m8 ; t11a 4326 REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 4327 REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later 4328 vpbroadcastd m11, [pd_1567] 4329 vpbroadcastd m10, [pd_3784] 4330 ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11 4331 ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11 4332 mova [r6-32*4], m0 4333 mova [r6-32*3], m1 4334 mova [r6+32*0], m4 4335 mova [r6+32*1], m5 4336 mova [r6-32*2], m2 4337 mova [r6-32*1], m3 4338 mova [r6+32*2], m6 4339 mova [r6+32*3], m7 4340 ret 4341 4342INV_TXFM_16X8_FN flipadst, dct 4343INV_TXFM_16X8_FN flipadst, adst 4344INV_TXFM_16X8_FN flipadst, flipadst 4345INV_TXFM_16X8_FN flipadst, identity 4346 4347cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 4348 vpbroadcastd m13, [clip_18b_min] 4349 vpbroadcastd m14, [clip_18b_max] 4350.pass1: 4351 lea r6, [rsp+32*4] 4352 call m(iadst_16x8_internal_10bpc).main 4353 vpbroadcastd m14, [pd_3072] 4354 psrld m15, 11 4355 psubd m13, m14, m15 4356 call .pass1_rotations 4357 jmp m(iadst_16x8_internal_10bpc).pass1_end 4358.pass2: 4359 call m(idct_16x8_internal_10bpc).transpose 4360 call m(iadst_16x8_internal_8bpc).main 4361 call m(iadst_16x8_internal_8bpc).main_pass2_end 4362 vpbroadcastd m10, [pw_2048] 4363 pxor m11, m11 4364 psubw m11, m10 4365 mova m12, m0 4366 pmulhrsw m0, m7, m11 4367 mova m7, m1 4368 pmulhrsw m1, m6, m10 4369 mova m6, m2 4370 pmulhrsw m2, m5, m11 4371 mova m5, m3 4372 pmulhrsw m3, m4, m10 4373 call m(idct_16x8_internal_10bpc).write_16x4_start 4374 pmulhrsw m0, m5, m11 4375 pmulhrsw m1, m6, m10 4376 pmulhrsw m2, m7, m11 4377 pmulhrsw m3, m12, m10 4378 call m(idct_16x8_internal_10bpc).write_16x4_zero 4379 RET 4380ALIGN function_align 4381.pass1_rotations: 4382 psubd m8, m13, m7 4383 paddd m7, m14, m9 4384 paddd m9, m14, m6 4385 psubd m6, m13, m10 4386 psubd m10, m13, m5 4387 paddd m5, m14, m11 4388 paddd m11, m14, m4 4389 psubd m4, m13, m12 4390 psubd m12, m15, m3 4391 paddd m3, m15, [r6-32*1] 4392 paddd m13, m15, m2 4393 psubd m2, m15, [r6-32*2] 4394 psubd m14, m15, m1 4395 mova m1, m15 4396 paddd m15, m0 4397 psubd m0, m1, [r6-32*4] 4398 paddd m1, [r6-32*3] 4399 ret 4400 4401INV_TXFM_16X8_FN identity, dct 4402INV_TXFM_16X8_FN identity, adst 4403INV_TXFM_16X8_FN identity, flipadst 4404INV_TXFM_16X8_FN identity, identity 4405 4406cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 4407.pass1: 4408 vpbroadcastd m15, [pd_2896] 4409 pmulld m0, m15, [cq+32* 0] 4410 pmulld m1, m15, [cq+32* 1] 4411 pmulld m2, m15, [cq+32* 2] 4412 pmulld m3, m15, [cq+32* 3] 4413 pmulld m4, m15, [cq+32* 4] 4414 pmulld m5, m15, [cq+32* 5] 4415 pmulld m6, m15, [cq+32* 6] 4416 pmulld m7, m15, [cq+32* 7] 4417 pmulld m8, m15, [cq+32* 8] 4418 pmulld m9, m15, [cq+32* 9] 4419 pmulld m10, m15, [cq+32*10] 4420 pmulld m11, m15, [cq+32*11] 4421 pmulld m12, m15, [cq+32*12] 4422 pmulld m13, m15, [cq+32*13] 4423 pmulld m14, m15, [cq+32*14] 4424 pmulld m15, [cq+32*15] 4425 mova [rsp], m7 4426 vpbroadcastd m7, [pd_2048] 4427 REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ 4428 m8, m9, m10, m11, m12, m13, m14, m15 4429 paddd m7, [rsp] 4430 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ 4431 m8, m9, m10, m11, m12, m13, m14, m15 4432 mova [rsp], m15 4433 vpbroadcastd m15, [pd_5793] 4434 REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ 4435 m8, m9, m10, m11, m12, m13, m14 4436 pmulld m15, [rsp] 4437 mova [rsp], m7 4438 vpbroadcastd m7, [pd_3072] 4439 REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ 4440 m8, m9, m10, m11, m12, m13, m14, m15 4441 paddd m7, [rsp] 4442 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ 4443 m8, m9, m10, m11, m12, m13, m14, m15 4444 jmp tx2q 4445.pass2: 4446 call m(idct_16x8_internal_10bpc).transpose 4447 vpbroadcastd m10, [pw_4096] 4448 jmp m(idct_16x8_internal_10bpc).end 4449 4450INV_TXFM_16X8_FN dct, dct, 12 4451INV_TXFM_16X8_FN dct, identity, 12 4452INV_TXFM_16X8_FN dct, adst, 12 4453INV_TXFM_16X8_FN dct, flipadst, 12 4454 4455cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 4456 vpbroadcastd m12, [clip_20b_min] 4457 vpbroadcastd m13, [clip_20b_max] 4458 jmp m(idct_16x8_internal_10bpc).pass1 4459.pass2: 4460 call .pass2_main 4461 RET 4462ALIGN function_align 4463.pass2_main: 4464 call m(idct_8x16_internal_12bpc).transpose 4465 vpbroadcastd m12, [clip_18b_min] 4466 vpbroadcastd m13, [clip_18b_max] 4467 vpbroadcastd m11, [pd_2048] 4468 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 4469 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 4470 call m(idct_8x8_internal_10bpc).main 4471 call m(idct_8x8_internal_12bpc).round_shift4 4472 mova [cq+32* 8], m0 4473 mova [cq+32* 9], m1 4474 mova [cq+32*10], m2 4475 mova [cq+32*11], m3 4476 mova [cq+32*12], m4 4477 mova [cq+32*13], m5 4478 mova [cq+32*14], m6 4479 mova [cq+32*15], m7 4480 pmaxsd m0, m12, [cq+32*0] 4481 pmaxsd m1, m12, [cq+32*1] 4482 pmaxsd m2, m12, [cq+32*2] 4483 pmaxsd m3, m12, [cq+32*3] 4484 pmaxsd m4, m12, [cq+32*4] 4485 pmaxsd m5, m12, [cq+32*5] 4486 pmaxsd m6, m12, [cq+32*6] 4487 pmaxsd m7, m12, [cq+32*7] 4488 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 4489 call m(idct_8x8_internal_10bpc).main 4490 call m(idct_8x8_internal_12bpc).round_shift4 4491.end: 4492 packssdw m0, [cq+32* 8] 4493 packssdw m1, [cq+32* 9] 4494 packssdw m2, [cq+32*10] 4495 packssdw m3, [cq+32*11] 4496 packssdw m4, [cq+32*12] 4497 packssdw m5, [cq+32*13] 4498 packssdw m6, [cq+32*14] 4499 packssdw m7, [cq+32*15] 4500 REPX {vpermq x, x, q3120}, m0, m1, m2, m3 4501 call .write_16x4_start 4502 call m(idct_16x8_internal_10bpc).write_16x4_zero 4503 vpermq m0, m4, q3120 4504 vpermq m1, m5, q3120 4505 vpermq m2, m6, q3120 4506 vpermq m3, m7, q3120 4507 jmp m(idct_16x8_internal_10bpc).write_16x4_zero 4508ALIGN function_align 4509.write_16x4_start: 4510 vpbroadcastd m9, [pixel_12bpc_max] 4511 lea r3, [strideq*3] 4512 pxor m8, m8 4513 ret 4514 4515INV_TXFM_16X8_FN adst, dct, 12 4516INV_TXFM_16X8_FN adst, adst, 12 4517INV_TXFM_16X8_FN adst, flipadst, 12 4518INV_TXFM_16X8_FN adst, identity, 12 4519 4520cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 4521 vpbroadcastd m13, [clip_20b_min] 4522 vpbroadcastd m14, [clip_20b_max] 4523 jmp m(iadst_16x8_internal_10bpc).pass1 4524.pass2: 4525 call .pass2_main 4526 call m(idct_16x8_internal_12bpc).end 4527 RET 4528ALIGN function_align 4529.pass2_main: 4530 call m(idct_8x16_internal_12bpc).transpose 4531 vpbroadcastd m12, [clip_18b_min] 4532 vpbroadcastd m13, [clip_18b_max] 4533 vpbroadcastd m11, [pd_2048] 4534 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 4535 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 4536 call m(iadst_8x8_internal_12bpc).pass2_main2 4537 mova [cq+32* 8], m0 4538 mova [cq+32* 9], m1 4539 mova [cq+32*10], m2 4540 mova [cq+32*11], m3 4541 mova [cq+32*12], m4 4542 mova [cq+32*13], m5 4543 mova [cq+32*14], m6 4544 mova [cq+32*15], m7 4545 pmaxsd m0, m12, [cq+32*0] 4546 pmaxsd m1, m12, [cq+32*1] 4547 pmaxsd m2, m12, [cq+32*2] 4548 pmaxsd m3, m12, [cq+32*3] 4549 pmaxsd m4, m12, [cq+32*4] 4550 pmaxsd m5, m12, [cq+32*5] 4551 pmaxsd m6, m12, [cq+32*6] 4552 pmaxsd m7, m12, [cq+32*7] 4553 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 4554 call m(iadst_8x8_internal_12bpc).pass2_main2 4555 ret 4556 4557INV_TXFM_16X8_FN flipadst, dct, 12 4558INV_TXFM_16X8_FN flipadst, adst, 12 4559INV_TXFM_16X8_FN flipadst, flipadst, 12 4560INV_TXFM_16X8_FN flipadst, identity, 12 4561 4562cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 4563 vpbroadcastd m13, [clip_20b_min] 4564 vpbroadcastd m14, [clip_20b_max] 4565 jmp m(iflipadst_16x8_internal_10bpc).pass1 4566.pass2: 4567 call m(iadst_16x8_internal_12bpc).pass2_main 4568 packssdw m13, m0, [cq+32* 8] 4569 packssdw m12, m1, [cq+32* 9] 4570 packssdw m11, m2, [cq+32*10] 4571 packssdw m10, m3, [cq+32*11] 4572 packssdw m3, m4, [cq+32*12] 4573 packssdw m2, m5, [cq+32*13] 4574 packssdw m1, m6, [cq+32*14] 4575 packssdw m0, m7, [cq+32*15] 4576 REPX {vpermq x, x, q3120}, m0, m1, m2, m3 4577 call m(idct_16x8_internal_12bpc).write_16x4_start 4578 call m(idct_16x8_internal_10bpc).write_16x4_zero 4579 vpermq m0, m10, q3120 4580 vpermq m1, m11, q3120 4581 vpermq m2, m12, q3120 4582 vpermq m3, m13, q3120 4583 call m(idct_16x8_internal_10bpc).write_16x4_zero 4584 RET 4585 4586INV_TXFM_16X8_FN identity, dct, 12 4587INV_TXFM_16X8_FN identity, adst, 12 4588INV_TXFM_16X8_FN identity, flipadst, 12 4589INV_TXFM_16X8_FN identity, identity, 12 4590 4591cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 4592 jmp m(iidentity_16x8_internal_10bpc).pass1 4593.pass2: 4594 call m(idct_16x8_internal_10bpc).transpose2 4595 vpbroadcastd m10, [pw_4096] 4596 pmulhrsw m0, m10 4597 pmulhrsw m1, m10 4598 pmulhrsw m2, m10 4599 pmulhrsw m3, m10 4600 call m(idct_16x8_internal_12bpc).write_16x4_start 4601 call m(idct_16x8_internal_10bpc).write_16x4_zero 4602 jmp m(idct_16x8_internal_10bpc).end2 4603 4604%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth 4605 INV_TXFM_FN %1, %2, %3, 16x16, %4 4606%ifidn %1_%2, dct_dct 4607 imul r6d, [cq], 181 4608 vpbroadcastd m3, [dconly_%4bpc] 4609 mov [cq], eobd ; 0 4610 or r3d, 16 4611 add r6d, 640 4612 sar r6d, 10 4613 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 4614%endif 4615%endmacro 4616 4617INV_TXFM_16X16_FN dct, dct 4618INV_TXFM_16X16_FN dct, identity, 28 4619INV_TXFM_16X16_FN dct, adst 4620INV_TXFM_16X16_FN dct, flipadst 4621 4622cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 4623 vpbroadcastd m12, [clip_18b_min] 4624 vpbroadcastd m13, [clip_18b_max] 4625.pass1: 4626 vpbroadcastd m11, [pd_2048] 4627 vpbroadcastd m14, [pd_2896] 4628 lea r6, [rsp+32*4] 4629 sub eobd, 36 4630 jl .fast 4631 add cq, 32 4632 call .main 4633 sub cq, 32 4634 mova m10, [r6-32*4] 4635 mova m9, [r6-32*3] 4636 mova m8, [r6-32*2] 4637 psubd m15, m0, m10 ; out15 4638 paddd m0, m10 ; out0 4639 psubd m10, m1, m9 ; out14 4640 paddd m1, m9 ; out1 4641 psubd m9, m2, m8 ; out13 4642 paddd m2, m8 ; out2 4643 REPX {psrad x, 2}, m0, m1, m2 4644 mova [r6-32*4], m0 4645 mova [r6-32*3], m1 4646 mova [r6-32*2], m2 4647 mova m2, [r6-32*1] 4648 mova m1, [r6+32*0] 4649 mova m0, [r6+32*1] 4650 REPX {psrad x, 2}, m9, m10, m15 4651 psubd m8, m3, m2 ; out12 4652 paddd m3, m2 ; out3 4653 psubd m2, m4, m1 ; out11 4654 paddd m4, m1 ; out4 4655 psubd m1, m5, m0 ; out10 4656 paddd m5, m0 ; out5 4657 REPX {psrad x, 2}, m3, m4, m5 4658 mova [r6-32*1], m3 4659 mova [r6+32*0], m4 4660 mova [r6+32*1], m5 4661 mova m4, [r6+32*2] 4662 mova m3, [r6+32*3] 4663 REPX {psrad x, 2}, m1, m2, m8 4664 psubd m5, m6, m4 ; out9 4665 paddd m6, m4 ; out6 4666 psubd m4, m7, m3 ; out8 4667 paddd m7, m3 ; out7 4668 REPX {psrad x, 2}, m6, m7, m4, m5 4669 mova [r6+32*2], m6 4670 mova [r6+32*3], m7 4671 add r6, 32*8 4672 mova [r6-32*4], m4 4673 mova [r6-32*3], m5 4674 mova [r6-32*2], m1 4675 mova [r6-32*1], m2 4676 mova [r6+32*0], m8 4677 mova [r6+32*1], m9 4678 mova [r6+32*2], m10 4679 mova [r6+32*3], m15 4680.fast: 4681 add r6, 32*8 4682 call .main 4683 mova m14, [r6-32*4] 4684 mova m13, [r6-32*3] 4685 mova m12, [r6-32*2] 4686 mova m11, [r6-32*1] 4687 mova m10, [r6+32*0] 4688 mova m9, [r6+32*1] 4689 mova m8, [r6+32*2] 4690 psubd m15, m0, m14 ; out15 4691 paddd m0, m14 ; out0 4692 psubd m14, m1, m13 ; out14 4693 paddd m1, m13 ; out1 4694 psubd m13, m2, m12 ; out13 4695 paddd m2, m12 ; out2 4696 psubd m12, m3, m11 ; out12 4697 paddd m3, m11 ; out3 4698 psubd m11, m4, m10 ; out11 4699 paddd m4, m10 ; out4 4700 psubd m10, m5, m9 ; out10 4701 paddd m5, m9 ; out5 4702 psubd m9, m6, m8 ; out9 4703 paddd m6, m8 ; out6 4704 psubd m8, m7, [r6+32*3] ; out8 4705 paddd m7, [r6+32*3] ; out7 4706 sub r6, 32*8 4707 REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ 4708 m8, m9, m10, m11, m12, m13, m14, m15 4709 jmp tx2q 4710.pass2: 4711 call .transpose 4712 lea r6, [pw_5+128] 4713 mova [rsp], m15 4714 call m(idct_16x16_internal_8bpc).main 4715 mova m1, [rsp+32*1] 4716.end: 4717 call .write_16x16 4718 RET 4719ALIGN function_align 4720.write_16x16: 4721 mova [rsp+gprsize+32*0], m8 4722 mova [rsp+gprsize+32*1], m9 4723 mova [rsp+gprsize+32*2], m12 4724 vpbroadcastd m12, [pw_2048] 4725 pmulhrsw m0, m12 4726 pmulhrsw m1, m12 4727 pmulhrsw m2, m12 4728 pmulhrsw m3, m12 4729 call m(idct_16x8_internal_10bpc).write_16x4_start 4730.write_16x16_2: 4731 pmulhrsw m0, m12, m4 4732 pmulhrsw m1, m12, m5 4733 pmulhrsw m2, m12, m6 4734 pmulhrsw m3, m12, m7 4735 call m(idct_16x8_internal_10bpc).write_16x4_zero 4736 pmulhrsw m0, m12, [rsp+gprsize+32*0] 4737 pmulhrsw m1, m12, [rsp+gprsize+32*1] 4738 pmulhrsw m2, m12, m10 4739 pmulhrsw m3, m12, m11 4740 call m(idct_16x8_internal_10bpc).write_16x4_zero 4741 pmulhrsw m0, m12, [rsp+gprsize+32*2] 4742 pmulhrsw m1, m12, m13 4743 pmulhrsw m2, m12, m14 4744 pmulhrsw m3, m12, m15 4745 jmp m(idct_16x8_internal_10bpc).write_16x4_zero 4746ALIGN function_align 4747.transpose: 4748 test eobd, eobd 4749 jl .transpose_fast 4750 packssdw m8, [r6-32*4] 4751 packssdw m9, [r6-32*3] 4752 packssdw m10, [r6-32*2] 4753 packssdw m11, [r6-32*1] 4754 packssdw m12, [r6+32*0] 4755 packssdw m13, [r6+32*1] 4756 packssdw m14, [r6+32*2] 4757 packssdw m15, [r6+32*3] 4758 sub r6, 32*8 4759 packssdw m0, [r6-32*4] 4760 packssdw m1, [r6-32*3] 4761 packssdw m2, [r6-32*2] 4762 packssdw m3, [r6-32*1] 4763 packssdw m4, [r6+32*0] 4764 packssdw m5, [r6+32*1] 4765 packssdw m6, [r6+32*2] 4766 packssdw m7, [r6+32*3] 4767 mova [r6], m8 4768 punpckhwd m8, m0, m1 4769 punpcklwd m0, m1 4770 punpcklwd m1, m2, m3 4771 punpckhwd m2, m3 4772 punpckhwd m3, m6, m7 4773 punpcklwd m6, m7 4774 punpcklwd m7, m4, m5 4775 punpckhwd m4, m5 4776 punpckldq m5, m8, m2 4777 punpckhdq m8, m2 4778 punpckhdq m2, m0, m1 4779 punpckldq m0, m1 4780 punpckhdq m1, m7, m6 4781 punpckldq m7, m6 4782 punpckhdq m6, m4, m3 4783 punpckldq m4, m3 4784 punpckhqdq m3, m2, m1 4785 punpcklqdq m2, m1 4786 punpckhqdq m1, m0, m7 4787 punpcklqdq m0, m7 4788 punpcklqdq m7, m8, m6 4789 punpckhqdq m8, m6 4790 punpckhqdq m6, m5, m4 4791 punpcklqdq m5, m4 4792 mova m4, [r6] 4793 mova [r6], m8 4794 punpcklwd m8, m4, m9 4795 punpckhwd m4, m9 4796 punpcklwd m9, m10, m11 4797 punpckhwd m10, m11 4798 punpckhwd m11, m14, m15 4799 punpcklwd m14, m15 4800 punpckhwd m15, m12, m13 4801 punpcklwd m12, m13 4802 punpckldq m13, m4, m10 4803 punpckhdq m4, m10 4804 punpckhdq m10, m8, m9 4805 punpckldq m8, m9 4806 punpckhdq m9, m12, m14 4807 punpckldq m12, m14 4808 punpckhdq m14, m15, m11 4809 punpckldq m15, m11 4810 punpckhqdq m11, m10, m9 4811 punpcklqdq m10, m9 4812 punpckhqdq m9, m8, m12 4813 punpcklqdq m8, m12 4814 punpcklqdq m12, m13, m15 4815 punpckhqdq m13, m15 4816 punpckhqdq m15, m4, m14 4817 punpcklqdq m14, m4, m14 4818 vperm2i128 m4, m0, m8, 0x31 4819 vinserti128 m0, xm8, 1 4820 vinserti128 m8, m5, xm12, 1 4821 vperm2i128 m12, m5, 0x13 4822 vperm2i128 m5, m1, m9, 0x31 4823 vinserti128 m1, xm9, 1 4824 vinserti128 m9, m6, xm13, 1 4825 vperm2i128 m13, m6, 0x13 4826 vperm2i128 m6, m2, m10, 0x31 4827 vinserti128 m2, xm10, 1 4828 vinserti128 m10, m7, xm14, 1 4829 vperm2i128 m14, m7, 0x13 4830 vperm2i128 m7, m3, m11, 0x31 4831 vinserti128 m3, xm11, 1 4832 mova xm11, [r6] 4833 vinserti128 m11, xm15, 1 4834 vinserti128 m15, [r6+16], 0 4835 ret 4836.transpose_fast: 4837 call m(idct_16x8_internal_10bpc).transpose2 4838 pxor m8, m8 4839 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 4840 ret 4841ALIGN function_align 4842.main: 4843 mova m0, [cq+64* 1] 4844 mova m1, [cq+64* 3] 4845 mova m2, [cq+64* 5] 4846 mova m3, [cq+64* 7] 4847 mova m4, [cq+64* 9] 4848 mova m5, [cq+64*11] 4849 mova m6, [cq+64*13] 4850 mova m7, [cq+64*15] 4851 call m(idct_8x16_internal_10bpc).main_oddhalf 4852 mova m0, [cq+64* 0] 4853 mova m1, [cq+64* 2] 4854 mova m2, [cq+64* 4] 4855 mova m3, [cq+64* 6] 4856 mova m4, [cq+64* 8] 4857 mova m5, [cq+64*10] 4858 mova m6, [cq+64*12] 4859 mova m7, [cq+64*14] 4860 call m(idct_8x8_internal_10bpc).main 4861 call m(idct_8x16_internal_10bpc).main_evenhalf 4862 psrld m10, m11, 10 ; pd_2 4863 REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 4864 ret 4865 4866INV_TXFM_16X16_FN adst, dct 4867INV_TXFM_16X16_FN adst, adst 4868INV_TXFM_16X16_FN adst, flipadst 4869 4870cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 4871 vpbroadcastd m13, [clip_18b_min] 4872 vpbroadcastd m14, [clip_18b_max] 4873.pass1: 4874 vpbroadcastd m15, [pd_2896] 4875 lea r6, [rsp+32*4] 4876 sub eobd, 36 4877 jl .fast 4878 add cq, 32 4879 call .main 4880 sub cq, 32 4881 vpbroadcastd m8, [pd_5120] 4882 paddd m4, m8 4883 paddd m6, m8 4884 paddd m9, m8 4885 paddd m11, m8 4886 vpbroadcastd m8, [pd_5119] 4887 psubd m5, m8, m5 4888 psubd m7, m8, m7 4889 psubd m10, m8, m10 4890 psubd m12, m8, m12 4891 REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12 4892 mova [r6+32*0], m4 4893 mova [r6+32*1], m5 4894 mova [r6+32*2], m6 4895 mova [r6+32*3], m7 4896 psrld m4, m15, 10 ; pd_2 4897 paddd m0, m4 4898 psubd m1, m4, m1 4899 paddd m2, m4 4900 psubd m3, m4, m3 4901 psubd m7, m4, [r6-32*4] 4902 paddd m6, m4, [r6-32*3] 4903 psubd m5, m4, [r6-32*2] 4904 paddd m4, [r6-32*1] 4905 REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 4906 mova [r6-32*4], m0 4907 mova [r6-32*3], m1 4908 mova [r6-32*2], m2 4909 mova [r6-32*1], m3 4910 add r6, 32*8 4911 mova [r6-32*4], m9 4912 mova [r6-32*3], m10 4913 mova [r6-32*2], m11 4914 mova [r6-32*1], m12 4915 mova [r6+32*0], m4 4916 mova [r6+32*1], m5 4917 mova [r6+32*2], m6 4918 mova [r6+32*3], m7 4919.fast: 4920 add r6, 32*8 4921 call .main 4922 vpbroadcastd m14, [pd_5120] 4923 vpbroadcastd m13, [pd_5119] 4924 psrld m15, 10 ; pd_2 4925 paddd m0, m15 4926 psubd m1, m15, m1 4927 paddd m2, m15 4928 psubd m3, m15, m3 4929 paddd m4, m14 4930 psubd m5, m13, m5 4931 paddd m6, m14 4932 psubd m7, m13, m7 4933 paddd m8, m14, m9 4934 psubd m9, m13, m10 4935 paddd m10, m14, m11 4936 psubd m11, m13, m12 4937 paddd m12, m15, [r6-32*1] 4938 psubd m13, m15, [r6-32*2] 4939 paddd m14, m15, [r6-32*3] 4940 psubd m15, [r6-32*4] 4941.pass1_end: 4942 REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 4943 REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 4944 sub r6, 32*8 4945 jmp tx2q 4946.pass2: 4947 call m(idct_16x16_internal_10bpc).transpose 4948 lea r6, [pw_5+128] 4949 mova [rsp], m15 4950 call m(iadst_16x16_internal_8bpc).main 4951 call m(iadst_16x16_internal_8bpc).main_pass2_end 4952 mova [rsp+32*0], m8 4953 mova [rsp+32*2], m12 4954 mova [rsp+32*3], m13 4955 vpbroadcastd m12, [pw_2048] 4956 pxor m13, m13 4957 psubw m13, m12 4958 pmulhrsw m0, m12 4959 pmulhrsw m1, m13, [rsp+32*1] 4960 mova [rsp+32*1], m9 4961 pmulhrsw m2, m12 4962 pmulhrsw m3, m13 4963 call m(idct_16x8_internal_10bpc).write_16x4_start 4964 pmulhrsw m0, m12, m4 4965 pmulhrsw m1, m13, m5 4966 pmulhrsw m2, m12, m6 4967 pmulhrsw m3, m13, m7 4968 call m(idct_16x8_internal_10bpc).write_16x4_zero 4969 pmulhrsw m0, m12, [rsp+32*0] 4970 pmulhrsw m1, m13, [rsp+32*1] 4971 pmulhrsw m2, m12, m10 4972 pmulhrsw m3, m13, m11 4973 call m(idct_16x8_internal_10bpc).write_16x4_zero 4974 pmulhrsw m0, m12, [rsp+32*2] 4975 pmulhrsw m1, m13, [rsp+32*3] 4976 pmulhrsw m2, m12, m14 4977 pmulhrsw m3, m13, m15 4978 call m(idct_16x8_internal_10bpc).write_16x4_zero 4979 RET 4980ALIGN function_align 4981.main: 4982 mova m0, [cq+64* 2] 4983 mova m1, [cq+64*13] 4984 mova m2, [cq+64* 6] 4985 mova m3, [cq+64* 9] 4986 mova m4, [cq+64*10] 4987 mova m5, [cq+64* 5] 4988 mova m6, [cq+64*14] 4989 mova m7, [cq+64* 1] 4990 vpbroadcastd m12, [pd_2048] 4991 call m(iadst_16x8_internal_10bpc).main_part1 4992 mova m0, [cq+64* 0] 4993 mova m1, [cq+64*15] 4994 mova m2, [cq+64* 4] 4995 mova m3, [cq+64*11] 4996 mova m4, [cq+64* 8] 4997 mova m5, [cq+64* 7] 4998 mova m6, [cq+64*12] 4999 mova m7, [cq+64* 3] 5000 jmp m(iadst_16x8_internal_10bpc).main_part2 5001 5002INV_TXFM_16X16_FN flipadst, dct 5003INV_TXFM_16X16_FN flipadst, adst 5004INV_TXFM_16X16_FN flipadst, flipadst 5005 5006cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 5007 vpbroadcastd m13, [clip_18b_min] 5008 vpbroadcastd m14, [clip_18b_max] 5009.pass1: 5010 vpbroadcastd m15, [pd_2896] 5011 lea r6, [rsp+32*4] 5012 sub eobd, 36 5013 jl .fast 5014 add cq, 32 5015 call m(iadst_16x16_internal_10bpc).main 5016 sub cq, 32 5017 vpbroadcastd m8, [pd_5120] 5018 paddd m11, m8 5019 paddd m9, m8 5020 paddd m6, m8 5021 paddd m4, m8 5022 vpbroadcastd m8, [pd_5119] 5023 psubd m12, m8, m12 5024 psubd m10, m8, m10 5025 psubd m7, m8, m7 5026 psubd m5, m8, m5 5027 REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4 5028 mova [r6+32*0], m12 5029 mova [r6+32*1], m11 5030 mova [r6+32*2], m10 5031 mova [r6+32*3], m9 5032 psrld m9, m15, 10 ; pd_2 5033 psubd m3, m9, m3 5034 paddd m2, m9 5035 psubd m1, m9, m1 5036 paddd m0, m9 5037 psubd m12, m9, [r6-32*4] 5038 paddd m11, m9, [r6-32*3] 5039 psubd m10, m9, [r6-32*2] 5040 paddd m9, [r6-32*1] 5041 REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0 5042 mova [r6-32*4], m12 5043 mova [r6-32*3], m11 5044 mova [r6-32*2], m10 5045 mova [r6-32*1], m9 5046 add r6, 32*8 5047 mova [r6-32*4], m7 5048 mova [r6-32*3], m6 5049 mova [r6-32*2], m5 5050 mova [r6-32*1], m4 5051 mova [r6+32*0], m3 5052 mova [r6+32*1], m2 5053 mova [r6+32*2], m1 5054 mova [r6+32*3], m0 5055.fast: 5056 add r6, 32*8 5057 call m(iadst_16x16_internal_10bpc).main 5058 vpbroadcastd m14, [pd_5120] 5059 vpbroadcastd m13, [pd_5119] 5060 psrld m15, 10 ; pd_2 5061 psubd m8, m13, m7 5062 paddd m7, m14, m9 5063 paddd m9, m14, m6 5064 psubd m6, m13, m10 5065 psubd m10, m13, m5 5066 paddd m5, m14, m11 5067 paddd m11, m14, m4 5068 psubd m4, m13, m12 5069 psubd m12, m15, m3 5070 paddd m3, m15, [r6-32*1] 5071 paddd m13, m15, m2 5072 psubd m2, m15, [r6-32*2] 5073 psubd m14, m15, m1 5074 mova m1, m15 5075 paddd m15, m0 5076 psubd m0, m1, [r6-32*4] 5077 paddd m1, [r6-32*3] 5078 jmp m(iadst_16x16_internal_10bpc).pass1_end 5079.pass2: 5080 call m(idct_16x16_internal_10bpc).transpose 5081 lea r6, [pw_5+128] 5082 mova [rsp], m15 5083 call m(iadst_16x16_internal_8bpc).main 5084 call m(iadst_16x16_internal_8bpc).main_pass2_end 5085 mova [rsp+32*3], m3 5086 mova [rsp+32*2], m2 5087 mova [rsp+32*0], m0 5088 mova m2, m13 5089 mova m3, m12 5090 vpbroadcastd m12, [pw_2048] 5091 pxor m13, m13 5092 psubw m13, m12 5093 pmulhrsw m0, m13, m15 5094 pmulhrsw m1, m12, m14 5095 pmulhrsw m2, m13 5096 pmulhrsw m3, m12 5097 mova m14, m8 5098 mova m15, m9 5099 call m(idct_16x8_internal_10bpc).write_16x4_start 5100 pmulhrsw m0, m13, m11 5101 pmulhrsw m1, m12, m10 5102 pmulhrsw m2, m13, m15 5103 pmulhrsw m3, m12, m14 5104 call m(idct_16x8_internal_10bpc).write_16x4_zero 5105 pmulhrsw m0, m13, m7 5106 pmulhrsw m1, m12, m6 5107 pmulhrsw m2, m13, m5 5108 pmulhrsw m3, m12, m4 5109 call m(idct_16x8_internal_10bpc).write_16x4_zero 5110 pmulhrsw m0, m13, [rsp+32*3] 5111 pmulhrsw m1, m12, [rsp+32*2] 5112 pmulhrsw m2, m13, [rsp+32*1] 5113 pmulhrsw m3, m12, [rsp+32*0] 5114 call m(idct_16x8_internal_10bpc).write_16x4_zero 5115 RET 5116 5117INV_TXFM_16X16_FN identity, dct, -92 5118INV_TXFM_16X16_FN identity, identity 5119 5120cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 5121 vpbroadcastd m15, [pd_5793] 5122 vpbroadcastd m7, [pd_5120] 5123 lea r6, [rsp+32*4] 5124 sub eobd, 36 5125 jl .fast 5126 mov r3, -32*8*4 5127.righthalf: 5128 pmulld m0, m15, [cq+r3+32*33] 5129 pmulld m1, m15, [cq+r3+32*35] 5130 pmulld m2, m15, [cq+r3+32*37] 5131 pmulld m3, m15, [cq+r3+32*39] 5132 add r6, 32*4 5133 REPX {paddd x, m7}, m0, m1, m2, m3 5134 REPX {psrad x, 13}, m0, m1, m2, m3 5135 mova [r6+32*0], m0 5136 mova [r6+32*1], m1 5137 mova [r6+32*2], m2 5138 mova [r6+32*3], m3 5139 add r3, 32*8 5140 jl .righthalf 5141.fast: 5142 pmulld m0, m15, [cq+64* 0] 5143 pmulld m1, m15, [cq+64* 1] 5144 pmulld m2, m15, [cq+64* 2] 5145 pmulld m3, m15, [cq+64* 3] 5146 pmulld m4, m15, [cq+64* 4] 5147 pmulld m5, m15, [cq+64* 5] 5148 pmulld m6, m15, [cq+64* 6] 5149 pmulld m8, m15, [cq+64* 7] 5150 mova [cq], m8 5151 pmulld m8, m15, [cq+64* 8] 5152 pmulld m9, m15, [cq+64* 9] 5153 pmulld m10, m15, [cq+64*10] 5154 pmulld m11, m15, [cq+64*11] 5155 pmulld m12, m15, [cq+64*12] 5156 pmulld m13, m15, [cq+64*13] 5157 pmulld m14, m15, [cq+64*14] 5158 pmulld m15, [cq+64*15] 5159 REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ 5160 m8, m9, m10, m11, m12, m13, m14, m15 5161 paddd m7, [cq] 5162 REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \ 5163 m8, m9, m10, m11, m12, m13, m14, m15 5164 jmp tx2q 5165.pass2: 5166 call m(idct_16x16_internal_10bpc).transpose 5167 5168 mova [cq+32*0], m15 5169 mova [cq+32*1], m0 5170 vpbroadcastd m15, [pw_1697x16] 5171 5172 REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ 5173 8, 9, 10, 11, 12, 13, 14 5174 mova m0, [cq+32*1] 5175 mova [cq+32*1], m1 5176 IDTX16 0, 1, 15 5177 mova m1, [cq+32*0] 5178 pmulhrsw m15, m1 5179 paddsw m1, m1 5180 paddsw m15, m1 5181 mova m1, [cq+32*1] 5182 jmp m(idct_16x16_internal_10bpc).end 5183 5184INV_TXFM_16X16_FN dct, dct, 0, 12 5185INV_TXFM_16X16_FN dct, identity, 28, 12 5186INV_TXFM_16X16_FN dct, adst, 0, 12 5187INV_TXFM_16X16_FN dct, flipadst, 0, 12 5188 5189cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 5190 vpbroadcastd m12, [clip_20b_min] 5191 vpbroadcastd m13, [clip_20b_max] 5192 jmp m(idct_16x16_internal_10bpc).pass1 5193.pass2: 5194 mova [cq+32* 8], m8 5195 mova [cq+32* 9], m9 5196 mova [cq+32*10], m10 5197 mova [cq+32*11], m11 5198 mova [cq+32*12], m12 5199 mova [cq+32*13], m13 5200 mova [cq+32*14], m14 5201 mova [cq+32*15], m15 5202 call .pass2_main 5203 packssdw m0, m1 5204 packssdw m1, m2, m3 5205 packssdw m2, m4, m5 5206 packssdw m3, m6, m7 5207 packssdw m4, m8, m9 5208 packssdw m5, m10, m11 5209 packssdw m6, m12, m13 5210 packssdw m7, m14, m15 5211 mova [r6-32*4], m0 5212 mova [r6-32*3], m1 5213 mova [r6-32*2], m2 5214 mova [r6-32*1], m3 5215 mova [r6+32*0], m4 5216 mova [r6+32*1], m5 5217 mova [r6+32*2], m6 5218 mova [r6+32*3], m7 5219 mova m0, [cq+32* 8] 5220 mova m1, [cq+32* 9] 5221 mova m2, [cq+32*10] 5222 mova m3, [cq+32*11] 5223 mova m4, [cq+32*12] 5224 mova m5, [cq+32*13] 5225 mova m6, [cq+32*14] 5226 mova m7, [cq+32*15] 5227 mov r5, r6 5228 add r6, 32*16 5229 call .pass2_main 5230 jmp m(iadst_16x16_internal_12bpc).end 5231ALIGN function_align 5232.write_16x16: 5233 mova [rsp+gprsize+32*0], m8 5234 mova [rsp+gprsize+32*1], m9 5235 mova [rsp+gprsize+32*2], m12 5236 vpbroadcastd m12, [pw_16384] 5237 pmulhrsw m0, m12 5238 pmulhrsw m1, m12 5239 pmulhrsw m2, m12 5240 pmulhrsw m3, m12 5241 call m(idct_16x8_internal_12bpc).write_16x4_start 5242 call m(idct_16x8_internal_10bpc).write_16x4_zero 5243 jmp m(idct_16x16_internal_10bpc).write_16x16_2 5244ALIGN function_align 5245.pass2_main: 5246 call m(idct_8x8_internal_12bpc).transpose_8x8 5247 mova [cq+32* 0], m0 5248 mova [cq+32* 1], m2 5249 mova [cq+32* 2], m4 5250 mova [cq+32* 3], m6 5251 vpbroadcastd m12, [clip_18b_min] 5252 vpbroadcastd m13, [clip_18b_max] 5253 pmaxsd m0, m12, m1 5254 pmaxsd m1, m12, m3 5255 pmaxsd m2, m12, m5 5256 pmaxsd m3, m12, m7 5257 REPX {pminsd x, m13}, m0, m1, m2, m3 5258 test eobd, eobd 5259 jge .pass2_slow 5260 pxor m4, m4 5261 REPX {mova x, m4}, m5, m6, m7 5262 jmp .pass2_fast 5263.pass2_slow: 5264 sub r6, 32*8 5265 mova m8, [r6-32*4] 5266 mova m4, [r6-32*3] 5267 mova m10, [r6-32*2] 5268 mova m5, [r6-32*1] 5269 mova m12, [r6+32*0] 5270 mova m6, [r6+32*1] 5271 mova m14, [r6+32*2] 5272 mova m7, [r6+32*3] 5273 TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15 5274 mova [cq+32* 4], m8 5275 mova [cq+32* 5], m10 5276 mova [cq+32* 6], m12 5277 mova [cq+32* 7], m14 5278 vpbroadcastd m12, [clip_18b_min] 5279 vpbroadcastd m13, [clip_18b_max] 5280 REPX {pmaxsd x, m12}, m4, m5, m6, m7 5281 REPX {pminsd x, m13}, m4, m5, m6, m7 5282.pass2_fast: 5283 vpbroadcastd m11, [pd_2048] 5284 vpbroadcastd m14, [pd_2896] 5285 call m(idct_8x16_internal_10bpc).main_oddhalf 5286 pmaxsd m0, m12, [cq+32* 0] 5287 pmaxsd m1, m12, [cq+32* 1] 5288 pmaxsd m2, m12, [cq+32* 2] 5289 pmaxsd m3, m12, [cq+32* 3] 5290 REPX {pminsd x, m13}, m0, m1, m2, m3 5291 test eobd, eobd 5292 jge .pass2_slow2 5293 pxor m4, m4 5294 REPX {mova x, m4}, m5, m6, m7 5295 jmp .pass2_fast2 5296.pass2_slow2: 5297 pmaxsd m4, m12, [cq+32* 4] 5298 pmaxsd m5, m12, [cq+32* 5] 5299 pmaxsd m6, m12, [cq+32* 6] 5300 pmaxsd m7, m12, [cq+32* 7] 5301 REPX {pminsd x, m13}, m4, m5, m6, m7 5302.pass2_fast2: 5303 call m(idct_8x8_internal_10bpc).main 5304 call m(idct_8x16_internal_10bpc).main_evenhalf 5305 psrad m11, 8 ; pd_8 5306 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 5307 call m(idct_16x8_internal_10bpc).pass1_rotations 5308 REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ 5309 m8, m9, m10, m11, m12, m13, m14, m15 5310 ret 5311 5312INV_TXFM_16X16_FN adst, dct, 0, 12 5313INV_TXFM_16X16_FN adst, adst, 0, 12 5314INV_TXFM_16X16_FN adst, flipadst, 0, 12 5315 5316cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 5317 vpbroadcastd m13, [clip_20b_min] 5318 vpbroadcastd m14, [clip_20b_max] 5319 jmp m(iadst_16x16_internal_10bpc).pass1 5320.pass2: 5321 call .pass2_part1 5322 call m(iadst_16x8_internal_10bpc).pass1_rotations 5323 call .pass2_part2 5324 call m(iadst_16x8_internal_10bpc).pass1_rotations 5325.pass2_part3: 5326 REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 5327 REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 5328.end: 5329 packssdw m15, m14 5330 packssdw m14, m13, m12 5331 packssdw m13, m11, m10 5332 packssdw m12, m9, m8 5333 packssdw m11, m7, m6 5334 packssdw m10, m5, m4 5335 packssdw m7, m3, m2 5336 packssdw m6, m1, m0 5337 vpblendd m0, m6, [r5-32*4], 0x33 5338 vpblendd m1, m6, [r5-32*4], 0xcc 5339 vpblendd m2, m7, [r5-32*3], 0x33 5340 vpblendd m3, m7, [r5-32*3], 0xcc 5341 vpermq m0, m0, q3120 5342 vpermq m1, m1, q2031 5343 vpermq m2, m2, q3120 5344 vpermq m3, m3, q2031 5345 call m(idct_16x8_internal_12bpc).write_16x4_start 5346 call m(idct_16x8_internal_10bpc).write_16x4_zero 5347 vpblendd m0, m10, [r5-32*2], 0x33 5348 vpblendd m1, m10, [r5-32*2], 0xcc 5349 vpblendd m2, m11, [r5-32*1], 0x33 5350 vpblendd m3, m11, [r5-32*1], 0xcc 5351 vpermq m0, m0, q3120 5352 vpermq m1, m1, q2031 5353 vpermq m2, m2, q3120 5354 vpermq m3, m3, q2031 5355 call m(idct_16x8_internal_10bpc).write_16x4_zero 5356 vpblendd m0, m12, [r5+32*0], 0x33 5357 vpblendd m1, m12, [r5+32*0], 0xcc 5358 vpblendd m2, m13, [r5+32*1], 0x33 5359 vpblendd m3, m13, [r5+32*1], 0xcc 5360 vpermq m0, m0, q3120 5361 vpermq m1, m1, q2031 5362 vpermq m2, m2, q3120 5363 vpermq m3, m3, q2031 5364 call m(idct_16x8_internal_10bpc).write_16x4_zero 5365 vpblendd m0, m14, [r5+32*2], 0x33 5366 vpblendd m1, m14, [r5+32*2], 0xcc 5367 vpblendd m2, m15, [r5+32*3], 0x33 5368 vpblendd m3, m15, [r5+32*3], 0xcc 5369 vpermq m0, m0, q3120 5370 vpermq m1, m1, q2031 5371 vpermq m2, m2, q3120 5372 vpermq m3, m3, q2031 5373 call m(idct_16x8_internal_10bpc).write_16x4_zero 5374 RET 5375ALIGN function_align 5376.pass2_part1: 5377 mova [cq+32* 8], m8 5378 mova [cq+32* 9], m9 5379 mova [cq+32*10], m10 5380 mova [cq+32*11], m11 5381 mova [cq+32*12], m12 5382 mova [cq+32*13], m13 5383 mova [cq+32*14], m14 5384 mova [cq+32*15], m15 5385.pass2_main: 5386 call m(idct_8x8_internal_12bpc).transpose_8x8 5387 mova [cq+32* 0], m0 5388 mova [cq+32* 1], m3 5389 mova [cq+32* 2], m4 5390 mova [cq+32* 3], m7 5391 vpbroadcastd m13, [clip_18b_min] 5392 vpbroadcastd m14, [clip_18b_max] 5393 pmaxsd m0, m13, m2 5394 pmaxsd m2, m13, m6 5395 pmaxsd m5, m13, m5 5396 pmaxsd m7, m13, m1 5397 REPX {pminsd x, m14}, m0, m2, m5, m7 5398 test eobd, eobd 5399 jge .pass2_slow 5400 pxor m1, m1 5401 REPX {mova x, m1}, m3, m4, m6 5402 jmp .pass2_fast 5403.pass2_slow: 5404 sub r6, 32*8 5405 mova m8, [r6-32*4] 5406 mova m3, [r6-32*3] 5407 mova m4, [r6-32*2] 5408 mova m11, [r6-32*1] 5409 mova m12, [r6+32*0] 5410 mova m1, [r6+32*1] 5411 mova m6, [r6+32*2] 5412 mova m15, [r6+32*3] 5413 TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14 5414 mova [cq+32* 4], m8 5415 mova [cq+32* 5], m11 5416 mova [cq+32* 6], m12 5417 mova [cq+32* 7], m15 5418 vpbroadcastd m13, [clip_18b_min] 5419 vpbroadcastd m14, [clip_18b_max] 5420 REPX {pmaxsd x, m13}, m1, m3, m4, m6 5421 REPX {pminsd x, m14}, m1, m3, m4, m6 5422.pass2_fast: 5423 vpbroadcastd m12, [pd_2048] 5424 vpbroadcastd m15, [pd_2896] 5425 call m(iadst_16x8_internal_10bpc).main_part1 5426 pmaxsd m0, m13, [cq+32* 0] ; 0 5427 pmaxsd m7, m13, [cq+32* 1] ; 3 5428 pmaxsd m2, m13, [cq+32* 2] ; 4 5429 pmaxsd m5, m13, [cq+32* 3] ; 7 5430 REPX {pminsd x, m14}, m0, m2, m5, m7 5431 test eobd, eobd 5432 jge .pass2_slow2 5433 pxor m1, m1 5434 REPX {mova x, m1}, m3, m4, m6 5435 jmp .pass2_fast2 5436.pass2_slow2: 5437 pmaxsd m4, m13, [cq+32* 4] ; 8 5438 pmaxsd m3, m13, [cq+32* 5] ; 11 5439 pmaxsd m6, m13, [cq+32* 6] ; 12 5440 pmaxsd m1, m13, [cq+32* 7] ; 15 5441 REPX {pminsd x, m14}, m1, m3, m4, m6 5442.pass2_fast2: 5443 call m(iadst_16x8_internal_10bpc).main_part2 5444 vpbroadcastd m14, [pd_17408] 5445 psrld m15, 11 ; pd_1 5446 psubd m13, m14, m15 ; pd_17407 5447 pslld m15, 3 ; pd_8 5448 ret 5449ALIGN function_align 5450.pass2_part2: 5451 REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 5452 REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 5453 packssdw m0, m1 5454 packssdw m1, m2, m3 5455 packssdw m2, m4, m5 5456 packssdw m3, m6, m7 5457 packssdw m4, m8, m9 5458 packssdw m5, m10, m11 5459 packssdw m6, m12, m13 5460 packssdw m7, m14, m15 5461 mova [r6-32*4], m0 5462 mova [r6-32*3], m1 5463 mova [r6-32*2], m2 5464 mova [r6-32*1], m3 5465 mova [r6+32*0], m4 5466 mova [r6+32*1], m5 5467 mova [r6+32*2], m6 5468 mova [r6+32*3], m7 5469 mova m0, [cq+32* 8] 5470 mova m1, [cq+32* 9] 5471 mova m2, [cq+32*10] 5472 mova m3, [cq+32*11] 5473 mova m4, [cq+32*12] 5474 mova m5, [cq+32*13] 5475 mova m6, [cq+32*14] 5476 mova m7, [cq+32*15] 5477 mov r5, r6 5478 add r6, 32*16 5479 jmp .pass2_main 5480 5481INV_TXFM_16X16_FN flipadst, dct, 0, 12 5482INV_TXFM_16X16_FN flipadst, adst, 0, 12 5483INV_TXFM_16X16_FN flipadst, flipadst, 0, 12 5484 5485cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 5486 vpbroadcastd m13, [clip_20b_min] 5487 vpbroadcastd m14, [clip_20b_max] 5488 jmp m(iflipadst_16x16_internal_10bpc).pass1 5489.pass2: 5490 call m(iadst_16x16_internal_12bpc).pass2_part1 5491 call m(iflipadst_16x8_internal_10bpc).pass1_rotations 5492 call m(iadst_16x16_internal_12bpc).pass2_part2 5493 call m(iflipadst_16x8_internal_10bpc).pass1_rotations 5494 jmp m(iadst_16x16_internal_12bpc).pass2_part3 5495 5496INV_TXFM_16X16_FN identity, dct, -92, 12 5497INV_TXFM_16X16_FN identity, identity, 0, 12 5498 5499%macro IDTX16_12BPC 1 ; src 5500 pmulld m6, m7, m%1 5501 paddd m6, m15 5502 psrad m6, 12 5503 paddd m6, m%1 5504 psrad m%1, m6, 1 5505%endmacro 5506 5507cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 5508 vpbroadcastd m7, [pd_1697] 5509 vpbroadcastd m15, [pd_5120] 5510 lea r6, [rsp+32*4] 5511 sub eobd, 36 5512 jl .fast 5513 mov r3, -32*8*4 5514.righthalf: 5515 mova m10, [cq+r3+32*33] 5516 mova m11, [cq+r3+32*35] 5517 mova m12, [cq+r3+32*37] 5518 mova m13, [cq+r3+32*39] 5519 add r6, 32*4 5520 pmulld m0, m7, m10 5521 pmulld m1, m7, m11 5522 pmulld m2, m7, m12 5523 pmulld m3, m7, m13 5524 REPX {paddd x, m15}, m0, m1, m2, m3 5525 REPX {psrad x, 12 }, m0, m1, m2, m3 5526 paddd m0, m10 5527 paddd m1, m11 5528 paddd m2, m12 5529 paddd m3, m13 5530 REPX {psrad x, 1 }, m0, m1, m2, m3 5531 mova [r6+32*0], m0 5532 mova [r6+32*1], m1 5533 mova [r6+32*2], m2 5534 mova [r6+32*3], m3 5535 add r3, 32*8 5536 jl .righthalf 5537.fast: 5538 mova m0, [cq+64* 0] 5539 mova m1, [cq+64* 1] 5540 mova m2, [cq+64* 2] 5541 mova m3, [cq+64* 3] 5542 mova m4, [cq+64* 4] 5543 mova m5, [cq+64* 5] 5544 mova m8, [cq+64* 6] 5545 mova m9, [cq+64* 7] 5546 REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9 5547 mova [cq+64*0], m8 5548 mova [cq+64*1], m9 5549 mova m8, [cq+64* 8] 5550 mova m9, [cq+64* 9] 5551 mova m10, [cq+64*10] 5552 mova m11, [cq+64*11] 5553 mova m12, [cq+64*12] 5554 mova m13, [cq+64*13] 5555 mova m14, [cq+64*14] 5556 REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14 5557 mova m6, [cq+64*15] 5558 pmulld m7, m6 5559 paddd m7, m15 5560 psrad m7, 12 5561 paddd m7, m6 5562 mova m6, [cq+64*0] 5563 psrad m15, m7, 1 5564 mova m7, [cq+64*1] 5565 jmp tx2q 5566.pass2: 5567 call m(iidentity_8x16_internal_12bpc).pass2_main 5568 call m(idct_16x16_internal_10bpc).transpose_fast 5569 test eobd, eobd 5570 jl .pass2_fast 5571 mova [cq+32* 8], m0 5572 mova [cq+32* 9], m1 5573 mova [cq+32*10], m2 5574 mova [cq+32*11], m3 5575 mova [cq+32*12], m4 5576 mova [cq+32*13], m5 5577 mova [cq+32*14], m6 5578 mova [cq+32*15], m7 5579 mova m8, [r6-32*4] 5580 mova m9, [r6-32*3] 5581 mova m10, [r6-32*2] 5582 mova m11, [r6-32*1] 5583 mova m12, [r6+32*0] 5584 mova m13, [r6+32*1] 5585 mova m14, [r6+32*2] 5586 mova m15, [r6+32*3] 5587 sub r6, 32*8 5588 mova m0, [r6-32*4] 5589 mova m1, [r6-32*3] 5590 mova m2, [r6-32*2] 5591 mova m3, [r6-32*1] 5592 mova m4, [r6+32*0] 5593 mova m5, [r6+32*1] 5594 mova m6, [r6+32*2] 5595 mova m7, [r6+32*3] 5596 call m(iidentity_8x16_internal_12bpc).pass2_main 5597 call m(idct_16x8_internal_10bpc).transpose2 5598 mova m8, m0 5599 mova m9, m1 5600 mova m10, m2 5601 mova m11, m3 5602 mova m12, m4 5603 mova m13, m5 5604 mova m14, m6 5605 mova m15, m7 5606 mova m0, [cq+32* 8] 5607 mova m1, [cq+32* 9] 5608 mova m2, [cq+32*10] 5609 mova m3, [cq+32*11] 5610 mova m4, [cq+32*12] 5611 mova m5, [cq+32*13] 5612 mova m6, [cq+32*14] 5613 mova m7, [cq+32*15] 5614.pass2_fast: 5615 call m(idct_16x16_internal_12bpc).write_16x16 5616 RET 5617 5618%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack 5619 mova m%4, [r6+32*(%1-4)] 5620 mova m%2, [r5+32*(3-%1)] 5621 mova m%5, [r4+32*(%1-4)] 5622 psubd m%3, m%1, m%4 ; idct16 out15 - n 5623 paddd m%1, m%4 ; idct16 out0 + n 5624 pmaxsd m%1, m12 5625 pmaxsd m%3, m12 5626 pminsd m%1, m13 5627 pminsd m%3, m13 5628 paddd m%1, m11 5629 paddd m%3, m11 5630 psubd m%4, m%1, m%2 ; out31 - n 5631 paddd m%1, m%2 ; out0 + n 5632 paddd m%2, m%3, m%5 ; out15 - n 5633 psubd m%3, m%5 ; out16 + n 5634 REPX {psrad x, %6}, m%1, m%3, m%2, m%4 5635%if %7 & 1 5636 packssdw m%1, m%3 ; out0 + n, out16 + n 5637 packssdw m%2, m%4 ; out15 - n, out31 - n 5638%endif 5639%endmacro 5640 5641cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob 5642 test eobd, eobd 5643 jz .dconly 5644 PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob 5645%undef cmp 5646 vpbroadcastd m11, [pd_2048] 5647 vpbroadcastd m12, [clip_18b_min] 5648 vpbroadcastd m13, [clip_18b_max] 5649 vbroadcasti128 m14, [idct32_shuf] 5650 mov r4, cq 5651 call .pass1_main 5652 mova [rsp+32*0], m2 5653 mova [rsp+32*1], m3 5654 cmp eobd, 43 5655 jge .eob43 5656 pxor m4, m4 5657 REPX {mova x, m4}, [rsp+32*2], m2, m3, m11 5658 jmp .pass1_end_fast 5659.eob43: 5660 lea r6, [rsp+32*8] 5661 mova [r6-32*4], m0 5662 mova [r6-32*3], m1 5663 call .pass1_main 5664 mova [rsp+32*2], m2 5665 cmp eobd, 107 5666 jge .eob107 5667 mova m11, m3 5668 mova m2, m0 5669 mova m3, m1 5670 mova m0, [r6-32*4] 5671 mova m1, [r6-32*3] 5672 pxor m4, m4 5673.pass1_end_fast: 5674 vpbroadcastd m10, [pw_2048] 5675 lea r6, [deint_shuf+128] 5676 REPX {mova x, m4}, m5, m6, m7 5677 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast 5678 jmp .end 5679.eob107: 5680 mova [rsp+32*3], m3 5681 mova [r6-32*2], m0 5682 mova [r6-32*1], m1 5683 call .pass1_main 5684 cmp eobd, 171 5685 jge .eob171 5686 pshufd m12, m2, q1032 5687 pshufd m13, m3, q1032 5688 mova m4, m0 5689 mova m5, m1 5690 pxor m6, m6 5691 REPX {mova x, m6}, m7, m14, m15 5692 jmp .pass1_end 5693.eob171: 5694 mova [r6+32*0], m0 5695 mova [r6+32*1], m1 5696 mova [r6+32*2], m2 5697 mova [r6+32*3], m3 5698 call .pass1_main 5699 pshufd m12, [r6+32*2], q1032 ; out19 out17 5700 pshufd m13, [r6+32*3], q1032 ; out23 out21 5701 mova m4, [r6+32*0] ; out16 out18 5702 mova m5, [r6+32*1] ; out20 out22 5703 pshufd m14, m2, q1032 ; out27 out25 5704 pshufd m15, m3, q1032 ; out31 out29 5705 mova m6, m0 ; out24 out26 5706 mova m7, m1 ; out28 out30 5707.pass1_end: 5708 mova m0, [r6-32*4] ; out0 out2 5709 mova m1, [r6-32*3] ; out4 out6 5710 mova m2, [r6-32*2] ; out8 out10 5711 mova m3, [r6-32*1] ; out12 out14 5712 lea r6, [deint_shuf+128] 5713 mova m11, [rsp+32*3] ; out13 out15 5714 vpbroadcastd m10, [pw_2048] 5715 call m(inv_txfm_add_dct_dct_8x32_8bpc).main 5716.end: ; [rsp+0*32] = m12 5717 vpbroadcastd m12, [pw_2048] 5718 mov cq, r4 5719 mova [rsp+32*1], m8 5720 mova [rsp+32*2], m9 5721 mova [rsp+32*3], m10 5722 mova [rsp+32*4], m11 5723 vpermq m0, m0, q3120 5724 vpermq m1, m1, q2031 5725 pmulhrsw m0, m12 5726 pmulhrsw m1, m12 5727 call m(idct_8x8_internal_10bpc).write_8x4_start 5728 vpermq m0, m2, q3120 5729 vpermq m1, m3, q2031 5730 pmulhrsw m0, m12 5731 pmulhrsw m1, m12 5732 call m(idct_8x8_internal_10bpc).write_8x4 5733 vpermq m0, m4, q3120 5734 vpermq m1, m5, q2031 5735 pmulhrsw m0, m12 5736 pmulhrsw m1, m12 5737 call m(idct_8x8_internal_10bpc).write_8x4 5738 vpermq m0, m6, q3120 5739 vpermq m1, m7, q2031 5740 pmulhrsw m0, m12 5741 pmulhrsw m1, m12 5742 call m(idct_8x8_internal_10bpc).write_8x4 5743 vpermq m0, [rsp+32*1], q3120 5744 vpermq m1, [rsp+32*2], q2031 5745 pmulhrsw m0, m12 5746 pmulhrsw m1, m12 5747 call m(idct_8x8_internal_10bpc).write_8x4 5748 vpermq m0, [rsp+32*3], q3120 5749 vpermq m1, [rsp+32*4], q2031 5750 pmulhrsw m0, m12 5751 pmulhrsw m1, m12 5752 call m(idct_8x8_internal_10bpc).write_8x4 5753 vpermq m0, [rsp+32*0], q3120 5754 vpermq m1, m13, q2031 5755 pmulhrsw m0, m12 5756 pmulhrsw m1, m12 5757 call m(idct_8x8_internal_10bpc).write_8x4 5758 vpermq m0, m14, q3120 5759 vpermq m1, m15, q2031 5760 pmulhrsw m0, m12 5761 pmulhrsw m1, m12 5762 call m(idct_8x8_internal_10bpc).write_8x4 5763 RET 5764.dconly: 5765 imul r6d, [cq], 181 5766 vpbroadcastd m2, [dconly_10bpc] 5767 mov [cq], eobd ; 0 5768 or r3d, 32 5769 add r6d, 640 5770 sar r6d, 10 5771 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 5772ALIGN function_align 5773.pass1_main_part1: 5774 mova m0, [cq+128*0] 5775 mova m1, [cq+128*1] 5776 mova m2, [cq+128*2] 5777 mova m3, [cq+128*3] 5778 mova m4, [cq+128*4] 5779 mova m5, [cq+128*5] 5780 mova m6, [cq+128*6] 5781 mova m7, [cq+128*7] 5782 call m(idct_8x8_internal_10bpc).main 5783 psrld m1, m11, 10 ; pd_2 5784 REPX {paddd x, m1}, m0, m6, m5, m3 5785 paddd m1, m6, m7 ; out1 5786 psubd m6, m7 ; out6 5787 psubd m7, m0, m9 ; out7 5788 paddd m0, m9 ; out0 5789 paddd m2, m5, m4 ; out2 5790 psubd m5, m4 ; out5 5791 psubd m4, m3, m8 ; out4 5792 paddd m3, m8 ; out3 5793 REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 5794 ret 5795ALIGN function_align 5796.pass1_main: 5797 call .pass1_main_part1 5798 add cq, 32 5799 packssdw m0, m1 5800 packssdw m2, m3 5801 packssdw m4, m5 5802 packssdw m6, m7 5803 pshufb m0, m14 5804 pshufb m2, m14 5805 pshufb m4, m14 5806 pshufb m6, m14 5807 punpckhdq m3, m0, m2 5808 punpckldq m0, m2 5809 punpckldq m2, m4, m6 5810 punpckhdq m4, m6 5811 vperm2i128 m1, m0, m2, 0x31 ; 4 6 5812 vinserti128 m0, xm2, 1 ; 0 2 5813 vinserti128 m2, m3, xm4, 1 ; 1 3 5814 vperm2i128 m3, m4, 0x31 ; 5 7 5815 ret 5816.main_oddhalf_part1_fast_rect2: 5817 REPX {paddd x, m11}, m0, m1, m2, m3 5818 REPX {psrad x, 12 }, m0, m1, m2, m3 5819.main_oddhalf_part1_fast: ; lower half zero 5820 vpbroadcastd m7, [pd_4091] 5821 vpbroadcastd m8, [pd_201] 5822 vpbroadcastd m6, [pd_m1380] 5823 vpbroadcastd m9, [pd_3857] 5824 vpbroadcastd m5, [pd_3703] 5825 vpbroadcastd m10, [pd_1751] 5826 vpbroadcastd m4, [pd_m2751] 5827 vpbroadcastd m15, [pd_3035] 5828 pmulld m7, m0 5829 pmulld m0, m8 5830 pmulld m6, m1 5831 pmulld m1, m9 5832 pmulld m5, m2 5833 pmulld m2, m10 5834 pmulld m4, m3 5835 pmulld m3, m15 5836 jmp .main_oddhalf_part1_fast2 5837.main_oddhalf_part1_rect2: 5838 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 5839 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 5840.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 5841 ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a 5842 ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a 5843 ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a 5844 ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a 5845.main_oddhalf_part1_fast2: 5846 REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 5847 REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 5848 psubd m8, m0, m4 ; t17 5849 paddd m0, m4 ; t16 5850 psubd m4, m6, m2 ; t18 5851 paddd m6, m2 ; t19 5852 psubd m2, m1, m5 ; t29 5853 paddd m1, m5 ; t28 5854 psubd m5, m7, m3 ; t30 5855 paddd m7, m3 ; t31 5856 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 5857 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 5858 vpbroadcastd m15, [pd_4017] 5859 vpbroadcastd m10, [pd_799] 5860 ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a 5861 ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a 5862 psubd m3, m0, m6 ; t19a 5863 paddd m0, m6 ; t16a 5864 psubd m6, m7, m1 ; t28a 5865 paddd m7, m1 ; t31a 5866 psubd m1, m5, m4 ; t18 5867 paddd m5, m4 ; t17 5868 psubd m4, m8, m2 ; t29 5869 paddd m8, m2 ; t30 5870 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 5871 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 5872 vpbroadcastd m15, [pd_3784] 5873 vpbroadcastd m10, [pd_1567] 5874 ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a 5875 ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 5876 mova [r6-32*4], m0 5877 mova [r6-32*3], m5 5878 mova [r6-32*2], m4 5879 mova [r6-32*1], m6 5880 mova [r6+32*0], m3 5881 mova [r6+32*1], m1 5882 mova [r6+32*2], m8 5883 mova [r6+32*3], m7 5884 ret 5885.main_oddhalf_part2_fast_rect2: 5886 REPX {paddd x, m11}, m0, m1, m2, m3 5887 REPX {psrad x, 12 }, m0, m1, m2, m3 5888.main_oddhalf_part2_fast: ; lower half zero 5889 vpbroadcastd m7, [pd_m601] 5890 vpbroadcastd m8, [pd_4052] 5891 vpbroadcastd m6, [pd_3973] 5892 vpbroadcastd m9, [pd_995] 5893 vpbroadcastd m5, [pd_m2106] 5894 vpbroadcastd m10, [pd_3513] 5895 vpbroadcastd m4, [pd_3290] 5896 vpbroadcastd m15, [pd_2440] 5897 pmulld m7, m0 5898 pmulld m0, m8 5899 pmulld m6, m1 5900 pmulld m1, m9 5901 pmulld m5, m2 5902 pmulld m2, m10 5903 pmulld m4, m3 5904 pmulld m3, m15 5905 jmp .main_oddhalf_part2_fast2 5906.main_oddhalf_part2_rect2: 5907 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 5908 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 5909.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 5910 ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a 5911 ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a 5912 ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a 5913 ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a 5914.main_oddhalf_part2_fast2: 5915 REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 5916 REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 5917 psubd m8, m0, m4 ; t25 5918 paddd m0, m4 ; t24 5919 psubd m4, m6, m2 ; t26 5920 paddd m6, m2 ; t27 5921 psubd m2, m1, m5 ; t21 5922 paddd m1, m5 ; t20 5923 psubd m5, m7, m3 ; t22 5924 paddd m7, m3 ; t23 5925 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 5926 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 5927 vpbroadcastd m15, [pd_2276] 5928 vpbroadcastd m10, [pd_3406] 5929 ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a 5930 ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a 5931 psubd m3, m0, m6 ; t27a 5932 paddd m0, m6 ; t24a 5933 psubd m6, m7, m1 ; t20a 5934 paddd m7, m1 ; t23a 5935 psubd m1, m5, m4 ; t21 5936 paddd m5, m4 ; t22 5937 psubd m4, m8, m2 ; t26 5938 paddd m8, m2 ; t25 5939 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 5940 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 5941 vpbroadcastd m15, [pd_3784] 5942 vpbroadcastd m10, [pd_1567] 5943 ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a 5944 ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20 5945 mova m9, [r6-32*4] ; t16a 5946 mova m10, [r6-32*3] ; t17 5947 psubd m2, m9, m7 ; t23 5948 paddd m9, m7 ; t16 5949 psubd m7, m10, m5 ; t22a 5950 paddd m10, m5 ; t17a 5951 REPX {pmaxsd x, m12}, m9, m10, m2, m7 5952 REPX {pminsd x, m13}, m9, m10, m2, m7 5953 mova [r6-32*4], m9 5954 mova [r6-32*3], m10 5955 mova m9, [r6-32*2] ; t18a 5956 mova m10, [r6-32*1] ; t19 5957 psubd m5, m9, m1 ; t21 5958 paddd m9, m1 ; t18 5959 psubd m1, m10, m6 ; t20a 5960 paddd m10, m6 ; t19a 5961 REPX {pmaxsd x, m12}, m9, m10, m5, m1 5962 REPX {pminsd x, m13}, m9, m10, m5, m1 5963 mova [r6-32*2], m9 5964 mova [r6-32*1], m10 5965 mova m9, [r6+32*0] ; t28 5966 mova m10, [r6+32*1] ; t29a 5967 psubd m6, m9, m3 ; t27a 5968 paddd m9, m3 ; t28a 5969 psubd m3, m10, m4 ; t26 5970 paddd m10, m4 ; t29 5971 REPX {pmaxsd x, m12}, m9, m10, m6, m3 5972 REPX {pminsd x, m13}, m9, m10, m6, m3 5973 REPX {pmulld x, m14}, m6, m3, m1, m5 5974 paddd m6, m11 5975 paddd m3, m11 5976 psubd m4, m6, m1 ; t20 5977 paddd m6, m1 ; t27 5978 psubd m1, m3, m5 ; t21a 5979 paddd m3, m5 ; t26a 5980 REPX {psrad x, 12 }, m4, m1, m3, m6 5981 mova [r6+32*0], m4 5982 mova [r6+32*1], m1 5983 mova m4, [r6+32*2] ; t30 5984 mova m1, [r6+32*3] ; t31a 5985 psubd m5, m4, m8 ; t25a 5986 paddd m4, m8 ; t30a 5987 psubd m8, m1, m0 ; t24 5988 paddd m1, m0 ; t31 5989 REPX {pmaxsd x, m12}, m8, m5, m4, m1 5990 REPX {pminsd x, m13}, m8, m5, m4, m1 5991 REPX {pmulld x, m14}, m5, m8, m7, m2 5992 paddd m5, m11 5993 paddd m8, m11 5994 psubd m0, m5, m7 ; t22 5995 paddd m5, m7 ; t25 5996 psubd m7, m8, m2 ; t23a 5997 paddd m2, m8 ; t24a 5998 REPX {psrad x, 12 }, m0, m7, m2, m5 5999 mova [r6+32*2], m0 6000 mova [r6+32*3], m7 6001 mov r4, r6 6002 add r6, 32*8 6003 mova [r6-32*4], m2 6004 mova [r6-32*3], m5 6005 mova [r6-32*2], m3 6006 mova [r6-32*1], m6 6007 mova [r6+32*0], m9 6008 mova [r6+32*1], m10 6009 mova [r6+32*2], m4 6010 mova [r6+32*3], m1 6011 mov r5, r6 6012 add r6, 32*8 6013 ret 6014ALIGN function_align 6015.main_end: 6016 psrld m11, 10 ; pd_2 6017 IDCT32_END 0, 15, 8, 9, 10, 2 6018 IDCT32_END 1, 14, 8, 9, 10, 2 6019 punpckhwd m8, m0, m1 ; 16 17 6020 punpcklwd m0, m1 ; 0 1 6021 punpcklwd m1, m14, m15 ; 14 15 6022 punpckhwd m14, m15 ; 30 31 6023 mova [r5+32*3], m8 6024 mova [r5+32*2], m14 6025 IDCT32_END 2, 15, 8, 9, 10, 2 6026 IDCT32_END 3, 14, 8, 9, 10, 2 6027 punpckhwd m8, m2, m3 ; 18 19 6028 punpcklwd m2, m3 ; 2 3 6029 punpcklwd m3, m14, m15 ; 12 13 6030 punpckhwd m14, m15 ; 28 29 6031 mova [r5+32*1], m8 6032 mova [r5+32*0], m14 6033 IDCT32_END 4, 15, 8, 9, 10, 2 6034 IDCT32_END 5, 14, 8, 9, 10, 2 6035 punpckhwd m8, m4, m5 ; 20 21 6036 punpcklwd m4, m5 ; 4 5 6037 punpcklwd m5, m14, m15 ; 10 11 6038 punpckhwd m14, m15 ; 26 27 6039 mova [r5-32*1], m8 6040 mova [r5-32*2], m14 6041 IDCT32_END 6, 15, 8, 9, 10, 2 6042 IDCT32_END 7, 14, 8, 9, 10, 2 6043 punpckhwd m8, m6, m7 ; 22 23 6044 punpcklwd m6, m7 ; 6 7 6045 punpcklwd m7, m14, m15 ; 8 9 6046 punpckhwd m14, m15 ; 24 25 6047 mova [r5-32*3], m8 6048 mova [r5-32*4], m14 6049.transpose: 6050 punpckhdq m15, m3, m1 6051 punpckldq m3, m1 6052 punpckhdq m1, m4, m6 6053 punpckldq m4, m6 6054 punpckhdq m6, m0, m2 6055 punpckldq m0, m2 6056 punpckhdq m2, m7, m5 6057 punpckldq m7, m5 6058 punpcklqdq m5, m2, m15 6059 punpckhqdq m2, m15 6060 punpckhqdq m15, m7, m3 6061 punpcklqdq m7, m3 6062 punpckhqdq m3, m6, m1 6063 punpcklqdq m6, m1 6064 punpckhqdq m1, m0, m4 6065 punpcklqdq m0, m4 6066 vperm2i128 m4, m0, m7, 0x31 6067 vinserti128 m0, xm7, 1 6068 vperm2i128 m7, m3, m2, 0x31 6069 vinserti128 m3, xm2, 1 6070 vinserti128 m2, m6, xm5, 1 6071 vperm2i128 m6, m5, 0x31 6072 vperm2i128 m5, m1, m15, 0x31 6073 vinserti128 m1, xm15, 1 6074 ret 6075 6076cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob 6077 vpbroadcastd m7, [pixel_10bpc_max] 6078.pass1: 6079 vpbroadcastd m5, [pw_5] 6080 pxor m6, m6 6081 mov r6d, eobd 6082 add eobb, 21 6083 cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192 6084 lea r6, [strideq*3] 6085 lea r5, [strideq*5] 6086 lea r4, [strideq+r6*2] ; strideq*7 6087.loop: 6088 mova m0, [cq+128*0] 6089 packssdw m0, [cq+128*1] 6090 mova m1, [cq+128*2] 6091 packssdw m1, [cq+128*3] 6092 mova m2, [cq+128*4] 6093 packssdw m2, [cq+128*5] 6094 mova m3, [cq+128*6] 6095 packssdw m3, [cq+128*7] 6096 REPX {paddsw x, m5}, m0, m1, m2, m3 6097 REPX {psraw x, 3 }, m0, m1, m2, m3 6098 call .main_zero 6099 add cq, 32 6100 lea dstq, [dstq+strideq*8] 6101 sub eobd, 64 6102 jge .loop 6103 RET 6104ALIGN function_align 6105.main_zero: 6106 REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 6107.main: 6108 punpckhwd m4, m0, m1 6109 punpcklwd m0, m1 6110 punpckhwd m1, m2, m3 6111 punpcklwd m2, m3 6112 punpckhwd m3, m0, m4 6113 punpcklwd m0, m4 6114 punpckhwd m4, m2, m1 6115 punpcklwd m2, m1 6116 punpckhqdq m1, m0, m2 6117 punpcklqdq m0, m2 6118 punpcklqdq m2, m3, m4 6119 punpckhqdq m3, m4 6120 mova xm4, [dstq+strideq*0] 6121 vinserti128 m4, [dstq+strideq*4], 1 6122 paddw m0, m4 6123 mova xm4, [dstq+strideq*1] 6124 vinserti128 m4, [dstq+r5 ], 1 6125 paddw m1, m4 6126 mova xm4, [dstq+strideq*2] 6127 vinserti128 m4, [dstq+r6*2 ], 1 6128 paddw m2, m4 6129 mova xm4, [dstq+r6 ] 6130 vinserti128 m4, [dstq+r4 ], 1 6131 paddw m3, m4 6132 REPX {pmaxsw x, m6}, m0, m1, m2, m3 6133 REPX {pminsw x, m7}, m0, m1, m2, m3 6134 mova [dstq+strideq*0], xm0 6135 vextracti128 [dstq+strideq*4], m0, 1 6136 mova [dstq+strideq*1], xm1 6137 vextracti128 [dstq+r5 ], m1, 1 6138 mova [dstq+strideq*2], xm2 6139 vextracti128 [dstq+r6*2 ], m2, 1 6140 mova [dstq+r6 ], xm3 6141 vextracti128 [dstq+r4 ], m3, 1 6142 ret 6143 6144cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob 6145 test eobd, eobd 6146 jz .dconly 6147 PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob 6148%undef cmp 6149 vpbroadcastd m11, [pd_2048] 6150 vpbroadcastd m12, [clip_20b_min] 6151 vpbroadcastd m13, [clip_20b_max] 6152 mov r4, cq 6153 lea r6, [rsp+32*4] 6154 call .pass1_main 6155 cmp eobd, 43 6156 jge .eob43 6157 jmp .pass2_fast 6158.eob43: 6159 call .pass1_main 6160 cmp eobd, 107 6161 jge .eob107 6162.pass2_fast: 6163 mov cq, r4 6164 vpbroadcastd m12, [clip_18b_min] 6165 vpbroadcastd m13, [clip_18b_max] 6166 pmaxsd m0, m12, [cq+128*1+ 0] 6167 pmaxsd m1, m12, [cq+128*7+ 0] 6168 pmaxsd m2, m12, [cq+128*1+32] 6169 pmaxsd m3, m12, [cq+128*7+32] 6170 REPX {pminsd x, m13}, m0, m1, m2, m3 6171 vpbroadcastd m14, [pd_2896] 6172 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast 6173 pmaxsd m0, m12, [cq+128*3+ 0] 6174 pmaxsd m1, m12, [cq+128*5+ 0] 6175 pmaxsd m2, m12, [cq+128*3+32] 6176 pmaxsd m3, m12, [cq+128*5+32] 6177 REPX {pminsd x, m13}, m0, m1, m2, m3 6178 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast 6179 pmaxsd m0, m12, [cq+128*2+ 0] 6180 pmaxsd m1, m12, [cq+128*6+ 0] 6181 pmaxsd m2, m12, [cq+128*2+32] 6182 pmaxsd m3, m12, [cq+128*6+32] 6183 REPX {pminsd x, m13}, m0, m1, m2, m3 6184 call m(idct_8x16_internal_10bpc).main_oddhalf_fast 6185 pmaxsd m0, m12, [cq+128*0+ 0] 6186 pmaxsd m1, m12, [cq+128*4+ 0] 6187 pmaxsd m2, m12, [cq+128*0+32] 6188 pmaxsd m3, m12, [cq+128*4+32] 6189 REPX {pminsd x, m13}, m0, m1, m2, m3 6190 pxor m4, m4 6191 REPX {mova x, m4}, m5, m6, m7 6192 call m(idct_8x8_internal_10bpc).main 6193 call m(idct_8x16_internal_10bpc).main_evenhalf 6194 jmp .pass2_end 6195.eob107: 6196 call .pass1_main 6197 cmp eobd, 171 6198 jge .eob171 6199 jmp .pass2 6200.eob171: 6201 call .pass1_main 6202.pass2: 6203 mov cq, r4 6204 vpbroadcastd m12, [clip_18b_min] 6205 vpbroadcastd m13, [clip_18b_max] 6206 pmaxsd m0, m12, [cq+128*1+ 0] 6207 pmaxsd m1, m12, [cq+128*7+ 0] 6208 pmaxsd m2, m12, [cq+128*1+32] 6209 pmaxsd m3, m12, [cq+128*7+32] 6210 pmaxsd m4, m12, [cq+128*1+64] 6211 pmaxsd m5, m12, [cq+128*7+64] 6212 pmaxsd m6, m12, [cq+128*1+96] 6213 pmaxsd m7, m12, [cq+128*7+96] 6214 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 6215 vpbroadcastd m14, [pd_2896] 6216 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 6217 pmaxsd m0, m12, [cq+128*3+ 0] 6218 pmaxsd m1, m12, [cq+128*5+ 0] 6219 pmaxsd m2, m12, [cq+128*3+32] 6220 pmaxsd m3, m12, [cq+128*5+32] 6221 pmaxsd m4, m12, [cq+128*3+64] 6222 pmaxsd m5, m12, [cq+128*5+64] 6223 pmaxsd m6, m12, [cq+128*3+96] 6224 pmaxsd m7, m12, [cq+128*5+96] 6225 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 6226 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 6227 pmaxsd m0, m12, [cq+128*2+ 0] 6228 pmaxsd m1, m12, [cq+128*6+ 0] 6229 pmaxsd m2, m12, [cq+128*2+32] 6230 pmaxsd m3, m12, [cq+128*6+32] 6231 pmaxsd m4, m12, [cq+128*2+64] 6232 pmaxsd m5, m12, [cq+128*6+64] 6233 pmaxsd m6, m12, [cq+128*2+96] 6234 pmaxsd m7, m12, [cq+128*6+96] 6235 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 6236 call m(idct_8x16_internal_10bpc).main_oddhalf 6237 pmaxsd m0, m12, [cq+128*0+ 0] 6238 pmaxsd m1, m12, [cq+128*4+ 0] 6239 pmaxsd m2, m12, [cq+128*0+32] 6240 pmaxsd m3, m12, [cq+128*4+32] 6241 pmaxsd m4, m12, [cq+128*0+64] 6242 pmaxsd m5, m12, [cq+128*4+64] 6243 pmaxsd m6, m12, [cq+128*0+96] 6244 pmaxsd m7, m12, [cq+128*4+96] 6245 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 6246 call m(idct_8x8_internal_10bpc).main 6247 call m(idct_8x16_internal_10bpc).main_evenhalf 6248.pass2_end: 6249 psrld m11, 8 ; pd_8 6250 IDCT32_END 0, 15, 8, 9, 10, 4 6251 IDCT32_END 1, 14, 8, 9, 10, 4 6252 punpckhqdq m8, m0, m1 ; 16 17 (interleaved) 6253 punpcklqdq m0, m1 ; 0 1 (interleaved) 6254 punpcklqdq m1, m14, m15 ; 14 15 (interleaved) 6255 punpckhqdq m14, m15 ; 30 31 (interleaved) 6256 mova [r5+32*3], m8 6257 mova [r5+32*2], m14 6258 IDCT32_END 2, 15, 8, 9, 10, 4 6259 IDCT32_END 3, 14, 8, 9, 10, 4 6260 punpckhqdq m8, m2, m3 ; 18 19 (interleaved) 6261 punpcklqdq m2, m3 ; 2 3 (interleaved) 6262 punpcklqdq m3, m14, m15 ; 12 13 (interleaved) 6263 punpckhqdq m14, m15 ; 28 29 (interleaved) 6264 mova [r5+32*1], m8 6265 mova [r5+32*0], m14 6266 IDCT32_END 4, 15, 8, 9, 10, 4 6267 IDCT32_END 5, 14, 8, 9, 10, 4 6268 punpckhqdq m8, m4, m5 ; 20 21 (interleaved) 6269 punpcklqdq m4, m5 ; 4 5 (interleaved) 6270 punpcklqdq m5, m14, m15 ; 10 11 (interleaved) 6271 punpckhqdq m14, m15 ; 26 27 (interleaved) 6272 mova [r5-32*1], m8 6273 mova [r5-32*2], m14 6274 IDCT32_END 6, 15, 8, 9, 10, 4 6275 IDCT32_END 7, 14, 8, 9, 10, 4 6276 punpckhqdq m8, m6, m7 ; 22 23 (interleaved) 6277 punpcklqdq m6, m7 ; 6 7 (interleaved) 6278 punpcklqdq m7, m14, m15 ; 8 9 (interleaved) 6279 punpckhqdq m14, m15 ; 24 25 (interleaved) 6280 mova [r5-32*3], m8 6281 mova [r5-32*4], m14 6282 mova m15, m1 6283.end: 6284 vpermq m0, m0, q3120 6285 vpermq m1, m2, q3120 6286 call m(idct_8x8_internal_12bpc).write_8x4_start 6287 call m(idct_8x8_internal_10bpc).write_8x4 6288 vpermq m0, m4, q3120 6289 vpermq m1, m6, q3120 6290 call m(idct_8x8_internal_10bpc).write_8x4 6291 vpermq m0, m7, q3120 6292 vpermq m1, m5, q3120 6293 call m(idct_8x8_internal_10bpc).write_8x4 6294 vpermq m0, m3, q3120 6295 vpermq m1, m15, q3120 6296 call m(idct_8x8_internal_10bpc).write_8x4 6297 vpermq m0, [r5+32*3], q3120 6298 vpermq m1, [r5+32*1], q3120 6299 call m(idct_8x8_internal_10bpc).write_8x4 6300 vpermq m0, [r5-32*1], q3120 6301 vpermq m1, [r5-32*3], q3120 6302 call m(idct_8x8_internal_10bpc).write_8x4 6303 vpermq m0, [r5-32*4], q3120 6304 vpermq m1, [r5-32*2], q3120 6305 call m(idct_8x8_internal_10bpc).write_8x4 6306 vpermq m0, [r5+32*0], q3120 6307 vpermq m1, [r5+32*2], q3120 6308 call m(idct_8x8_internal_10bpc).write_8x4 6309 RET 6310.dconly: 6311 imul r6d, [cq], 181 6312 vpbroadcastd m2, [dconly_12bpc] 6313 mov [cq], eobd ; 0 6314 or r3d, 32 6315 add r6d, 640 6316 sar r6d, 10 6317 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 6318ALIGN function_align 6319.pass1_main: 6320 call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1 6321 TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15 6322 mova [cq+128*0], m0 6323 mova [cq+128*1], m1 6324 mova [cq+128*2], m2 6325 mova [cq+128*3], m3 6326 mova [cq+128*4], m4 6327 mova [cq+128*5], m5 6328 mova [cq+128*6], m6 6329 mova [cq+128*7], m7 6330 add cq, 32 6331 ret 6332ALIGN function_align 6333.main_end: 6334 psrld m11, 10 ; pd_2 6335 IDCT32_END 0, 15, 8, 9, 10, 2, 0 6336 mova [cq+32*16], m8 6337 mova [cq+32*31], m9 6338 IDCT32_END 1, 14, 8, 9, 10, 2, 0 6339 mova [cq+32*17], m8 6340 mova [cq+32*30], m9 6341 mova [cq+32*14], m14 6342 IDCT32_END 2, 14, 8, 9, 10, 2, 0 6343 mova [cq+32*18], m8 6344 mova [cq+32*29], m9 6345 mova [cq+32*13], m14 6346 IDCT32_END 3, 14, 8, 9, 10, 2, 0 6347 mova [cq+32*19], m8 6348 mova [cq+32*28], m9 6349 mova [cq+32*12], m14 6350 IDCT32_END 4, 14, 8, 9, 10, 2, 0 6351 mova [cq+32*20], m8 6352 mova [cq+32*27], m9 6353 mova [cq+32* 0], m0 6354 mova [cq+32* 1], m1 6355 mova [cq+32* 2], m2 6356 IDCT32_END 5, 10, 0, 1, 2, 2, 0 6357 mova [cq+32*21], m0 6358 mova [cq+32*26], m1 6359 IDCT32_END 6, 9, 0, 1, 2, 2, 0 6360 mova [cq+32*22], m0 6361 mova [cq+32*25], m1 6362 IDCT32_END 7, 8, 0, 1, 2, 2, 0 6363 mova [cq+32*23], m0 6364 mova [cq+32*24], m1 6365 mova m0, [cq+32* 0] 6366 mova m1, [cq+32* 1] 6367 mova m2, [cq+32* 2] 6368 mova m11, m14 6369 mova m12, [cq+32*12] 6370 mova m13, [cq+32*13] 6371 mova m14, [cq+32*14] 6372 ret 6373 6374cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob 6375 vpbroadcastd m7, [pixel_12bpc_max] 6376 jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1 6377 6378cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob 6379 test eobd, eobd 6380 jnz .full 6381 imul r6d, [cq], 181 6382 vpbroadcastd m3, [dconly_10bpc] 6383 mov [cq], eobd ; 0 6384 or r3d, 8 6385.dconly: 6386 add r6d, 640 6387 sar r6d, 10 6388.dconly2: 6389 imul r6d, 181 6390 add r6d, 2176 6391 sar r6d, 12 6392 movd xm0, r6d 6393 paddsw xm0, xm3 6394 vpbroadcastw m0, xm0 6395.dconly_loop: 6396 paddsw m1, m0, [dstq+32*0] 6397 paddsw m2, m0, [dstq+32*1] 6398 psubusw m1, m3 6399 psubusw m2, m3 6400 mova [dstq+32*0], m1 6401 mova [dstq+32*1], m2 6402 add dstq, strideq 6403 dec r3d 6404 jg .dconly_loop 6405 RET 6406.full: 6407 PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob 6408 lea r6, [rsp+32*4] 6409 vpbroadcastd m12, [clip_18b_min] 6410 vpbroadcastd m13, [clip_18b_max] 6411 call .pass1 6412 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end 6413 lea r6, [deint_shuf+128] 6414 vpbroadcastd m11, [pw_2048] 6415 mov r4, dstq 6416 call .pass2 6417 mova m0, [r5+32*3] ; 16 17 6418 mova m1, [r5+32*2] ; 30 31 6419 mova m2, [r5+32*1] ; 18 19 6420 mova m3, [r5+32*0] ; 28 29 6421 mova m4, [r5-32*1] ; 20 21 6422 mova m5, [r5-32*2] ; 26 27 6423 mova m6, [r5-32*3] ; 22 23 6424 mova m7, [r5-32*4] ; 24 25 6425 call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose 6426 lea dstq, [r4+32] 6427 call .pass2 6428 RET 6429ALIGN function_align 6430.pass2: 6431 call m(idct_16x8_internal_8bpc).main 6432 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 6433 call m(idct_16x8_internal_10bpc).write_16x4_start 6434 pmulhrsw m0, m11, m4 6435 pmulhrsw m1, m11, m5 6436 pmulhrsw m2, m11, m6 6437 pmulhrsw m3, m11, m7 6438 jmp m(idct_16x8_internal_10bpc).write_16x4_zero 6439ALIGN function_align 6440.pass1: 6441 mova m0, [cq+32* 1] 6442 mova m1, [cq+32* 7] 6443 mova m2, [cq+32* 9] 6444 mova m3, [cq+32*15] 6445 mova m4, [cq+32*17] 6446 mova m5, [cq+32*23] 6447 mova m6, [cq+32*25] 6448 mova m7, [cq+32*31] 6449 vpbroadcastd m11, [pd_2048] 6450 vpbroadcastd m14, [pd_2896] 6451 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 6452 mova m0, [cq+32* 3] 6453 mova m1, [cq+32* 5] 6454 mova m2, [cq+32*11] 6455 mova m3, [cq+32*13] 6456 mova m4, [cq+32*19] 6457 mova m5, [cq+32*21] 6458 mova m6, [cq+32*27] 6459 mova m7, [cq+32*29] 6460 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 6461 mova m0, [cq+32* 2] 6462 mova m1, [cq+32* 6] 6463 mova m2, [cq+32*10] 6464 mova m3, [cq+32*14] 6465 mova m4, [cq+32*18] 6466 mova m5, [cq+32*22] 6467 mova m6, [cq+32*26] 6468 mova m7, [cq+32*30] 6469 call m(idct_8x16_internal_10bpc).main_oddhalf 6470 mova m0, [cq+32* 0] 6471 mova m1, [cq+32* 4] 6472 mova m2, [cq+32* 8] 6473 mova m3, [cq+32*12] 6474 mova m4, [cq+32*16] 6475 mova m5, [cq+32*20] 6476 mova m6, [cq+32*24] 6477 mova m7, [cq+32*28] 6478 call m(idct_8x8_internal_10bpc).main 6479 call m(idct_8x16_internal_10bpc).main_evenhalf 6480 ret 6481 6482cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob 6483 vpbroadcastd m7, [pixel_10bpc_max] 6484.pass1: 6485 vpbroadcastd m5, [pw_4096] 6486 pxor m6, m6 6487 mov r6d, eobd 6488 add eobb, 21 6489 cmovc eobd, r6d 6490 lea r6, [strideq*3] 6491 lea r5, [strideq*5] 6492 lea r4, [strideq+r6*2] ; strideq*7 6493.loop: 6494 mova m0, [cq+32*0] 6495 packssdw m0, [cq+32*1] 6496 mova m1, [cq+32*2] 6497 packssdw m1, [cq+32*3] 6498 REPX {mova [cq+32*x], m6}, 0, 1, 2, 3 6499 add cq, 32*8 6500 mova m2, [cq-32*4] 6501 packssdw m2, [cq-32*3] 6502 mova m3, [cq-32*2] 6503 packssdw m3, [cq-32*1] 6504 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 6505 REPX {mova [cq+32*x], m6}, -4, -3, -2, -1 6506 call m(inv_txfm_add_identity_identity_8x32_10bpc).main 6507 add dstq, 16 6508 sub eobd, 64 6509 jge .loop 6510 RET 6511 6512cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob 6513 test eobd, eobd 6514 jnz .full 6515 imul r6d, [cq], 181 6516 vpbroadcastd m3, [dconly_12bpc] 6517 mov [cq], eobd ; 0 6518 or r3d, 8 6519 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly 6520.full: 6521 PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob 6522 lea r6, [rsp+32*4] 6523 vpbroadcastd m12, [clip_20b_min] 6524 vpbroadcastd m13, [clip_20b_max] 6525 call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1 6526 call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end 6527 mov r4, dstq 6528 call m(idct_16x8_internal_12bpc).pass2_main 6529 mova m0, [cq+32* 0] ; 16 6530 mova m1, [cq+32* 1] ; 17 6531 mova m2, [cq+32* 2] ; 18 6532 mova m3, [cq+32* 3] ; 19 6533 mova m4, [cq+32* 4] ; 20 6534 mova m5, [cq+32* 5] ; 21 6535 mova m6, [cq+32* 6] ; 22 6536 mova m7, [cq+32* 7] ; 23 6537 mova m8, [cq+32* 8] ; 24 6538 mova m9, [cq+32* 9] ; 25 6539 mova m10, [cq+32*10] ; 26 6540 mova m11, [cq+32*11] ; 27 6541 mova m12, [cq+32*12] ; 28 6542 mova m13, [cq+32*13] ; 29 6543 mova m14, [cq+32*14] ; 30 6544 mova m15, [cq+32*15] ; 31 6545 lea dstq, [r4+32] 6546 call m(idct_16x8_internal_12bpc).pass2_main 6547 RET 6548 6549cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob 6550 vpbroadcastd m7, [pixel_12bpc_max] 6551 jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1 6552 6553%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] 6554 mova m%4, [%2] 6555 paddsw m%3, m%1, m%4 6556 psubsw m%1, m%4 6557%if %1 == 0 6558 pxor m6, m6 6559%endif 6560 pmulhrsw m%3, m15 6561 pmulhrsw m%1, m15 6562 paddw m%3, [dstq+%5] 6563 paddw m%1, [r2+%6] 6564 pmaxsw m%3, m6 6565 pmaxsw m%1, m6 6566 pminsw m%3, m7 6567 pminsw m%1, m7 6568 mova [dstq+%5], m%3 6569 mova [r2+%6], m%1 6570%endmacro 6571 6572cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob 6573 test eobd, eobd 6574 jz .dconly 6575 PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob 6576%undef cmp 6577 vpbroadcastd m11, [pd_2048] 6578 vpbroadcastd m12, [clip_18b_min] 6579 vpbroadcastd m13, [clip_18b_max] 6580 vpbroadcastd m14, [pd_2896] 6581 lea r6, [rsp+32*16] 6582 lea r4, [r6+32*8] 6583 lea r5, [r6+32*16] 6584 call .main 6585 sub eobd, 44 6586 jge .eob44 6587 vperm2i128 m2, m0, m3, 0x31 ; 5 6588 vinserti128 m0, xm3, 1 ; 1 6589 vperm2i128 m3, m1, m4, 0x31 ; 7 6590 vinserti128 m1, xm4, 1 ; 3 6591 pxor m4, m4 6592 REPX {mova x, m4}, m5, m6, m7 6593 REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 6594 jmp .fast 6595.dconly: 6596 imul r6d, [cq], 181 6597 vpbroadcastd m3, [dconly_10bpc] 6598 mov [cq], eobd ; 0 6599 or r3d, 32 6600 add r6d, 128 6601 sar r6d, 8 6602 imul r6d, 181 6603 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 6604.eob44: 6605 mova [r4+16*0], xm0 6606 mova [r4+16*1], xm3 6607 mova [r4+16*2], xm1 6608 mova [r4+16*3], xm4 6609 vextracti128 [r4+16*4], m0, 1 6610 vextracti128 [r4+16*5], m3, 1 6611 vextracti128 [r4+16*6], m1, 1 6612 vextracti128 [r4+16*7], m4, 1 6613 call .main 6614 sub eobd, 107 6615 jge .eob151 6616 vperm2i128 m7, m1, m4, 0x31 ; 15 6617 vinserti128 m5, m1, xm4, 1 ; 11 6618 vperm2i128 m6, m0, m3, 0x31 ; 13 6619 vinserti128 m4, m0, xm3, 1 ; 9 6620 mova m0, [r4+32*0] 6621 mova m1, [r4+32*1] 6622 mova m2, [r4+32*2] 6623 mova m3, [r4+32*3] 6624.fast: 6625 lea r6, [pw_5+128] 6626 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 6627 pxor m8, m8 6628 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 6629 jmp .idct16 6630.eob151: 6631 mova [r4-16*8], xm0 6632 mova [r4-16*7], xm3 6633 mova [r4-16*6], xm1 6634 mova [r4-16*5], xm4 6635 vextracti128 [r4-16*4], m0, 1 6636 vextracti128 [r4-16*3], m3, 1 6637 vextracti128 [r4-16*2], m1, 1 6638 vextracti128 [r4-16*1], m4, 1 6639 call .main 6640 sub eobd, 128 6641 jge .eob279 6642 vperm2i128 m10, m0, m3, 0x31 ; 21 6643 vinserti128 m8, m0, xm3, 1 ; 17 6644 vperm2i128 m11, m1, m4, 0x31 ; 23 6645 vinserti128 m9, m1, xm4, 1 ; 19 6646 pxor m12, m12 6647 REPX {mova x, m12}, m13, m14, m15 6648 REPX {mova [r6+32*x], m12}, 0, 1, 2, 3 6649 jmp .full 6650.eob279: 6651 mova [r5+16*0], xm0 6652 mova [r5+16*1], xm3 6653 mova [r5+16*2], xm1 6654 mova [r5+16*3], xm4 6655 vextracti128 [r5+16*4], m0, 1 6656 vextracti128 [r5+16*5], m3, 1 6657 vextracti128 [r5+16*6], m1, 1 6658 vextracti128 [r5+16*7], m4, 1 6659 call .main 6660 vperm2i128 m14, m0, m3, 0x31 ; 29 6661 vinserti128 m12, m0, xm3, 1 ; 25 6662 vperm2i128 m15, m1, m4, 0x31 ; 31 6663 vinserti128 m13, m1, xm4, 1 ; 27 6664 mova m8, [r5+32*0] 6665 mova m9, [r5+32*1] 6666 mova m10, [r5+32*2] 6667 mova m11, [r5+32*3] 6668.full: 6669 mova m0, [r4+32*0] 6670 mova m1, [r4+32*1] 6671 mova m2, [r4+32*2] 6672 mova m3, [r4+32*3] 6673 mova m4, [r4-32*4] 6674 mova m5, [r4-32*3] 6675 mova m6, [r4-32*2] 6676 mova m7, [r4-32*1] 6677 lea r6, [pw_5 + 128] 6678 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 6679 lea r3, [rsp+32*8] 6680 mova m8, [r3+32*0] 6681 mova m9, [r3+32*1] 6682 mova m10, [r3+32*2] 6683 mova m11, [r3+32*3] 6684 mova m12, [r3-32*4] 6685 mova m13, [r3-32*3] 6686 mova m14, [r3-32*2] 6687 mova m15, [r3-32*1] 6688.idct16: 6689 lea r3, [rsp+32*16] 6690 mova m0, [r3+32*0] 6691 mova m1, [r3+32*1] 6692 mova m2, [r3+32*2] 6693 mova m3, [r3+32*3] 6694 mova m4, [r3-32*4] 6695 mova m5, [r3-32*3] 6696 mova m6, [r3-32*2] 6697 mova m7, [r3-32*1] 6698 mova [rsp], m15 6699 call m(idct_16x16_internal_8bpc).main 6700 imul r2, strideq, 19 6701 lea r3, [strideq*3] 6702 add r2, dstq 6703 call .pass2_end 6704 RET 6705ALIGN function_align 6706.main: 6707 pmulld m0, m14, [cq+128* 1] 6708 pmulld m1, m14, [cq+128* 3] 6709 pmulld m2, m14, [cq+128* 5] 6710 pmulld m3, m14, [cq+128* 7] 6711 pmulld m4, m14, [cq+128* 9] 6712 pmulld m5, m14, [cq+128*11] 6713 pmulld m6, m14, [cq+128*13] 6714 pmulld m7, m14, [cq+128*15] 6715 call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 6716 pmulld m0, m14, [cq+128* 0] 6717 pmulld m1, m14, [cq+128* 2] 6718 pmulld m2, m14, [cq+128* 4] 6719 pmulld m3, m14, [cq+128* 6] 6720 pmulld m4, m14, [cq+128* 8] 6721 pmulld m5, m14, [cq+128*10] 6722 pmulld m6, m14, [cq+128*12] 6723 pmulld m7, m14, [cq+128*14] 6724 call m(idct_8x8_internal_10bpc).main_rect2 6725 call m(idct_8x16_internal_10bpc).main_evenhalf 6726 psrld m15, m11, 11 ; pd_1 6727 mova m8, [r6-32*4] 6728 mova m9, [r6-32*3] 6729 REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 6730 psubd m10, m0, m8 ; out15 6731 paddd m0, m8 ; out0 6732 mova m8, [r6-32*2] 6733 paddd m15, m1, m9 ; out1 6734 psubd m1, m9 ; out14 6735 mova m9, [r6-32*1] 6736 REPX {psrad x, 1}, m0, m15, m10, m1 6737 packssdw m0, m15 6738 packssdw m1, m10 6739 psubd m10, m2, m8 ; out13 6740 paddd m2, m8 ; out2 6741 mova m8, [r6+32*0] 6742 paddd m15, m3, m9 ; out3 6743 psubd m3, m9 ; out12 6744 mova m9, [r6+32*1] 6745 REPX {psrad x, 1}, m2, m15, m10, m3 6746 packssdw m2, m15 6747 packssdw m3, m10 6748 psubd m10, m4, m8 ; out11 6749 paddd m4, m8 ; out4 6750 mova m8, [r6+32*2] 6751 paddd m15, m5, m9 ; out5 6752 psubd m5, m9 ; out10 6753 mova m9, [r6+32*3] 6754 REPX {psrad x, 1}, m4, m10, m15, m5 6755 packssdw m4, m15 6756 packssdw m5, m10 6757 psubd m10, m6, m8 ; out9 6758 paddd m6, m8 ; out6 6759 paddd m15, m7, m9 ; out7 6760 psubd m7, m9 ; out8 6761 REPX {psrad x, 1}, m6, m10, m15, m7 6762 packssdw m6, m15 6763 packssdw m7, m10 6764 punpckhwd m8, m0, m2 6765 punpcklwd m0, m2 6766 punpckhwd m2, m3, m1 6767 punpcklwd m3, m1 6768 punpckhwd m1, m4, m6 6769 punpcklwd m4, m6 6770 punpcklwd m6, m7, m5 6771 punpckhwd m7, m5 6772 pxor m5, m5 6773 mov r7d, 128*13 6774.main_zero_loop: 6775 mova [cq+r7-128*1], m5 6776 mova [cq+r7+128*0], m5 6777 mova [cq+r7+128*1], m5 6778 mova [cq+r7+128*2], m5 6779 sub r7d, 128*4 6780 jg .main_zero_loop 6781 add cq, 32 6782 punpcklwd m5, m3, m2 6783 punpckhwd m3, m2 6784 punpcklwd m2, m4, m1 6785 punpckhwd m4, m1 6786 punpckhwd m1, m0, m8 6787 punpcklwd m0, m8 6788 punpckhwd m8, m6, m7 6789 punpcklwd m6, m7 6790 punpcklqdq m7, m1, m4 6791 punpckhqdq m1, m4 6792 punpckhqdq m4, m8, m3 6793 punpcklqdq m8, m3 6794 punpckhqdq m3, m6, m5 6795 punpcklqdq m6, m5 6796 punpcklqdq m5, m0, m2 6797 punpckhqdq m0, m2 6798 mova [r6+16*0], xm5 6799 mova [r6+16*1], xm6 6800 mova [r6+16*2], xm7 6801 mova [r6+16*3], xm8 6802 vextracti128 [r6+16*4], m5, 1 6803 vextracti128 [r6+16*5], m6, 1 6804 vextracti128 [r6+16*6], m7, 1 6805 vextracti128 [r6+16*7], m8, 1 6806 sub r6, 32*4 6807 ret 6808ALIGN function_align 6809.pass2_end: 6810 mova [rsp+gprsize+32*0], m6 6811 mova [rsp+gprsize+32*2], m7 6812 mova [rsp+gprsize+32*3], m15 6813 vpbroadcastd m15, [pw_2048] 6814 vpbroadcastd m7, [pixel_10bpc_max] 6815 IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4 6816 IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8 6817 IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4 6818 IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0 6819 add dstq, strideq 6820 sub r2, strideq 6821 mova m1, [rsp+gprsize+32*1] 6822 IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4 6823 IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8 6824 IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4 6825 IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0 6826 add dstq, strideq 6827 sub r2, strideq 6828 mova m1, [rsp+gprsize+32*0] 6829 IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4 6830 IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8 6831 IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4 6832 IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0 6833 add dstq, strideq 6834 sub r2, strideq 6835 mova m1, [rsp+gprsize+32*2] 6836 mova m2, [rsp+gprsize+32*3] 6837 IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4 6838 IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8 6839 IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4 6840 IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0 6841 ret 6842 6843cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob 6844 vpbroadcastd m7, [pixel_10bpc_max] 6845.pass1: 6846 vpbroadcastd m8, [pw_2896x8] 6847 vpbroadcastd m9, [pw_1697x16] 6848 vpbroadcastd m11, [pw_8192] 6849 lea r6, [strideq*5] 6850 pxor m6, m6 6851 paddw m10, m11, m11 ; pw_16384 6852 mov r5, dstq 6853 call .main 6854 sub eobd, 36 6855 jl .ret 6856 add cq, 128*8 6857 lea dstq, [r5+16] 6858 call .main 6859 sub cq, 128*8-32 6860 lea dstq, [r5+strideq*8] 6861 mov r5, dstq 6862 call .main 6863 sub eobd, 107 ; eob < 143 6864 jl .ret 6865 add cq, 128*8 6866 lea dstq, [r5+16] 6867 call .main 6868 sub cq, 128*8-32 6869 lea dstq, [r5+strideq*8] 6870 mov r5, dstq 6871 call .main 6872 sub eobd, 128 ; eob < 271 6873 jl .ret 6874 add cq, 128*8 6875 lea dstq, [r5+16] 6876 call .main 6877 sub cq, 128*8-32 6878 lea dstq, [r5+strideq*8] 6879 mov r5, dstq 6880 call .main 6881 sub eobd, 128 ; eob < 399 6882 jl .ret 6883 add cq, 128*8 6884 lea dstq, [r5+16] 6885 call .main 6886.ret: 6887 RET 6888ALIGN function_align 6889.main: 6890 mova m0, [cq+128*0] 6891 packssdw m0, [cq+128*1] 6892 mova m1, [cq+128*2] 6893 packssdw m1, [cq+128*3] 6894 mova m2, [cq+128*4] 6895 packssdw m2, [cq+128*5] 6896 mova m3, [cq+128*6] 6897 packssdw m3, [cq+128*7] 6898 REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 6899 REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3 6900 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 6901 REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 6902.main2: 6903 punpckhwd m4, m0, m1 6904 punpcklwd m0, m1 6905 punpckhwd m1, m2, m3 6906 punpcklwd m2, m3 6907 punpckhwd m3, m0, m4 6908 punpcklwd m0, m4 6909 punpcklwd m4, m2, m1 6910 punpckhwd m2, m1 6911 punpckhqdq m1, m0, m4 6912 punpcklqdq m0, m4 6913 call m(iidentity_8x8_internal_10bpc).write_2x8x2 6914 punpcklqdq m0, m3, m2 6915 punpckhqdq m1, m3, m2 6916 jmp m(iidentity_8x8_internal_10bpc).write_2x8x2 6917 6918cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob 6919 vpbroadcastd m7, [pixel_12bpc_max] 6920 jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1 6921 6922cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob 6923 test eobd, eobd 6924 jz .dconly 6925 PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob 6926%undef cmp 6927 vpbroadcastd m12, [clip_18b_min] 6928 vpbroadcastd m13, [clip_18b_max] 6929 lea r6, [rsp+32*4] 6930 call .main 6931 cmp eobd, 36 6932 jge .full 6933 call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose 6934 pxor m8, m8 6935 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] 6936 lea r6, [pw_5+128] 6937 mov r7, dstq 6938 call m(idct_16x16_internal_8bpc).main 6939 call .write_16x16 6940 mova m0, [r5+32*3] 6941 mova m1, [r5+32*2] 6942 mova m2, [r5+32*1] 6943 mova m3, [r5+32*0] 6944 mova m4, [r5-32*1] 6945 mova m5, [r5-32*2] 6946 mova m6, [r5-32*3] 6947 mova m7, [r5-32*4] 6948 call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose 6949 pxor m8, m8 6950 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] 6951 jmp .end 6952.dconly: 6953 imul r6d, [cq], 181 6954 vpbroadcastd m3, [dconly_10bpc] 6955 mov [cq], eobd ; 0 6956 or r3d, 16 6957 add r6d, 128 6958 sar r6d, 8 6959 imul r6d, 181 6960 add r6d, 384 6961 sar r6d, 9 6962 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 6963.full: 6964 add cq, 32 6965 mova [r4+32*3], m0 6966 mova [r4+32*2], m1 6967 mova [r4+32*1], m2 6968 mova [r4+32*0], m3 6969 mova [r4-32*1], m4 6970 mova [r4-32*2], m5 6971 mova [r4-32*3], m6 6972 mova [r4-32*4], m7 6973 call .main 6974 sub r4, 32*16 ; topleft 16x8 6975 call .transpose_16x16 6976 lea r6, [pw_5+128] 6977 mov r7, dstq 6978 call m(idct_16x16_internal_8bpc).main 6979 call .write_16x16 6980 mova m0, [r5+32*3] 6981 mova m1, [r5+32*2] 6982 mova m2, [r5+32*1] 6983 mova m3, [r5+32*0] 6984 mova m4, [r5-32*1] 6985 mova m5, [r5-32*2] 6986 mova m6, [r5-32*3] 6987 mova m7, [r5-32*4] 6988 add r4, 32*8 ; bottomleft 16x8 6989 call .transpose_16x16 6990.end: 6991 lea dstq, [r7+32] 6992 call m(idct_16x16_internal_8bpc).main 6993 call .write_16x16 6994 RET 6995ALIGN function_align 6996.transpose_16x16: 6997 punpckhdq m8, m3, m1 6998 punpckldq m3, m1 6999 punpckhdq m1, m0, m2 7000 punpckldq m0, m2 7001 punpckhdq m2, m7, m5 7002 punpckldq m7, m5 7003 punpckhdq m5, m4, m6 7004 punpckldq m4, m6 7005 punpckhqdq m6, m0, m4 7006 punpcklqdq m0, m4 7007 punpckhqdq m4, m1, m5 7008 punpcklqdq m1, m5 7009 punpckhqdq m5, m7, m3 7010 punpcklqdq m7, m3 7011 punpckhqdq m3, m2, m8 7012 punpcklqdq m2, m8 7013 vinserti128 m8, m0, xm7, 1 7014 vperm2i128 m12, m0, m7, 0x31 7015 vinserti128 m9, m6, xm5, 1 7016 vperm2i128 m13, m6, m5, 0x31 7017 vinserti128 m10, m1, xm2, 1 7018 vperm2i128 m14, m1, m2, 0x31 7019 vinserti128 m11, m4, xm3, 1 7020 vperm2i128 m15, m4, m3, 0x31 7021 mova m0, [r4+32*3] 7022 mova m1, [r4+32*2] 7023 mova m2, [r4+32*1] 7024 mova m3, [r4+32*0] 7025 mova m4, [r4-32*1] 7026 mova m5, [r4-32*2] 7027 mova m6, [r4-32*3] 7028 mova m7, [r4-32*4] 7029 mova [rsp+gprsize], m15 7030 jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose 7031ALIGN function_align 7032.main: 7033 vpbroadcastd m14, [pd_2896] 7034 vpbroadcastd m11, [pd_2048] 7035 pmulld m0, m14, [cq+64* 1] 7036 pmulld m1, m14, [cq+64* 7] 7037 pmulld m2, m14, [cq+64* 9] 7038 pmulld m3, m14, [cq+64*15] 7039 pmulld m4, m14, [cq+64*17] 7040 pmulld m5, m14, [cq+64*23] 7041 pmulld m6, m14, [cq+64*25] 7042 pmulld m7, m14, [cq+64*31] 7043 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 7044 pmulld m0, m14, [cq+64* 3] 7045 pmulld m1, m14, [cq+64* 5] 7046 pmulld m2, m14, [cq+64*11] 7047 pmulld m3, m14, [cq+64*13] 7048 pmulld m4, m14, [cq+64*19] 7049 pmulld m5, m14, [cq+64*21] 7050 pmulld m6, m14, [cq+64*27] 7051 pmulld m7, m14, [cq+64*29] 7052 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 7053 pmulld m0, m14, [cq+64* 2] 7054 pmulld m1, m14, [cq+64* 6] 7055 pmulld m2, m14, [cq+64*10] 7056 pmulld m3, m14, [cq+64*14] 7057 pmulld m4, m14, [cq+64*18] 7058 pmulld m5, m14, [cq+64*22] 7059 pmulld m6, m14, [cq+64*26] 7060 pmulld m7, m14, [cq+64*30] 7061 call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 7062 pmulld m0, m14, [cq+64* 0] 7063 pmulld m1, m14, [cq+64* 4] 7064 pmulld m2, m14, [cq+64* 8] 7065 pmulld m3, m14, [cq+64*12] 7066 pmulld m4, m14, [cq+64*16] 7067 pmulld m5, m14, [cq+64*20] 7068 pmulld m6, m14, [cq+64*24] 7069 pmulld m7, m14, [cq+64*28] 7070 call m(idct_8x8_internal_10bpc).main_rect2 7071 call m(idct_8x16_internal_10bpc).main_evenhalf 7072 pxor m8, m8 7073 mov r7d, 64*30 7074.main_zero_loop: 7075 mova [cq+r7-64*2], m8 7076 mova [cq+r7-64*1], m8 7077 mova [cq+r7+64*0], m8 7078 mova [cq+r7+64*1], m8 7079 sub r7d, 64*4 7080 jg .main_zero_loop 7081.main_end: 7082 psrld m11, 11 ; pd_1 7083 IDCT32_END 0, 15, 8, 9, 10, 1 7084 IDCT32_END 1, 14, 8, 9, 10, 1 7085 punpckhwd m8, m0, m1 ; 16 17 7086 punpcklwd m0, m1 ; 0 1 7087 punpcklwd m1, m14, m15 ; 14 15 7088 punpckhwd m14, m15 ; 30 31 7089 mova [r5+32*3], m8 7090 mova [r5+32*2], m14 7091 IDCT32_END 2, 15, 8, 9, 10, 1 7092 IDCT32_END 3, 14, 8, 9, 10, 1 7093 punpckhwd m8, m2, m3 ; 18 19 7094 punpcklwd m2, m3 ; 2 3 7095 punpcklwd m3, m14, m15 ; 12 13 7096 punpckhwd m14, m15 ; 28 29 7097 mova [r5+32*1], m8 7098 mova [r5+32*0], m14 7099 IDCT32_END 4, 15, 8, 9, 10, 1 7100 IDCT32_END 5, 14, 8, 9, 10, 1 7101 punpckhwd m8, m4, m5 ; 20 21 7102 punpcklwd m4, m5 ; 4 5 7103 punpcklwd m5, m14, m15 ; 10 11 7104 punpckhwd m14, m15 ; 26 27 7105 mova [r5-32*1], m8 7106 mova [r5-32*2], m14 7107 IDCT32_END 6, 15, 8, 9, 10, 1 7108 IDCT32_END 7, 14, 8, 9, 10, 1 7109 punpckhwd m8, m6, m7 ; 22 23 7110 punpcklwd m6, m7 ; 6 7 7111 punpcklwd m7, m14, m15 ; 8 9 7112 punpckhwd m14, m15 ; 24 25 7113 mova [r5-32*3], m8 7114 mova [r5-32*4], m14 7115 ret 7116ALIGN function_align 7117.write_16x16: 7118 mova m1, [rsp+gprsize+32*1] 7119 mova [rsp+gprsize+32*0], m8 7120 mova [rsp+gprsize+32*1], m9 7121 mova [rsp+gprsize+32*2], m12 7122 vpbroadcastd m12, [pw_2048] 7123 vpbroadcastd m9, [pixel_10bpc_max] 7124 lea r3, [strideq*3] 7125 pxor m8, m8 7126 pmulhrsw m0, m12 7127 pmulhrsw m1, m12 7128 pmulhrsw m2, m12 7129 pmulhrsw m3, m12 7130 call m(idct_16x8_internal_10bpc).write_16x4 7131 pmulhrsw m0, m12, m4 7132 pmulhrsw m1, m12, m5 7133 pmulhrsw m2, m12, m6 7134 pmulhrsw m3, m12, m7 7135 call m(idct_16x8_internal_10bpc).write_16x4 7136 pmulhrsw m0, m12, [rsp+gprsize+32*0] 7137 pmulhrsw m1, m12, [rsp+gprsize+32*1] 7138 pmulhrsw m2, m12, m10 7139 pmulhrsw m3, m12, m11 7140 call m(idct_16x8_internal_10bpc).write_16x4 7141 pmulhrsw m0, m12, [rsp+gprsize+32*2] 7142 pmulhrsw m1, m12, m13 7143 pmulhrsw m2, m12, m14 7144 pmulhrsw m3, m12, m15 7145 jmp m(idct_16x8_internal_10bpc).write_16x4 7146 7147cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob 7148 vpbroadcastd m7, [pixel_10bpc_max] 7149.pass1: 7150 vpbroadcastd m8, [pw_2896x8] 7151 vpbroadcastd m9, [pw_1697x16] 7152 vpbroadcastd m10, [pw_4096] 7153 lea r6, [strideq*5] 7154 pxor m6, m6 7155 mov r5, dstq 7156 call .main 7157 sub eobd, 36 7158 jl .ret 7159 add cq, 32 7160 lea dstq, [dstq+strideq*4] 7161 call .main 7162 add cq, 64*8-32 7163 lea dstq, [r5+16*1] 7164 call .main 7165 sub eobd, 107 ; eob < 143 7166 jl .ret 7167 add cq, 32 7168 lea dstq, [dstq+strideq*4] 7169 call .main 7170 add cq, 64*8-32 7171 lea dstq, [r5+16*2] 7172 call .main 7173 sub eobd, 128 ; eob < 271 7174 jl .ret 7175 add cq, 32 7176 lea dstq, [dstq+strideq*4] 7177 call .main 7178 add cq, 64*8-32 7179 lea dstq, [r5+16*3] 7180 call .main 7181 sub eobd, 128 ; eob < 399 7182 jl .ret 7183 add cq, 32 7184 lea dstq, [dstq+strideq*4] 7185 call .main 7186.ret: 7187 RET 7188ALIGN function_align 7189.main: 7190 mova m0, [cq+64*0] 7191 packssdw m0, [cq+64*1] 7192 mova m1, [cq+64*2] 7193 packssdw m1, [cq+64*3] 7194 mova m2, [cq+64*4] 7195 packssdw m2, [cq+64*5] 7196 mova m3, [cq+64*6] 7197 packssdw m3, [cq+64*7] 7198 REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 7199 REPX {paddsw x, x }, m0, m1, m2, m3 7200 REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3 7201 REPX {pmulhrsw x, m10}, m0, m1, m2, m3 7202 REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 7203 jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 7204 7205cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob 7206 vpbroadcastd m7, [pixel_12bpc_max] 7207 jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1 7208 7209cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob 7210 test eobd, eobd 7211 jz .dconly 7212 PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob 7213%undef cmp 7214 vpbroadcastd m12, [clip_18b_min] 7215 vpbroadcastd m13, [clip_18b_max] 7216 lea r6, [rsp+32*7] 7217 call .main 7218 cmp eobd, 36 7219 jl .fast 7220 call .main 7221 cmp eobd, 136 7222 jl .fast 7223 call .main 7224 cmp eobd, 300 7225 jl .fast 7226 call .main 7227 jmp .pass2 7228.dconly: 7229 imul r6d, [cq], 181 7230 vpbroadcastd m3, [dconly_10bpc] 7231 mov [cq], eobd ; 0 7232 or r3d, 32 7233 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly 7234.fast: 7235 lea r4, [rsp+32*71] 7236 pxor m0, m0 7237.fast_loop: 7238 REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 7239 add r6, 32*8 7240 cmp r6, r4 7241 jl .fast_loop 7242.pass2: 7243 lea r3, [rsp+32*3] 7244 mov r4, r6 7245 lea r5, [r6+32*8] 7246 lea r6, [pw_5+128] 7247 call .pass2_oddhalf 7248 call .pass2_evenhalf 7249 imul r2, strideq, 19 7250 lea r3, [strideq*3] 7251 add r2, dstq 7252 call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end 7253 sub dstq, r3 7254 lea r2, [r2+r3+32] 7255 add dstq, 32 7256 lea r3, [rsp+32*11] 7257 call .pass2_oddhalf 7258 call .pass2_evenhalf 7259 lea r3, [strideq*3] 7260 call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end 7261 RET 7262ALIGN function_align 7263.main: 7264 mova m0, [cq+128* 1] 7265 mova m1, [cq+128* 7] 7266 mova m2, [cq+128* 9] 7267 mova m3, [cq+128*15] 7268 mova m4, [cq+128*17] 7269 mova m5, [cq+128*23] 7270 mova m6, [cq+128*25] 7271 mova m7, [cq+128*31] 7272 vpbroadcastd m11, [pd_2048] 7273 vpbroadcastd m14, [pd_2896] 7274 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 7275 mova m0, [cq+128* 3] 7276 mova m1, [cq+128* 5] 7277 mova m2, [cq+128*11] 7278 mova m3, [cq+128*13] 7279 mova m4, [cq+128*19] 7280 mova m5, [cq+128*21] 7281 mova m6, [cq+128*27] 7282 mova m7, [cq+128*29] 7283 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 7284 mova m0, [cq+128* 2] 7285 mova m1, [cq+128* 6] 7286 mova m2, [cq+128*10] 7287 mova m3, [cq+128*14] 7288 mova m4, [cq+128*18] 7289 mova m5, [cq+128*22] 7290 mova m6, [cq+128*26] 7291 mova m7, [cq+128*30] 7292 call m(idct_8x16_internal_10bpc).main_oddhalf 7293 mova m0, [cq+128* 0] 7294 mova m1, [cq+128* 4] 7295 mova m2, [cq+128* 8] 7296 mova m3, [cq+128*12] 7297 mova m4, [cq+128*16] 7298 mova m5, [cq+128*20] 7299 mova m6, [cq+128*24] 7300 mova m7, [cq+128*28] 7301 call m(idct_8x8_internal_10bpc).main 7302 call m(idct_8x16_internal_10bpc).main_evenhalf 7303 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end 7304 pxor m15, m15 7305 mov r7d, 128*29 7306.main_zero_loop: 7307 mova [cq+r7-128*1], m15 7308 mova [cq+r7+128*0], m15 7309 mova [cq+r7+128*1], m15 7310 mova [cq+r7+128*2], m15 7311 sub r7d, 128*4 7312 jg .main_zero_loop 7313 add cq, 32 7314 mova [r4-32*4], m0 7315 mova [r4-32*3], m1 7316 mova [r4-32*2], m2 7317 mova [r4-32*1], m3 7318 mova [r4+32*0], m4 7319 mova [r4+32*1], m5 7320 mova [r4+32*2], m6 7321 mova [r4+32*3], m7 7322 mova m0, [r5+32*3] 7323 mova m1, [r5+32*2] 7324 mova m2, [r5+32*1] 7325 mova m3, [r5+32*0] 7326 mova m4, [r5-32*1] 7327 mova m5, [r5-32*2] 7328 mova m6, [r5-32*3] 7329 mova m7, [r5-32*4] 7330 call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose 7331 mova [r5-32*4], m0 7332 mova [r5-32*3], m1 7333 mova [r5-32*2], m2 7334 mova [r5-32*1], m3 7335 mova [r5+32*0], m4 7336 mova [r5+32*1], m5 7337 mova [r5+32*2], m6 7338 mova [r5+32*3], m7 7339 ret 7340ALIGN function_align 7341.pass2_oddhalf: 7342 mova m0, [r3+32* 1] ; 1 7343 mova m1, [r3+32* 3] ; 3 7344 mova m2, [r3+32* 5] ; 5 7345 mova m3, [r3+32* 7] ; 7 7346 mova m4, [r3+32*17] ; 9 7347 mova m5, [r3+32*19] ; 11 7348 mova m6, [r3+32*21] ; 13 7349 mova m7, [r3+32*23] ; 15 7350 mova m8, [r3+32*33] ; 17 7351 mova m9, [r3+32*35] ; 19 7352 mova m10, [r3+32*37] ; 21 7353 mova m11, [r3+32*39] ; 23 7354 mova m12, [r3+32*49] ; 25 7355 mova m13, [r3+32*51] ; 27 7356 mova m14, [r3+32*53] ; 29 7357 mova m15, [r3+32*55] ; 31 7358 jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 7359ALIGN function_align 7360.pass2_evenhalf: 7361 mova m0, [r3+32* 0] ; 0 7362 mova m1, [r3+32* 2] ; 2 7363 mova m2, [r3+32* 4] ; 4 7364 mova m3, [r3+32* 6] ; 6 7365 mova m4, [r3+32*16] ; 8 7366 mova m5, [r3+32*18] ; 10 7367 mova m6, [r3+32*20] ; 12 7368 mova m7, [r3+32*22] ; 14 7369 mova m8, [r3+32*32] ; 16 7370 mova m9, [r3+32*34] ; 18 7371 mova m10, [r3+32*36] ; 20 7372 mova m11, [r3+32*38] ; 22 7373 mova m12, [r3+32*48] ; 24 7374 mova m13, [r3+32*50] ; 26 7375 mova m14, [r3+32*52] ; 28 7376 mova m15, [r3+32*54] ; 30 7377 mova [rsp+gprsize], m15 7378 jmp m(idct_16x16_internal_8bpc).main 7379 7380cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob 7381%undef cmp 7382 vpbroadcastd m7, [pixel_10bpc_max] 7383.pass1: 7384 vpbroadcastd m5, [pw_8192] 7385 pxor m6, m6 7386 lea r6, [strideq*3] 7387 lea r5, [strideq*5] 7388 lea r4, [strideq+r6*2] ; strideq*7 7389 call .main ; 0 7390 cmp eobd, 36 7391 jl .ret 7392 add cq, 128*8 ; 0 1 7393 mov r7, dstq ; 1 7394 add dstq, 16 7395 call .main 7396 call .main2 7397 cmp eobd, 136 7398 jl .ret 7399 add cq, 128*16-32 ; 0 1 2 7400 lea dstq, [r7+16*2] ; 1 2 7401 call .main ; 2 7402 call .main2 7403 call .main2 7404 cmp eobd, 300 7405 jl .ret 7406 add cq, 128*24-64 ; 0 1 2 3 7407 add r7, 16*3 ; 1 2 3 7408 mov dstq, r7 ; 2 3 7409 call .main ; 3 7410 call .main2 7411 call .main2 7412 call .main2 7413 cmp eobd, 535 7414 jl .ret 7415 add cq, 128*24-64 ; 0 1 2 3 7416 lea dstq, [r7+strideq*8] ; 1 2 3 4 7417 mov r7, dstq ; 2 3 4 7418 call .main ; 3 4 7419 call .main2 7420 call .main2 7421 cmp eobd, 755 7422 jl .ret 7423 add cq, 128*16-32 ; 0 1 2 3 7424 lea dstq, [r7+strideq*8] ; 1 2 3 4 7425 call .main ; 2 3 4 5 7426 call .main2 ; 3 4 5 7427 cmp eobd, 911 7428 jl .ret 7429 add cq, 128*8 ; 0 1 2 3 7430 add dstq, 16 ; 1 2 3 4 7431 call .main ; 2 3 4 5 7432.ret: ; 3 4 5 6 7433 RET 7434ALIGN function_align 7435.main2: 7436 sub cq, 128*8-32 7437 lea dstq, [dstq+strideq*8-16] 7438.main: 7439 mova m0, [cq+128*0] 7440 packssdw m0, [cq+128*1] 7441 mova m1, [cq+128*2] 7442 packssdw m1, [cq+128*3] 7443 mova m2, [cq+128*4] 7444 packssdw m2, [cq+128*5] 7445 mova m3, [cq+128*6] 7446 packssdw m3, [cq+128*7] 7447 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 7448 jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero 7449 7450cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob 7451 vpbroadcastd m7, [pixel_12bpc_max] 7452 jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1 7453 7454%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) 7455%if %1 & 1 7456 mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n 7457 mova m%4, [r4-32*(14+%1)] ; idct32 out31-n 7458%else 7459 mova m%5, [r4-32*(45-%1)] 7460 mova m%4, [r5-32*(20+%1)] 7461%endif 7462 paddsw m%6, m%5, m%4 ; idct32 out 0+n 7463 psubsw m%5, m%4 ; idct32 out31-n 7464 paddsw m%4, m%5, m%3 ; out31-n 7465 psubsw m%5, m%3 ; out32+n 7466 paddsw m%3, m%6, m%2 ; out 0+n 7467 psubsw m%6, m%2 ; out63-n 7468 REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3 7469%if %1 & 1 7470 %define %%d0 r2 7471 %define %%d1 dstq 7472%else 7473 %define %%d0 dstq 7474 %define %%d1 r2 7475%endif 7476 paddw m%3, [%%d0+%7 ] 7477 paddw m%4, [%%d1+%8 ] 7478 paddw m%5, [%%d0+%9 ] 7479 paddw m%6, [%%d1+%10] 7480 pxor m%2, m%2 7481 REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6 7482 vpbroadcastd m%2, [pixel_10bpc_max] 7483 REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6 7484 mova [%%d0+%7 ], m%3 7485 mova [%%d1+%8 ], m%4 7486 mova [%%d0+%9 ], m%5 7487 mova [%%d1+%10], m%6 7488%endmacro 7489 7490cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob 7491 test eobd, eobd 7492 jz .dconly 7493 PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob 7494%undef cmp 7495 vpbroadcastd m11, [pd_2048] 7496 vpbroadcastd m12, [clip_18b_min] 7497 vpbroadcastd m13, [clip_18b_max] 7498 vpbroadcastd m14, [pd_2896] 7499 lea r6, [rsp+32*6] 7500 call .main 7501 sub eobd, 44 7502 jl .fast 7503 call .main 7504 sub eobd, 107 7505 jl .fast 7506 call .main 7507 sub eobd, 128 7508 jl .fast 7509 call .main 7510 jmp .pass2 7511.dconly: 7512 imul r6d, [cq], 181 7513 vpbroadcastd m3, [dconly_10bpc] 7514 mov [cq], eobd ; 0 7515 or r3d, 64 7516 add r6d, 640 7517 sar r6d, 10 7518 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 7519.fast: 7520 lea r4, [rsp+32*38] 7521 pxor m0, m0 7522.fast_loop: 7523 REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 7524 add r6, 32*8 7525 cmp r6, r4 7526 jl .fast_loop 7527.pass2: 7528 lea r6, [pw_5+128] 7529 mova m0, [rsp+32* 2] ; in0 7530 mova m1, [rsp+32* 6] ; in4 7531 mova m2, [rsp+32*10] ; in8 7532 mova m3, [rsp+32*14] ; in12 7533 mova m4, [rsp+32*18] ; in16 7534 mova m5, [rsp+32*22] ; in20 7535 mova m6, [rsp+32*26] ; in24 7536 mova m7, [rsp+32*30] ; in28 7537 pxor m8, m8 7538 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 7539 mova [rsp], m8 7540 call m(idct_16x16_internal_8bpc).main 7541 mova m1, [rsp+32*1] 7542 lea r4, [rsp+32*38] 7543 mova [r4-32*4], m0 7544 mova [r4-32*3], m1 7545 mova [r4-32*2], m2 7546 mova [r4-32*1], m3 7547 mova [r4+32*0], m4 7548 mova [r4+32*1], m5 7549 mova [r4+32*2], m6 7550 mova [r4+32*3], m7 7551 add r4, 32*8 7552 mova [r4-32*4], m8 7553 mova [r4-32*3], m9 7554 mova [r4-32*2], m10 7555 mova [r4-32*1], m11 7556 mova [r4+32*0], m12 7557 mova [r4+32*1], m13 7558 mova [r4+32*2], m14 7559 mova [r4+32*3], m15 7560 mova m0, [rsp+32* 4] ; in2 7561 mova m1, [rsp+32* 8] ; in6 7562 mova m2, [rsp+32*12] ; in10 7563 mova m3, [rsp+32*16] ; in14 7564 mova m4, [rsp+32*20] ; in18 7565 mova m5, [rsp+32*24] ; in22 7566 mova m6, [rsp+32*28] ; in26 7567 mova m7, [rsp+32*32] ; in30 7568 lea r5, [r4+32*16] 7569 add r4, 32*8 7570 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 7571 mova m0, [rsp+32* 3] ; in1 7572 mova m1, [rsp+32*33] ; in31 7573 mova m2, [rsp+32*19] ; in17 7574 mova m3, [rsp+32*17] ; in15 7575 mova m4, [rsp+32*11] ; in9 7576 mova m5, [rsp+32*25] ; in23 7577 mova m6, [rsp+32*27] ; in25 7578 mova m7, [rsp+32* 9] ; in7 7579 lea r6, [idct64_mul - 8] 7580 add r4, 32*16 7581 add r5, 32*32 7582 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 7583 mova m0, [rsp+32* 7] ; in5 7584 mova m1, [rsp+32*29] ; in27 7585 mova m2, [rsp+32*23] ; in21 7586 mova m3, [rsp+32*13] ; in11 7587 mova m4, [rsp+32*15] ; in13 7588 mova m5, [rsp+32*21] ; in19 7589 mova m6, [rsp+32*31] ; in29 7590 mova m7, [rsp+32* 5] ; in3 7591 add r6, 8 7592 add r4, 32*8 7593 sub r5, 32*8 7594 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 7595 lea r8, [strideq*4] 7596 lea r9, [strideq*5] 7597 lea r3, [r9+strideq*1] ; stride*6 7598 lea r7, [r9+strideq*2] ; stride*7 7599 call .main_part2_pass2 7600 RET 7601ALIGN function_align 7602.main: 7603 mova m0, [cq+128* 1] 7604 mova m1, [cq+128* 3] 7605 mova m2, [cq+128* 5] 7606 mova m3, [cq+128* 7] 7607 mova m4, [cq+128* 9] 7608 mova m5, [cq+128*11] 7609 mova m6, [cq+128*13] 7610 mova m7, [cq+128*15] 7611 call m(idct_8x16_internal_10bpc).main_oddhalf 7612 mova m0, [cq+128* 0] 7613 mova m1, [cq+128* 2] 7614 mova m2, [cq+128* 4] 7615 mova m3, [cq+128* 6] 7616 mova m4, [cq+128* 8] 7617 mova m5, [cq+128*10] 7618 mova m6, [cq+128*12] 7619 mova m7, [cq+128*14] 7620 call m(idct_8x8_internal_10bpc).main 7621 call m(idct_8x16_internal_10bpc).main_evenhalf 7622 pxor m15, m15 7623 mov r7d, 128*13 7624.main_zero_loop: 7625 mova [cq+r7-128*1], m15 7626 mova [cq+r7+128*0], m15 7627 mova [cq+r7+128*1], m15 7628 mova [cq+r7+128*2], m15 7629 sub r7d, 128*4 7630 jg .main_zero_loop 7631 add cq, 32 7632 psrld m15, m11, 10 ; pd_2 7633 mova m8, [r6-32*4] 7634 mova m9, [r6+32*3] 7635 REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 7636 psubd m10, m0, m8 ; out15 7637 paddd m0, m8 ; out0 7638 mova m8, [r6-32*3] 7639 psubd m15, m7, m9 ; out8 7640 paddd m7, m9 ; out7 7641 mova m9, [r6+32*2] 7642 REPX {psrad x, 2}, m0, m15, m10, m7 7643 packssdw m0, m15 7644 packssdw m7, m10 7645 psubd m10, m1, m8 ; out14 7646 paddd m1, m8 ; out1 7647 mova m8, [r6-32*2] 7648 psubd m15, m6, m9 ; out9 7649 paddd m6, m9 ; out6 7650 mova m9, [r6+32*1] 7651 REPX {psrad x, 2}, m1, m15, m10, m6 7652 packssdw m1, m15 7653 packssdw m6, m10 7654 psubd m10, m2, m8 ; out13 7655 paddd m2, m8 ; out2 7656 mova m8, [r6-32*1] 7657 psubd m15, m5, m9 ; out10 7658 paddd m5, m9 ; out5 7659 mova m9, [r6+32*0] 7660 REPX {psrad x, 2}, m2, m15, m10, m5 7661 packssdw m2, m15 7662 packssdw m5, m10 7663 psubd m10, m3, m8 ; out12 7664 paddd m3, m8 ; out3 7665 psubd m15, m4, m9 ; out11 7666 paddd m4, m9 ; out4 7667 REPX {psrad x, 2}, m3, m15, m10, m4 7668 packssdw m3, m15 7669 packssdw m4, m10 7670 call m(idct_16x8_internal_10bpc).transpose3 7671 mova [r6-32*4], m0 7672 mova [r6-32*3], m1 7673 mova [r6-32*2], m2 7674 mova [r6-32*1], m3 7675 mova [r6+32*0], m4 7676 mova [r6+32*1], m5 7677 mova [r6+32*2], m6 7678 mova [r6+32*3], m7 7679 add r6, 32*8 7680 ret 7681.main_part2_pass2: 7682 vpbroadcastd m11, [pw_1567_3784] 7683 vpbroadcastd m12, [pw_m3784_1567] 7684 vpbroadcastd m13, [pw_2896_2896] 7685 lea r6, [pw_5+128] 7686 lea r2, [dstq+r7] 7687.main_part2_pass2_loop: 7688 vpbroadcastd m14, [pw_m2896_2896] 7689 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal 7690 vpbroadcastd m14, [pw_2048] 7691 IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 7692 IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 7693 IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 7694 IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 7695 add dstq, strideq 7696 sub r2, strideq 7697 cmp r4, r5 7698 jne .main_part2_pass2_loop 7699 ret 7700ALIGN function_align 7701.main_part1_rect2: 7702 REPX {paddd x, m11}, m0, m1, m2, m3 7703 REPX {psrad x, 12 }, m0, m1, m2, m3 7704.main_part1: ; idct64 steps 1-5 7705 ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 7706 ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 7707 ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 7708 ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 7709 vpbroadcastd m7, [r5+4*0] 7710 vpbroadcastd m8, [r5+4*1] 7711 vpbroadcastd m6, [r5+4*2] 7712 vpbroadcastd m9, [r5+4*3] 7713 vpbroadcastd m5, [r5+4*4] 7714 vpbroadcastd m10, [r5+4*5] 7715 vpbroadcastd m4, [r5+4*6] 7716 vpbroadcastd m15, [r5+4*7] 7717 pmulld m7, m0 ; t63a 7718 pmulld m0, m8 ; t32a 7719 pmulld m6, m1 ; t62a 7720 pmulld m1, m9 ; t33a 7721 pmulld m5, m2 ; t61a 7722 pmulld m2, m10 ; t34a 7723 pmulld m4, m3 ; t60a 7724 pmulld m3, m15 ; t35a 7725 vpbroadcastd m10, [r5+4*8] 7726 vpbroadcastd m15, [r5+4*9] 7727 REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 7728 REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 7729 psubd m8, m0, m1 ; t33 7730 paddd m0, m1 ; t32 7731 psubd m1, m7, m6 ; t62 7732 paddd m7, m6 ; t63 7733 psubd m6, m3, m2 ; t34 7734 paddd m3, m2 ; t35 7735 psubd m2, m4, m5 ; t61 7736 paddd m4, m5 ; t60 7737 REPX {pmaxsd x, m12}, m8, m1, m6, m2 7738 REPX {pminsd x, m13}, m8, m1, m6, m2 7739 ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a 7740 ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a 7741 REPX {pmaxsd x, m12}, m0, m3, m7, m4 7742 REPX {pminsd x, m13}, m0, m3, m7, m4 7743 vpbroadcastd m10, [r5+4*10] 7744 vpbroadcastd m15, [r5+4*11] 7745 psubd m5, m0, m3 ; t35a 7746 paddd m0, m3 ; t32a 7747 psubd m3, m7, m4 ; t60a 7748 paddd m7, m4 ; t63a 7749 psubd m4, m1, m6 ; t34 7750 paddd m1, m6 ; t33 7751 psubd m6, m8, m2 ; t61 7752 paddd m8, m2 ; t62 7753 REPX {pmaxsd x, m12}, m5, m3, m4, m6 7754 REPX {pminsd x, m13}, m5, m3, m4, m6 7755 ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 7756 ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a 7757 REPX {pmaxsd x, m12}, m0, m7, m1, m8 7758 REPX {pminsd x, m13}, m0, m7, m1, m8 7759 add r5, 4*12 7760 mova [r6-32*4], m0 7761 mova [r6+32*3], m7 7762 mova [r6-32*3], m1 7763 mova [r6+32*2], m8 7764 mova [r6-32*2], m6 7765 mova [r6+32*1], m4 7766 mova [r6-32*1], m3 7767 mova [r6+32*0], m5 7768 add r6, 32*8 7769 ret 7770.main_part2: ; idct64 steps 6-9 7771 lea r5, [r6+32*3] 7772 sub r6, 32*4 7773 vpbroadcastd m10, [pd_1567] 7774 vpbroadcastd m15, [pd_3784] 7775.main_part2_loop: 7776 mova m0, [r6-32*32] ; t32a 7777 mova m1, [r5-32*24] ; t39a 7778 mova m2, [r5-32*32] ; t63a 7779 mova m3, [r6-32*24] ; t56a 7780 mova m4, [r6-32*16] ; t40a 7781 mova m5, [r5-32* 8] ; t47a 7782 mova m6, [r5-32*16] ; t55a 7783 mova m7, [r6-32* 8] ; t48a 7784 psubd m8, m0, m1 ; t39 7785 paddd m0, m1 ; t32 7786 psubd m1, m2, m3 ; t56 7787 paddd m2, m3 ; t63 7788 psubd m3, m5, m4 ; t40 7789 paddd m5, m4 ; t47 7790 psubd m4, m7, m6 ; t55 7791 paddd m7, m6 ; t48 7792 REPX {pmaxsd x, m12}, m8, m1, m3, m4 7793 REPX {pminsd x, m13}, m8, m1, m3, m4 7794 ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a 7795 ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a 7796 REPX {pmaxsd x, m12}, m0, m2, m5, m7 7797 REPX {pminsd x, m13}, m0, m5, m2, m7 7798 psubd m6, m2, m7 ; t48a 7799 paddd m2, m7 ; t63a 7800 psubd m7, m0, m5 ; t47a 7801 paddd m0, m5 ; t32a 7802 psubd m5, m8, m4 ; t55 7803 paddd m8, m4 ; t56 7804 psubd m4, m1, m3 ; t40 7805 paddd m1, m3 ; t39 7806 REPX {pmaxsd x, m12}, m6, m7, m5, m4 7807 REPX {pminsd x, m13}, m6, m7, m5, m4 7808 REPX {pmulld x, m14}, m6, m7, m5, m4 7809 REPX {pmaxsd x, m12}, m2, m0, m8, m1 7810 REPX {pminsd x, m13}, m2, m0, m8, m1 7811 paddd m6, m11 7812 paddd m5, m11 7813 psubd m3, m6, m7 ; t47 7814 paddd m6, m7 ; t48 7815 psubd m7, m5, m4 ; t40a 7816 paddd m5, m4 ; t55a 7817 REPX {psrad x, 12}, m3, m6, m7, m5 7818 mova [r5-32* 8], m2 7819 mova [r6-32*32], m0 7820 mova [r6-32* 8], m8 7821 mova [r5-32*32], m1 7822 mova [r5-32*24], m3 7823 mova [r6-32*16], m6 7824 mova [r6-32*24], m7 7825 mova [r5-32*16], m5 7826 add r6, 32 7827 sub r5, 32 7828 cmp r6, r5 7829 jl .main_part2_loop 7830 ret 7831 7832cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob 7833 test eobd, eobd 7834 jz .dconly 7835 PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob 7836%undef cmp 7837 vpbroadcastd m12, [clip_18b_min] 7838 vpbroadcastd m13, [clip_18b_max] 7839 lea r6, [rsp+32*6] 7840 call .main 7841 cmp eobd, 36 7842 jl .fast 7843 call .main 7844 cmp eobd, 136 7845 jl .fast 7846 call .main 7847 cmp eobd, 300 7848 jl .fast 7849 call .main 7850 jmp .pass2 7851.dconly: 7852 imul r6d, [cq], 181 7853 vpbroadcastd m3, [dconly_10bpc] 7854 mov [cq], eobd ; 0 7855 or r3d, 64 7856 add r6d, 128 7857 sar r6d, 8 7858 imul r6d, 181 7859 add r6d, 384 7860 sar r6d, 9 7861 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 7862.fast: 7863 lea r4, [rsp+32*70] 7864 pxor m0, m0 7865.fast_loop: 7866 REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 7867 add r6, 32*8 7868 cmp r6, r4 7869 jl .fast_loop 7870.pass2: 7871 lea r6, [pw_5 + 128] 7872 mov r10, rsp 7873 lea r8, [strideq*4] 7874 lea r9, [strideq*5] 7875 lea r3, [r9+strideq*1] ; stride*6 7876 lea r7, [r9+strideq*2] ; stride*7 7877.pass2_loop: 7878 mova m0, [r10+32* 2] ; in0 7879 mova m1, [r10+32* 6] ; in4 7880 mova m2, [r10+32*18] ; in8 7881 mova m3, [r10+32*22] ; in12 7882 mova m4, [r10+32*34] ; in16 7883 mova m5, [r10+32*38] ; in20 7884 mova m6, [r10+32*50] ; in24 7885 mova m7, [r10+32*54] ; in28 7886 pxor m8, m8 7887 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 7888 mova [rsp], m8 7889 call m(idct_16x16_internal_8bpc).main 7890 mova m1, [rsp+32*1] 7891 lea r4, [rsp+32*70] 7892 mova [r4-32*4], m0 7893 mova [r4-32*3], m1 7894 mova [r4-32*2], m2 7895 mova [r4-32*1], m3 7896 mova [r4+32*0], m4 7897 mova [r4+32*1], m5 7898 mova [r4+32*2], m6 7899 mova [r4+32*3], m7 7900 add r4, 32*8 7901 mova [r4-32*4], m8 7902 mova [r4-32*3], m9 7903 mova [r4-32*2], m10 7904 mova [r4-32*1], m11 7905 mova [r4+32*0], m12 7906 mova [r4+32*1], m13 7907 mova [r4+32*2], m14 7908 mova [r4+32*3], m15 7909 mova m0, [r10+32* 4] ; in2 7910 mova m1, [r10+32* 8] ; in6 7911 mova m2, [r10+32*20] ; in10 7912 mova m3, [r10+32*24] ; in14 7913 mova m4, [r10+32*36] ; in18 7914 mova m5, [r10+32*40] ; in22 7915 mova m6, [r10+32*52] ; in26 7916 mova m7, [r10+32*56] ; in30 7917 lea r5, [r4+32*16] 7918 add r4, 32*8 7919 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 7920 mova m0, [r10+32* 3] ; in1 7921 mova m1, [r10+32*57] ; in31 7922 mova m2, [r10+32*35] ; in17 7923 mova m3, [r10+32*25] ; in15 7924 mova m4, [r10+32*19] ; in9 7925 mova m5, [r10+32*41] ; in23 7926 mova m6, [r10+32*51] ; in25 7927 mova m7, [r10+32* 9] ; in7 7928 lea r6, [idct64_mul - 8] 7929 add r4, 32*16 7930 add r5, 32*32 7931 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 7932 mova m0, [r10+32* 7] ; in5 7933 mova m1, [r10+32*53] ; in27 7934 mova m2, [r10+32*39] ; in21 7935 mova m3, [r10+32*21] ; in11 7936 mova m4, [r10+32*23] ; in13 7937 mova m5, [r10+32*37] ; in19 7938 mova m6, [r10+32*55] ; in29 7939 mova m7, [r10+32* 5] ; in3 7940 add r6, 8 7941 add r4, 32*8 7942 sub r5, 32*8 7943 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 7944 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 7945 add r10, 32*8 7946 sub r4, 32*98 ; rsp+32*16 7947 sub dstq, r8 7948 add dstq, 32 7949 cmp r10, r4 7950 jl .pass2_loop 7951 RET 7952ALIGN function_align 7953.main: 7954 vpbroadcastd m14, [pd_2896] 7955 vpbroadcastd m11, [pd_2048] 7956 pmulld m0, m14, [cq+128* 1] 7957 pmulld m1, m14, [cq+128* 7] 7958 pmulld m2, m14, [cq+128* 9] 7959 pmulld m3, m14, [cq+128*15] 7960 pmulld m4, m14, [cq+128*17] 7961 pmulld m5, m14, [cq+128*23] 7962 pmulld m6, m14, [cq+128*25] 7963 pmulld m7, m14, [cq+128*31] 7964 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 7965 pmulld m0, m14, [cq+128* 3] 7966 pmulld m1, m14, [cq+128* 5] 7967 pmulld m2, m14, [cq+128*11] 7968 pmulld m3, m14, [cq+128*13] 7969 pmulld m4, m14, [cq+128*19] 7970 pmulld m5, m14, [cq+128*21] 7971 pmulld m6, m14, [cq+128*27] 7972 pmulld m7, m14, [cq+128*29] 7973 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 7974 pmulld m0, m14, [cq+128* 2] 7975 pmulld m1, m14, [cq+128* 6] 7976 pmulld m2, m14, [cq+128*10] 7977 pmulld m3, m14, [cq+128*14] 7978 pmulld m4, m14, [cq+128*18] 7979 pmulld m5, m14, [cq+128*22] 7980 pmulld m6, m14, [cq+128*26] 7981 pmulld m7, m14, [cq+128*30] 7982 call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 7983 pmulld m0, m14, [cq+128* 0] 7984 pmulld m1, m14, [cq+128* 4] 7985 pmulld m2, m14, [cq+128* 8] 7986 pmulld m3, m14, [cq+128*12] 7987 pmulld m4, m14, [cq+128*16] 7988 pmulld m5, m14, [cq+128*20] 7989 pmulld m6, m14, [cq+128*24] 7990 pmulld m7, m14, [cq+128*28] 7991 pxor m15, m15 7992 mov r7d, 128*29 7993.main_zero_loop: 7994 mova [cq+r7-128*1], m15 7995 mova [cq+r7+128*0], m15 7996 mova [cq+r7+128*1], m15 7997 mova [cq+r7+128*2], m15 7998 sub r7d, 128*4 7999 jg .main_zero_loop 8000 add cq, 32 8001 call m(idct_8x8_internal_10bpc).main_rect2 8002 call m(idct_8x16_internal_10bpc).main_evenhalf 8003 call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end 8004 call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose 8005 mova [r4-32*4], m0 8006 mova [r4-32*3], m1 8007 mova [r4-32*2], m2 8008 mova [r4-32*1], m3 8009 mova [r4+32*0], m4 8010 mova [r4+32*1], m5 8011 mova [r4+32*2], m6 8012 mova [r4+32*3], m7 8013 mova m0, [r5+32*3] 8014 mova m1, [r5+32*2] 8015 mova m2, [r5+32*1] 8016 mova m3, [r5+32*0] 8017 mova m4, [r5-32*1] 8018 mova m5, [r5-32*2] 8019 mova m6, [r5-32*3] 8020 mova m7, [r5-32*4] 8021 call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose 8022 mova [r5-32*4], m0 8023 mova [r5-32*3], m1 8024 mova [r5-32*2], m2 8025 mova [r5-32*1], m3 8026 mova [r5+32*0], m4 8027 mova [r5+32*1], m5 8028 mova [r5+32*2], m6 8029 mova [r5+32*3], m7 8030 ret 8031 8032cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob 8033 test eobd, eobd 8034 jnz .normal 8035 imul r6d, [cq], 181 8036 mov [cq], eobd ; 0 8037 or r3d, 16 8038.dconly: 8039 add r6d, 640 8040 sar r6d, 10 8041.dconly2: 8042 vpbroadcastd m5, [dconly_10bpc] 8043 imul r6d, 181 8044 add r6d, 2176 8045 sar r6d, 12 8046 movd xm0, r6d 8047 paddsw xm0, xm5 8048 vpbroadcastw m0, xm0 8049.dconly_loop: 8050 paddsw m1, m0, [dstq+32*0] 8051 paddsw m2, m0, [dstq+32*1] 8052 paddsw m3, m0, [dstq+32*2] 8053 paddsw m4, m0, [dstq+32*3] 8054 REPX {psubusw x, m5}, m1, m2, m3, m4 8055 mova [dstq+32*0], m1 8056 mova [dstq+32*1], m2 8057 mova [dstq+32*2], m3 8058 mova [dstq+32*3], m4 8059 add dstq, strideq 8060 dec r3d 8061 jg .dconly_loop 8062 RET 8063.normal: 8064 PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob 8065%undef cmp 8066 vpbroadcastd m11, [pd_2048] 8067 vpbroadcastd m12, [clip_18b_min] 8068 vpbroadcastd m13, [clip_18b_max] 8069 vpbroadcastd m14, [pd_2896] 8070 lea r6, [rsp+32*4] 8071 call .main 8072 call .shift_transpose 8073 cmp eobd, 36 8074 jl .fast 8075 call .main 8076 call .shift_transpose 8077 jmp .pass2 8078.fast: 8079 pxor m0, m0 8080 mov r3d, 4 8081.fast_loop: 8082 REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 8083 add r6, 32*8 8084 dec r3d 8085 jg .fast_loop 8086.pass2: 8087 lea r7, [r6-32*64] 8088 lea r4, [r6-32*32] 8089 lea r6, [pw_5+128] 8090 mov r5, dstq 8091.pass2_loop: 8092 mova m0, [r7-32*4] 8093 mova m1, [r7-32*3] 8094 mova m2, [r7-32*2] 8095 mova m3, [r7-32*1] 8096 mova m4, [r7+32*0] 8097 mova m5, [r7+32*1] 8098 mova m6, [r7+32*2] 8099 mova m7, [r7+32*3] 8100 add r7, 32*32 8101 mova m8, [r7-32*4] 8102 mova m9, [r7-32*3] 8103 mova m10, [r7-32*2] 8104 mova m11, [r7-32*1] 8105 mova m12, [r7+32*0] 8106 mova m13, [r7+32*1] 8107 mova m14, [r7+32*2] 8108 mova m15, [r7+32*3] 8109 sub r7, 32*24 8110 mova [rsp], m15 8111 call m(idct_16x16_internal_8bpc).main 8112 mova m1, [rsp+32*1] 8113 call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16 8114 add r5, 32 8115 mov dstq, r5 8116 cmp r7, r4 8117 jl .pass2_loop 8118 RET 8119ALIGN function_align 8120.main: 8121 lea r5, [idct64_mul_16bpc] 8122 mova m0, [cq+64* 1] 8123 mova m1, [cq+64*31] 8124 mova m2, [cq+64*17] 8125 mova m3, [cq+64*15] 8126 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 8127 mova m0, [cq+64* 7] 8128 mova m1, [cq+64*25] 8129 mova m2, [cq+64*23] 8130 mova m3, [cq+64* 9] 8131 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 8132 mova m0, [cq+64* 5] 8133 mova m1, [cq+64*27] 8134 mova m2, [cq+64*21] 8135 mova m3, [cq+64*11] 8136 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 8137 mova m0, [cq+64* 3] 8138 mova m1, [cq+64*29] 8139 mova m2, [cq+64*19] 8140 mova m3, [cq+64*13] 8141 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 8142 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 8143 mova m0, [cq+64* 2] 8144 mova m1, [cq+64*14] 8145 mova m2, [cq+64*18] 8146 mova m3, [cq+64*30] 8147 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast 8148 mova m0, [cq+64* 6] 8149 mova m1, [cq+64*10] 8150 mova m2, [cq+64*22] 8151 mova m3, [cq+64*26] 8152 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast 8153 mova m0, [cq+64* 4] 8154 mova m1, [cq+64*12] 8155 mova m2, [cq+64*20] 8156 mova m3, [cq+64*28] 8157 call m(idct_8x16_internal_10bpc).main_oddhalf_fast 8158 mova m0, [cq+64* 0] 8159 mova m1, [cq+64* 8] 8160 mova m2, [cq+64*16] 8161 mova m3, [cq+64*24] 8162 pxor m15, m15 8163 mov r7d, 64*30 8164.main_zero_loop: 8165 mova [cq+r7-64*2], m15 8166 mova [cq+r7-64*1], m15 8167 mova [cq+r7+64*0], m15 8168 mova [cq+r7+64*1], m15 8169 sub r7d, 64*4 8170 jg .main_zero_loop 8171.main_end: 8172 psrld m15, m11, 10 ; pd_2 8173.main_end2: 8174 add cq, 32 8175 pxor m4, m4 8176 REPX {mova x, m4}, m5, m6, m7 8177 call m(idct_8x8_internal_10bpc).main 8178 add r6, 32*8 8179 call m(idct_8x16_internal_10bpc).main_evenhalf 8180 mova [r6+32*2], m1 8181 mova [r6+32*1], m2 8182 mova [r6+32*0], m3 8183 mova [r6-32*1], m4 8184 mova [r6-32*2], m5 8185 mova [r6-32*3], m6 8186 mova [r6-32*4], m7 8187 jmp .main_end_loop_start 8188.main_end_loop: 8189 mova m0, [r6+32* 3] ; idct8 0 + n 8190.main_end_loop_start: 8191 mova m1, [r5+32* 4] ; idct16 15 - n 8192 mova m2, [r5-32*12] ; idct32 16 + n 8193 mova m3, [r6-32*13] ; idct32 31 - n 8194 mova m4, [r6-32*29] ; idct64 63 - n 8195 mova m5, [r5-32*28] ; idct64 48 + n 8196 mova m6, [r6-32*45] ; idct64 47 - n 8197 mova m7, [r5-32*44] ; idct64 32 + n 8198 paddd m8, m0, m1 ; idct16 out0 + n 8199 psubd m0, m1 ; idct16 out15 - n 8200 REPX {pmaxsd x, m12}, m8, m0 8201 REPX {pminsd x, m13}, m8, m0 8202 paddd m1, m8, m3 ; idct32 out0 + n 8203 psubd m8, m3 ; idct32 out31 - n 8204 paddd m3, m0, m2 ; idct32 out15 - n 8205 psubd m0, m2 ; idct32 out16 + n 8206 REPX {pmaxsd x, m12}, m1, m8, m3, m0 8207 REPX {pminsd x, m13}, m1, m3, m8, m0 8208 REPX {paddd x, m15}, m1, m3, m0, m8 8209 paddd m2, m1, m4 ; idct64 out0 + n (unshifted) 8210 psubd m1, m4 ; idct64 out63 - n (unshifted) 8211 paddd m4, m3, m5 ; idct64 out15 - n (unshifted) 8212 psubd m3, m5 ; idct64 out48 + n (unshifted) 8213 paddd m5, m0, m6 ; idct64 out16 + n (unshifted) 8214 psubd m0, m6 ; idct64 out47 - n (unshifted) 8215 paddd m6, m8, m7 ; idct64 out31 - n (unshifted) 8216 psubd m8, m7 ; idct64 out32 + n (unshifted) 8217 mova [r5-32*44], m2 8218 mova [r6+32* 3], m1 8219 mova [r6-32*45], m4 8220 mova [r5+32* 4], m3 8221 mova [r5-32*28], m5 8222 mova [r6-32*13], m0 8223 mova [r6-32*29], m6 8224 mova [r5-32*12], m8 8225 add r5, 32 8226 sub r6, 32 8227 cmp r5, r6 8228 jl .main_end_loop 8229 ret 8230.shift_transpose: 8231%macro IDCT64_SHIFT_TRANSPOSE 1 ; shift 8232 sub r6, 32*48 8233 mov r5, r6 8234%%loop: 8235 mova m0, [r6-32* 4] 8236 mova m4, [r6+32* 4] 8237 mova m1, [r6-32* 3] 8238 mova m5, [r6+32* 5] 8239 mova m2, [r6-32* 2] 8240 mova m6, [r6+32* 6] 8241 mova m3, [r6-32* 1] 8242 mova m7, [r6+32* 7] 8243 REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7 8244 packssdw m0, m4 8245 packssdw m1, m5 8246 packssdw m2, m6 8247 packssdw m3, m7 8248 mova m4, [r6+32* 0] 8249 mova m6, [r6+32* 8] 8250 mova m5, [r6+32* 1] 8251 mova m7, [r6+32* 9] 8252 REPX {psrad x, %1}, m4, m6, m5, m7 8253 packssdw m4, m6 8254 packssdw m5, m7 8255 mova m6, [r6+32* 2] 8256 mova m8, [r6+32*10] 8257 mova m7, [r6+32* 3] 8258 mova m9, [r6+32*11] 8259 REPX {psrad x, %1}, m6, m8, m7, m9 8260 packssdw m6, m8 8261 packssdw m7, m9 8262 call m(idct_16x8_internal_10bpc).transpose3 8263 mova [r5-32*4], m0 8264 mova [r5-32*3], m1 8265 mova [r5-32*2], m2 8266 mova [r5-32*1], m3 8267 mova [r5+32*0], m4 8268 mova [r5+32*1], m5 8269 mova [r5+32*2], m6 8270 mova [r5+32*3], m7 8271 add r6, 32*16 8272 add r5, 32*8 8273 cmp r5, r4 8274 jl %%loop 8275 mov r6, r4 8276%endmacro 8277 IDCT64_SHIFT_TRANSPOSE 2 8278 ret 8279 8280cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob 8281 test eobd, eobd 8282 jz .dconly 8283 PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob 8284%undef cmp 8285 vpbroadcastd m11, [pd_2048] 8286 vpbroadcastd m12, [clip_18b_min] 8287 vpbroadcastd m13, [clip_18b_max] 8288 vpbroadcastd m14, [pd_2896] 8289 lea r6, [rsp+32*7] 8290 call .main 8291 cmp eobd, 36 8292 jl .fast 8293 call .main 8294 cmp eobd, 136 8295 jl .fast 8296 call .main 8297 cmp eobd, 300 8298 jl .fast 8299 call .main 8300 jmp .pass2 8301.dconly: 8302 imul r6d, [cq], 181 8303 mov [cq], eobd ; 0 8304 or r3d, 32 8305 add r6d, 128 8306 sar r6d, 8 8307 imul r6d, 181 8308 add r6d, 384 8309 sar r6d, 9 8310 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 8311.fast: 8312 pxor m0, m0 8313 lea r4, [rsp+32*135] 8314.fast_loop: 8315 REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 8316 add r6, 32*8 8317 cmp r6, r4 8318 jl .fast_loop 8319.pass2: 8320 lea r7, [r6-32*32] 8321 lea r5, [r6+32*8] 8322 lea r6, [pw_5+128] 8323 imul r2, strideq, 19 8324 lea r3, [strideq*3] 8325 add r2, dstq 8326.pass2_loop: 8327 mova m0, [r7-32*99] 8328 mova m1, [r7-32*97] 8329 mova m2, [r7-32*95] 8330 mova m3, [r7-32*93] 8331 mova m4, [r7-32*67] 8332 mova m5, [r7-32*65] 8333 mova m6, [r7-32*63] 8334 mova m7, [r7-32*61] 8335 mova m8, [r7-32*35] 8336 mova m9, [r7-32*33] 8337 mova m10, [r7-32*31] 8338 mova m11, [r7-32*29] 8339 mova m12, [r7-32* 3] 8340 mova m13, [r7-32* 1] 8341 mova m14, [r7+32* 1] 8342 mova m15, [r7+32* 3] 8343 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 8344 mova m0, [r7-32*100] 8345 mova m1, [r7-32*98] 8346 mova m2, [r7-32*96] 8347 mova m3, [r7-32*94] 8348 mova m4, [r7-32*68] 8349 mova m5, [r7-32*66] 8350 mova m6, [r7-32*64] 8351 mova m7, [r7-32*62] 8352 mova m8, [r7-32*36] 8353 mova m9, [r7-32*34] 8354 mova m10, [r7-32*32] 8355 mova m11, [r7-32*30] 8356 mova m12, [r7-32* 4] 8357 mova m13, [r7-32* 2] 8358 mova m14, [r7+32* 0] 8359 mova m15, [r7+32* 2] 8360 add r7, 32*8 8361 mova [rsp], m15 8362 call m(idct_16x16_internal_8bpc).main 8363 call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end 8364 sub dstq, r3 8365 lea r2, [r2+r3+32] 8366 add dstq, 32 8367 cmp r7, r4 8368 jl .pass2_loop 8369 RET 8370ALIGN function_align 8371.main: 8372 lea r5, [idct64_mul_16bpc] 8373 pmulld m0, m14, [cq+128* 1] 8374 pmulld m1, m14, [cq+128*31] 8375 pmulld m2, m14, [cq+128*17] 8376 pmulld m3, m14, [cq+128*15] 8377 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 8378 pmulld m0, m14, [cq+128* 7] 8379 pmulld m1, m14, [cq+128*25] 8380 pmulld m2, m14, [cq+128*23] 8381 pmulld m3, m14, [cq+128* 9] 8382 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 8383 pmulld m0, m14, [cq+128* 5] 8384 pmulld m1, m14, [cq+128*27] 8385 pmulld m2, m14, [cq+128*21] 8386 pmulld m3, m14, [cq+128*11] 8387 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 8388 pmulld m0, m14, [cq+128* 3] 8389 pmulld m1, m14, [cq+128*29] 8390 pmulld m2, m14, [cq+128*19] 8391 pmulld m3, m14, [cq+128*13] 8392 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 8393 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 8394 pmulld m0, m14, [cq+128* 2] 8395 pmulld m1, m14, [cq+128*14] 8396 pmulld m2, m14, [cq+128*18] 8397 pmulld m3, m14, [cq+128*30] 8398 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2 8399 pmulld m0, m14, [cq+128* 6] 8400 pmulld m1, m14, [cq+128*10] 8401 pmulld m2, m14, [cq+128*22] 8402 pmulld m3, m14, [cq+128*26] 8403 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2 8404 pmulld m0, m14, [cq+128* 4] 8405 pmulld m1, m14, [cq+128*12] 8406 pmulld m2, m14, [cq+128*20] 8407 pmulld m3, m14, [cq+128*28] 8408 call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2 8409 pmulld m0, m14, [cq+128* 0] 8410 pmulld m1, m14, [cq+128* 8] 8411 pmulld m2, m14, [cq+128*16] 8412 pmulld m3, m14, [cq+128*24] 8413 pxor m15, m15 8414 mov r7d, 128*29 8415.main_zero_loop: 8416 mova [cq+r7-128*1], m15 8417 mova [cq+r7+128*0], m15 8418 mova [cq+r7+128*1], m15 8419 mova [cq+r7+128*2], m15 8420 sub r7d, 128*4 8421 jg .main_zero_loop 8422 psrld m15, m11, 11 ; pd_1 8423 REPX {paddd x, m11}, m0, m1, m2, m3 8424 REPX {psrad x, 12 }, m0, m1, m2, m3 8425 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2 8426 IDCT64_SHIFT_TRANSPOSE 1 8427 ret 8428 8429cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob 8430 test eobd, eobd 8431 jz .dconly 8432 PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob 8433%undef cmp 8434 vpbroadcastd m11, [pd_2048] 8435 vpbroadcastd m12, [clip_18b_min] 8436 vpbroadcastd m13, [clip_18b_max] 8437 vpbroadcastd m14, [pd_2896] 8438 lea r6, [rsp+32*7] 8439 call .main 8440 cmp eobd, 36 8441 jl .fast 8442 call .main 8443 cmp eobd, 136 8444 jl .fast 8445 call .main 8446 cmp eobd, 300 8447 jl .fast 8448 call .main 8449 jmp .pass2 8450.dconly: 8451 imul r6d, [cq], 181 8452 mov [cq], eobd ; 0 8453 or r3d, 64 8454 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly 8455.fast: 8456 pxor m0, m0 8457 lea r4, [rsp+32*135] 8458.fast_loop: 8459 REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 8460 add r6, 32*8 8461 cmp r6, r4 8462 jl .fast_loop 8463.pass2: 8464 lea r10, [r6-32*32] 8465 lea r6, [pw_5+128] 8466 lea r8, [strideq*4] 8467 lea r9, [strideq*5] 8468 lea r3, [r9+strideq*1] ; stride*6 8469 lea r7, [r9+strideq*2] ; stride*7 8470.pass2_loop: 8471 mova m0, [r10-32*100] ; in0 8472 mova m1, [r10-32*96] ; in4 8473 mova m2, [r10-32*68] ; in8 8474 mova m3, [r10-32*64] ; in12 8475 mova m4, [r10-32*36] ; in16 8476 mova m5, [r10-32*32] ; in20 8477 mova m6, [r10-32* 4] ; in24 8478 mova m7, [r10+32* 0] ; in28 8479 pxor m8, m8 8480 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 8481 mova [rsp], m8 8482 call m(idct_16x16_internal_8bpc).main 8483 mova m1, [rsp+32*1] 8484 mova [r4-32*4], m0 8485 mova [r4-32*3], m1 8486 mova [r4-32*2], m2 8487 mova [r4-32*1], m3 8488 mova [r4+32*0], m4 8489 mova [r4+32*1], m5 8490 mova [r4+32*2], m6 8491 mova [r4+32*3], m7 8492 add r4, 32*8 8493 mova [r4-32*4], m8 8494 mova [r4-32*3], m9 8495 mova [r4-32*2], m10 8496 mova [r4-32*1], m11 8497 mova [r4+32*0], m12 8498 mova [r4+32*1], m13 8499 mova [r4+32*2], m14 8500 mova [r4+32*3], m15 8501 mova m0, [r10-32*98] ; in2 8502 mova m1, [r10-32*94] ; in6 8503 mova m2, [r10-32*66] ; in10 8504 mova m3, [r10-32*62] ; in14 8505 mova m4, [r10-32*34] ; in18 8506 mova m5, [r10-32*30] ; in22 8507 mova m6, [r10-32* 2] ; in26 8508 mova m7, [r10+32* 2] ; in30 8509 lea r5, [r4+32*16] 8510 add r4, 32*8 8511 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 8512 mova m0, [r10-32*99] ; in1 8513 mova m1, [r10+32* 3] ; in31 8514 mova m2, [r10-32*35] ; in17 8515 mova m3, [r10-32*61] ; in15 8516 mova m4, [r10-32*67] ; in9 8517 mova m5, [r10-32*29] ; in23 8518 mova m6, [r10-32* 3] ; in25 8519 mova m7, [r10-32*93] ; in7 8520 lea r6, [idct64_mul - 8] 8521 add r4, 32*16 8522 add r5, 32*32 8523 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 8524 mova m0, [r10-32*95] ; in5 8525 mova m1, [r10-32* 1] ; in27 8526 mova m2, [r10-32*31] ; in21 8527 mova m3, [r10-32*65] ; in11 8528 mova m4, [r10-32*63] ; in13 8529 mova m5, [r10-32*33] ; in19 8530 mova m6, [r10+32* 1] ; in29 8531 mova m7, [r10-32*97] ; in3 8532 add r6, 8 8533 add r4, 32*8 8534 sub r5, 32*8 8535 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 8536 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 8537 add r10, 32*8 8538 sub dstq, r8 8539 sub r4, 32*44 8540 add dstq, 32 8541 cmp r10, r4 8542 jl .pass2_loop 8543 RET 8544ALIGN function_align 8545.main: 8546 lea r5, [idct64_mul_16bpc] 8547 mova m0, [cq+128* 1] 8548 mova m1, [cq+128*31] 8549 mova m2, [cq+128*17] 8550 mova m3, [cq+128*15] 8551 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 8552 mova m0, [cq+128* 7] 8553 mova m1, [cq+128*25] 8554 mova m2, [cq+128*23] 8555 mova m3, [cq+128* 9] 8556 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 8557 mova m0, [cq+128* 5] 8558 mova m1, [cq+128*27] 8559 mova m2, [cq+128*21] 8560 mova m3, [cq+128*11] 8561 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 8562 mova m0, [cq+128* 3] 8563 mova m1, [cq+128*29] 8564 mova m2, [cq+128*19] 8565 mova m3, [cq+128*13] 8566 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 8567 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 8568 mova m0, [cq+128* 2] 8569 mova m1, [cq+128*14] 8570 mova m2, [cq+128*18] 8571 mova m3, [cq+128*30] 8572 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast 8573 mova m0, [cq+128* 6] 8574 mova m1, [cq+128*10] 8575 mova m2, [cq+128*22] 8576 mova m3, [cq+128*26] 8577 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast 8578 mova m0, [cq+128* 4] 8579 mova m1, [cq+128*12] 8580 mova m2, [cq+128*20] 8581 mova m3, [cq+128*28] 8582 call m(idct_8x16_internal_10bpc).main_oddhalf_fast 8583 mova m0, [cq+128* 0] 8584 mova m1, [cq+128* 8] 8585 mova m2, [cq+128*16] 8586 mova m3, [cq+128*24] 8587 pxor m15, m15 8588 mov r7d, 128*29 8589.main_zero_loop: 8590 mova [cq+r7-128*1], m15 8591 mova [cq+r7+128*0], m15 8592 mova [cq+r7+128*1], m15 8593 mova [cq+r7+128*2], m15 8594 sub r7d, 128*4 8595 jg .main_zero_loop 8596 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end 8597 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose 8598 8599%endif ; ARCH_X86_64 8600