1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31%macro JMP_TABLE 2-* 32 %xdefine %1_jmptable %%table 33 %xdefine %%base mangle(private_prefix %+ _%1_avx2) 34 %%table: 35 %rep %0 - 1 36 dd %%base %+ .%2 - %%table 37 %rotate 1 38 %endrep 39%endmacro 40 41%macro CDEF_FILTER_JMP_TABLE 1 42JMP_TABLE cdef_filter_%1_8bpc, \ 43 d6k0, d6k1, d7k0, d7k1, \ 44 d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ 45 d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ 46 d0k0, d0k1, d1k0, d1k1 47%endmacro 48 49SECTION_RODATA 32 50 51pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 52blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 53 dd 0x80, 0x00, 0x00 54blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 55blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 56 dd 0x00, 0x00 57blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 58 dd 0x0000 59blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 60 dd 0x0000, 0x0000 61blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 62blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 63div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 64shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 65shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 66pw_128: times 2 dw 128 67pw_2048: times 2 dw 2048 68tap_table: ; masks for 8 bit shifts 69 db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 70 ; weights 71 db 4, 2, 3, 3, 2, 1 72 db -1 * 16 + 1, -2 * 16 + 2 73 db 0 * 16 + 1, -1 * 16 + 2 74 db 0 * 16 + 1, 0 * 16 + 2 75 db 0 * 16 + 1, 1 * 16 + 2 76 db 1 * 16 + 1, 2 * 16 + 2 77 db 1 * 16 + 0, 2 * 16 + 1 78 db 1 * 16 + 0, 2 * 16 + 0 79 db 1 * 16 + 0, 2 * 16 - 1 80 ; the last 6 are repeats of the first 6 so we don't need to & 7 81 db -1 * 16 + 1, -2 * 16 + 2 82 db 0 * 16 + 1, -1 * 16 + 2 83 db 0 * 16 + 1, 0 * 16 + 2 84 db 0 * 16 + 1, 1 * 16 + 2 85 db 1 * 16 + 1, 2 * 16 + 2 86 db 1 * 16 + 0, 2 * 16 + 1 87 88CDEF_FILTER_JMP_TABLE 4x4 89CDEF_FILTER_JMP_TABLE 4x8 90CDEF_FILTER_JMP_TABLE 8x8 91 92SECTION .text 93 94%macro PREP_REGS 2 ; w, h 95 ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] 96 mov dird, r7m 97 lea tableq, [cdef_filter_%1x%2_8bpc_jmptable] 98 lea dirq, [tableq+dirq*2*4] 99%if %1 == 4 100 %if %2 == 4 101 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ 102 table, dir, dirjmp, stride3, k 103 %else 104 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ 105 table, dir, dirjmp, dst4, stride3, k 106 lea dst4q, [dstq+strideq*4] 107 %endif 108%else 109 DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \ 110 table, dir, dirjmp, top2, stride3, k 111 mov hq, -8 112 lea top1q, [top1q+strideq*0] 113 lea top2q, [top1q+strideq*1] 114%endif 115%if %1 == 4 116 lea stride3q, [strideq*3] 117%endif 118%endmacro 119 120%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max 121 mov kd, 1 122 pxor m15, m15 ; sum 123%if %2 == 8 124 pxor m12, m12 125 %if %1 == 4 126 movd xm4, [dstq +strideq*0] 127 movd xm6, [dstq +strideq*1] 128 movd xm5, [dstq +strideq*2] 129 movd xm7, [dstq +stride3q ] 130 vinserti128 m4, [dst4q+strideq*0], 1 131 vinserti128 m6, [dst4q+strideq*1], 1 132 vinserti128 m5, [dst4q+strideq*2], 1 133 vinserti128 m7, [dst4q+stride3q ], 1 134 punpckldq m4, m6 135 punpckldq m5, m7 136 %else 137 movq xm4, [dstq+strideq*0] 138 movq xm5, [dstq+strideq*1] 139 vinserti128 m4, [dstq+strideq*2], 1 140 vinserti128 m5, [dstq+stride3q ], 1 141 %endif 142 punpcklqdq m4, m5 143%else 144 movd xm4, [dstq+strideq*0] 145 movd xm5, [dstq+strideq*1] 146 vinserti128 m4, [dstq+strideq*2], 1 147 vinserti128 m5, [dstq+stride3q ], 1 148 punpckldq m4, m5 149%endif 150%if %3 == 1 151 mova m7, m4 ; min 152 mova m8, m4 ; max 153%endif 154%endmacro 155 156%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength 157 ; mul_tap, w, h, clip 158 ; load p0/p1 159 movsxd dirjmpq, [dirq+kq*4+%1*2*4] 160 add dirjmpq, tableq 161 call dirjmpq 162 163%if %8 == 1 164 pmaxub m7, m5 165 pminub m8, m5 166 pmaxub m7, m6 167 pminub m8, m6 168%endif 169 170 ; accumulate sum[m15] over p0/p1 171%if %7 == 4 172 punpcklbw m5, m6 173 punpcklbw m6, m4, m4 174 psubusb m9, m5, m6 175 psubusb m5, m6, m5 176 por m9, m5 ; abs_diff_p01(p01 - px) 177 pcmpeqb m5, m9 178 por m5, %5 179 psignb m6, %5, m5 180 psrlw m5, m9, %2 ; emulate 8-bit shift 181 pand m5, %3 182 psubusb m5, %4, m5 183 pminub m5, m9 184 pmaddubsw m5, m6 185 paddw m15, m5 186%else 187 psubusb m9, m5, m4 188 psubusb m5, m4, m5 189 psubusb m11, m6, m4 190 psubusb m6, m4, m6 191 por m9, m5 ; abs_diff_p0(p0 - px) 192 por m11, m6 ; abs_diff_p1(p1 - px) 193 pcmpeqb m5, m9 194 pcmpeqb m6, m11 195 punpckhbw m10, m9, m11 196 punpcklbw m9, m11 197 por m5, %5 198 por m11, m6, %5 199 punpckhbw m6, m5, m11 200 punpcklbw m5, m11 201 psignb m11, %5, m6 202 psrlw m6, m10, %2 ; emulate 8-bit shift 203 pand m6, %3 204 psubusb m6, %4, m6 205 pminub m6, m10 206 pmaddubsw m6, m11 207 paddw m12, m6 208 psignb m11, %5, m5 209 psrlw m5, m9, %2 ; emulate 8-bit shift 210 pand m5, %3 211 psubusb m5, %4, m5 212 pminub m5, m9 213 pmaddubsw m5, m11 214 paddw m15, m5 215%endif 216%endmacro 217 218%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip 219%if %2 == 4 220 %if %5 == 1 221 punpcklbw m4, %3 222 %endif 223 pcmpgtw %3, m15 224 paddw m15, %3 225 pmulhrsw m15, %4 226 %if %5 == 0 227 packsswb m15, m15 228 paddb m4, m15 229 %else 230 paddw m4, m15 231 packuswb m4, m4 ; clip px in [0x0,0xff] 232 pminub m4, m7 233 pmaxub m4, m8 234 %endif 235 vextracti128 xm5, m4, 1 236 movd [dstq+strideq*0], xm4 237 movd [dstq+strideq*2], xm5 238 pextrd [dstq+strideq*1], xm4, 1 239 pextrd [dstq+stride3q ], xm5, 1 240%else 241 pcmpgtw m6, %3, m12 242 pcmpgtw m5, %3, m15 243 paddw m12, m6 244 paddw m15, m5 245 %if %5 == 1 246 punpckhbw m5, m4, %3 247 punpcklbw m4, %3 248 %endif 249 pmulhrsw m12, %4 250 pmulhrsw m15, %4 251 %if %5 == 0 252 packsswb m15, m12 253 paddb m4, m15 254 %else 255 paddw m5, m12 256 paddw m4, m15 257 packuswb m4, m5 ; clip px in [0x0,0xff] 258 pminub m4, m7 259 pmaxub m4, m8 260 %endif 261 vextracti128 xm5, m4, 1 262 %if %1 == 4 263 movd [dstq +strideq*0], xm4 264 movd [dst4q+strideq*0], xm5 265 pextrd [dstq +strideq*1], xm4, 1 266 pextrd [dst4q+strideq*1], xm5, 1 267 pextrd [dstq +strideq*2], xm4, 2 268 pextrd [dst4q+strideq*2], xm5, 2 269 pextrd [dstq +stride3q ], xm4, 3 270 pextrd [dst4q+stride3q ], xm5, 3 271 %else 272 movq [dstq+strideq*0], xm4 273 movq [dstq+strideq*2], xm5 274 movhps [dstq+strideq*1], xm4 275 movhps [dstq+stride3q ], xm5 276 %endif 277%endif 278%endmacro 279 280%macro BORDER_PREP_REGS 2 ; w, h 281 ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] 282 mov dird, r7m 283 lea dirq, [tableq+dirq*2+14] 284%if %1*%2*2/mmsize > 1 285 %if %1 == 4 286 DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off 287 %else 288 DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off 289 %endif 290 mov hd, %1*%2*2/mmsize 291%else 292 DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off 293%endif 294 lea stkq, [px] 295 pxor m11, m11 296%endmacro 297 298%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max 299 mov kd, 1 300%if %1 == 4 301 movq xm4, [stkq+32*0] 302 movhps xm4, [stkq+32*1] 303 movq xm5, [stkq+32*2] 304 movhps xm5, [stkq+32*3] 305 vinserti128 m4, xm5, 1 306%else 307 mova xm4, [stkq+32*0] ; px 308 vinserti128 m4, [stkq+32*1], 1 309%endif 310 pxor m15, m15 ; sum 311%if %3 == 1 312 mova m7, m4 ; max 313 mova m8, m4 ; min 314%endif 315%endmacro 316 317%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength 318 ; mul_tap, w, clip 319 ; load p0/p1 320 movsx offq, byte [dirq+kq+%1] ; off1 321%if %6 == 4 322 movq xm5, [stkq+offq*2+32*0] ; p0 323 movq xm6, [stkq+offq*2+32*2] 324 movhps xm5, [stkq+offq*2+32*1] 325 movhps xm6, [stkq+offq*2+32*3] 326 vinserti128 m5, xm6, 1 327%else 328 movu xm5, [stkq+offq*2+32*0] ; p0 329 vinserti128 m5, [stkq+offq*2+32*1], 1 330%endif 331 neg offq ; -off1 332%if %6 == 4 333 movq xm6, [stkq+offq*2+32*0] ; p1 334 movq xm9, [stkq+offq*2+32*2] 335 movhps xm6, [stkq+offq*2+32*1] 336 movhps xm9, [stkq+offq*2+32*3] 337 vinserti128 m6, xm9, 1 338%else 339 movu xm6, [stkq+offq*2+32*0] ; p1 340 vinserti128 m6, [stkq+offq*2+32*1], 1 341%endif 342%if %7 == 1 343 ; out of bounds values are set to a value that is a both a large unsigned 344 ; value and a negative signed value. 345 ; use signed max and unsigned min to remove them 346 pmaxsw m7, m5 ; max after p0 347 pminuw m8, m5 ; min after p0 348 pmaxsw m7, m6 ; max after p1 349 pminuw m8, m6 ; min after p1 350%endif 351 352 ; accumulate sum[m15] over p0/p1 353 ; calculate difference before converting 354 psubw m5, m4 ; diff_p0(p0 - px) 355 psubw m6, m4 ; diff_p1(p1 - px) 356 357 ; convert to 8-bits with signed saturation 358 ; saturating to large diffs has no impact on the results 359 packsswb m5, m6 360 361 ; group into pairs so we can accumulate using maddubsw 362 pshufb m5, m12 363 pabsb m9, m5 364 psignb m10, %5, m5 365 psrlw m5, m9, %2 ; emulate 8-bit shift 366 pand m5, %3 367 psubusb m5, %4, m5 368 369 ; use unsigned min since abs diff can equal 0x80 370 pminub m5, m9 371 pmaddubsw m5, m10 372 paddw m15, m5 373%endmacro 374 375%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip 376 pcmpgtw m9, m11, m15 377 paddw m15, m9 378 pmulhrsw m15, %2 379 paddw m4, m15 380%if %3 == 1 381 pminsw m4, m7 382 pmaxsw m4, m8 383%endif 384 packuswb m4, m4 385 vextracti128 xm5, m4, 1 386%if %1 == 4 387 movd [dstq+strideq*0], xm4 388 pextrd [dstq+strideq*1], xm4, 1 389 movd [dstq+strideq*2], xm5 390 pextrd [dstq+stride3q ], xm5, 1 391%else 392 movq [dstq+strideq*0], xm4 393 movq [dstq+strideq*1], xm5 394%endif 395%endmacro 396 397%macro CDEF_FILTER 2 ; w, h 398INIT_YMM avx2 399cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ 400 pri, sec, dir, damping, edge 401 mov edged, edgem 402 cmp edged, 0xf 403 jne .border_block 404 405 PUSH r11 406 PUSH r12 407%if %2 == 4 408%assign regs_used 13 409 ALLOC_STACK 0x60, 16 410 pmovzxbw xm0, [leftq+1] 411 vpermq m0, m0, q0110 412 psrldq m1, m0, 4 413 vpalignr m2, m0, m0, 12 414 movu [rsp+0x10], m0 415 movu [rsp+0x28], m1 416 movu [rsp+0x40], m2 417%elif %1 == 4 418%assign regs_used 14 419 PUSH r13 420 ALLOC_STACK 8*2+%1*%2*1, 16 421 pmovzxwd m0, [leftq] 422 mova [rsp+0x10], m0 423%else 424%assign regs_used 15 425 PUSH r13 426 PUSH r14 427 ALLOC_STACK 8*4+%1*%2*2+32, 16 428 lea r11, [strideq*3] 429 movu xm4, [dstq+strideq*2] 430 pmovzxwq m0, [leftq+0] 431 pmovzxwq m1, [leftq+8] 432 vinserti128 m4, [dstq+r11], 1 433 pmovzxbd m2, [leftq+1] 434 pmovzxbd m3, [leftq+9] 435 mov [rsp+16], botq 436 mova [rsp+0x20], m0 437 mova [rsp+0x40], m1 438 mova [rsp+0x60], m2 439 mova [rsp+0x80], m3 440 mova [rsp+0xa0], m4 441 lea botq, [dstq+strideq*4] 442%endif 443 444 DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping 445 mov dampingd, r8m 446 xor zerod, zerod 447 movifnidn prid, prim 448 sub dampingd, 31 449 movifnidn secdmpd, secdmpm 450 test prid, prid 451 jz .sec_only 452 movd xm0, prid 453 lzcnt pridmpd, prid 454 add pridmpd, dampingd 455 cmovs pridmpd, zerod 456 mov [rsp+0], pridmpq ; pri_shift 457 test secdmpd, secdmpd 458 jz .pri_only 459 movd xm1, secdmpd 460 lzcnt secdmpd, secdmpd 461 add secdmpd, dampingd 462 mov [rsp+8], secdmpq ; sec_shift 463 464 DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp 465 lea tableq, [tap_table] 466 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 467 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 468 469 ; pri/sec_taps[k] [4 total] 470 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir 471 vpbroadcastb m0, xm0 ; pri_strength 472 vpbroadcastb m1, xm1 ; sec_strength 473 and prid, 1 474 lea priq, [tableq+priq*2+8] ; pri_taps 475 lea secq, [tableq+12] ; sec_taps 476 477 PREP_REGS %1, %2 478%if %1*%2 > mmsize 479.v_loop: 480%endif 481 LOAD_BLOCK %1, %2, 1 482.k_loop: 483 vpbroadcastb m2, [priq+kq] ; pri_taps 484 vpbroadcastb m3, [secq+kq] ; sec_taps 485 ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0 486 ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2 487 ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2 488 dec kq 489 jge .k_loop 490 491 vpbroadcastd m10, [pw_2048] 492 pxor m9, m9 493 ADJUST_PIXEL %1, %2, m9, m10, 1 494%if %1*%2 > mmsize 495 lea dstq, [dstq+strideq*4] 496 lea top1q, [rsp+0xa0] 497 lea top2q, [rsp+0xb0] 498 mov botq, [rsp+16] 499 add hq, 4 500 jl .v_loop 501%endif 502 RET 503 504.pri_only: 505 DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp 506 lea tableq, [tap_table] 507 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 508 ; pri/sec_taps[k] [4 total] 509 DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir 510 vpbroadcastb m0, xm0 ; pri_strength 511 and prid, 1 512 lea priq, [tableq+priq*2+8] ; pri_taps 513 PREP_REGS %1, %2 514 vpbroadcastd m3, [pw_2048] 515 pxor m1, m1 516%if %1*%2 > mmsize 517.pri_v_loop: 518%endif 519 LOAD_BLOCK %1, %2 520.pri_k_loop: 521 vpbroadcastb m2, [priq+kq] ; pri_taps 522 ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 523 dec kq 524 jge .pri_k_loop 525 ADJUST_PIXEL %1, %2, m1, m3 526%if %1*%2 > mmsize 527 lea dstq, [dstq+strideq*4] 528 lea top1q, [rsp+0xa0] 529 lea top2q, [rsp+0xb0] 530 mov botq, [rsp+16] 531 add hq, 4 532 jl .pri_v_loop 533%endif 534 RET 535 536.sec_only: 537 DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping 538 movd xm1, secdmpd 539 lzcnt secdmpd, secdmpd 540 add secdmpd, dampingd 541 mov [rsp+8], secdmpq ; sec_shift 542 DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table 543 lea tableq, [tap_table] 544 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 545 ; pri/sec_taps[k] [4 total] 546 DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir 547 vpbroadcastb m1, xm1 ; sec_strength 548 lea secq, [tableq+12] ; sec_taps 549 PREP_REGS %1, %2 550 vpbroadcastd m2, [pw_2048] 551 pxor m0, m0 552%if %1*%2 > mmsize 553.sec_v_loop: 554%endif 555 LOAD_BLOCK %1, %2 556.sec_k_loop: 557 vpbroadcastb m3, [secq+kq] ; sec_taps 558 ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 559 ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 560 dec kq 561 jge .sec_k_loop 562 ADJUST_PIXEL %1, %2, m0, m2 563%if %1*%2 > mmsize 564 lea dstq, [dstq+strideq*4] 565 lea top1q, [rsp+0xa0] 566 lea top2q, [rsp+0xb0] 567 mov botq, [rsp+16] 568 add hq, 4 569 jl .sec_v_loop 570%endif 571 RET 572 573.d0k0: 574%if %1 == 4 575 %if %2 == 4 576 vpbroadcastq m6, [dstq+strideq*1-1] 577 vpbroadcastq m10, [dstq+strideq*2-1] 578 movd xm5, [topq+strideq*1+1] 579 movd xm9, [dstq+strideq*0+1] 580 psrldq m11, m6, 2 581 psrldq m12, m10, 2 582 vinserti128 m6, [dstq+stride3q -1], 1 583 vinserti128 m10, [botq -1], 1 584 vpblendd m5, m11, 0x10 585 vpblendd m9, m12, 0x10 586 movu m11, [blend_4x4+16] 587 punpckldq m6, m10 588 punpckldq m5, m9 589 vpblendvb m6, [rsp+gprsize+0x28], m11 590 %else 591 movd xm5, [topq +strideq*1+1] 592 movq xm6, [dstq +strideq*1-1] 593 movq xm10, [dstq +stride3q -1] 594 movq xm11, [dst4q+strideq*1-1] 595 pinsrd xm5, [dstq +strideq*0+1], 1 596 movhps xm6, [dstq +strideq*2-1] 597 movhps xm10, [dst4q+strideq*0-1] 598 movhps xm11, [dst4q+strideq*2-1] 599 psrldq xm9, xm6, 2 600 shufps xm5, xm9, q2010 ; -1 +0 +1 +2 601 shufps xm6, xm10, q2020 ; +1 +2 +3 +4 602 psrldq xm9, xm11, 2 603 psrldq xm10, 2 604 shufps xm10, xm9, q2020 ; +3 +4 +5 +6 605 movd xm9, [dst4q+stride3q -1] 606 pinsrd xm9, [botq -1], 1 607 shufps xm11, xm9, q1020 ; +5 +6 +7 +8 608 pmovzxbw m9, [leftq+3] 609 vinserti128 m6, xm11, 1 610 movu m11, [blend_4x8_0+4] 611 vinserti128 m5, xm10, 1 612 vpblendvb m6, m9, m11 613 %endif 614%else 615 lea r13, [blend_8x8_0+16] 616 movq xm5, [top2q +1] 617 vbroadcasti128 m10, [dstq+strideq*1-1] 618 vbroadcasti128 m11, [dstq+strideq*2-1] 619 movhps xm5, [dstq+strideq*0+1] 620 vinserti128 m6, m10, [dstq+stride3q-1], 1 621 vinserti128 m9, m11, [botq -1], 1 622 psrldq m10, 2 623 psrldq m11, 2 624 punpcklqdq m6, m9 625 movu m9, [r13+hq*2*1+16*1] 626 punpcklqdq m10, m11 627 vpblendd m5, m10, 0xF0 628 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9 629%endif 630 ret 631.d1k0: 632.d2k0: 633.d3k0: 634%if %1 == 4 635 %if %2 == 4 636 movq xm6, [dstq+strideq*0-1] 637 movq xm9, [dstq+strideq*1-1] 638 vinserti128 m6, [dstq+strideq*2-1], 1 639 vinserti128 m9, [dstq+stride3q -1], 1 640 movu m11, [rsp+gprsize+0x10] 641 pcmpeqd m12, m12 642 psrldq m5, m6, 2 643 psrldq m10, m9, 2 644 psrld m12, 24 645 punpckldq m6, m9 646 punpckldq m5, m10 647 vpblendvb m6, m11, m12 648 %else 649 movq xm6, [dstq +strideq*0-1] 650 movq xm9, [dstq +strideq*2-1] 651 movhps xm6, [dstq +strideq*1-1] 652 movhps xm9, [dstq +stride3q -1] 653 movq xm10, [dst4q+strideq*0-1] 654 movhps xm10, [dst4q+strideq*1-1] 655 psrldq xm5, xm6, 2 656 psrldq xm11, xm9, 2 657 shufps xm5, xm11, q2020 658 movq xm11, [dst4q+strideq*2-1] 659 movhps xm11, [dst4q+stride3q -1] 660 shufps xm6, xm9, q2020 661 shufps xm9, xm10, xm11, q2020 662 vinserti128 m6, xm9, 1 663 pmovzxbw m9, [leftq+1] 664 psrldq xm10, 2 665 psrldq xm11, 2 666 shufps xm10, xm11, q2020 667 vpbroadcastd m11, [blend_4x8_0+4] 668 vinserti128 m5, xm10, 1 669 vpblendvb m6, m9, m11 670 %endif 671%else 672 movu xm5, [dstq+strideq*0-1] 673 movu xm9, [dstq+strideq*1-1] 674 vinserti128 m5, [dstq+strideq*2-1], 1 675 vinserti128 m9, [dstq+stride3q -1], 1 676 movu m10, [blend_8x8_0+16] 677 punpcklqdq m6, m5, m9 678 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10 679 psrldq m5, 2 680 psrldq m9, 2 681 punpcklqdq m5, m9 682%endif 683 ret 684.d4k0: 685%if %1 == 4 686 %if %2 == 4 687 vpbroadcastq m10, [dstq+strideq*1-1] 688 vpbroadcastq m11, [dstq+strideq*2-1] 689 movd xm6, [topq+strideq*1-1] 690 movd xm9, [dstq+strideq*0-1] 691 psrldq m5, m10, 2 692 psrldq m12, m11, 2 693 vpblendd m6, m10, 0x10 694 vpblendd m9, m11, 0x10 695 movu m10, [blend_4x4] 696 vinserti128 m5, [dstq+stride3q +1], 1 697 vinserti128 m12, [botq +1], 1 698 punpckldq m6, m9 699 punpckldq m5, m12 700 vpblendvb m6, [rsp+gprsize+0x40], m10 701 %else 702 movd xm6, [topq +strideq*1-1] 703 movq xm9, [dstq +strideq*1-1] 704 movq xm10, [dstq +stride3q -1] 705 movq xm11, [dst4q+strideq*1-1] 706 pinsrd xm6, [dstq +strideq*0-1], 1 707 movhps xm9, [dstq +strideq*2-1] 708 movhps xm10, [dst4q+strideq*0-1] 709 movhps xm11, [dst4q+strideq*2-1] 710 psrldq xm5, xm9, 2 711 shufps xm6, xm9, q2010 712 psrldq xm9, xm10, 2 713 shufps xm5, xm9, q2020 714 shufps xm10, xm11, q2020 715 movd xm9, [dst4q+stride3q +1] 716 vinserti128 m6, xm10, 1 717 pinsrd xm9, [botq +1], 1 718 psrldq xm11, 2 719 pmovzxbw m10, [leftq-1] 720 shufps xm11, xm9, q1020 721 movu m9, [blend_4x8_0] 722 vinserti128 m5, xm11, 1 723 vpblendvb m6, m10, m9 724 %endif 725%else 726 lea r13, [blend_8x8_0+8] 727 movq xm6, [top2q -1] 728 vbroadcasti128 m5, [dstq+strideq*1-1] 729 vbroadcasti128 m9, [dstq+strideq*2-1] 730 movhps xm6, [dstq+strideq*0-1] 731 movu m11, [r13+hq*2*1+16*1] 732 punpcklqdq m10, m5, m9 733 vinserti128 m5, [dstq+stride3q -1], 1 734 vinserti128 m9, [botq -1], 1 735 vpblendd m6, m10, 0xF0 736 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11 737 psrldq m5, 2 738 psrldq m9, 2 739 punpcklqdq m5, m9 740%endif 741 ret 742.d5k0: 743.d6k0: 744.d7k0: 745%if %1 == 4 746 %if %2 == 4 747 movd xm6, [topq+strideq*1 ] 748 vpbroadcastd m5, [dstq+strideq*1 ] 749 vpbroadcastd m9, [dstq+strideq*2 ] 750 vpblendd xm6, [dstq+strideq*0-4], 0x2 751 vpblendd m5, m9, 0x22 752 vpblendd m6, m5, 0x30 753 vinserti128 m5, [dstq+stride3q ], 1 754 vpblendd m5, [botq -20], 0x20 755 %else 756 movd xm6, [topq +strideq*1] 757 movd xm5, [dstq +strideq*1] 758 movd xm9, [dstq +stride3q ] 759 movd xm10, [dst4q+strideq*1] 760 movd xm11, [dst4q+stride3q ] 761 pinsrd xm6, [dstq +strideq*0], 1 762 pinsrd xm5, [dstq +strideq*2], 1 763 pinsrd xm9, [dst4q+strideq*0], 1 764 pinsrd xm10, [dst4q+strideq*2], 1 765 pinsrd xm11, [botq ], 1 766 punpcklqdq xm6, xm5 767 punpcklqdq xm5, xm9 768 punpcklqdq xm9, xm10 769 punpcklqdq xm10, xm11 770 vinserti128 m6, xm9, 1 771 vinserti128 m5, xm10, 1 772 %endif 773%else 774 movq xm6, [top2q ] 775 movq xm5, [dstq+strideq*1] 776 movq xm9, [dstq+stride3q ] 777 movhps xm6, [dstq+strideq*0] 778 movhps xm5, [dstq+strideq*2] 779 movhps xm9, [botq ] 780 vinserti128 m6, xm5, 1 781 vinserti128 m5, xm9, 1 782%endif 783 ret 784.d0k1: 785%if %1 == 4 786 %if %2 == 4 787 movd xm6, [dstq+strideq*2-2] 788 movd xm9, [dstq+stride3q -2] 789 movd xm5, [topq+strideq*0+2] 790 movd xm10, [topq+strideq*1+2] 791 pinsrw xm6, [leftq+4], 0 792 pinsrw xm9, [leftq+6], 0 793 vinserti128 m5, [dstq+strideq*0+2], 1 794 vinserti128 m10, [dstq+strideq*1+2], 1 795 vinserti128 m6, [botq+strideq*0-2], 1 796 vinserti128 m9, [botq+strideq*1-2], 1 797 punpckldq m5, m10 798 punpckldq m6, m9 799 %else 800 movq xm6, [dstq +strideq*2-2] 801 movd xm10, [dst4q+strideq*2-2] 802 movd xm5, [topq +strideq*0+2] 803 movq xm9, [dst4q+strideq*0-2] 804 movhps xm6, [dstq +stride3q -2] 805 pinsrw xm10, [dst4q+stride3q ], 3 806 pinsrd xm5, [topq +strideq*1+2], 1 807 movhps xm9, [dst4q+strideq*1-2] 808 pinsrd xm10, [botq +strideq*0-2], 2 809 pinsrd xm5, [dstq +strideq*0+2], 2 810 pinsrd xm10, [botq +strideq*1-2], 3 811 pinsrd xm5, [dstq +strideq*1+2], 3 812 shufps xm11, xm6, xm9, q3131 813 shufps xm6, xm9, q2020 814 movu m9, [blend_4x8_3+8] 815 vinserti128 m6, xm10, 1 816 vinserti128 m5, xm11, 1 817 vpblendvb m6, [rsp+gprsize+0x10+8], m9 818 %endif 819%else 820 lea r13, [blend_8x8_1+16] 821 movq xm6, [dstq+strideq*2-2] 822 movq xm9, [dstq+stride3q -2] 823 movq xm5, [top1q +2] 824 movq xm10, [top2q +2] 825 movu m11, [r13+hq*2*2+16*2] 826 vinserti128 m6, [botq+strideq*0-2], 1 827 vinserti128 m9, [botq+strideq*1-2], 1 828 vinserti128 m5, [dstq+strideq*0+2], 1 829 vinserti128 m10, [dstq+strideq*1+2], 1 830 punpcklqdq m6, m9 831 punpcklqdq m5, m10 832 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11 833%endif 834 ret 835.d1k1: 836%if %1 == 4 837 %if %2 == 4 838 vpbroadcastq m6, [dstq+strideq*1-2] 839 vpbroadcastq m9, [dstq+strideq*2-2] 840 movd xm5, [topq+strideq*1+2] 841 movd xm10, [dstq+strideq*0+2] 842 psrldq m11, m6, 4 843 psrldq m12, m9, 4 844 vpblendd m5, m11, 0x10 845 movq xm11, [leftq+2] 846 vinserti128 m6, [dstq+stride3q-2], 1 847 punpckldq xm11, xm11 848 vpblendd m10, m12, 0x10 849 pcmpeqd m12, m12 850 pmovzxwd m11, xm11 851 psrld m12, 16 852 punpckldq m6, m9 853 vpbroadcastd m9, [botq-2] 854 vpblendvb m6, m11, m12 855 punpckldq m5, m10 856 vpblendd m6, m9, 0x20 857 %else 858 movd xm5, [topq +strideq*1+2] 859 movq xm6, [dstq +strideq*1-2] 860 movq xm9, [dstq +stride3q -2] 861 movq xm10, [dst4q+strideq*1-2] 862 movd xm11, [dst4q+stride3q -2] 863 pinsrd xm5, [dstq +strideq*0+2], 1 864 movhps xm6, [dstq +strideq*2-2] 865 movhps xm9, [dst4q+strideq*0-2] 866 movhps xm10, [dst4q+strideq*2-2] 867 pinsrd xm11, [botq -2], 1 868 shufps xm5, xm6, q3110 869 shufps xm6, xm9, q2020 870 shufps xm9, xm10, q3131 871 shufps xm10, xm11, q1020 872 movu m11, [blend_4x8_2+4] 873 vinserti128 m6, xm10, 1 874 vinserti128 m5, xm9, 1 875 vpblendvb m6, [rsp+gprsize+0x10+4], m11 876 %endif 877%else 878 lea r13, [blend_8x8_1+16] 879 movq xm5, [top2q +2] 880 vbroadcasti128 m6, [dstq+strideq*1-2] 881 vbroadcasti128 m9, [dstq+strideq*2-2] 882 movhps xm5, [dstq+strideq*0+2] 883 shufps m10, m6, m9, q2121 884 vinserti128 m6, [dstq+stride3q -2], 1 885 vinserti128 m9, [botq -2], 1 886 movu m11, [r13+hq*2*1+16*1] 887 vpblendd m5, m10, 0xF0 888 punpcklqdq m6, m9 889 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11 890%endif 891 ret 892.d2k1: 893%if %1 == 4 894 %if %2 == 4 895 movq xm11, [leftq] 896 movq xm6, [dstq+strideq*0-2] 897 movq xm9, [dstq+strideq*1-2] 898 vinserti128 m6, [dstq+strideq*2-2], 1 899 vinserti128 m9, [dstq+stride3q -2], 1 900 punpckldq xm11, xm11 901 psrldq m5, m6, 4 902 psrldq m10, m9, 4 903 pmovzxwd m11, xm11 904 punpckldq m6, m9 905 punpckldq m5, m10 906 pblendw m6, m11, 0x05 907 %else 908 movq xm5, [dstq +strideq*0-2] 909 movq xm9, [dstq +strideq*2-2] 910 movq xm10, [dst4q+strideq*0-2] 911 movq xm11, [dst4q+strideq*2-2] 912 movhps xm5, [dstq +strideq*1-2] 913 movhps xm9, [dstq +stride3q -2] 914 movhps xm10, [dst4q+strideq*1-2] 915 movhps xm11, [dst4q+stride3q -2] 916 shufps xm6, xm5, xm9, q2020 917 shufps xm5, xm9, q3131 918 shufps xm9, xm10, xm11, q2020 919 shufps xm10, xm11, q3131 920 pmovzxwd m11, [leftq] 921 vinserti128 m6, xm9, 1 922 vinserti128 m5, xm10, 1 923 pblendw m6, m11, 0x55 924 %endif 925%else 926 mova m11, [rsp+gprsize+0x20+hq*8+64] 927 movu xm5, [dstq+strideq*0-2] 928 movu xm9, [dstq+strideq*1-2] 929 vinserti128 m5, [dstq+strideq*2-2], 1 930 vinserti128 m9, [dstq+stride3q -2], 1 931 shufps m6, m5, m9, q1010 932 shufps m5, m9, q2121 933 pblendw m6, m11, 0x11 934%endif 935 ret 936.d3k1: 937%if %1 == 4 938 %if %2 == 4 939 vpbroadcastq m11, [dstq+strideq*1-2] 940 vpbroadcastq m12, [dstq+strideq*2-2] 941 movd xm6, [topq+strideq*1-2] 942 movd xm9, [dstq+strideq*0-2] 943 pblendw m11, [leftq-16+2], 0x01 944 pblendw m12, [leftq-16+4], 0x01 945 pinsrw xm9, [leftq- 0+0], 0 946 psrldq m5, m11, 4 947 psrldq m10, m12, 4 948 vinserti128 m5, [dstq+stride3q +2], 1 949 vinserti128 m10, [botq +2], 1 950 vpblendd m6, m11, 0x10 951 vpblendd m9, m12, 0x10 952 punpckldq m6, m9 953 punpckldq m5, m10 954 %else 955 movd xm6, [topq +strideq*1-2] 956 movq xm5, [dstq +strideq*1-2] 957 movq xm9, [dstq +stride3q -2] 958 movq xm10, [dst4q+strideq*1-2] 959 movd xm11, [dst4q+stride3q +2] 960 pinsrw xm6, [dstq +strideq*0 ], 3 961 movhps xm5, [dstq +strideq*2-2] 962 movhps xm9, [dst4q+strideq*0-2] 963 movhps xm10, [dst4q+strideq*2-2] 964 pinsrd xm11, [botq +2], 1 965 shufps xm6, xm5, q2010 966 shufps xm5, xm9, q3131 967 shufps xm9, xm10, q2020 968 shufps xm10, xm11, q1031 969 movu m11, [blend_4x8_2] 970 vinserti128 m6, xm9, 1 971 vinserti128 m5, xm10, 1 972 vpblendvb m6, [rsp+gprsize+0x10-4], m11 973 %endif 974%else 975 lea r13, [blend_8x8_1+8] 976 movq xm6, [top2q -2] 977 vbroadcasti128 m5, [dstq+strideq*1-2] 978 vbroadcasti128 m10, [dstq+strideq*2-2] 979 movhps xm6, [dstq+strideq*0-2] 980 punpcklqdq m9, m5, m10 981 vinserti128 m5, [dstq+stride3q -2], 1 982 vinserti128 m10, [botq -2], 1 983 movu m11, [r13+hq*2*1+16*1] 984 vpblendd m6, m9, 0xF0 985 shufps m5, m10, q2121 986 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11 987%endif 988 ret 989.d4k1: 990%if %1 == 4 991 %if %2 == 4 992 vinserti128 m6, [dstq+strideq*0-2], 1 993 vinserti128 m9, [dstq+strideq*1-2], 1 994 movd xm5, [dstq+strideq*2+2] 995 movd xm10, [dstq+stride3q +2] 996 pblendw m6, [leftq-16+0], 0x01 997 pblendw m9, [leftq-16+2], 0x01 998 vinserti128 m5, [botq+strideq*0+2], 1 999 vinserti128 m10, [botq+strideq*1+2], 1 1000 vpblendd m6, [topq+strideq*0-2], 0x01 1001 vpblendd m9, [topq+strideq*1-2], 0x01 1002 punpckldq m5, m10 1003 punpckldq m6, m9 1004 %else 1005 movd xm6, [topq +strideq*0-2] 1006 movq xm5, [dstq +strideq*2-2] 1007 movq xm9, [dst4q+strideq*0-2] 1008 movd xm10, [dst4q+strideq*2+2] 1009 pinsrd xm6, [topq +strideq*1-2], 1 1010 movhps xm5, [dstq +stride3q -2] 1011 movhps xm9, [dst4q+strideq*1-2] 1012 pinsrd xm10, [dst4q+stride3q +2], 1 1013 pinsrd xm6, [dstq +strideq*0-2], 2 1014 pinsrd xm10, [botq +strideq*0+2], 2 1015 pinsrd xm6, [dstq +strideq*1-2], 3 1016 pinsrd xm10, [botq +strideq*1+2], 3 1017 shufps xm11, xm5, xm9, q2020 1018 shufps xm5, xm9, q3131 1019 movu m9, [blend_4x8_3] 1020 vinserti128 m6, xm11, 1 1021 vinserti128 m5, xm10, 1 1022 vpblendvb m6, [rsp+gprsize+0x10-8], m9 1023 %endif 1024%else 1025 lea r13, [blend_8x8_1] 1026 movu m11, [r13+hq*2*2+16*2] 1027 movq xm6, [top1q -2] 1028 movq xm9, [top2q -2] 1029 movq xm5, [dstq+strideq*2+2] 1030 movq xm10, [dstq+stride3q +2] 1031 vinserti128 m6, [dstq+strideq*0-2], 1 1032 vinserti128 m9, [dstq+strideq*1-2], 1 1033 vinserti128 m5, [botq+strideq*0+2], 1 1034 vinserti128 m10, [botq+strideq*1+2], 1 1035 punpcklqdq m6, m9 1036 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11 1037 punpcklqdq m5, m10 1038%endif 1039 ret 1040.d5k1: 1041%if %1 == 4 1042 %if %2 == 4 1043 movd xm6, [topq+strideq*0-1] 1044 movd xm9, [topq+strideq*1-1] 1045 movd xm5, [dstq+strideq*2+1] 1046 movd xm10, [dstq+stride3q +1] 1047 pcmpeqd m12, m12 1048 pmovzxbw m11, [leftq-8+1] 1049 psrld m12, 24 1050 vinserti128 m6, [dstq+strideq*0-1], 1 1051 vinserti128 m9, [dstq+strideq*1-1], 1 1052 vinserti128 m5, [botq+strideq*0+1], 1 1053 vinserti128 m10, [botq+strideq*1+1], 1 1054 punpckldq m6, m9 1055 pxor m9, m9 1056 vpblendd m12, m9, 0x0F 1057 punpckldq m5, m10 1058 vpblendvb m6, m11, m12 1059 %else 1060 movd xm6, [topq +strideq*0-1] 1061 movq xm5, [dstq +strideq*2-1] 1062 movq xm9, [dst4q+strideq*0-1] 1063 movd xm10, [dst4q+strideq*2+1] 1064 pinsrd xm6, [topq +strideq*1-1], 1 1065 movhps xm5, [dstq +stride3q -1] 1066 movhps xm9, [dst4q+strideq*1-1] 1067 pinsrd xm10, [dst4q+stride3q +1], 1 1068 pinsrd xm6, [dstq +strideq*0-1], 2 1069 pinsrd xm10, [botq +strideq*0+1], 2 1070 pinsrd xm6, [dstq +strideq*1-1], 3 1071 pinsrd xm10, [botq +strideq*1+1], 3 1072 shufps xm11, xm5, xm9, q2020 1073 vinserti128 m6, xm11, 1 1074 pmovzxbw m11, [leftq-3] 1075 psrldq xm5, 2 1076 psrldq xm9, 2 1077 shufps xm5, xm9, q2020 1078 movu m9, [blend_4x8_1] 1079 vinserti128 m5, xm10, 1 1080 vpblendvb m6, m11, m9 1081 %endif 1082%else 1083 lea r13, [blend_8x8_0] 1084 movu m11, [r13+hq*2*2+16*2] 1085 movq xm6, [top1q -1] 1086 movq xm9, [top2q -1] 1087 movq xm5, [dstq+strideq*2+1] 1088 movq xm10, [dstq+stride3q +1] 1089 vinserti128 m6, [dstq+strideq*0-1], 1 1090 vinserti128 m9, [dstq+strideq*1-1], 1 1091 vinserti128 m5, [botq+strideq*0+1], 1 1092 vinserti128 m10, [botq+strideq*1+1], 1 1093 punpcklqdq m6, m9 1094 punpcklqdq m5, m10 1095 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11 1096%endif 1097 ret 1098.d6k1: 1099%if %1 == 4 1100 %if %2 == 4 1101 movd xm6, [topq+strideq*0] 1102 movd xm9, [topq+strideq*1] 1103 movd xm5, [dstq+strideq*2] 1104 movd xm10, [dstq+stride3q ] 1105 vinserti128 m6, [dstq+strideq*0], 1 1106 vinserti128 m9, [dstq+strideq*1], 1 1107 vinserti128 m5, [botq+strideq*0], 1 1108 vinserti128 m10, [botq+strideq*1], 1 1109 punpckldq m6, m9 1110 punpckldq m5, m10 1111 %else 1112 movd xm5, [dstq +strideq*2] 1113 movd xm6, [topq +strideq*0] 1114 movd xm9, [dst4q+strideq*2] 1115 pinsrd xm5, [dstq +stride3q ], 1 1116 pinsrd xm6, [topq +strideq*1], 1 1117 pinsrd xm9, [dst4q+stride3q ], 1 1118 pinsrd xm5, [dst4q+strideq*0], 2 1119 pinsrd xm6, [dstq +strideq*0], 2 1120 pinsrd xm9, [botq +strideq*0], 2 1121 pinsrd xm5, [dst4q+strideq*1], 3 1122 pinsrd xm6, [dstq +strideq*1], 3 1123 pinsrd xm9, [botq +strideq*1], 3 1124 vinserti128 m6, xm5, 1 1125 vinserti128 m5, xm9, 1 1126 %endif 1127%else 1128 movq xm5, [dstq+strideq*2] 1129 movq xm9, [botq+strideq*0] 1130 movq xm6, [top1q ] 1131 movq xm10, [dstq+strideq*0] 1132 movhps xm5, [dstq+stride3q ] 1133 movhps xm9, [botq+strideq*1] 1134 movhps xm6, [top2q ] 1135 movhps xm10, [dstq+strideq*1] 1136 vinserti128 m5, xm9, 1 1137 vinserti128 m6, xm10, 1 1138%endif 1139 ret 1140.d7k1: 1141%if %1 == 4 1142 %if %2 == 4 1143 movd xm5, [dstq+strideq*2-1] 1144 movd xm9, [dstq+stride3q -1] 1145 movd xm6, [topq+strideq*0+1] 1146 movd xm10, [topq+strideq*1+1] 1147 pinsrb xm5, [leftq+ 5], 0 1148 pinsrb xm9, [leftq+ 7], 0 1149 vinserti128 m6, [dstq+strideq*0+1], 1 1150 vinserti128 m10, [dstq+strideq*1+1], 1 1151 vinserti128 m5, [botq+strideq*0-1], 1 1152 vinserti128 m9, [botq+strideq*1-1], 1 1153 punpckldq m6, m10 1154 punpckldq m5, m9 1155 %else 1156 movd xm6, [topq +strideq*0+1] 1157 movq xm9, [dstq +strideq*2-1] 1158 movq xm10, [dst4q+strideq*0-1] 1159 movd xm11, [dst4q+strideq*2-1] 1160 pinsrd xm6, [topq +strideq*1+1], 1 1161 movhps xm9, [dstq +stride3q -1] 1162 movhps xm10, [dst4q+strideq*1-1] 1163 pinsrd xm11, [dst4q+stride3q -1], 1 1164 pinsrd xm6, [dstq +strideq*0+1], 2 1165 pinsrd xm11, [botq +strideq*0-1], 2 1166 pinsrd xm6, [dstq +strideq*1+1], 3 1167 pinsrd xm11, [botq +strideq*1-1], 3 1168 shufps xm5, xm9, xm10, q2020 1169 vinserti128 m5, xm11, 1 1170 pmovzxbw m11, [leftq+5] 1171 psrldq xm9, 2 1172 psrldq xm10, 2 1173 shufps xm9, xm10, q2020 1174 movu m10, [blend_4x8_1+8] 1175 vinserti128 m6, xm9, 1 1176 vpblendvb m5, m11, m10 1177 %endif 1178%else 1179 lea r13, [blend_8x8_0+16] 1180 movq xm5, [dstq+strideq*2-1] 1181 movq xm9, [botq+strideq*0-1] 1182 movq xm6, [top1q +1] 1183 movq xm10, [dstq+strideq*0+1] 1184 movhps xm5, [dstq+stride3q -1] 1185 movhps xm9, [botq+strideq*1-1] 1186 movhps xm6, [top2q +1] 1187 movhps xm10, [dstq+strideq*1+1] 1188 movu m11, [r13+hq*2*2+16*2] 1189 vinserti128 m5, xm9, 1 1190 vinserti128 m6, xm10, 1 1191 vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11 1192%endif 1193 ret 1194 1195.border_block: 1196 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge 1197 RESET_STACK_STATE 1198 %assign stack_offset stack_offset - (regs_used - 11) * gprsize 1199 %assign regs_used 11 1200 ALLOC_STACK 2*16+(%2+4)*32, 16 1201%define px rsp+2*16+2*32 1202 1203 pcmpeqw m14, m14 1204 psllw m14, 15 ; 0x8000 1205 1206 ; prepare pixel buffers - body/right 1207%if %1 == 4 1208 INIT_XMM avx2 1209%endif 1210%if %2 == 8 1211 lea dst4q, [dstq+strideq*4] 1212%endif 1213 lea stride3q, [strideq*3] 1214 test edgeb, 2 ; have_right 1215 jz .no_right 1216 pmovzxbw m1, [dstq+strideq*0] 1217 pmovzxbw m2, [dstq+strideq*1] 1218 pmovzxbw m3, [dstq+strideq*2] 1219 pmovzxbw m4, [dstq+stride3q] 1220 mova [px+0*32], m1 1221 mova [px+1*32], m2 1222 mova [px+2*32], m3 1223 mova [px+3*32], m4 1224%if %2 == 8 1225 pmovzxbw m1, [dst4q+strideq*0] 1226 pmovzxbw m2, [dst4q+strideq*1] 1227 pmovzxbw m3, [dst4q+strideq*2] 1228 pmovzxbw m4, [dst4q+stride3q] 1229 mova [px+4*32], m1 1230 mova [px+5*32], m2 1231 mova [px+6*32], m3 1232 mova [px+7*32], m4 1233%endif 1234 jmp .body_done 1235.no_right: 1236%if %1 == 4 1237 movd xm1, [dstq+strideq*0] 1238 movd xm2, [dstq+strideq*1] 1239 movd xm3, [dstq+strideq*2] 1240 movd xm4, [dstq+stride3q] 1241 pmovzxbw xm1, xm1 1242 pmovzxbw xm2, xm2 1243 pmovzxbw xm3, xm3 1244 pmovzxbw xm4, xm4 1245 movq [px+0*32], xm1 1246 movq [px+1*32], xm2 1247 movq [px+2*32], xm3 1248 movq [px+3*32], xm4 1249%else 1250 pmovzxbw xm1, [dstq+strideq*0] 1251 pmovzxbw xm2, [dstq+strideq*1] 1252 pmovzxbw xm3, [dstq+strideq*2] 1253 pmovzxbw xm4, [dstq+stride3q] 1254 mova [px+0*32], xm1 1255 mova [px+1*32], xm2 1256 mova [px+2*32], xm3 1257 mova [px+3*32], xm4 1258%endif 1259 movd [px+0*32+%1*2], xm14 1260 movd [px+1*32+%1*2], xm14 1261 movd [px+2*32+%1*2], xm14 1262 movd [px+3*32+%1*2], xm14 1263%if %2 == 8 1264 %if %1 == 4 1265 movd xm1, [dst4q+strideq*0] 1266 movd xm2, [dst4q+strideq*1] 1267 movd xm3, [dst4q+strideq*2] 1268 movd xm4, [dst4q+stride3q] 1269 pmovzxbw xm1, xm1 1270 pmovzxbw xm2, xm2 1271 pmovzxbw xm3, xm3 1272 pmovzxbw xm4, xm4 1273 movq [px+4*32], xm1 1274 movq [px+5*32], xm2 1275 movq [px+6*32], xm3 1276 movq [px+7*32], xm4 1277 %else 1278 pmovzxbw xm1, [dst4q+strideq*0] 1279 pmovzxbw xm2, [dst4q+strideq*1] 1280 pmovzxbw xm3, [dst4q+strideq*2] 1281 pmovzxbw xm4, [dst4q+stride3q] 1282 mova [px+4*32], xm1 1283 mova [px+5*32], xm2 1284 mova [px+6*32], xm3 1285 mova [px+7*32], xm4 1286 %endif 1287 movd [px+4*32+%1*2], xm14 1288 movd [px+5*32+%1*2], xm14 1289 movd [px+6*32+%1*2], xm14 1290 movd [px+7*32+%1*2], xm14 1291%endif 1292.body_done: 1293 1294 ; top 1295 test edgeb, 4 ; have_top 1296 jz .no_top 1297 test edgeb, 1 ; have_left 1298 jz .top_no_left 1299 test edgeb, 2 ; have_right 1300 jz .top_no_right 1301 pmovzxbw m1, [topq+strideq*0-(%1/2)] 1302 pmovzxbw m2, [topq+strideq*1-(%1/2)] 1303 movu [px-2*32-%1], m1 1304 movu [px-1*32-%1], m2 1305 jmp .top_done 1306.top_no_right: 1307 pmovzxbw m1, [topq+strideq*0-%1] 1308 pmovzxbw m2, [topq+strideq*1-%1] 1309 movu [px-2*32-%1*2], m1 1310 movu [px-1*32-%1*2], m2 1311 movd [px-2*32+%1*2], xm14 1312 movd [px-1*32+%1*2], xm14 1313 jmp .top_done 1314.top_no_left: 1315 test edgeb, 2 ; have_right 1316 jz .top_no_left_right 1317 pmovzxbw m1, [topq+strideq*0] 1318 pmovzxbw m2, [topq+strideq*1] 1319 mova [px-2*32+0], m1 1320 mova [px-1*32+0], m2 1321 movd [px-2*32-4], xm14 1322 movd [px-1*32-4], xm14 1323 jmp .top_done 1324.top_no_left_right: 1325%if %1 == 4 1326 movd xm1, [topq+strideq*0] 1327 pinsrd xm1, [topq+strideq*1], 1 1328 pmovzxbw xm1, xm1 1329 movq [px-2*32+0], xm1 1330 movhps [px-1*32+0], xm1 1331%else 1332 pmovzxbw xm1, [topq+strideq*0] 1333 pmovzxbw xm2, [topq+strideq*1] 1334 mova [px-2*32+0], xm1 1335 mova [px-1*32+0], xm2 1336%endif 1337 movd [px-2*32-4], xm14 1338 movd [px-1*32-4], xm14 1339 movd [px-2*32+%1*2], xm14 1340 movd [px-1*32+%1*2], xm14 1341 jmp .top_done 1342.no_top: 1343 movu [px-2*32-%1], m14 1344 movu [px-1*32-%1], m14 1345.top_done: 1346 1347 ; left 1348 test edgeb, 1 ; have_left 1349 jz .no_left 1350 pmovzxbw xm1, [leftq+ 0] 1351%if %2 == 8 1352 pmovzxbw xm2, [leftq+ 8] 1353%endif 1354 movd [px+0*32-4], xm1 1355 pextrd [px+1*32-4], xm1, 1 1356 pextrd [px+2*32-4], xm1, 2 1357 pextrd [px+3*32-4], xm1, 3 1358%if %2 == 8 1359 movd [px+4*32-4], xm2 1360 pextrd [px+5*32-4], xm2, 1 1361 pextrd [px+6*32-4], xm2, 2 1362 pextrd [px+7*32-4], xm2, 3 1363%endif 1364 jmp .left_done 1365.no_left: 1366 movd [px+0*32-4], xm14 1367 movd [px+1*32-4], xm14 1368 movd [px+2*32-4], xm14 1369 movd [px+3*32-4], xm14 1370%if %2 == 8 1371 movd [px+4*32-4], xm14 1372 movd [px+5*32-4], xm14 1373 movd [px+6*32-4], xm14 1374 movd [px+7*32-4], xm14 1375%endif 1376.left_done: 1377 1378 ; bottom 1379 DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge 1380 test edgeb, 8 ; have_bottom 1381 jz .no_bottom 1382 test edgeb, 1 ; have_left 1383 jz .bottom_no_left 1384 test edgeb, 2 ; have_right 1385 jz .bottom_no_right 1386 pmovzxbw m1, [botq+strideq*0-(%1/2)] 1387 pmovzxbw m2, [botq+strideq*1-(%1/2)] 1388 movu [px+(%2+0)*32-%1], m1 1389 movu [px+(%2+1)*32-%1], m2 1390 jmp .bottom_done 1391.bottom_no_right: 1392 pmovzxbw m1, [botq+strideq*0-%1] 1393 pmovzxbw m2, [botq+strideq*1-%1] 1394 movu [px+(%2+0)*32-%1*2], m1 1395 movu [px+(%2+1)*32-%1*2], m2 1396%if %1 == 8 1397 movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu 1398%endif 1399 movd [px+(%2+0)*32+%1*2], xm14 1400 movd [px+(%2+1)*32+%1*2], xm14 1401 jmp .bottom_done 1402.bottom_no_left: 1403 test edgeb, 2 ; have_right 1404 jz .bottom_no_left_right 1405 pmovzxbw m1, [botq+strideq*0] 1406 pmovzxbw m2, [botq+strideq*1] 1407 mova [px+(%2+0)*32+0], m1 1408 mova [px+(%2+1)*32+0], m2 1409 movd [px+(%2+0)*32-4], xm14 1410 movd [px+(%2+1)*32-4], xm14 1411 jmp .bottom_done 1412.bottom_no_left_right: 1413%if %1 == 4 1414 movd xm1, [botq+strideq*0] 1415 pinsrd xm1, [botq+strideq*1], 1 1416 pmovzxbw xm1, xm1 1417 movq [px+(%2+0)*32+0], xm1 1418 movhps [px+(%2+1)*32+0], xm1 1419%else 1420 pmovzxbw xm1, [botq+strideq*0] 1421 pmovzxbw xm2, [botq+strideq*1] 1422 mova [px+(%2+0)*32+0], xm1 1423 mova [px+(%2+1)*32+0], xm2 1424%endif 1425 movd [px+(%2+0)*32-4], xm14 1426 movd [px+(%2+1)*32-4], xm14 1427 movd [px+(%2+0)*32+%1*2], xm14 1428 movd [px+(%2+1)*32+%1*2], xm14 1429 jmp .bottom_done 1430.no_bottom: 1431 movu [px+(%2+0)*32-%1], m14 1432 movu [px+(%2+1)*32-%1], m14 1433.bottom_done: 1434 1435 ; actual filter 1436 INIT_YMM avx2 1437 DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero 1438%undef edged 1439 ; register to shuffle values into after packing 1440 vbroadcasti128 m12, [shufb_lohi] 1441 1442 mov dampingd, r8m 1443 xor zerod, zerod 1444 movifnidn prid, prim 1445 sub dampingd, 31 1446 movifnidn secdmpd, secdmpm 1447 test prid, prid 1448 jz .border_sec_only 1449 movd xm0, prid 1450 lzcnt pridmpd, prid 1451 add pridmpd, dampingd 1452 cmovs pridmpd, zerod 1453 mov [rsp+0], pridmpq ; pri_shift 1454 test secdmpd, secdmpd 1455 jz .border_pri_only 1456 movd xm1, secdmpd 1457 lzcnt secdmpd, secdmpd 1458 add secdmpd, dampingd 1459 mov [rsp+8], secdmpq ; sec_shift 1460 1461 DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3 1462 lea tableq, [tap_table] 1463 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 1464 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 1465 1466 ; pri/sec_taps[k] [4 total] 1467 DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3 1468 vpbroadcastb m0, xm0 ; pri_strength 1469 vpbroadcastb m1, xm1 ; sec_strength 1470 and prid, 1 1471 lea priq, [tableq+priq*2+8] ; pri_taps 1472 lea secq, [tableq+12] ; sec_taps 1473 1474 BORDER_PREP_REGS %1, %2 1475%if %1*%2*2/mmsize > 1 1476.border_v_loop: 1477%endif 1478 BORDER_LOAD_BLOCK %1, %2, 1 1479.border_k_loop: 1480 vpbroadcastb m2, [priq+kq] ; pri_taps 1481 vpbroadcastb m3, [secq+kq] ; sec_taps 1482 ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1 1483 ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1 1484 ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1 1485 dec kq 1486 jge .border_k_loop 1487 1488 vpbroadcastd m10, [pw_2048] 1489 BORDER_ADJUST_PIXEL %1, m10, 1 1490%if %1*%2*2/mmsize > 1 1491 %define vloop_lines (mmsize/(%1*2)) 1492 lea dstq, [dstq+strideq*vloop_lines] 1493 add stkq, 32*vloop_lines 1494 dec hd 1495 jg .border_v_loop 1496%endif 1497 RET 1498 1499.border_pri_only: 1500 DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3 1501 lea tableq, [tap_table] 1502 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 1503 DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3 1504 vpbroadcastb m0, xm0 ; pri_strength 1505 and prid, 1 1506 lea priq, [tableq+priq*2+8] ; pri_taps 1507 BORDER_PREP_REGS %1, %2 1508 vpbroadcastd m1, [pw_2048] 1509%if %1*%2*2/mmsize > 1 1510.border_pri_v_loop: 1511%endif 1512 BORDER_LOAD_BLOCK %1, %2 1513.border_pri_k_loop: 1514 vpbroadcastb m2, [priq+kq] ; pri_taps 1515 ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 1516 dec kq 1517 jge .border_pri_k_loop 1518 BORDER_ADJUST_PIXEL %1, m1 1519%if %1*%2*2/mmsize > 1 1520 %define vloop_lines (mmsize/(%1*2)) 1521 lea dstq, [dstq+strideq*vloop_lines] 1522 add stkq, 32*vloop_lines 1523 dec hd 1524 jg .border_pri_v_loop 1525%endif 1526 RET 1527 1528.border_sec_only: 1529 DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3 1530 movd xm1, secdmpd 1531 lzcnt secdmpd, secdmpd 1532 add secdmpd, dampingd 1533 mov [rsp+8], secdmpq ; sec_shift 1534 DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3 1535 lea tableq, [tap_table] 1536 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 1537 DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3 1538 vpbroadcastb m1, xm1 ; sec_strength 1539 lea secq, [tableq+12] ; sec_taps 1540 BORDER_PREP_REGS %1, %2 1541 vpbroadcastd m0, [pw_2048] 1542%if %1*%2*2/mmsize > 1 1543.border_sec_v_loop: 1544%endif 1545 BORDER_LOAD_BLOCK %1, %2 1546.border_sec_k_loop: 1547 vpbroadcastb m3, [secq+kq] ; sec_taps 1548 ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 1549 ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 1550 dec kq 1551 jge .border_sec_k_loop 1552 BORDER_ADJUST_PIXEL %1, m0 1553%if %1*%2*2/mmsize > 1 1554 %define vloop_lines (mmsize/(%1*2)) 1555 lea dstq, [dstq+strideq*vloop_lines] 1556 add stkq, 32*vloop_lines 1557 dec hd 1558 jg .border_sec_v_loop 1559%endif 1560 RET 1561%endmacro 1562 1563CDEF_FILTER 8, 8 1564CDEF_FILTER 4, 8 1565CDEF_FILTER 4, 4 1566 1567INIT_YMM avx2 1568cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3 1569 lea stride3q, [strideq*3] 1570 movq xm0, [srcq+strideq*0] 1571 movq xm1, [srcq+strideq*1] 1572 movq xm2, [srcq+strideq*2] 1573 movq xm3, [srcq+stride3q ] 1574 lea srcq, [srcq+strideq*4] 1575 vpbroadcastq m4, [srcq+stride3q ] 1576 vpbroadcastq m5, [srcq+strideq*2] 1577 vpblendd m0, m4, 0xf0 1578 vpblendd m1, m5, 0xf0 1579 vpbroadcastq m4, [srcq+strideq*1] 1580 vpbroadcastq m5, [srcq+strideq*0] 1581 vpblendd m2, m4, 0xf0 1582 vpblendd m3, m5, 0xf0 1583 pxor m4, m4 1584 punpcklbw m0, m4 1585 punpcklbw m1, m4 1586 punpcklbw m2, m4 1587 punpcklbw m3, m4 1588cglobal_label .main 1589 vpbroadcastd m4, [pw_128] 1590 PROLOGUE 3, 4, 15 1591 psubw m0, m4 1592 psubw m1, m4 1593 psubw m2, m4 1594 psubw m3, m4 1595 1596 ; shuffle registers to generate partial_sum_diag[0-1] together 1597 vperm2i128 m7, m0, m0, 0x01 1598 vperm2i128 m6, m1, m1, 0x01 1599 vperm2i128 m5, m2, m2, 0x01 1600 vperm2i128 m4, m3, m3, 0x01 1601 1602 ; start with partial_sum_hv[0-1] 1603 paddw m8, m0, m1 1604 paddw m9, m2, m3 1605 phaddw m10, m0, m1 1606 phaddw m11, m2, m3 1607 paddw m8, m9 1608 phaddw m10, m11 1609 vextracti128 xm9, m8, 1 1610 vextracti128 xm11, m10, 1 1611 paddw xm8, xm9 ; partial_sum_hv[1] 1612 phaddw xm10, xm11 ; partial_sum_hv[0] 1613 vinserti128 m8, xm10, 1 1614 vpbroadcastd m9, [div_table+44] 1615 pmaddwd m8, m8 1616 pmulld m8, m9 ; cost6[2a-d] | cost2[a-d] 1617 1618 ; create aggregates [lower half]: 1619 ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+ 1620 ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0 1621 ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+ 1622 ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x 1623 ; and [upper half]: 1624 ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+ 1625 ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567 1626 ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+ 1627 ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx 1628 ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd 1629 1630 pslldq m9, m1, 2 1631 psrldq m10, m1, 14 1632 pslldq m11, m2, 4 1633 psrldq m12, m2, 12 1634 pslldq m13, m3, 6 1635 psrldq m14, m3, 10 1636 paddw m9, m11 1637 paddw m10, m12 1638 paddw m9, m13 1639 paddw m10, m14 1640 pslldq m11, m4, 8 1641 psrldq m12, m4, 8 1642 pslldq m13, m5, 10 1643 psrldq m14, m5, 6 1644 paddw m9, m11 1645 paddw m10, m12 1646 paddw m9, m13 1647 paddw m10, m14 1648 pslldq m11, m6, 12 1649 psrldq m12, m6, 4 1650 pslldq m13, m7, 14 1651 psrldq m14, m7, 2 1652 paddw m9, m11 1653 paddw m10, m12 1654 paddw m9, m13 1655 paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero] 1656 vbroadcasti128 m14, [shufw_6543210x] 1657 vbroadcasti128 m13, [div_table+16] 1658 vbroadcasti128 m12, [div_table+0] 1659 paddw m9, m0 ; partial_sum_diag[0/1][0-7] 1660 pshufb m10, m14 1661 punpckhwd m11, m9, m10 1662 punpcklwd m9, m10 1663 pmaddwd m11, m11 1664 pmaddwd m9, m9 1665 pmulld m11, m13 1666 pmulld m9, m12 1667 paddd m9, m11 ; cost0[a-d] | cost4[a-d] 1668 1669 ; merge horizontally and vertically for partial_sum_alt[0-3] 1670 paddw m10, m0, m1 1671 paddw m11, m2, m3 1672 paddw m12, m4, m5 1673 paddw m13, m6, m7 1674 phaddw m0, m4 1675 phaddw m1, m5 1676 phaddw m2, m6 1677 phaddw m3, m7 1678 1679 ; create aggregates [lower half]: 1680 ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234 1681 ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx 1682 ; and [upper half]: 1683 ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 1684 ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx 1685 ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd 1686 1687 pslldq m4, m11, 2 1688 psrldq m11, 14 1689 pslldq m5, m12, 4 1690 psrldq m12, 12 1691 pslldq m6, m13, 6 1692 psrldq m13, 10 1693 paddw m4, m10 1694 paddw m11, m12 1695 vpbroadcastd m12, [div_table+44] 1696 paddw m5, m6 1697 paddw m11, m13 ; partial_sum_alt[3/2] right 1698 vbroadcasti128 m13, [div_table+32] 1699 paddw m4, m5 ; partial_sum_alt[3/2] left 1700 pshuflw m5, m11, q3012 1701 punpckhwd m6, m11, m4 1702 punpcklwd m4, m5 1703 pmaddwd m6, m6 1704 pmaddwd m4, m4 1705 pmulld m6, m12 1706 pmulld m4, m13 1707 paddd m4, m6 ; cost7[a-d] | cost5[a-d] 1708 1709 ; create aggregates [lower half]: 1710 ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234 1711 ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx 1712 ; and [upper half]: 1713 ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 1714 ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx 1715 ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd 1716 1717 pslldq m5, m1, 2 1718 psrldq m1, 14 1719 pslldq m6, m2, 4 1720 psrldq m2, 12 1721 pslldq m7, m3, 6 1722 psrldq m3, 10 1723 paddw m5, m0 1724 paddw m1, m2 1725 paddw m6, m7 1726 paddw m1, m3 ; partial_sum_alt[0/1] right 1727 paddw m5, m6 ; partial_sum_alt[0/1] left 1728 pshuflw m0, m1, q3012 1729 punpckhwd m1, m5 1730 punpcklwd m5, m0 1731 pmaddwd m1, m1 1732 pmaddwd m5, m5 1733 pmulld m1, m12 1734 pmulld m5, m13 1735 paddd m5, m1 ; cost1[a-d] | cost3[a-d] 1736 1737 mova xm0, [pd_47130256+ 16] 1738 mova m1, [pd_47130256] 1739 phaddd m9, m8 1740 phaddd m5, m4 1741 phaddd m9, m5 1742 vpermd m0, m9 ; cost[0-3] 1743 vpermd m1, m9 ; cost[4-7] | cost[0-3] 1744 1745 ; now find the best cost 1746 pmaxsd xm2, xm0, xm1 1747 pshufd xm3, xm2, q1032 1748 pmaxsd xm2, xm3 1749 pshufd xm3, xm2, q2301 1750 pmaxsd xm2, xm3 ; best cost 1751 1752 ; find the idx using minpos 1753 ; make everything other than the best cost negative via subtraction 1754 ; find the min of unsigned 16-bit ints to sort out the negative values 1755 psubd xm4, xm1, xm2 1756 psubd xm3, xm0, xm2 1757 packssdw xm3, xm4 1758 phminposuw xm3, xm3 1759 1760 ; convert idx to 32-bits 1761 psrld xm3, 16 1762 movd eax, xm3 1763 1764 ; get idx^4 complement 1765 vpermd m3, m1 1766 psubd xm2, xm3 1767 psrld xm2, 10 1768 movd [varq], xm2 1769 RET 1770 1771%endif ; ARCH_X86_64 1772