1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33wiener_shufA: db 1, 2, 7, 6, 3, 4, 9, 8, 5, 6, 11, 10, 7, 8, 13, 12 34wiener_shufB: db 2, 3, 8, 7, 4, 5, 10, 9, 6, 7, 12, 11, 8, 9, 14, 13 35wiener_shufC: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 36wiener_shufD: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 37wiener_perm32: db 1, 9, 3, 11, 5, 13, 7, 15, 33, 41, 35, 43, 37, 45, 39, 47 38 db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63 39sgr_shuf: db 128, 1, -1, 2,132, 3, -1, 4,136, 5, -1, 6,140, 7, -1, 8 40 db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1, 0,128 41sgr_mix_perm: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 42r_ext_mask: times 68 db -1 43 times 4 db 0 44wiener_x_shuf: db 0, 2, -1, 0 45wiener_x_add: db 0, 1,127, 0 46 47pw_61448: times 2 dw 61448 48pw_164_455: dw 164, 455 49pd_m16380: dd -16380 50pd_m4096: dd -4096 51pd_m25 dd -25 52pd_m9: dd -9 53pd_34816: dd 34816 54pd_8421376: dd 8421376 55 56cextern sgr_x_by_x 57 58SECTION .text 59 60DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers 61 62INIT_ZMM avx512icl 63cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \ 64 w, h, edge, flt 65 mov fltq, r6mp 66 mov wd, wm 67 movifnidn hd, hm 68 mov edged, r7m 69 vbroadcasti32x4 m6, [wiener_shufA] 70 vbroadcasti32x4 m7, [wiener_shufB] 71 mov r10d, 0xfffe 72 vbroadcasti32x4 m8, [wiener_shufC] 73 vbroadcasti32x4 m9, [wiener_shufD] 74 kmovw k1, r10d 75 vpbroadcastd m0, [wiener_x_shuf] 76 vpbroadcastd m1, [wiener_x_add] 77 mov r10, 0xaaaaaaaaaaaaaaaa 78 vpbroadcastd m11, [fltq+ 0] 79 vpbroadcastd m12, [fltq+ 4] 80 kmovq k2, r10 81 vpbroadcastd m10, [pd_m16380] 82 packsswb m11, m11 ; x0 x1 x0 x1 83 vpbroadcastd m14, [fltq+16] 84 pshufb m12, m0 85 vpbroadcastd m15, [fltq+20] 86 paddb m12, m1 ; x2 x3+1 x2 127 87 vpbroadcastd m13, [pd_8421376] 88 psllw m14, 5 ; y0 y1 89 psllw m15, 5 ; y2 y3 90 cmp wd, 32 ; the minimum lr unit size for chroma in 4:2:0 is 32 91 jle .w32 ; pixels, so we need a special case for small widths 92 lea t1, [rsp+wq*2+16] 93 add lpfq, wq 94 add dstq, wq 95 neg wq 96 test edgeb, 4 ; LR_HAVE_TOP 97 jz .no_top 98 call .h_top 99 add lpfq, strideq 100 mov t6, t1 101 mov t5, t1 102 add t1, 384*2 103 call .h_top 104 lea r10, [lpfq+strideq*4] 105 mov lpfq, dstq 106 mov t4, t1 107 add t1, 384*2 108 add r10, strideq 109 mov [rsp], r10 ; below 110 call .h 111 mov t3, t1 112 mov t2, t1 113 dec hd 114 jz .v1 115 add lpfq, strideq 116 add t1, 384*2 117 call .h 118 mov t2, t1 119 dec hd 120 jz .v2 121 add lpfq, strideq 122 add t1, 384*2 123 call .h 124 dec hd 125 jz .v3 126.main: 127 lea t0, [t1+384*2] 128.main_loop: 129 call .hv 130 dec hd 131 jnz .main_loop 132 test edgeb, 8 ; LR_HAVE_BOTTOM 133 jz .v3 134 mov lpfq, [rsp] 135 call .hv_bottom 136 add lpfq, strideq 137 call .hv_bottom 138.v1: 139 call .v 140 RET 141.no_top: 142 lea r10, [lpfq+strideq*4] 143 mov lpfq, dstq 144 lea r10, [r10+strideq*2] 145 mov [rsp], r10 146 call .h 147 mov t6, t1 148 mov t5, t1 149 mov t4, t1 150 mov t3, t1 151 mov t2, t1 152 dec hd 153 jz .v1 154 add lpfq, strideq 155 add t1, 384*2 156 call .h 157 mov t2, t1 158 dec hd 159 jz .v2 160 add lpfq, strideq 161 add t1, 384*2 162 call .h 163 dec hd 164 jz .v3 165 lea t0, [t1+384*2] 166 call .hv 167 dec hd 168 jz .v3 169 add t0, 384*8 170 call .hv 171 dec hd 172 jnz .main 173.v3: 174 call .v 175.v2: 176 call .v 177 jmp .v1 178.h: 179 mov r10, wq 180 test edgeb, 1 ; LR_HAVE_LEFT 181 jz .h_extend_left 182 movd xm16, [leftq] 183 vmovdqu32 m16{k1}, [lpfq+r10-4] 184 add leftq, 4 185 jmp .h_main 186.h_extend_left: 187 vpbroadcastb xm16, [lpfq+r10] ; the masked load ensures that no exception 188 vmovdqu32 m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory 189 jmp .h_main 190.h_top: 191 mov r10, wq 192 test edgeb, 1 ; LR_HAVE_LEFT 193 jz .h_extend_left 194.h_loop: 195 movu m16, [lpfq+r10-4] 196.h_main: 197 movu m17, [lpfq+r10+4] 198 test edgeb, 2 ; LR_HAVE_RIGHT 199 jnz .h_have_right 200 cmp r10d, -66 201 jl .h_have_right 202 push r0 203 lea r0, [r_ext_mask+65] 204 vpbroadcastb m0, [lpfq-1] 205 vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b 206 vpternlogd m17, m0, [r0+r10+8], 0xe4 207 pop r0 208.h_have_right: 209 pshufb m4, m16, m6 210 mova m0, m10 211 vpdpbusd m0, m4, m11 212 pshufb m4, m16, m7 213 mova m2, m10 214 vpdpbusd m2, m4, m11 215 pshufb m4, m17, m6 216 mova m1, m10 217 vpdpbusd m1, m4, m11 218 pshufb m4, m17, m7 219 mova m3, m10 220 vpdpbusd m3, m4, m11 221 pshufb m4, m16, m8 222 vpdpbusd m0, m4, m12 223 pshufb m16, m9 224 vpdpbusd m2, m16, m12 225 pshufb m4, m17, m8 226 vpdpbusd m1, m4, m12 227 pshufb m17, m9 228 vpdpbusd m3, m17, m12 229 packssdw m0, m2 230 packssdw m1, m3 231 psraw m0, 3 232 psraw m1, 3 233 mova [t1+r10*2+ 0], m0 234 mova [t1+r10*2+64], m1 235 add r10, 64 236 jl .h_loop 237 ret 238ALIGN function_align 239.hv: 240 add lpfq, strideq 241 mov r10, wq 242 test edgeb, 1 ; LR_HAVE_LEFT 243 jz .hv_extend_left 244 movd xm16, [leftq] 245 vmovdqu32 m16{k1}, [lpfq+r10-4] 246 add leftq, 4 247 jmp .hv_main 248.hv_extend_left: 249 vpbroadcastb xm16, [lpfq+r10] 250 vmovdqu32 m16{k1}, [lpfq+r10-4] 251 jmp .hv_main 252.hv_bottom: 253 mov r10, wq 254 test edgeb, 1 ; LR_HAVE_LEFT 255 jz .hv_extend_left 256.hv_loop: 257 movu m16, [lpfq+r10-4] 258.hv_main: 259 movu m17, [lpfq+r10+4] 260 test edgeb, 2 ; LR_HAVE_RIGHT 261 jnz .hv_have_right 262 cmp r10d, -66 263 jl .hv_have_right 264 push r0 265 lea r0, [r_ext_mask+65] 266 vpbroadcastb m0, [lpfq-1] 267 vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b 268 vpternlogd m17, m0, [r0+r10+8], 0xe4 269 pop r0 270.hv_have_right: 271 pshufb m4, m16, m6 272 mova m0, m10 273 vpdpbusd m0, m4, m11 274 pshufb m4, m16, m7 275 mova m2, m10 276 vpdpbusd m2, m4, m11 277 pshufb m4, m17, m6 278 mova m1, m10 279 vpdpbusd m1, m4, m11 280 pshufb m4, m17, m7 281 mova m3, m10 282 vpdpbusd m3, m4, m11 283 pshufb m4, m16, m8 284 vpdpbusd m0, m4, m12 285 pshufb m16, m9 286 vpdpbusd m2, m16, m12 287 pshufb m4, m17, m8 288 vpdpbusd m1, m4, m12 289 pshufb m17, m9 290 vpdpbusd m3, m17, m12 291 packssdw m0, m2 292 packssdw m1, m3 293 psraw m0, 3 294 psraw m1, 3 295 mova m16, [t4+r10*2] 296 paddw m16, [t2+r10*2] 297 mova m3, [t3+r10*2] 298 mova m17, [t4+r10*2+64] 299 paddw m17, [t2+r10*2+64] 300 mova m5, [t3+r10*2+64] 301 punpcklwd m4, m16, m3 302 mova m2, m13 303 vpdpwssd m2, m4, m15 304 punpcklwd m18, m17, m5 305 mova m4, m13 306 vpdpwssd m4, m18, m15 307 punpckhwd m16, m3 308 mova m3, m13 309 vpdpwssd m3, m16, m15 310 punpckhwd m17, m5 311 mova m5, m13 312 vpdpwssd m5, m17, m15 313 mova m17, [t5+r10*2] 314 paddw m17, [t1+r10*2] 315 paddw m16, m0, [t6+r10*2] 316 mova m19, [t5+r10*2+64] 317 paddw m19, [t1+r10*2+64] 318 paddw m18, m1, [t6+r10*2+64] 319 mova [t0+r10*2+ 0], m0 320 mova [t0+r10*2+64], m1 321 punpcklwd m0, m16, m17 322 vpdpwssd m2, m0, m14 323 punpcklwd m1, m18, m19 324 vpdpwssd m4, m1, m14 325 punpckhwd m16, m17 326 vpdpwssd m3, m16, m14 327 punpckhwd m18, m19 328 vpdpwssd m5, m18, m14 329 packuswb m2, m4 330 psrlw m2, 8 331 vpackuswb m2{k2}, m3, m5 332 movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap 333 add r10, 64 ; function is used for chroma as well, and in some 334 jl .hv_loop ; esoteric edge cases chroma dst pointers may only 335 mov t6, t5 ; have a 32-byte alignment despite having a width 336 mov t5, t4 ; larger than 32, so use an unaligned store here. 337 mov t4, t3 338 mov t3, t2 339 mov t2, t1 340 mov t1, t0 341 mov t0, t6 342 add dstq, strideq 343 ret 344.v: 345 mov r10, wq 346.v_loop: 347 mova m4, [t4+r10*2+ 0] 348 paddw m4, [t2+r10*2+ 0] 349 mova m1, [t3+r10*2+ 0] 350 mova m5, [t4+r10*2+64] 351 paddw m5, [t2+r10*2+64] 352 mova m3, [t3+r10*2+64] 353 punpcklwd m6, m4, m1 354 mova m0, m13 355 vpdpwssd m0, m6, m15 356 punpcklwd m6, m5, m3 357 mova m2, m13 358 vpdpwssd m2, m6, m15 359 punpckhwd m4, m1 360 mova m1, m13 361 vpdpwssd m1, m4, m15 362 punpckhwd m5, m3 363 mova m3, m13 364 vpdpwssd m3, m5, m15 365 mova m5, [t1+r10*2+ 0] 366 paddw m4, m5, [t6+r10*2+ 0] 367 paddw m5, [t5+r10*2+ 0] 368 mova m7, [t1+r10*2+64] 369 paddw m6, m7, [t6+r10*2+64] 370 paddw m7, [t5+r10*2+64] 371 punpcklwd m8, m4, m5 372 vpdpwssd m0, m8, m14 373 punpcklwd m8, m6, m7 374 vpdpwssd m2, m8, m14 375 punpckhwd m4, m5 376 vpdpwssd m1, m4, m14 377 punpckhwd m6, m7 378 vpdpwssd m3, m6, m14 379 packuswb m0, m2 380 psrlw m0, 8 381 vpackuswb m0{k2}, m1, m3 382 movu [dstq+r10], m0 383 add r10, 64 384 jl .v_loop 385 mov t6, t5 386 mov t5, t4 387 mov t4, t3 388 mov t3, t2 389 mov t2, t1 390 add dstq, strideq 391 ret 392.w32: 393 lea r10, [r_ext_mask+73] 394 mova ym18, [wiener_perm32] 395 lea t1, [rsp+16] 396 sub r10, wq 397 test edgeb, 4 ; LR_HAVE_TOP 398 jz .w32_no_top 399 call .w32_h_top 400 add lpfq, strideq 401 mov t6, t1 402 mov t5, t1 403 add t1, 32*2 404 call .w32_h_top 405 lea r9, [lpfq+strideq*4] 406 mov lpfq, dstq 407 mov t4, t1 408 add t1, 32*2 409 add r9, strideq 410 mov [rsp], r9 ; below 411 call .w32_h 412 mov t3, t1 413 mov t2, t1 414 dec hd 415 jz .w32_v1 416 add lpfq, strideq 417 add t1, 32*2 418 call .w32_h 419 mov t2, t1 420 dec hd 421 jz .w32_v2 422 add lpfq, strideq 423 add t1, 32*2 424 call .w32_h 425 dec hd 426 jz .w32_v3 427.w32_main: 428 lea t0, [t1+32*2] 429.w32_main_loop: 430 call .w32_hv 431 dec hd 432 jnz .w32_main_loop 433 test edgeb, 8 ; LR_HAVE_BOTTOM 434 jz .w32_v3 435 mov lpfq, [rsp] 436 call .w32_hv_bottom 437 add lpfq, strideq 438 call .w32_hv_bottom 439.w32_v1: 440 call .w32_v 441 RET 442.w32_no_top: 443 lea r9, [lpfq+strideq*4] 444 mov lpfq, dstq 445 lea r9, [r9+strideq*2] 446 mov [rsp], r9 447 call .w32_h 448 mov t6, t1 449 mov t5, t1 450 mov t4, t1 451 mov t3, t1 452 mov t2, t1 453 dec hd 454 jz .w32_v1 455 add lpfq, strideq 456 add t1, 32*2 457 call .w32_h 458 mov t2, t1 459 dec hd 460 jz .w32_v2 461 add lpfq, strideq 462 add t1, 32*2 463 call .w32_h 464 dec hd 465 jz .w32_v3 466 lea t0, [t1+32*2] 467 call .w32_hv 468 dec hd 469 jz .w32_v3 470 add t0, 32*8 471 call .w32_hv 472 dec hd 473 jnz .w32_main 474.w32_v3: 475 call .w32_v 476.w32_v2: 477 call .w32_v 478 jmp .w32_v1 479.w32_h: 480 test edgeb, 1 ; LR_HAVE_LEFT 481 jz .w32_h_extend_left 482 movd xm16, [leftq] 483 vmovdqu32 ym16{k1}, [lpfq-4] 484 add leftq, 4 485 jmp .w32_h_main 486.w32_h_extend_left: 487 vpbroadcastb xm16, [lpfq] ; the masked load ensures that no exception 488 vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory 489 jmp .w32_h_main 490.w32_h_top: 491 test edgeb, 1 ; LR_HAVE_LEFT 492 jz .w32_h_extend_left 493 movu ym16, [lpfq-4] 494.w32_h_main: 495 vinserti32x8 m16, [lpfq+4], 1 496 test edgeb, 2 ; LR_HAVE_RIGHT 497 jnz .w32_h_have_right 498 vpbroadcastb m0, [lpfq+wq-1] 499 movu ym17, [r10-8] 500 vinserti32x8 m17, [r10+0], 1 501 vpternlogd m16, m0, m17, 0xe4 ; c ? a : b 502.w32_h_have_right: 503 pshufb m2, m16, m6 504 mova m0, m10 505 vpdpbusd m0, m2, m11 506 pshufb m2, m16, m7 507 mova m1, m10 508 vpdpbusd m1, m2, m11 509 pshufb m2, m16, m8 510 vpdpbusd m0, m2, m12 511 pshufb m16, m9 512 vpdpbusd m1, m16, m12 513 packssdw m0, m1 514 psraw m0, 3 515 mova [t1], m0 516 ret 517.w32_hv: 518 add lpfq, strideq 519 test edgeb, 1 ; LR_HAVE_LEFT 520 jz .w32_hv_extend_left 521 movd xm16, [leftq] 522 vmovdqu32 ym16{k1}, [lpfq-4] 523 add leftq, 4 524 jmp .w32_hv_main 525.w32_hv_extend_left: 526 vpbroadcastb xm16, [lpfq] 527 vmovdqu32 ym16{k1}, [lpfq-4] 528 jmp .w32_hv_main 529.w32_hv_bottom: 530 test edgeb, 1 ; LR_HAVE_LEFT 531 jz .w32_hv_extend_left 532 movu ym16, [lpfq-4] 533.w32_hv_main: 534 vinserti32x8 m16, [lpfq+4], 1 535 test edgeb, 2 ; LR_HAVE_RIGHT 536 jnz .w32_hv_have_right 537 vpbroadcastb m0, [lpfq+wq-1] 538 movu ym17, [r10-8] 539 vinserti32x8 m17, [r10+0], 1 540 vpternlogd m16, m0, m17, 0xe4 541.w32_hv_have_right: 542 mova m3, [t4] 543 paddw m3, [t2] 544 mova m2, [t3] 545 pshufb m4, m16, m6 546 mova m0, m10 547 vpdpbusd m0, m4, m11 548 pshufb m4, m16, m7 549 mova m5, m10 550 vpdpbusd m5, m4, m11 551 punpcklwd m4, m3, m2 552 mova m1, m13 553 vpdpwssd m1, m4, m15 554 punpckhwd m3, m2 555 mova m2, m13 556 vpdpwssd m2, m3, m15 557 pshufb m4, m16, m8 558 vpdpbusd m0, m4, m12 559 pshufb m16, m9 560 vpdpbusd m5, m16, m12 561 packssdw m0, m5 562 psraw m0, 3 563 mova m4, [t5] 564 paddw m4, [t1] 565 paddw m3, m0, [t6] 566 mova [t0], m0 567 punpcklwd m0, m3, m4 568 vpdpwssd m1, m0, m14 569 punpckhwd m3, m4 570 vpdpwssd m2, m3, m14 571 packuswb m1, m2 572 vpermb m16, m18, m1 573 mova [dstq], ym16 574 mov t6, t5 575 mov t5, t4 576 mov t4, t3 577 mov t3, t2 578 mov t2, t1 579 mov t1, t0 580 mov t0, t6 581 add dstq, strideq 582 ret 583.w32_v: 584 mova m2, [t4] 585 paddw m2, [t2] 586 mova m1, [t3] 587 mova m4, [t1] 588 paddw m3, m4, [t6] 589 paddw m4, [t5] 590 punpcklwd m5, m2, m1 591 mova m0, m13 592 vpdpwssd m0, m5, m15 593 punpckhwd m2, m1 594 mova m1, m13 595 vpdpwssd m1, m2, m15 596 punpcklwd m2, m3, m4 597 vpdpwssd m0, m2, m14 598 punpckhwd m3, m4 599 vpdpwssd m1, m3, m14 600 packuswb m0, m1 601 vpermb m16, m18, m0 602 mova [dstq], ym16 603 mov t6, t5 604 mov t5, t4 605 mov t4, t3 606 mov t3, t2 607 mov t2, t1 608 add dstq, strideq 609 ret 610 611cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \ 612 w, h, edge, params 613 mov paramsq, r6mp 614 mov wd, wm 615 mov hd, hm 616 mov edged, r7m 617 vbroadcasti32x4 m5, [sgr_shuf+1] 618 add lpfq, wq 619 vbroadcasti32x4 m6, [sgr_shuf+9] 620 add dstq, wq 621 vbroadcasti32x4 m7, [sgr_shuf+3] 622 lea t3, [rsp+wq*4+16+416*12] 623 vbroadcasti32x4 m8, [sgr_shuf+7] 624 pxor m4, m4 625 vpbroadcastd m9, [pd_m25] 626 vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 627 vpbroadcastw m15, [paramsq+8] ; w0 628 lea t1, [rsp+wq*2+20] 629 vpbroadcastd m10, [pw_164_455] 630 neg wq 631 vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) 632 mov r10d, 0xfe 633 vpbroadcastd m13, [pd_m4096] 634 kmovb k1, r10d 635 vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) 636 mov r10, 0x3333333333333333 637 mova m18, [sgr_x_by_x+64*0] 638 kmovq k2, r10 639 mova m19, [sgr_x_by_x+64*1] 640 lea r12, [r_ext_mask+75] 641 mova m20, [sgr_x_by_x+64*2] 642 psllw m15, 4 643 mova m21, [sgr_x_by_x+64*3] 644 lea r10, [lpfq+strideq*4] 645 mova ym22, [sgr_shuf] 646 add r10, strideq 647 mov [rsp], r10 ; below 648 test edgeb, 4 ; LR_HAVE_TOP 649 jz .no_top 650 call .h_top 651 add lpfq, strideq 652 mov t2, t1 653 call .top_fixup 654 add t1, 416*6 655 call .h_top 656 lea r10, [lpfq+strideq*4] 657 mov lpfq, dstq 658 add r10, strideq 659 mov [rsp], r10 ; below 660 mov t0, t2 661 dec hd 662 jz .height1 663 or edged, 16 664 call .h 665.main: 666 add lpfq, strideq 667 call .hv 668 call .prep_n 669 sub hd, 2 670 jl .extend_bottom 671.main_loop: 672 add lpfq, strideq 673 test hd, hd 674 jz .odd_height 675 call .h 676 add lpfq, strideq 677 call .hv 678 call .n0 679 call .n1 680 sub hd, 2 681 jge .main_loop 682 test edgeb, 8 ; LR_HAVE_BOTTOM 683 jz .extend_bottom 684 mov lpfq, [rsp] 685 call .h_top 686 add lpfq, strideq 687 call .hv_bottom 688.end: 689 call .n0 690 call .n1 691.end2: 692 RET 693.height1: 694 call .hv 695 call .prep_n 696 jmp .odd_height_end 697.odd_height: 698 call .hv 699 call .n0 700 call .n1 701.odd_height_end: 702 call .v 703 call .n0 704 jmp .end2 705.extend_bottom: 706 call .v 707 jmp .end 708.no_top: 709 lea r10, [lpfq+strideq*4] 710 mov lpfq, dstq 711 lea r10, [r10+strideq*2] 712 mov [rsp], r10 713 call .h 714 lea t2, [t1+416*6] 715 call .top_fixup 716 dec hd 717 jz .no_top_height1 718 or edged, 16 719 mov t0, t1 720 mov t1, t2 721 jmp .main 722.no_top_height1: 723 call .v 724 call .prep_n 725 jmp .odd_height_end 726.h: ; horizontal boxsum 727 lea r10, [wq-2] 728 test edgeb, 1 ; LR_HAVE_LEFT 729 jz .h_extend_left 730 movd xm17, [leftq] 731 vmovdqu32 ym17{k1}, [lpfq+wq-4] 732 add leftq, 4 733 jmp .h_main 734.h_extend_left: 735 vpbroadcastb xm17, [lpfq+wq] 736 vmovdqu32 ym17{k1}, [lpfq+wq-4] 737 jmp .h_main 738.h_top: 739 lea r10, [wq-2] 740 test edgeb, 1 ; LR_HAVE_LEFT 741 jz .h_extend_left 742.h_loop: 743 movu ym17, [lpfq+r10-2] 744.h_main: 745 vinserti32x8 m17, [lpfq+r10+6], 1 746 test edgeb, 2 ; LR_HAVE_RIGHT 747 jnz .h_have_right 748 cmp r10d, -34 749 jl .h_have_right 750 vpbroadcastb m0, [lpfq-1] 751 movu ym16, [r12+r10-8] 752 vinserti32x8 m16, [r12+r10+0], 1 753 vpternlogd m17, m0, m16, 0xe4 754.h_have_right: 755 pshufb m3, m17, m5 756 pmullw m2, m3, m3 757 pshufb m1, m17, m6 758 paddw m0, m3, m1 759 shufps m3, m1, q2121 760 paddw m0, m3 761 punpcklwd m16, m3, m1 762 punpckhwd m3, m1 763 punpcklwd m1, m2, m4 764 vpdpwssd m1, m16, m16 765 punpckhwd m2, m4 766 vpdpwssd m2, m3, m3 767 pshufb m16, m17, m7 768 paddw m0, m16 769 pshufb m17, m8 770 paddw m0, m17 ; sum 771 punpcklwd m3, m16, m17 772 vpdpwssd m1, m3, m3 ; sumsq 773 punpckhwd m16, m17 774 vpdpwssd m2, m16, m16 775 test edgeb, 16 ; y > 0 776 jz .h_loop_end 777 paddw m0, [t1+r10*2+416*0] 778 paddd m1, [t1+r10*2+416*2] 779 paddd m2, [t1+r10*2+416*4] 780.h_loop_end: 781 mova [t1+r10*2+416*0], m0 782 mova [t1+r10*2+416*2], m1 783 mova [t1+r10*2+416*4], m2 784 add r10, 32 785 jl .h_loop 786 ret 787.top_fixup: 788 lea r10, [wq-2] 789.top_fixup_loop: ; the sums of the first row needs to be doubled 790 mova m0, [t1+r10*2+416*0] 791 mova m1, [t1+r10*2+416*2] 792 mova m2, [t1+r10*2+416*4] 793 paddw m0, m0 794 paddd m1, m1 795 paddd m2, m2 796 mova [t2+r10*2+416*0], m0 797 mova [t2+r10*2+416*2], m1 798 mova [t2+r10*2+416*4], m2 799 add r10, 32 800 jl .top_fixup_loop 801 ret 802ALIGN function_align 803.hv: ; horizontal boxsum + vertical boxsum + ab 804 lea r10, [wq-2] 805 test edgeb, 1 ; LR_HAVE_LEFT 806 jz .hv_extend_left 807 movd xm17, [leftq] 808 vmovdqu32 ym17{k1}, [lpfq+wq-4] 809 add leftq, 4 810 jmp .hv_main 811.hv_extend_left: 812 vpbroadcastb xm17, [lpfq+wq] 813 vmovdqu32 ym17{k1}, [lpfq+wq-4] 814 jmp .hv_main 815.hv_bottom: 816 lea r10, [wq-2] 817 test edgeb, 1 ; LR_HAVE_LEFT 818 jz .hv_extend_left 819.hv_loop: 820 movu ym17, [lpfq+r10-2] 821.hv_main: 822 vinserti32x8 m17, [lpfq+r10+6], 1 823 test edgeb, 2 ; LR_HAVE_RIGHT 824 jnz .hv_have_right 825 cmp r10d, -34 826 jl .hv_have_right 827 vpbroadcastb m0, [lpfq-1] 828 movu ym16, [r12+r10-8] 829 vinserti32x8 m16, [r12+r10+0], 1 830 vpternlogd m17, m0, m16, 0xe4 831.hv_have_right: 832 pshufb m1, m17, m5 833 pmullw m3, m1, m1 834 pshufb m2, m17, m6 835 paddw m0, m1, m2 836 shufps m1, m2, q2121 837 paddw m0, m1 838 punpcklwd m16, m1, m2 839 punpckhwd m1, m2 840 punpcklwd m2, m3, m4 841 vpdpwssd m2, m16, m16 842 punpckhwd m3, m4 843 vpdpwssd m3, m1, m1 844 pshufb m16, m17, m7 845 paddw m0, m16 846 pshufb m17, m8 847 paddw m0, m17 ; h sum 848 punpcklwd m1, m16, m17 849 vpdpwssd m2, m1, m1 ; h sumsq 850 punpckhwd m16, m17 851 vpdpwssd m3, m16, m16 852 paddw m1, m0, [t1+r10*2+416*0] 853 paddd m16, m2, [t1+r10*2+416*2] 854 paddd m17, m3, [t1+r10*2+416*4] 855 test hd, hd 856 jz .hv_last_row 857.hv_main2: 858 paddd m16, [t2+r10*2+416*2] ; hv sumsq 859 paddd m17, [t2+r10*2+416*4] 860 paddw m1, [t2+r10*2+416*0] ; hv sum 861 mova [t0+r10*2+416*2], m2 862 mova [t0+r10*2+416*4], m3 863 mova [t0+r10*2+416*0], m0 864 pmulld m16, m9 ; -a * 25 865 pmulld m17, m9 866 punpcklwd m0, m1, m4 ; b 867 vpdpwssd m16, m0, m0 ; -p 868 punpckhwd m1, m4 869 vpdpwssd m17, m1, m1 870 pmaddwd m0, m10 ; b * 164 871 pmaddwd m1, m10 872 pmulld m16, m11 ; p * s 873 pmulld m17, m11 874 vpalignr m17{k2}, m16, m16, 2 875 mova m16, m20 876 paddusw m17, m12 877 psraw m17, 4 ; min(z, 255) - 256 878 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 879 vpmovb2m k3, m17 880 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 881 vmovdqu8 m17{k3}, m16 ; x 882 pandn m16, m13, m17 883 psrld m17, 16 884 pmulld m0, m16 885 pmulld m1, m17 886 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 887 paddd m1, m14 888 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) 889 vpternlogd m17, m1, m13, 0xd8 890 mova [t3+r10*4+ 8], m16 ; The neighbor calculations requires 891 mova [t3+r10*4+ 24], xm17 ; 13 bits for a and 21 bits for b. 892 vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but 893 mova [t3+r10*4+ 72], m17 ; that gets us most of the way. 894 vextracti128 [t3+r10*4+ 72], ym16, 1 895 vextracti32x4 [t3+r10*4+104], m16, 3 896 add r10, 32 897 jl .hv_loop 898 mov t2, t1 899 mov t1, t0 900 mov t0, t2 901 ret 902.hv_last_row: ; esoteric edge case for odd heights 903 mova [t1+r10*2+416*0], m1 904 paddw m1, m0 905 mova [t1+r10*2+416*2], m16 906 paddd m16, m2 907 mova [t1+r10*2+416*4], m17 908 paddd m17, m3 909 jmp .hv_main2 910.v: ; vertical boxsum + ab 911 lea r10, [wq-2] 912.v_loop: 913 mova m2, [t1+r10*2+416*2] 914 paddd m16, m2, [t2+r10*2+416*2] 915 mova m3, [t1+r10*2+416*4] 916 paddd m17, m3, [t2+r10*2+416*4] 917 paddd m2, m2 918 paddd m3, m3 919 paddd m16, m2 ; hv sumsq 920 paddd m17, m3 921 pmulld m16, m9 ; -a * 25 922 pmulld m17, m9 923 mova m0, [t1+r10*2+416*0] 924 paddw m1, m0, [t2+r10*2+416*0] 925 paddw m0, m0 926 paddw m1, m0 ; hv sum 927 punpcklwd m0, m1, m4 ; b 928 vpdpwssd m16, m0, m0 ; -p 929 punpckhwd m1, m4 930 vpdpwssd m17, m1, m1 931 pmaddwd m0, m10 ; b * 164 932 pmaddwd m1, m10 933 pmulld m16, m11 ; p * s 934 pmulld m17, m11 935 vpalignr m17{k2}, m16, m16, 2 936 mova m16, m20 937 paddusw m17, m12 938 psraw m17, 4 ; min(z, 255) - 256 939 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 940 vpmovb2m k3, m17 941 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 942 vmovdqu8 m17{k3}, m16 ; x 943 pandn m16, m13, m17 944 psrld m17, 16 945 pmulld m0, m16 946 pmulld m1, m17 947 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 948 paddd m1, m14 949 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) 950 vpternlogd m17, m1, m13, 0xd8 951 mova [t3+r10*4+ 8], m16 952 mova [t3+r10*4+ 24], xm17 953 vextracti32x4 [t3+r10*4+ 56], m17, 2 954 mova [t3+r10*4+ 72], m17 955 vextracti128 [t3+r10*4+ 72], ym16, 1 956 vextracti32x4 [t3+r10*4+104], m16, 3 957 add r10, 32 958 jl .v_loop 959 ret 960.prep_n: ; initial neighbor setup 961 mov r10, wq 962.prep_n_loop: 963 movu m0, [t3+r10*4+ 4] 964 movu m1, [t3+r10*4+68] 965 paddd m2, m0, [t3+r10*4+ 0] 966 paddd m3, m1, [t3+r10*4+64] 967 paddd m2, [t3+r10*4+ 8] 968 paddd m3, [t3+r10*4+72] 969 paddd m0, m2 970 pslld m2, 2 971 paddd m1, m3 972 pslld m3, 2 973 paddd m2, m0 ; ab 565 974 paddd m3, m1 975 pandn m0, m13, m2 ; a 976 psrld m2, 12 ; b 977 pandn m1, m13, m3 978 psrld m3, 12 979 mova [t3+r10*4+416*4+ 0], m0 980 mova [t3+r10*4+416*8+ 0], m2 981 mova [t3+r10*4+416*4+64], m1 982 mova [t3+r10*4+416*8+64], m3 983 add r10, 32 984 jl .prep_n_loop 985 ret 986ALIGN function_align 987.n0: ; neighbor + output (even rows) 988 mov r10, wq 989.n0_loop: 990 movu m16, [t3+r10*4+ 4] 991 movu m17, [t3+r10*4+68] 992 paddd m0, m16, [t3+r10*4+ 0] 993 paddd m1, m17, [t3+r10*4+64] 994 paddd m0, [t3+r10*4+ 8] 995 paddd m1, [t3+r10*4+72] 996 paddd m16, m0 997 pslld m0, 2 998 paddd m17, m1 999 pslld m1, 2 1000 paddd m0, m16 1001 paddd m1, m17 1002 pandn m16, m13, m0 1003 psrld m0, 12 1004 pandn m17, m13, m1 1005 psrld m1, 12 1006 paddd m2, m16, [t3+r10*4+416*4+ 0] ; a 1007 paddd m3, m17, [t3+r10*4+416*4+64] 1008 mova [t3+r10*4+416*4+ 0], m16 1009 mova [t3+r10*4+416*4+64], m17 1010 paddd m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8) 1011 paddd m17, m1, [t3+r10*4+416*8+64] 1012 mova [t3+r10*4+416*8+ 0], m0 1013 mova [t3+r10*4+416*8+64], m1 1014 pmovzxbd m0, [dstq+r10+ 0] 1015 pmovzxbd m1, [dstq+r10+16] 1016 pmaddwd m2, m0 ; a * src 1017 pmaddwd m3, m1 1018 packssdw m0, m1 1019 psubd m16, m2 ; b - a * src + (1 << 8) 1020 psubd m17, m3 1021 psrad m16, 9 1022 psrad m17, 9 1023 packssdw m16, m17 1024 pmulhrsw m16, m15 1025 paddw m16, m0 1026 packuswb m16, m16 1027 vpermd m16, m22, m16 1028 mova [dstq+r10], ym16 1029 add r10, 32 1030 jl .n0_loop 1031 add dstq, strideq 1032 ret 1033ALIGN function_align 1034.n1: ; neighbor + output (odd rows) 1035 mov r10, wq 1036.n1_loop: 1037 pmovzxbd m0, [dstq+r10+ 0] 1038 pmovzxbd m1, [dstq+r10+16] 1039 pmaddwd m2, m0, [t3+r10*4+416*4+ 0] ; a * src 1040 pmaddwd m3, m1, [t3+r10*4+416*4+64] 1041 mova m16, [t3+r10*4+416*8+ 0] ; b + (1 << 7) 1042 mova m17, [t3+r10*4+416*8+64] 1043 packssdw m0, m1 1044 psubd m16, m2 ; b - a * src + (1 << 7) 1045 psubd m17, m3 1046 psrad m16, 8 1047 psrad m17, 8 1048 packssdw m16, m17 1049 pmulhrsw m16, m15 1050 paddw m16, m0 1051 packuswb m16, m16 1052 vpermd m16, m22, m16 1053 mova [dstq+r10], ym16 1054 add r10, 32 1055 jl .n1_loop 1056 add dstq, strideq 1057 ret 1058 1059cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \ 1060 w, h, edge, params 1061 mov paramsq, r6mp 1062 mov wd, wm 1063 movifnidn hd, hm 1064 mov edged, r7m 1065 vbroadcasti32x4 m5, [sgr_shuf+3] 1066 add lpfq, wq 1067 vbroadcasti32x4 m6, [sgr_shuf+5] 1068 add dstq, wq 1069 vbroadcasti32x4 m7, [sgr_shuf+7] 1070 pxor m4, m4 1071 vpbroadcastd m8, [pd_m9] 1072 vpsubd m11, m4, [paramsq+4] {1to16} ; -s1 1073 vpbroadcastw m15, [paramsq+10] ; w1 1074 lea t1, [rsp+wq*2+20] 1075 vpbroadcastd m10, [pw_164_455] 1076 lea t3, [rsp+wq*4+16+416*12] 1077 vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) 1078 neg wq 1079 vpbroadcastd m13, [pd_m4096] 1080 mov r10d, 0xfe 1081 vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) 1082 kmovb k1, r10d 1083 mova m18, [sgr_x_by_x+64*0] 1084 mov r10, 0x3333333333333333 1085 mova m19, [sgr_x_by_x+64*1] 1086 kmovq k2, r10 1087 mova m20, [sgr_x_by_x+64*2] 1088 psllw m15, 4 1089 mova m21, [sgr_x_by_x+64*3] 1090 lea r14, [r_ext_mask+75] 1091 mova ym9, [sgr_shuf] 1092 test edgeb, 4 ; LR_HAVE_TOP 1093 jz .no_top 1094 call .h_top 1095 add lpfq, strideq 1096 mov t2, t1 1097 add t1, 416*6 1098 call .h_top 1099 lea t4, [lpfq+strideq*4] 1100 mov lpfq, dstq 1101 add t4, strideq 1102 mov [rsp], t4 ; below 1103 mov t0, t2 1104 call .hv 1105.main: 1106 mov t5, t3 1107 add t3, 416*4 1108 dec hd 1109 jz .height1 1110 add lpfq, strideq 1111 call .hv 1112 call .prep_n 1113 dec hd 1114 jz .extend_bottom 1115.main_loop: 1116 add lpfq, strideq 1117 call .hv 1118 call .n 1119 dec hd 1120 jnz .main_loop 1121 test edgeb, 8 ; LR_HAVE_BOTTOM 1122 jz .extend_bottom 1123 mov lpfq, [rsp] 1124 call .hv_bottom 1125 call .n 1126 add lpfq, strideq 1127 call .hv_bottom 1128.end: 1129 call .n 1130 RET 1131.height1: 1132 call .v 1133 call .prep_n 1134 mov t2, t1 1135 call .v 1136 jmp .end 1137.extend_bottom: 1138 call .v 1139 call .n 1140 mov t2, t1 1141 call .v 1142 jmp .end 1143.no_top: 1144 lea t4, [lpfq+strideq*4] 1145 mov lpfq, dstq 1146 lea t4, [t4+strideq*2] 1147 mov [rsp], t4 1148 call .h 1149 lea t0, [t1+416*6] 1150 mov t2, t1 1151 call .v 1152 jmp .main 1153.h: ; horizontal boxsum 1154 lea r10, [wq-2] 1155 test edgeb, 1 ; LR_HAVE_LEFT 1156 jz .h_extend_left 1157 movd xm17, [leftq] 1158 vmovdqu32 ym17{k1}, [lpfq+wq-4] 1159 add leftq, 4 1160 jmp .h_main 1161.h_extend_left: 1162 vpbroadcastb xm17, [lpfq+wq] 1163 vmovdqu32 ym17{k1}, [lpfq+wq-4] 1164 jmp .h_main 1165.h_top: 1166 lea r10, [wq-2] 1167 test edgeb, 1 ; LR_HAVE_LEFT 1168 jz .h_extend_left 1169.h_loop: 1170 movu ym17, [lpfq+r10-2] 1171.h_main: 1172 vinserti32x8 m17, [lpfq+r10+6], 1 1173 test edgeb, 2 ; LR_HAVE_RIGHT 1174 jnz .h_have_right 1175 cmp r10d, -33 1176 jl .h_have_right 1177 vpbroadcastb m0, [lpfq-1] 1178 movu ym16, [r14+r10-8] 1179 vinserti32x8 m16, [r14+r10+0], 1 1180 vpternlogd m17, m0, m16, 0xe4 1181.h_have_right: 1182 pshufb m0, m17, m5 1183 pmullw m2, m0, m0 1184 pshufb m16, m17, m6 1185 paddw m0, m16 1186 pshufb m17, m7 1187 paddw m0, m17 ; sum 1188 punpcklwd m3, m16, m17 1189 punpcklwd m1, m2, m4 1190 vpdpwssd m1, m3, m3 ; sumsq 1191 punpckhwd m16, m17 1192 punpckhwd m2, m4 1193 vpdpwssd m2, m16, m16 1194 mova [t1+r10*2+416*0], m0 1195 mova [t1+r10*2+416*2], m1 1196 mova [t1+r10*2+416*4], m2 1197 add r10, 32 1198 jl .h_loop 1199 ret 1200ALIGN function_align 1201.hv: ; horizontal boxsum + vertical boxsum + ab 1202 lea r10, [wq-2] 1203 test edgeb, 1 ; LR_HAVE_LEFT 1204 jz .hv_extend_left 1205 movd xm17, [leftq] 1206 vmovdqu32 ym17{k1}, [lpfq+wq-4] 1207 add leftq, 4 1208 jmp .hv_main 1209.hv_extend_left: 1210 vpbroadcastb xm17, [lpfq+wq] 1211 vmovdqu32 ym17{k1}, [lpfq+wq-4] 1212 jmp .hv_main 1213.hv_bottom: 1214 lea r10, [wq-2] 1215 test edgeb, 1 ; LR_HAVE_LEFT 1216 jz .hv_extend_left 1217.hv_loop: 1218 movu ym17, [lpfq+r10-2] 1219.hv_main: 1220 vinserti32x8 m17, [lpfq+r10+6], 1 1221 test edgeb, 2 ; LR_HAVE_RIGHT 1222 jnz .hv_have_right 1223 cmp r10d, -33 1224 jl .hv_have_right 1225 vpbroadcastb m0, [lpfq-1] 1226 movu ym16, [r14+r10-8] 1227 vinserti32x8 m16, [r14+r10+0], 1 1228 vpternlogd m17, m0, m16, 0xe4 1229.hv_have_right: 1230 pshufb m0, m17, m5 1231 pmullw m3, m0, m0 1232 pshufb m1, m17, m6 1233 paddw m0, m1 1234 pshufb m17, m7 1235 paddw m0, m17 ; h sum 1236 punpcklwd m16, m17, m1 1237 punpcklwd m2, m3, m4 1238 vpdpwssd m2, m16, m16 ; h sumsq 1239 punpckhwd m17, m1 1240 punpckhwd m3, m4 1241 vpdpwssd m3, m17, m17 1242 paddw m1, m0, [t2+r10*2+416*0] 1243 paddw m1, [t1+r10*2+416*0] ; hv sum 1244 paddd m16, m2, [t2+r10*2+416*2] 1245 paddd m17, m3, [t2+r10*2+416*4] 1246 paddd m16, [t1+r10*2+416*2] ; hv sumsq 1247 paddd m17, [t1+r10*2+416*4] 1248 mova [t0+r10*2+416*0], m0 1249 mova [t0+r10*2+416*2], m2 1250 mova [t0+r10*2+416*4], m3 1251 pmulld m16, m8 ; -a * 9 1252 pmulld m17, m8 1253 punpcklwd m0, m4, m1 ; b 1254 vpdpwssd m16, m0, m0 ; -p 1255 punpckhwd m1, m4, m1 1256 vpdpwssd m17, m1, m1 1257 pmaddwd m0, m10 ; b * 455 1258 pmaddwd m1, m10 1259 pmulld m16, m11 ; p * s 1260 pmulld m17, m11 1261 vpalignr m17{k2}, m16, m16, 2 1262 mova m16, m20 1263 paddusw m17, m12 1264 psraw m17, 4 ; min(z, 255) - 256 1265 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 1266 vpmovb2m k3, m17 1267 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 1268 vmovdqu8 m17{k3}, m16 ; x 1269 pandn m16, m13, m17 1270 psrld m17, 16 1271 pmulld m0, m16 1272 pmulld m1, m17 1273 paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) 1274 paddd m1, m14 1275 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) 1276 vpternlogd m17, m1, m13, 0xd8 1277 mova [t3+r10*4+ 8], m16 1278 mova [t3+r10*4+ 24], xm17 1279 vextracti32x4 [t3+r10*4+ 56], m17, 2 1280 mova [t3+r10*4+ 72], m17 1281 vextracti128 [t3+r10*4+ 72], ym16, 1 1282 vextracti32x4 [t3+r10*4+104], m16, 3 1283 add r10, 32 1284 jl .hv_loop 1285 mov t2, t1 1286 mov t1, t0 1287 mov t0, t2 1288 ret 1289.v: ; vertical boxsum + ab 1290 lea r10, [wq-2] 1291.v_loop: 1292 mova m16, [t1+r10*2+416*2] 1293 mova m17, [t1+r10*2+416*4] 1294 paddd m16, m16 1295 paddd m17, m17 1296 paddd m16, [t2+r10*2+416*2] ; hv sumsq 1297 paddd m17, [t2+r10*2+416*4] 1298 pmulld m16, m8 ; -a * 9 1299 pmulld m17, m8 1300 mova m1, [t1+r10*2+416*0] 1301 paddw m1, m1 1302 paddw m1, [t2+r10*2+416*0] ; hv sum 1303 punpcklwd m0, m4, m1 ; b 1304 vpdpwssd m16, m0, m0 ; -p 1305 punpckhwd m1, m4, m1 1306 vpdpwssd m17, m1, m1 1307 pmaddwd m0, m10 ; b * 455 1308 pmaddwd m1, m10 1309 pmulld m16, m11 ; p * s 1310 pmulld m17, m11 1311 vpalignr m17{k2}, m16, m16, 2 1312 mova m16, m20 1313 paddusw m17, m12 1314 psraw m17, 4 ; min(z, 255) - 256 1315 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 1316 vpmovb2m k3, m17 1317 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 1318 vmovdqu8 m17{k3}, m16 ; x 1319 pandn m16, m13, m17 1320 psrld m17, 16 1321 pmulld m0, m16 1322 pmulld m1, m17 1323 paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) 1324 paddd m1, m14 1325 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) 1326 vpternlogd m17, m1, m13, 0xd8 1327 mova [t3+r10*4+ 8], m16 1328 mova [t3+r10*4+ 24], xm17 1329 vextracti32x4 [t3+r10*4+ 56], m17, 2 1330 mova [t3+r10*4+ 72], m17 1331 vextracti128 [t3+r10*4+ 72], ym16, 1 1332 vextracti32x4 [t3+r10*4+104], m16, 3 1333 add r10, 32 1334 jl .v_loop 1335 ret 1336.prep_n: ; initial neighbor setup 1337 mov r10, wq 1338 mov t4, t3 1339 add t3, 416*4 1340.prep_n_loop: 1341 mova m2, [t5+r10*4+0] 1342 mova m3, [t4+r10*4+0] 1343 paddd m2, [t5+r10*4+8] 1344 paddd m3, [t4+r10*4+8] 1345 paddd m0, m2, [t5+r10*4+4] 1346 paddd m1, m3, [t4+r10*4+4] 1347 pslld m0, 2 1348 paddd m1, m1 ; ab[ 0] 222 1349 psubd m0, m2 ; ab[-1] 343 1350 mova [t3+r10*4+416*4], m1 1351 paddd m1, m1 1352 mova [t5+r10*4], m0 1353 psubd m1, m3 ; ab[ 0] 343 1354 mova [t4+r10*4], m1 1355 add r10, 16 1356 jl .prep_n_loop 1357 ret 1358; a+b are packed together in a single dword, but we can't do the 1359; full neighbor calculations before splitting them since we don't 1360; have sufficient precision. The solution is to do the calculations 1361; in two equal halves and split a and b before doing the final sum. 1362ALIGN function_align 1363.n: ; neighbor + output 1364 mov r10, wq 1365.n_loop: 1366 mova m16, [t3+r10*4+ 0] 1367 paddd m16, [t3+r10*4+ 8] 1368 paddd m17, m16, [t3+r10*4+ 4] 1369 paddd m17, m17 ; ab[+1] 222 1370 mova m2, [t3+r10*4+416*4+ 0] 1371 paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 1372 mova m3, [t3+r10*4+416*4+64] 1373 paddd m1, m3, [t5+r10*4+64] 1374 mova [t3+r10*4+416*4+ 0], m17 1375 paddd m17, m17 1376 psubd m17, m16 ; ab[+1] 343 1377 mova [t5+r10*4+ 0], m17 1378 paddd m2, m17 ; ab[ 0] 222 + ab[+1] 343 1379 mova m16, [t3+r10*4+64] 1380 paddd m16, [t3+r10*4+72] 1381 paddd m17, m16, [t3+r10*4+68] 1382 paddd m17, m17 1383 mova [t3+r10*4+416*4+64], m17 1384 paddd m17, m17 1385 psubd m17, m16 1386 mova [t5+r10*4+64], m17 1387 pandn m16, m13, m0 1388 psrld m0, 12 1389 paddd m3, m17 1390 pandn m17, m13, m2 1391 psrld m2, 12 1392 paddd m16, m17 ; a 1393 pandn m17, m13, m1 1394 psrld m1, 12 1395 paddd m0, m2 ; b + (1 << 8) 1396 pandn m2, m13, m3 1397 psrld m3, 12 1398 paddd m17, m2 1399 pmovzxbd m2, [dstq+r10+ 0] 1400 paddd m1, m3 1401 pmovzxbd m3, [dstq+r10+16] 1402 pmaddwd m16, m2 ; a * src 1403 pmaddwd m17, m3 1404 packssdw m2, m3 1405 psubd m0, m16 ; b - a * src + (1 << 8) 1406 psubd m1, m17 1407 psrad m0, 9 1408 psrad m1, 9 1409 packssdw m0, m1 1410 pmulhrsw m0, m15 1411 paddw m0, m2 1412 packuswb m0, m0 1413 vpermd m16, m9, m0 1414 mova [dstq+r10], ym16 1415 add r10, 32 1416 jl .n_loop 1417 mov r10, t5 1418 mov t5, t4 1419 mov t4, r10 1420 add dstq, strideq 1421 ret 1422 1423cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \ 1424 w, h, edge, params 1425 mov paramsq, r6mp 1426 mov wd, wm 1427 movifnidn hd, hm 1428 mov edged, r7m 1429 vbroadcasti128 m5, [sgr_shuf+1] 1430 add lpfq, wq 1431 vbroadcasti128 m6, [sgr_shuf+9] 1432 add dstq, wq 1433 vbroadcasti128 m7, [sgr_shuf+3] 1434 lea t3, [rsp+wq*4+416*24+8] 1435 vbroadcasti128 m8, [sgr_shuf+7] 1436 pxor m4, m4 1437 vpbroadcastd m9, [pd_m9] 1438 vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 1439 vpbroadcastd m14, [pw_61448] 1440 vpsubd m12, m4, [paramsq+4] {1to16} ; -s1 1441 vpbroadcastd m26, [paramsq+8] ; w0 w1 1442 lea t1, [rsp+wq*2+12] 1443 vpbroadcastd m10, [pd_m25] 1444 neg wq 1445 vpbroadcastd m13, [pw_164_455] 1446 mov r10d, 0xfe 1447 vpbroadcastd m15, [pd_34816] 1448 kmovb k1, r10d 1449 mova m20, [sgr_x_by_x+64*0] 1450 mov r10, 0x3333333333333333 1451 mova m21, [sgr_x_by_x+64*1] 1452 kmovq k2, r10 1453 mova m22, [sgr_x_by_x+64*2] 1454 lea r12, [r_ext_mask+75] 1455 mova m23, [sgr_x_by_x+64*3] 1456 vpbroadcastd m24, [pd_m4096] 1457 vpbroadcastd m25, [sgr_shuf+28] ; 0x8000____ 1458 psllw m26, 5 1459 mova xm27, [sgr_mix_perm] 1460 test edgeb, 4 ; LR_HAVE_TOP 1461 jz .no_top 1462 call .h_top 1463 add lpfq, strideq 1464 mov t2, t1 1465 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup 1466 add t1, 416*12 1467 call .h_top 1468 lea r10, [lpfq+strideq*4] 1469 mov lpfq, dstq 1470 add r10, strideq 1471 mov [rsp], r10 ; below 1472 call .hv0 1473.main: 1474 dec hd 1475 jz .height1 1476 add lpfq, strideq 1477 call .hv1 1478 call .prep_n 1479 sub hd, 2 1480 jl .extend_bottom 1481.main_loop: 1482 add lpfq, strideq 1483 call .hv0 1484 test hd, hd 1485 jz .odd_height 1486 add lpfq, strideq 1487 call .hv1 1488 call .n0 1489 call .n1 1490 sub hd, 2 1491 jge .main_loop 1492 test edgeb, 8 ; LR_HAVE_BOTTOM 1493 jz .extend_bottom 1494 mov lpfq, [rsp] 1495 call .hv0_bottom 1496 add lpfq, strideq 1497 call .hv1_bottom 1498.end: 1499 call .n0 1500 call .n1 1501.end2: 1502 RET 1503.height1: 1504 call .v1 1505 call .prep_n 1506 jmp .odd_height_end 1507.odd_height: 1508 call .v1 1509 call .n0 1510 call .n1 1511.odd_height_end: 1512 call .v0 1513 call .v1 1514 call .n0 1515 jmp .end2 1516.extend_bottom: 1517 call .v0 1518 call .v1 1519 jmp .end 1520.no_top: 1521 lea r10, [lpfq+strideq*4] 1522 mov lpfq, dstq 1523 lea r10, [r10+strideq*2] 1524 mov [rsp], r10 1525 call .h 1526 lea t2, [t1+416*12] 1527 lea r10, [wq-2] 1528.top_fixup_loop: 1529 mova m0, [t1+r10*2+416* 0] 1530 mova m1, [t1+r10*2+416* 2] 1531 mova m2, [t1+r10*2+416* 4] 1532 paddw m0, m0 1533 mova m3, [t1+r10*2+416* 6] 1534 paddd m1, m1 1535 mova m16, [t1+r10*2+416* 8] 1536 paddd m2, m2 1537 mova m17, [t1+r10*2+416*10] 1538 mova [t2+r10*2+416* 0], m0 1539 mova [t2+r10*2+416* 2], m1 1540 mova [t2+r10*2+416* 4], m2 1541 mova [t2+r10*2+416* 6], m3 1542 mova [t2+r10*2+416* 8], m16 1543 mova [t2+r10*2+416*10], m17 1544 add r10, 32 1545 jl .top_fixup_loop 1546 call .v0 1547 jmp .main 1548.h: ; horizontal boxsums 1549 lea r10, [wq-2] 1550 test edgeb, 1 ; LR_HAVE_LEFT 1551 jz .h_extend_left 1552 movd xm17, [leftq] 1553 vmovdqu32 ym17{k1}, [lpfq+wq-4] 1554 add leftq, 4 1555 jmp .h_main 1556.h_extend_left: 1557 vpbroadcastb xm17, [lpfq+wq] 1558 vmovdqu32 ym17{k1}, [lpfq+wq-4] 1559 jmp .h_main 1560.h_top: 1561 lea r10, [wq-2] 1562 test edgeb, 1 ; LR_HAVE_LEFT 1563 jz .h_extend_left 1564.h_loop: 1565 movu ym17, [lpfq+r10-2] 1566.h_main: 1567 vinserti32x8 m17, [lpfq+r10+6], 1 1568 test edgeb, 2 ; LR_HAVE_RIGHT 1569 jnz .h_have_right 1570 cmp r10d, -34 1571 jl .h_have_right 1572 vpbroadcastb m0, [lpfq-1] 1573 movu ym16, [r12+r10-8] 1574 vinserti32x8 m16, [r12+r10+0], 1 1575 vpternlogd m17, m0, m16, 0xe4 1576.h_have_right: 1577 pshufb m3, m17, m5 1578 pshufb m18, m17, m6 1579 shufps m0, m3, m18, q2121 1580 pmullw m2, m0, m0 1581 pshufb m19, m17, m7 1582 paddw m0, m19 1583 pshufb m17, m8 1584 paddw m0, m17 ; sum3 1585 punpcklwd m16, m19, m17 1586 punpcklwd m1, m2, m4 1587 vpdpwssd m1, m16, m16 ; sumsq3 1588 punpckhwd m19, m17 1589 punpckhwd m2, m4 1590 vpdpwssd m2, m19, m19 1591 mova [t1+r10*2+416* 6], m0 1592 mova [t1+r10*2+416* 8], m1 1593 mova [t1+r10*2+416*10], m2 1594 punpcklwd m19, m3, m18 1595 paddw m0, m3 1596 vpdpwssd m1, m19, m19 ; sumsq5 1597 punpckhwd m3, m18 1598 paddw m0, m18 ; sum5 1599 vpdpwssd m2, m3, m3 1600 mova [t1+r10*2+416* 0], m0 1601 mova [t1+r10*2+416* 2], m1 1602 mova [t1+r10*2+416* 4], m2 1603 add r10, 32 1604 jl .h_loop 1605 ret 1606ALIGN function_align 1607.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) 1608 lea r10, [wq-2] 1609 test edgeb, 1 ; LR_HAVE_LEFT 1610 jz .hv0_extend_left 1611 movd xm17, [leftq] 1612 vmovdqu32 ym17{k1}, [lpfq+wq-4] 1613 add leftq, 4 1614 jmp .hv0_main 1615.hv0_extend_left: 1616 vpbroadcastb xm17, [lpfq+wq] 1617 vmovdqu32 ym17{k1}, [lpfq+wq-4] 1618 jmp .hv0_main 1619.hv0_bottom: 1620 lea r10, [wq-2] 1621 test edgeb, 1 ; LR_HAVE_LEFT 1622 jz .hv0_extend_left 1623.hv0_loop: 1624 movu ym17, [lpfq+r10-2] 1625.hv0_main: 1626 vinserti32x8 m17, [lpfq+r10+6], 1 1627 test edgeb, 2 ; LR_HAVE_RIGHT 1628 jnz .hv0_have_right 1629 cmp r10d, -34 1630 jl .hv0_have_right 1631 vpbroadcastb m0, [lpfq-1] 1632 movu ym16, [r12+r10-8] 1633 vinserti32x8 m16, [r12+r10+0], 1 1634 vpternlogd m17, m0, m16, 0xe4 1635.hv0_have_right: 1636 pshufb m18, m17, m5 1637 pshufb m19, m17, m6 1638 shufps m1, m18, m19, q2121 1639 pmullw m3, m1, m1 1640 pshufb m0, m17, m7 1641 paddw m1, m0 1642 pshufb m17, m8 1643 paddw m1, m17 ; sum3 1644 punpcklwd m16, m0, m17 1645 punpcklwd m2, m3, m4 1646 vpdpwssd m2, m16, m16 ; sumsq3 1647 punpckhwd m0, m17 1648 punpckhwd m3, m4 1649 vpdpwssd m3, m0, m0 1650 paddw m0, m1, [t1+r10*2+416* 6] 1651 paddd m16, m2, [t1+r10*2+416* 8] 1652 paddd m17, m3, [t1+r10*2+416*10] 1653 mova [t1+r10*2+416* 6], m1 1654 mova [t1+r10*2+416* 8], m2 1655 mova [t1+r10*2+416*10], m3 1656 paddw m1, m18 1657 paddw m1, m19 ; sum5 1658 mova [t3+r10*4+416*8+ 8], m1 1659 paddw m1, [t1+r10*2+416* 0] 1660 mova [t1+r10*2+416* 0], m1 1661 punpcklwd m1, m18, m19 1662 vpdpwssd m2, m1, m1 ; sumsq5 1663 punpckhwd m18, m19 1664 vpdpwssd m3, m18, m18 1665 mova [t3+r10*4+416*0+ 8], m2 ; we need a clean copy of the last row 1666 mova [t3+r10*4+416*0+72], m3 ; in case height is odd 1667 paddd m2, [t1+r10*2+416* 2] 1668 paddd m3, [t1+r10*2+416* 4] 1669 mova [t1+r10*2+416* 2], m2 1670 mova [t1+r10*2+416* 4], m3 1671 paddw m1, m0, [t2+r10*2+416* 6] 1672 paddd m2, m16, [t2+r10*2+416* 8] 1673 paddd m3, m17, [t2+r10*2+416*10] 1674 mova [t2+r10*2+416* 6], m0 1675 mova [t2+r10*2+416* 8], m16 1676 mova [t2+r10*2+416*10], m17 1677 pmulld m16, m2, m9 ; -a3 * 9 1678 pmulld m17, m3, m9 1679 punpcklwd m0, m4, m1 ; b3 1680 vpdpwssd m16, m0, m0 ; -p3 1681 punpckhwd m1, m4, m1 1682 vpdpwssd m17, m1, m1 1683 pmulld m16, m12 ; p3 * s1 1684 pmulld m17, m12 1685 pmaddwd m0, m13 ; b3 * 455 1686 pmaddwd m1, m13 1687 vpalignr m17{k2}, m16, m16, 2 1688 mova m16, m22 1689 paddusw m17, m14 1690 psraw m17, 4 ; min(z3, 255) - 256 1691 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] 1692 vpmovb2m k3, m17 1693 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] 1694 vmovdqu8 m17{k3}, m16 ; x3 1695 pandn m16, m24, m17 1696 psrld m17, 16 1697 pmulld m0, m16 1698 pmulld m1, m17 1699 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1700 paddd m1, m15 1701 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) 1702 vpternlogd m17, m1, m24, 0xd8 1703 mova [t3+r10*4+416*4+ 8], m16 1704 mova [t3+r10*4+416*4+ 24], xm17 1705 vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 1706 mova [t3+r10*4+416*4+ 72], m17 1707 vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 1708 vextracti32x4 [t3+r10*4+416*4+104], m16, 3 1709 add r10, 32 1710 jl .hv0_loop 1711 ret 1712ALIGN function_align 1713.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1714 lea r10, [wq-2] 1715 test edgeb, 1 ; LR_HAVE_LEFT 1716 jz .hv1_extend_left 1717 movd xm17, [leftq] 1718 vmovdqu32 ym17{k1}, [lpfq+wq-4] 1719 add leftq, 4 1720 jmp .hv1_main 1721.hv1_extend_left: 1722 vpbroadcastb xm17, [lpfq+wq] 1723 vmovdqu32 ym17{k1}, [lpfq+wq-4] 1724 jmp .hv1_main 1725.hv1_bottom: 1726 lea r10, [wq-2] 1727 test edgeb, 1 ; LR_HAVE_LEFT 1728 jz .hv1_extend_left 1729.hv1_loop: 1730 movu ym17, [lpfq+r10-2] 1731.hv1_main: 1732 vinserti32x8 m17, [lpfq+r10+6], 1 1733 test edgeb, 2 ; LR_HAVE_RIGHT 1734 jnz .hv1_have_right 1735 cmp r10d, -34 1736 jl .hv1_have_right 1737 vpbroadcastb m0, [lpfq-1] 1738 movu ym16, [r12+r10-8] 1739 vinserti32x8 m16, [r12+r10+0], 1 1740 vpternlogd m17, m0, m16, 0xe4 1741.hv1_have_right: 1742 pshufb m3, m17, m5 1743 pshufb m19, m17, m6 1744 shufps m2, m3, m19, q2121 1745 pmullw m1, m2, m2 1746 pshufb m18, m17, m7 1747 paddw m2, m18 1748 pshufb m17, m8 1749 paddw m2, m17 ; sum3 1750 punpcklwd m16, m17, m18 1751 punpcklwd m0, m1, m4 1752 vpdpwssd m0, m16, m16 ; sumsq3 1753 punpckhwd m17, m18 1754 punpckhwd m1, m4 1755 vpdpwssd m1, m17, m17 1756 paddd m16, m0, [t2+r10*2+416* 8] 1757 paddd m17, m1, [t2+r10*2+416*10] 1758 mova [t2+r10*2+416* 8], m0 1759 mova [t2+r10*2+416*10], m1 1760 punpcklwd m18, m3, m19 1761 vpdpwssd m0, m18, m18 ; sumsq5 1762 punpckhwd m18, m3, m19 1763 vpdpwssd m1, m18, m18 1764 paddw m3, m19 1765 pmulld m16, m9 ; -a3 * 9 1766 pmulld m17, m9 1767 paddd m18, m0, [t2+r10*2+416*2] 1768 paddd m19, m1, [t2+r10*2+416*4] 1769 paddd m18, [t1+r10*2+416*2] 1770 paddd m19, [t1+r10*2+416*4] 1771 mova [t2+r10*2+416*2], m0 1772 mova [t2+r10*2+416*4], m1 1773 pmulld m18, m10 ; -a5 * 25 1774 pmulld m19, m10 1775 paddw m1, m2, [t2+r10*2+416* 6] 1776 mova [t2+r10*2+416* 6], m2 1777 paddw m2, m3 ; sum5 1778 paddw m3, m2, [t2+r10*2+416*0] 1779 paddw m3, [t1+r10*2+416*0] 1780 mova [t2+r10*2+416*0], m2 1781 punpcklwd m0, m4, m1 ; b3 1782 vpdpwssd m16, m0, m0 ; -p3 1783 punpckhwd m1, m4, m1 1784 vpdpwssd m17, m1, m1 1785 punpcklwd m2, m3, m4 ; b5 1786 vpdpwssd m18, m2, m2 ; -p5 1787 punpckhwd m3, m4 1788 vpdpwssd m19, m3, m3 1789 pmulld m16, m12 ; p3 * s1 1790 pmulld m17, m12 1791 pmulld m18, m11 ; p5 * s0 1792 pmulld m19, m11 1793 pmaddwd m0, m13 ; b3 * 455 1794 pmaddwd m1, m13 1795 pmaddwd m2, m13 ; b5 * 164 1796 pmaddwd m3, m13 1797 vpalignr m17{k2}, m16, m16, 2 1798 vpalignr m19{k2}, m18, m18, 2 1799 paddusw m17, m14 1800 mova m16, m22 1801 psraw m17, 4 ; min(z3, 255) - 256 1802 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] 1803 vpmovb2m k3, m17 1804 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] 1805 paddusw m19, m14 1806 mova m18, m22 1807 psraw m19, 4 ; min(z5, 255) - 256 1808 vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] 1809 vpmovb2m k4, m19 1810 vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] 1811 vmovdqu8 m17{k3}, m16 ; x3 1812 vmovdqu8 m19{k4}, m18 ; x5 1813 pandn m16, m24, m17 1814 psrld m17, 16 1815 pmulld m0, m16 1816 pmulld m1, m17 1817 pandn m18, m24, m19 1818 psrld m19, 16 1819 pmulld m2, m18 1820 pmulld m3, m19 1821 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1822 paddd m1, m15 1823 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) 1824 vpternlogd m17, m1, m24, 0xd8 1825 mova [t3+r10*4+416*8+ 8], m16 1826 mova [t3+r10*4+416*8+ 24], xm17 1827 vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 1828 paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 1829 paddd m3, m15 1830 mova [t3+r10*4+416*8+ 72], m17 1831 vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 1832 vextracti32x4 [t3+r10*4+416*8+104], m16, 3 1833 vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) 1834 vpternlogd m19, m3, m24, 0xd8 1835 mova [t3+r10*4+416*0+ 8], m18 1836 mova [t3+r10*4+416*0+ 24], xm19 1837 vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 1838 mova [t3+r10*4+416*0+ 72], m19 1839 vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 1840 vextracti32x4 [t3+r10*4+416*0+104], m18, 3 1841 add r10, 32 1842 jl .hv1_loop 1843 mov r10, t2 1844 mov t2, t1 1845 mov t1, r10 1846 ret 1847.v0: ; vertical boxsums + ab3 (even rows) 1848 lea r10, [wq-2] 1849.v0_loop: 1850 mova m2, [t1+r10*2+416* 8] 1851 mova m3, [t1+r10*2+416*10] 1852 paddd m2, m2 1853 paddd m3, m3 1854 paddd m16, m2, [t2+r10*2+416* 8] 1855 paddd m17, m3, [t2+r10*2+416*10] 1856 mova m0, [t1+r10*2+416* 6] 1857 paddw m0, m0 1858 paddw m1, m0, [t2+r10*2+416* 6] 1859 pmulld m16, m9 ; -a3 * 9 1860 pmulld m17, m9 1861 mova [t2+r10*2+416* 6], m0 1862 mova [t2+r10*2+416* 8], m2 1863 mova [t2+r10*2+416*10], m3 1864 mova m2, [t1+r10*2+416*0] 1865 mova m3, [t1+r10*2+416*2] 1866 mova m18, [t1+r10*2+416*4] 1867 punpcklwd m0, m4, m1 ; b3 1868 vpdpwssd m16, m0, m0 ; -p3 1869 punpckhwd m1, m4, m1 1870 vpdpwssd m17, m1, m1 1871 pmulld m16, m12 ; p3 * s1 1872 pmulld m17, m12 1873 pmaddwd m0, m13 ; b3 * 455 1874 pmaddwd m1, m13 1875 mova [t3+r10*4+416*8+ 8], m2 1876 mova [t3+r10*4+416*0+ 8], m3 1877 mova [t3+r10*4+416*0+72], m18 1878 vpalignr m17{k2}, m16, m16, 2 1879 mova m16, m22 1880 paddusw m17, m14 1881 psraw m17, 4 ; min(z3, 255) - 256 1882 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] 1883 vpmovb2m k3, m17 1884 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] 1885 vmovdqu8 m17{k3}, m16 ; x3 1886 pandn m16, m24, m17 1887 psrld m17, 16 1888 pmulld m0, m16 1889 pmulld m1, m17 1890 paddw m2, m2 ; cc5 1891 paddd m3, m3 1892 paddd m18, m18 1893 mova [t1+r10*2+416*0], m2 1894 mova [t1+r10*2+416*2], m3 1895 mova [t1+r10*2+416*4], m18 1896 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1897 paddd m1, m15 1898 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) 1899 vpternlogd m17, m1, m24, 0xd8 1900 mova [t3+r10*4+416*4+ 8], m16 1901 mova [t3+r10*4+416*4+ 24], xm17 1902 vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 1903 mova [t3+r10*4+416*4+ 72], m17 1904 vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 1905 vextracti32x4 [t3+r10*4+416*4+104], m16, 3 1906 add r10, 32 1907 jl .v0_loop 1908 ret 1909.v1: ; vertical boxsums + ab (odd rows) 1910 lea r10, [wq-2] 1911.v1_loop: 1912 mova m0, [t1+r10*2+416* 8] 1913 paddd m16, m0, [t2+r10*2+416* 8] 1914 mova m1, [t1+r10*2+416*10] 1915 paddd m17, m1, [t2+r10*2+416*10] 1916 mova m2, [t3+r10*4+416*0+ 8] 1917 paddd m18, m2, [t2+r10*2+416* 2] 1918 mova m3, [t3+r10*4+416*0+72] 1919 paddd m19, m3, [t2+r10*2+416* 4] 1920 paddd m18, [t1+r10*2+416* 2] 1921 paddd m19, [t1+r10*2+416* 4] 1922 mova [t2+r10*2+416* 8], m0 1923 mova [t2+r10*2+416*10], m1 1924 mova [t2+r10*2+416* 2], m2 1925 mova [t2+r10*2+416* 4], m3 1926 pmulld m16, m9 ; -a3 * 9 1927 pmulld m17, m9 1928 pmulld m18, m10 ; -a5 * 25 1929 pmulld m19, m10 1930 mova m0, [t1+r10*2+416* 6] 1931 paddw m1, m0, [t2+r10*2+416* 6] 1932 mova m2, [t3+r10*4+416*8+ 8] 1933 paddw m3, m2, [t2+r10*2+416*0] 1934 paddw m3, [t1+r10*2+416*0] 1935 mova [t2+r10*2+416* 6], m0 1936 mova [t2+r10*2+416*0], m2 1937 punpcklwd m0, m4, m1 ; b3 1938 vpdpwssd m16, m0, m0 ; -p3 1939 punpckhwd m1, m4, m1 1940 vpdpwssd m17, m1, m1 1941 punpcklwd m2, m3, m4 ; b5 1942 vpdpwssd m18, m2, m2 ; -p5 1943 punpckhwd m3, m4 1944 vpdpwssd m19, m3, m3 1945 pmulld m16, m12 ; p3 * s1 1946 pmulld m17, m12 1947 pmulld m18, m11 ; p5 * s0 1948 pmulld m19, m11 1949 pmaddwd m0, m13 ; b3 * 455 1950 pmaddwd m1, m13 1951 pmaddwd m2, m13 ; b5 * 164 1952 pmaddwd m3, m13 1953 vpalignr m17{k2}, m16, m16, 2 1954 vpalignr m19{k2}, m18, m18, 2 1955 paddusw m17, m14 1956 mova m16, m22 1957 psraw m17, 4 ; min(z3, 255) - 256 1958 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] 1959 vpmovb2m k3, m17 1960 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] 1961 paddusw m19, m14 1962 mova m18, m22 1963 psraw m19, 4 ; min(z5, 255) - 256 1964 vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] 1965 vpmovb2m k4, m19 1966 vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] 1967 vmovdqu8 m17{k3}, m16 ; x3 1968 vmovdqu8 m19{k4}, m18 ; x5 1969 pandn m16, m24, m17 1970 psrld m17, 16 1971 pmulld m0, m16 1972 pmulld m1, m17 1973 pandn m18, m24, m19 1974 psrld m19, m19, 16 1975 pmulld m2, m18 1976 pmulld m3, m19 1977 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1978 paddd m1, m15 1979 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) 1980 vpternlogd m17, m1, m24, 0xd8 1981 mova [t3+r10*4+416*8+ 8], m16 1982 mova [t3+r10*4+416*8+ 24], xm17 1983 vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 1984 paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 1985 paddd m3, m15 1986 mova [t3+r10*4+416*8+ 72], m17 1987 vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 1988 vextracti32x4 [t3+r10*4+416*8+104], m16, 3 1989 vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) 1990 vpternlogd m19, m3, m24, 0xd8 1991 mova [t3+r10*4+416*0+ 8], m18 1992 mova [t3+r10*4+416*0+ 24], xm19 1993 vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 1994 mova [t3+r10*4+416*0+ 72], m19 1995 vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 1996 vextracti32x4 [t3+r10*4+416*0+104], m18, 3 1997 add r10, 32 1998 jl .v1_loop 1999 mov r10, t2 2000 mov t2, t1 2001 mov t1, r10 2002 ret 2003.prep_n: ; initial neighbor setup 2004 mov r10, wq 2005.prep_n_loop: 2006 movu m0, [t3+r10*4+416*0+4] 2007 paddd m1, m0, [t3+r10*4+416*0+0] 2008 mova m16, [t3+r10*4+416*4+0] 2009 paddd m1, [t3+r10*4+416*0+8] 2010 mova m17, [t3+r10*4+416*8+0] 2011 paddd m16, [t3+r10*4+416*4+8] 2012 paddd m17, [t3+r10*4+416*8+8] 2013 paddd m2, m16, [t3+r10*4+416*4+4] 2014 paddd m3, m17, [t3+r10*4+416*8+4] 2015 paddd m0, m1 2016 pslld m1, 2 2017 pslld m2, 2 2018 paddd m1, m0 ; ab5 565 2019 paddd m3, m3 ; ab3[ 0] 222 2020 psubd m2, m16 ; ab3[-1] 343 2021 mova [t3+r10*4+416*20], m3 2022 pandn m0, m24, m1 ; a5 565 2023 mova [t3+r10*4+416*24], m2 2024 psrld m1, 12 ; b5 565 2025 mova [t3+r10*4+416*12], m0 2026 paddd m3, m3 2027 mova [t3+r10*4+416*16], m1 2028 psubd m3, m17 ; ab3[ 0] 343 2029 mova [t3+r10*4+416*28], m3 2030 add r10, 16 2031 jl .prep_n_loop 2032 ret 2033ALIGN function_align 2034.n0: ; neighbor + output (even rows) 2035 mov r10, wq 2036.n0_loop: 2037 movu m2, [t3+r10*4+4] 2038 paddd m3, m2, [t3+r10*4+0] 2039 paddd m3, [t3+r10*4+8] 2040 mova m1, [t3+r10*4+416*4+0] 2041 paddd m2, m3 2042 pslld m3, 2 2043 paddd m1, [t3+r10*4+416*4+8] 2044 paddd m3, m2 2045 pandn m2, m24, m3 2046 psrld m3, 12 2047 paddd m0, m2, [t3+r10*4+416*12] ; a5 2048 paddd m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8) 2049 mova [t3+r10*4+416*12], m2 2050 mova [t3+r10*4+416*16], m3 2051 paddd m2, m1, [t3+r10*4+416*4+4] 2052 paddd m2, m2 ; ab3[ 1] 222 2053 mova m3, [t3+r10*4+416*20] 2054 paddd m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343 2055 mova [t3+r10*4+416*20], m2 2056 paddd m2, m2 2057 psubd m2, m1 ; ab3[ 1] 343 2058 mova [t3+r10*4+416*24], m2 2059 paddd m2, m3 ; ab3[ 0] 222 + ab3[ 1] 343 2060 pandn m1, m24, m17 2061 psrld m17, 12 2062 pandn m3, m24, m2 2063 psrld m2, 12 2064 paddd m1, m3 ; a3 2065 pmovzxbd m3, [dstq+r10] 2066 paddd m17, m2 ; b3 + (1 << 8) 2067 pmaddwd m0, m3 ; a5 * src 2068 pmaddwd m1, m3 ; a3 * src 2069 vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) 2070 psubd m16, m0 ; b5 - a5 * src + (1 << 8) 2071 psubd m17, m1 ; b3 - a3 * src + (1 << 8) 2072 psrld m16, 9 2073 pslld m17, 7 2074 vmovdqu8 m17{k2}, m16 2075 vpdpwssd m3, m17, m26 2076 packuswb m3, m2 2077 vpermb m16, m27, m3 2078 mova [dstq+r10], xm16 2079 add r10, 16 2080 jl .n0_loop 2081 add dstq, strideq 2082 ret 2083ALIGN function_align 2084.n1: ; neighbor + output (odd rows) 2085 mov r10, wq 2086.n1_loop: 2087 mova m1, [t3+r10*4+416*8+0] 2088 paddd m1, [t3+r10*4+416*8+8] 2089 paddd m2, m1, [t3+r10*4+416*8+4] 2090 paddd m2, m2 ; ab3[ 1] 222 2091 mova m0, [t3+r10*4+416*20] 2092 paddd m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343 2093 pmovzxbd m3, [dstq+r10] 2094 mova [t3+r10*4+416*20], m2 2095 paddd m2, m2 2096 psubd m2, m1 ; ab3[ 1] 343 2097 mova [t3+r10*4+416*28], m2 2098 paddd m0, m2 ; ab3[ 0] 222 + ab3[ 1] 343 2099 pandn m1, m24, m17 2100 psrld m17, 12 2101 pandn m2, m24, m0 2102 psrld m0, 12 2103 paddd m1, m2 ; a3 2104 paddd m17, m0 ; b3 + (1 << 8) 2105 mova m16, [t3+r10*4+416*16] ; b5 + (1 << 7) 2106 pmaddwd m1, m3 ; a3 * src 2107 pmaddwd m0, m3, [t3+r10*4+416*12] ; a5 * src 2108 vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) 2109 psubd m17, m1 ; b3 - a3 * src + (1 << 8) 2110 psubd m16, m0 ; b5 - a5 * src + (1 << 7) 2111 pslld m17, 7 2112 palignr m17{k2}, m16, m16, 1 2113 vpdpwssd m3, m17, m26 2114 packuswb m3, m3 2115 vpermb m16, m27, m3 2116 mova [dstq+r10], xm16 2117 add r10, 16 2118 jl .n1_loop 2119 add dstq, strideq 2120 ret 2121 2122%endif ; ARCH_X86_64 2123