1; Copyright © 2022-2024, VideoLAN and dav1d authors 2; Copyright © 2022-2024, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32 33ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3 34 db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11 35 db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7 36 db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15 37smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 38 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 39 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 40 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 41pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51 42 db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55 43 db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59 44 db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63 45pw_31to0: dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 46 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 47pw_1to32: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 48 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 49z_upsample: dw 0, -1, 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6 50 dw 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14 51z_xpos_mul: dw 1, 1, 1, 1, 2, 2, 1, 1, 3, 3, 2, 2, 4, 4, 2, 2 52 dw 5, 5, 3, 3, 6, 6, 3, 3, 7, 7, 4, 4, 8, 8, 4, 4 53z_ypos_mul: dw 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 1, 3, 3, 1, 1 54 dw 4, 4, 2, 2, 5, 5, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3 55z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 56z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 57z_xpos_off1a: dw 30720, 30784, 30848, 30912, 30976, 31040, 31104, 31168 58z_xpos_off1b: dw 30720, 30848, 30976, 31104, 31232, 31360, 31488, 31616 59filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5 60 times 4 db 10, 11, 12, 13, 2, 3, -1, -1 61filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7 62 times 4 db 26, 27, 28, 29, 14, 15, -1, -1 63filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9 64pw_1: times 2 dw 1 65 dd 10 66filter_rnd: dd 32 67 dd 1 68 dd 8 69 dd 11 70filter_shift: times 2 dw 6 71 dd 0 72 times 2 dw 4 73 dd 9 74pd_65536: dd 65536 75pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44 76 db 16, 24, 20, 28, 48, 56, 52, 60 77z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 78 db 39, 39, 47, 47, 47, 79, 79, 79 79z_filter_k: dw 8, 8, 6, 6, 4, 4 80 dw 4, 4, 5, 5, 4, 4 81 dw 0, 0, 0, 0, 2, 2 82pb_90: times 4 db 90 83pw_15: times 2 dw 15 84pw_16: times 2 dw 16 85pw_17: times 2 dw 17 86pw_24: times 2 dw 24 87pw_31: times 2 dw 31 88pw_32: times 2 dw 32 89pw_63: times 2 dw 63 90pw_64: times 2 dw 64 91pw_512: times 2 dw 512 92pw_2048: times 2 dw 2048 93pw_31806: times 2 dw 31806 94pw_32640: times 2 dw 32640 95pw_32672: times 2 dw 32672 96pw_32704: times 2 dw 32704 97pw_32735: times 2 dw 32735 98pw_32736: times 2 dw 32736 99 100%define pw_2 (z_xpos_mul+4* 2) 101%define pw_3 (z_xpos_mul+4* 4) 102%define pw_7 (z_xpos_mul+4*12) 103%define pw_0to31 (pw_1to32-2) 104 105%macro JMP_TABLE 3-* 106 %xdefine %1_%2_table (%%table - 2*4) 107 %xdefine %%base mangle(private_prefix %+ _%1_%2) 108 %%table: 109 %rep %0 - 2 110 dd %%base %+ .%3 - (%%table - 2*4) 111 %rotate 1 112 %endrep 113%endmacro 114 115JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64 116JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 117JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 118JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 119JMP_TABLE ipred_z1_16bpc, avx512icl, w4, w8, w16, w32, w64 120JMP_TABLE ipred_z2_16bpc, avx512icl, w4, w8, w16, w32, w64 121JMP_TABLE ipred_z3_16bpc, avx512icl, w4, w8, w16, w32, w64 122JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 123 124cextern smooth_weights_1d_16bpc 125cextern smooth_weights_2d_16bpc 126cextern dr_intra_derivative 127cextern filter_intra_taps 128 129SECTION .text 130 131%macro PAETH 3 ; top, signed_ldiff, ldiff 132 paddw m0, m%2, m2 133 psubw m1, m0, m3 ; tldiff 134 psubw m0, m%1 ; tdiff 135 pabsw m1, m1 136 pabsw m0, m0 137 pcmpgtw k1, m0, m1 138 pminsw m0, m1 139 pcmpgtw k2, m%3, m0 140 vpblendmw m0{k1}, m%1, m3 141 vpblendmw m0{k2}, m2, m0 142%endmacro 143 144INIT_ZMM avx512icl 145cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h 146%define base r6-ipred_paeth_16bpc_avx512icl_table 147 lea r6, [ipred_paeth_16bpc_avx512icl_table] 148 tzcnt wd, wm 149 movifnidn hd, hm 150 movsxd wq, [r6+wq*4] 151 vpbroadcastw m3, [tlq] ; topleft 152 add wq, r6 153 jmp wq 154.w4: 155 vpbroadcastq m4, [tlq+2] ; top 156 movsldup m7, [base+ipred_shuf] 157 lea r6, [strideq*3] 158 psubw m5, m4, m3 159 pabsw m6, m5 160.w4_loop: 161 sub tlq, 16 162 vbroadcasti32x4 m2, [tlq] 163 pshufb m2, m7 ; left 164 PAETH 4, 5, 6 165 vextracti32x4 xm1, m0, 2 166 vextracti32x4 xm8, ym0, 1 167 vextracti32x4 xm9, m0, 3 168 movq [dstq+strideq*0], xm0 169 movq [dstq+strideq*1], xm1 170 movq [dstq+strideq*2], xm8 171 movq [dstq+r6 ], xm9 172 sub hd, 8 173 jl .w4_end 174 lea dstq, [dstq+strideq*4] 175 movhps [dstq+strideq*0], xm0 176 movhps [dstq+strideq*1], xm1 177 movhps [dstq+strideq*2], xm8 178 movhps [dstq+r6 ], xm9 179 lea dstq, [dstq+strideq*4] 180 jg .w4_loop 181.w4_end: 182 RET 183.w8: 184 vbroadcasti32x4 m4, [tlq+2] 185 movsldup m7, [base+ipred_shuf] 186 lea r6, [strideq*3] 187 psubw m5, m4, m3 188 pabsw m6, m5 189.w8_loop: 190 sub tlq, 8 191 vpbroadcastq m2, [tlq] 192 pshufb m2, m7 193 PAETH 4, 5, 6 194 mova [dstq+strideq*0], xm0 195 vextracti32x4 [dstq+strideq*1], m0, 2 196 vextracti32x4 [dstq+strideq*2], ym0, 1 197 vextracti32x4 [dstq+r6 ], m0, 3 198 lea dstq, [dstq+strideq*4] 199 sub hd, 4 200 jg .w8_loop 201 RET 202.w16: 203 vbroadcasti32x8 m4, [tlq+2] 204 movsldup m7, [base+ipred_shuf] 205 psubw m5, m4, m3 206 pabsw m6, m5 207.w16_loop: 208 sub tlq, 4 209 vpbroadcastd m2, [tlq] 210 pshufb m2, m7 211 PAETH 4, 5, 6 212 mova [dstq+strideq*0], ym0 213 vextracti32x8 [dstq+strideq*1], m0, 1 214 lea dstq, [dstq+strideq*2] 215 sub hd, 2 216 jg .w16_loop 217 RET 218.w32: 219 movu m4, [tlq+2] 220 psubw m5, m4, m3 221 pabsw m6, m5 222.w32_loop: 223 sub tlq, 2 224 vpbroadcastw m2, [tlq] 225 PAETH 4, 5, 6 226 mova [dstq], m0 227 add dstq, strideq 228 dec hd 229 jg .w32_loop 230 RET 231.w64: 232 movu m4, [tlq+ 2] 233 movu m7, [tlq+66] 234 psubw m5, m4, m3 235 psubw m8, m7, m3 236 pabsw m6, m5 237 pabsw m9, m8 238.w64_loop: 239 sub tlq, 2 240 vpbroadcastw m2, [tlq] 241 PAETH 4, 5, 6 242 mova [dstq+64*0], m0 243 PAETH 7, 8, 9 244 mova [dstq+64*1], m0 245 add dstq, strideq 246 dec hd 247 jg .w64_loop 248 RET 249 250cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 251%define base r6-$$ 252 lea r6, [$$] 253 tzcnt wd, wm 254 mov hd, hm 255 movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4] 256 lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] 257 neg hq 258 vpbroadcastw m6, [tlq+hq*2] ; bottom 259 lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq] 260 lea stride3q, [strideq*3] 261 jmp wq 262.w4: 263 vpbroadcastq m5, [tlq+2] ; top 264 movsldup m4, [ipred_shuf] 265 psubw m5, m6 ; top - bottom 266.w4_loop: 267 vbroadcasti32x4 m3, [weightsq+hq*2] 268 pshufb m3, m4 269 pmulhrsw m3, m5 270 paddw m3, m6 271 vextracti32x4 xm0, m3, 3 272 vextracti32x4 xm1, ym3, 1 273 vextracti32x4 xm2, m3, 2 274 movhps [dstq+strideq*0], xm0 275 movhps [dstq+strideq*1], xm1 276 movhps [dstq+strideq*2], xm2 277 movhps [dstq+stride3q ], xm3 278 add hq, 8 279 jg .end 280 lea dstq, [dstq+strideq*4] 281 movq [dstq+strideq*0], xm0 282 movq [dstq+strideq*1], xm1 283 movq [dstq+strideq*2], xm2 284 movq [dstq+stride3q ], xm3 285 lea dstq, [dstq+strideq*4] 286 jl .w4_loop 287.end: 288 RET 289.w8: 290 vbroadcasti32x4 m5, [tlq+2] ; top 291 movsldup m4, [ipred_shuf] 292 psubw m5, m6 ; top - bottom 293.w8_loop: 294 vpbroadcastq m0, [weightsq+hq*2] 295 pshufb m0, m4 296 pmulhrsw m0, m5 297 paddw m0, m6 298 vextracti32x4 [dstq+strideq*0], m0, 3 299 vextracti32x4 [dstq+strideq*1], ym0, 1 300 vextracti32x4 [dstq+strideq*2], m0, 2 301 mova [dstq+stride3q ], xm0 302 lea dstq, [dstq+strideq*4] 303 add hq, 4 304 jl .w8_loop 305 RET 306.w16: 307 vbroadcasti32x8 m5, [tlq+2] ; top 308 movsldup m4, [ipred_shuf] 309 psubw m5, m6 ; top - bottom 310.w16_loop: 311 vpbroadcastd m0, [weightsq+hq*2+0] 312 vpbroadcastd m1, [weightsq+hq*2+4] 313 pshufb m0, m4 314 pshufb m1, m4 315 pmulhrsw m0, m5 316 pmulhrsw m1, m5 317 paddw m0, m6 318 paddw m1, m6 319 vextracti32x8 [dstq+strideq*0], m0, 1 320 mova [dstq+strideq*1], ym0 321 vextracti32x8 [dstq+strideq*2], m1, 1 322 mova [dstq+stride3q ], ym1 323 lea dstq, [dstq+strideq*4] 324 add hq, 4 325 jl .w16_loop 326 RET 327.w32: 328 movu m5, [tlq+2] 329 psubw m5, m6 330.w32_loop: 331 vpbroadcastw m0, [weightsq+hq*2+0] 332 vpbroadcastw m1, [weightsq+hq*2+2] 333 vpbroadcastw m2, [weightsq+hq*2+4] 334 vpbroadcastw m3, [weightsq+hq*2+6] 335 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 336 REPX {paddw x, m6}, m0, m1, m2, m3 337 mova [dstq+strideq*0], m0 338 mova [dstq+strideq*1], m1 339 mova [dstq+strideq*2], m2 340 mova [dstq+stride3q ], m3 341 lea dstq, [dstq+strideq*4] 342 add hq, 4 343 jl .w32_loop 344 RET 345.w64: 346 movu m4, [tlq+ 2] 347 movu m5, [tlq+66] 348 psubw m4, m6 349 psubw m5, m6 350.w64_loop: 351 vpbroadcastw m1, [weightsq+hq*2+0] 352 vpbroadcastw m3, [weightsq+hq*2+2] 353 pmulhrsw m0, m4, m1 354 pmulhrsw m1, m5 355 pmulhrsw m2, m4, m3 356 pmulhrsw m3, m5 357 REPX {paddw x, m6}, m0, m1, m2, m3 358 mova [dstq+strideq*0+64*0], m0 359 mova [dstq+strideq*0+64*1], m1 360 mova [dstq+strideq*1+64*0], m2 361 mova [dstq+strideq*1+64*1], m3 362 lea dstq, [dstq+strideq*2] 363 add hq, 2 364 jl .w64_loop 365 RET 366 367cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3 368 lea r6, [$$] 369 mov wd, wm 370 movifnidn hd, hm 371 vpbroadcastw m6, [tlq+wq*2] ; right 372 tzcnt wd, wd 373 add hd, hd 374 movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4] 375 sub tlq, hq 376 lea stride3q, [strideq*3] 377 lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq] 378 jmp wq 379.w4: 380 movsldup m4, [base+ipred_shuf] 381 vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2] 382.w4_loop: 383 vbroadcasti32x4 m0, [tlq+hq-16] ; left 384 pshufb m0, m4 385 psubw m0, m6 ; left - right 386 pmulhrsw m0, m5 387 paddw m0, m6 388 vextracti32x4 xm1, m0, 2 389 vextracti32x4 xm2, ym0, 1 390 vextracti32x4 xm3, m0, 3 391 movq [dstq+strideq*0], xm0 392 movq [dstq+strideq*1], xm1 393 movq [dstq+strideq*2], xm2 394 movq [dstq+stride3q ], xm3 395 sub hd, 8*2 396 jl .end 397 lea dstq, [dstq+strideq*4] 398 movhps [dstq+strideq*0], xm0 399 movhps [dstq+strideq*1], xm1 400 movhps [dstq+strideq*2], xm2 401 movhps [dstq+stride3q ], xm3 402 lea dstq, [dstq+strideq*4] 403 jg .w4_loop 404.end: 405 RET 406.w8: 407 movsldup m4, [base+ipred_shuf] 408 vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2] 409.w8_loop: 410 vpbroadcastq m0, [tlq+hq-8] ; left 411 pshufb m0, m4 412 psubw m0, m6 ; left - right 413 pmulhrsw m0, m5 414 paddw m0, m6 415 mova [dstq+strideq*0], xm0 416 vextracti32x4 [dstq+strideq*1], m0, 2 417 vextracti32x4 [dstq+strideq*2], ym0, 1 418 vextracti32x4 [dstq+stride3q ], m0, 3 419 lea dstq, [dstq+strideq*4] 420 sub hd, 4*2 421 jg .w8_loop 422 RET 423.w16: 424 movsldup m4, [base+ipred_shuf] 425 vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2] 426.w16_loop: 427 vpbroadcastd m0, [tlq+hq-4] 428 vpbroadcastd m1, [tlq+hq-8] 429 pshufb m0, m4 430 pshufb m1, m4 431 psubw m0, m6 432 psubw m1, m6 433 pmulhrsw m0, m5 434 pmulhrsw m1, m5 435 paddw m0, m6 436 paddw m1, m6 437 mova [dstq+strideq*0], ym0 438 vextracti32x8 [dstq+strideq*1], m0, 1 439 mova [dstq+strideq*2], ym1 440 vextracti32x8 [dstq+stride3q ], m1, 1 441 lea dstq, [dstq+strideq*4] 442 sub hq, 4*2 443 jg .w16_loop 444 RET 445.w32: 446 movu m5, [base+smooth_weights_1d_16bpc+32*2] 447.w32_loop: 448 vpbroadcastq m3, [tlq+hq-8] 449 punpcklwd m3, m3 450 psubw m3, m6 451 pshufd m0, m3, q3333 452 pshufd m1, m3, q2222 453 pshufd m2, m3, q1111 454 pshufd m3, m3, q0000 455 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 456 REPX {paddw x, m6}, m0, m1, m2, m3 457 mova [dstq+strideq*0], m0 458 mova [dstq+strideq*1], m1 459 mova [dstq+strideq*2], m2 460 mova [dstq+stride3q ], m3 461 lea dstq, [dstq+strideq*4] 462 sub hq, 4*2 463 jg .w32_loop 464 RET 465.w64: 466 movu m4, [base+smooth_weights_1d_16bpc+64*2] 467 movu m5, [base+smooth_weights_1d_16bpc+64*3] 468.w64_loop: 469 vpbroadcastw m1, [tlq+hq-2] 470 vpbroadcastw m3, [tlq+hq-4] 471 psubw m1, m6 472 psubw m3, m6 473 pmulhrsw m0, m4, m1 474 pmulhrsw m1, m5 475 pmulhrsw m2, m4, m3 476 pmulhrsw m3, m5 477 REPX {paddw x, m6}, m0, m1, m2, m3 478 mova [dstq+strideq*0+64*0], m0 479 mova [dstq+strideq*0+64*1], m1 480 mova [dstq+strideq*1+64*0], m2 481 mova [dstq+strideq*1+64*1], m3 482 lea dstq, [dstq+strideq*2] 483 sub hq, 2*2 484 jg .w64_loop 485 RET 486 487cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 488 lea r6, [$$] 489 mov wd, wm 490 movifnidn hd, hm 491 vpbroadcastw m13, [tlq+wq*2] ; right 492 tzcnt wd, wd 493 add hd, hd 494 movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4] 495 mov r5d, 0x55555555 496 sub tlq, hq 497 mova m14, [base+smooth_perm] 498 kmovd k1, r5d 499 vpbroadcastw m0, [tlq] ; bottom 500 mov r5, 0x3333333333333333 501 pxor m15, m15 502 lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq] 503 kmovq k2, r5 504 lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2] 505 jmp wq 506.w4: 507 vpbroadcastq m5, [tlq+hq+2] 508 movshdup m3, [base+ipred_shuf] 509 movsldup m4, [base+ipred_shuf] 510 vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4] 511 lea stride3q, [strideq*3] 512 punpcklwd m5, m0 ; top, bottom 513.w4_loop: 514 vbroadcasti32x4 m0, [v_weightsq] 515 vpbroadcastq m2, [tlq+hq-8] 516 mova m1, m13 517 pshufb m0, m3 518 pmaddwd m0, m5 519 pshufb m1{k2}, m2, m4 ; left, right 520 vpdpwssd m0, m1, m6 521 vpermb m0, m14, m0 522 pavgw ym0, ym15 523 vextracti32x4 xm1, ym0, 1 524 movq [dstq+strideq*0], xm0 525 movq [dstq+strideq*1], xm1 526 movhps [dstq+strideq*2], xm0 527 movhps [dstq+stride3q ], xm1 528 lea dstq, [dstq+strideq*4] 529 add v_weightsq, 4*4 530 sub hd, 4*2 531 jg .w4_loop 532 RET 533.w8: 534 vbroadcasti32x4 ym5, [tlq+hq+2] 535 movshdup m6, [base+ipred_shuf] 536 movsldup m7, [base+ipred_shuf] 537 pmovzxwd m5, ym5 538 vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4] 539 lea stride3q, [strideq*3] 540 vpblendmw m5{k1}, m0, m5 ; top, bottom 541.w8_loop: 542 vpbroadcastq m0, [v_weightsq+0] 543 vpbroadcastq m1, [v_weightsq+8] 544 vpbroadcastd m3, [tlq+hq-4] 545 vpbroadcastd m4, [tlq+hq-8] 546 pshufb m0, m6 547 pmaddwd m0, m5 548 pshufb m1, m6 549 pmaddwd m1, m5 550 mova m2, m13 551 pshufb m2{k2}, m3, m7 ; left, right 552 mova m3, m13 553 pshufb m3{k2}, m4, m7 554 vpdpwssd m0, m2, m8 555 vpdpwssd m1, m3, m8 556 add v_weightsq, 4*4 557 vpermt2b m0, m14, m1 558 pavgw m0, m15 559 mova [dstq+strideq*0], xm0 560 vextracti32x4 [dstq+strideq*1], ym0, 1 561 vextracti32x4 [dstq+strideq*2], m0, 2 562 vextracti32x4 [dstq+stride3q ], m0, 3 563 lea dstq, [dstq+strideq*4] 564 sub hd, 4*2 565 jg .w8_loop 566 RET 567.w16: 568 pmovzxwd m5, [tlq+hq+2] 569 mova m6, [base+smooth_weights_2d_16bpc+16*4] 570 vpblendmw m5{k1}, m0, m5 ; top, bottom 571.w16_loop: 572 vpbroadcastd m0, [v_weightsq+0] 573 vpbroadcastd m1, [v_weightsq+4] 574 pmaddwd m0, m5 575 pmaddwd m1, m5 576 mova m2, m13 577 vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right 578 mova m3, m13 579 vpbroadcastw m3{k1}, [tlq+hq-4] 580 vpdpwssd m0, m2, m6 581 vpdpwssd m1, m3, m6 582 add v_weightsq, 2*4 583 vpermt2b m0, m14, m1 584 pavgw m0, m15 585 mova [dstq+strideq*0], ym0 586 vextracti32x8 [dstq+strideq*1], m0, 1 587 lea dstq, [dstq+strideq*2] 588 sub hq, 2*2 589 jg .w16_loop 590 RET 591.w32: 592 pmovzxwd m5, [tlq+hq+ 2] 593 pmovzxwd m6, [tlq+hq+34] 594 mova m7, [base+smooth_weights_2d_16bpc+32*4] 595 mova m8, [base+smooth_weights_2d_16bpc+32*6] 596 vpblendmw m5{k1}, m0, m5 ; top, bottom 597 vpblendmw m6{k1}, m0, m6 598.w32_loop: 599 vpbroadcastd m2, [v_weightsq+0] 600 vpbroadcastd m3, [v_weightsq+4] 601 pmaddwd m0, m5, m2 602 pmaddwd m2, m6 603 pmaddwd m1, m5, m3 604 pmaddwd m3, m6 605 mova m4, m13 606 vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right 607 vpdpwssd m0, m4, m7 608 vpdpwssd m2, m4, m8 609 mova m4, m13 610 vpbroadcastw m4{k1}, [tlq+hq-4] 611 vpdpwssd m1, m4, m7 612 vpdpwssd m3, m4, m8 613 add v_weightsq, 2*4 614 vpermt2b m0, m14, m2 615 vpermt2b m1, m14, m3 616 pavgw m0, m15 617 pavgw m1, m15 618 mova [dstq+strideq*0], m0 619 mova [dstq+strideq*1], m1 620 lea dstq, [dstq+strideq*2] 621 sub hq, 2*2 622 jg .w32_loop 623 RET 624.w64: 625 pmovzxwd m5, [tlq+hq+ 2] 626 pmovzxwd m6, [tlq+hq+34] 627 pmovzxwd m7, [tlq+hq+66] 628 pmovzxwd m8, [tlq+hq+98] 629 mova m9, [base+smooth_weights_2d_16bpc+64*4] 630 vpblendmw m5{k1}, m0, m5 ; top, bottom 631 mova m10, [base+smooth_weights_2d_16bpc+64*5] 632 vpblendmw m6{k1}, m0, m6 633 mova m11, [base+smooth_weights_2d_16bpc+64*6] 634 vpblendmw m7{k1}, m0, m7 635 mova m12, [base+smooth_weights_2d_16bpc+64*7] 636 vpblendmw m8{k1}, m0, m8 637.w64_loop: 638 vpbroadcastd m3, [v_weightsq] 639 mova m4, m13 640 vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right 641 pmaddwd m0, m5, m3 642 pmaddwd m2, m6, m3 643 pmaddwd m1, m7, m3 644 pmaddwd m3, m8 645 vpdpwssd m0, m4, m9 646 vpdpwssd m2, m4, m10 647 vpdpwssd m1, m4, m11 648 vpdpwssd m3, m4, m12 649 add v_weightsq, 1*4 650 vpermt2b m0, m14, m2 651 vpermt2b m1, m14, m3 652 pavgw m0, m15 653 pavgw m1, m15 654 mova [dstq+64*0], m0 655 mova [dstq+64*1], m1 656 add dstq, strideq 657 sub hd, 1*2 658 jg .w64_loop 659 RET 660 661%if WIN64 662 DECLARE_REG_TMP 4 663%else 664 DECLARE_REG_TMP 8 665%endif 666 667cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx 668%define base r7-z_filter_t0 669 lea r7, [z_filter_t0] 670 tzcnt wd, wm 671 movifnidn angled, anglem 672 lea t0, [dr_intra_derivative] 673 movsxd wq, [base+ipred_z1_16bpc_avx512icl_table+wq*4] 674 add tlq, 2 675 mov dxd, angled 676 and dxd, 0x7e 677 add angled, 165 ; ~90 678 movzx dxd, word [t0+dxq] 679 lea wq, [base+ipred_z1_16bpc_avx512icl_table+wq] 680 movifnidn hd, hm 681 xor angled, 0x4ff ; d = 90 - angle 682 vpbroadcastd m15, [base+pw_31806] 683 jmp wq 684.w4: 685 vpbroadcastw m5, [tlq+14] 686 vinserti32x4 m5, [tlq], 0 687 cmp angleb, 40 688 jae .w4_no_upsample 689 lea r3d, [angleq-1024] 690 sar r3d, 7 691 add r3d, hd 692 jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) 693 call .upsample_top 694 vpbroadcastq m0, [base+z_xpos_off1b] 695 jmp .w4_main2 696.w4_no_upsample: 697 test angled, 0x400 698 jnz .w4_main ; !enable_intra_edge_filter 699 lea r3d, [hq+3] 700 vpbroadcastb xm0, r3d 701 vpbroadcastb xm1, angled 702 shr angled, 8 ; is_sm << 1 703 vpcmpeqb k1, xm0, [base+z_filter_wh] 704 vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] 705 kmovw r5d, k1 706 test r5d, r5d 707 jz .w4_main 708 call .w16_filter 709 mov r2d, 9 710 cmp hd, 4 711 cmovne r3d, r2d 712 vpbroadcastw m6, r3d 713 pminuw m6, [base+pw_0to31] 714 vpermw m5, m6, m5 715.w4_main: 716 vpbroadcastq m0, [base+z_xpos_off1a] 717.w4_main2: 718 movsldup m3, [base+z_xpos_mul] 719 vpbroadcastw m4, dxd 720 lea r2, [strideq*3] 721 pmullw m3, m4 722 vshufi32x4 m6, m5, m5, q3321 723 psllw m4, 3 ; dx*8 724 paddsw m3, m0 ; xpos 725 palignr m6, m5, 2 ; top+1 726.w4_loop: 727 psrlw m1, m3, 6 ; base_x 728 pand m2, m15, m3 ; frac 729 vpermw m0, m1, m5 ; top[base_x] 730 vpermw m1, m1, m6 ; top[base_x+1] 731 psllw m2, 9 732 psubw m1, m0 733 pmulhrsw m1, m2 734 paddw m0, m1 735 vextracti32x4 xm1, ym0, 1 736 movq [dstq+strideq*0], xm0 737 movhps [dstq+strideq*1], xm0 738 movq [dstq+strideq*2], xm1 739 movhps [dstq+r2 ], xm1 740 sub hd, 8 741 jl .w4_end 742 vextracti32x4 xm1, m0, 2 743 paddsw m3, m4 ; xpos += dx 744 lea dstq, [dstq+strideq*4] 745 vextracti32x4 xm0, m0, 3 746 movq [dstq+strideq*0], xm1 747 movhps [dstq+strideq*1], xm1 748 movq [dstq+strideq*2], xm0 749 movhps [dstq+r2 ], xm0 750 lea dstq, [dstq+strideq*4] 751 jg .w4_loop 752.w4_end: 753 RET 754.upsample_top: 755 vinserti32x4 m5, [tlq-16], 3 756 mova m3, [base+z_upsample] 757 vpbroadcastd m4, [base+pd_65536] 758 add dxd, dxd 759 vpermw m0, m3, m5 760 paddw m3, m4 761 vpermw m1, m3, m5 762 paddw m3, m4 763 vpermw m2, m3, m5 764 paddw m3, m4 765 vpermw m3, m3, m5 766 vpbroadcastw m5, r9m ; pixel_max 767 paddw m1, m2 ; b+c 768 paddw m0, m3 ; a+d 769 psubw m0, m1, m0 770 psraw m0, 3 771 pxor m2, m2 772 paddw m0, m1 773 pmaxsw m0, m2 774 pavgw m0, m2 775 pminsw m5, m0 776 ret 777.w8: 778 lea r3d, [angleq+216] 779 movu ym5, [tlq] 780 mov r3b, hb 781 movu m10, [base+pw_0to31] 782 cmp r3d, 8 783 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 784 lea r3d, [hq+7] 785 vpbroadcastw m6, r3d 786 add r3d, r3d 787 pminuw m6, m10 788 vpermw m5, m6, m5 789 call .upsample_top 790 vbroadcasti32x4 m0, [base+z_xpos_off1b] 791 jmp .w8_main2 792.w8_no_upsample: 793 lea r3d, [hq+7] 794 vpbroadcastb ym0, r3d 795 and r3d, 7 796 or r3d, 8 ; imin(h+7, 15) 797 vpbroadcastw m6, r3d 798 pminuw m6, m10 799 vpermw m5, m6, m5 800 test angled, 0x400 801 jnz .w8_main 802 vpbroadcastb ym1, angled 803 shr angled, 8 804 vpcmpeqb k1, ym0, [base+z_filter_wh] 805 mova xm0, [base+z_filter_t0+angleq*8] 806 vpcmpgtb k1{k1}, ym1, ym0 807 kmovd r5d, k1 808 test r5d, r5d 809 jz .w8_main 810 call .w16_filter 811 cmp hd, r3d 812 jl .w8_filter_end 813 pminud m6, m10, [base+pw_17] {1to16} 814 add r3d, 2 815.w8_filter_end: 816 vpermw m5, m6, m5 817.w8_main: 818 vbroadcasti32x4 m0, [base+z_xpos_off1a] 819.w8_main2: 820 movshdup m3, [base+z_xpos_mul] 821 vpbroadcastw m4, dxd 822 shl r3d, 6 823 lea r2, [strideq*3] 824 pmullw m3, m4 825 vshufi32x4 m6, m5, m5, q3321 826 sub r3d, dxd 827 psllw m4, 2 ; dx*4 828 shl dxd, 2 829 paddsw m3, m0 ; xpos 830 palignr m6, m5, 2 ; top+1 831.w8_loop: 832 psrlw m1, m3, 6 ; base_x 833 pand m2, m15, m3 ; frac 834 vpermw m0, m1, m5 ; top[base_x] 835 vpermw m1, m1, m6 ; top[base_x+1] 836 psllw m2, 9 837 psubw m1, m0 838 pmulhrsw m1, m2 839 paddw m0, m1 840 mova [dstq+strideq*0], xm0 841 vextracti32x4 [dstq+strideq*1], ym0, 1 842 vextracti32x4 [dstq+strideq*2], m0, 2 843 vextracti32x4 [dstq+r2 ], m0, 3 844 sub hd, 4 845 jz .w8_end 846 paddsw m3, m4 ; xpos += dx 847 lea dstq, [dstq+strideq*4] 848 sub r3d, dxd 849 jg .w8_loop 850 vextracti32x4 xm5, m5, 3 851.w8_end_loop: 852 mova [dstq+strideq*0], xm5 853 mova [dstq+strideq*1], xm5 854 mova [dstq+strideq*2], xm5 855 mova [dstq+r2 ], xm5 856 lea dstq, [dstq+strideq*4] 857 sub hd, 4 858 jg .w8_end_loop 859.w8_end: 860 RET 861.w16_filter: 862 vpbroadcastw m1, [tlq-2] 863 popcnt r5d, r5d 864 valignq m3, m6, m5, 2 865 vpbroadcastd m7, [base+z_filter_k+(r5-1)*4+12*0] 866 valignq m1, m5, m1, 6 867 vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1] 868 palignr m2, m3, m5, 2 869 vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2] 870 palignr m0, m5, m1, 14 871 pmullw m7, m5 872 palignr m3, m5, 4 873 paddw m0, m2 874 palignr m5, m1, 12 875 pmullw m0, m8 876 paddw m5, m3 877 pmullw m5, m9 878 pxor m1, m1 879 paddw m0, m7 880 paddw m5, m0 881 psrlw m5, 3 882 pavgw m5, m1 883 ret 884.w16: 885 lea r3d, [hq+15] 886 vpbroadcastb ym0, r3d 887 and r3d, 15 888 or r3d, 16 ; imin(h+15, 31) 889 vpbroadcastw m11, r3d 890 pminuw m10, m11, [base+pw_0to31] 891 vpbroadcastw m6, [tlq+r3*2] 892 vpermw m5, m10, [tlq] 893 test angled, 0x400 894 jnz .w16_main 895 vpbroadcastb ym1, angled 896 shr angled, 8 897 vpcmpeqb k1, ym0, [base+z_filter_wh] 898 mova xm0, [base+z_filter_t0+angleq*8] 899 vpcmpgtb k1{k1}, ym1, ym0 900 kmovd r5d, k1 901 test r5d, r5d 902 jz .w16_main 903 call .w16_filter 904 cmp hd, 16 905 jg .w16_filter_h32 906 vpermw m6, m11, m5 907 vpermw m5, m10, m5 908 jmp .w16_main 909.w16_filter_h32: 910 movzx r3d, word [tlq+62] 911 movzx r2d, word [tlq+60] 912 lea r2d, [r2+r3*8+4] 913 sub r2d, r3d 914 mov r3d, 1 915 shr r2d, 3 916 kmovb k1, r3d 917 movd xm0, r2d 918 or r3d, 32 919 vmovdqu16 m6{k1}, m0 920.w16_main: 921 rorx r2d, dxd, 23 922 mov r7, rsp 923 and rsp, ~63 924 vpbroadcastw m3, r2d 925 sub rsp, 64*2 926 mov r2d, dxd 927 paddw m4, m3, m3 928 mova [rsp+64*0], m5 929 vinserti32x8 m3, ym4, 1 930 mova [rsp+64*1], m6 931 shl r3d, 6 932.w16_loop: 933 lea r5d, [r2+dxq] 934 shr r2d, 6 935 movu ym0, [rsp+r2*2] 936 movu ym1, [rsp+r2*2+2] 937 lea r2d, [r5+dxq] 938 shr r5d, 6 939 vinserti32x8 m0, [rsp+r5*2], 1 940 vinserti32x8 m1, [rsp+r5*2+2], 1 941 pand m2, m15, m3 ; frac << 9 942 psubw m1, m0 943 pmulhrsw m1, m2 944 paddw m0, m1 945 mova [dstq+strideq*0], ym0 946 vextracti32x8 [dstq+strideq*1], m0, 1 947 sub hd, 2 948 jz .w16_end 949 paddw m3, m4 950 lea dstq, [dstq+strideq*2] 951 cmp r2d, r3d 952 jl .w16_loop 953 punpckhqdq ym6, ym6 954.w16_end_loop: 955 mova [dstq+strideq*0], ym6 956 mova [dstq+strideq*1], ym6 957 lea dstq, [dstq+strideq*2] 958 sub hd, 2 959 jg .w16_end_loop 960.w16_end: 961 mov rsp, r7 962 RET 963.w32: 964 lea r3d, [hq+31] 965 movu m7, [tlq+64*0] 966 and r3d, 31 967 vpbroadcastw m11, r3d 968 or r3d, 32 ; imin(h+31, 63) 969 pminuw m10, m11, [base+pw_0to31] 970 vpbroadcastw m9, [tlq+r3*2] 971 vpermw m8, m10, [tlq+64*1] 972 test angled, 0x400 973 jnz .w32_main 974 vpbroadcastd m5, [base+pw_3] 975 mov r5d, ~1 976 movu m3, [tlq-2] 977 kmovd k1, r5d 978 valignq m2, m8, m7, 6 979 paddw m7, m3 980 vmovdqu16 m3{k1}, [tlq-4] 981 valignq m4, m9, m8, 2 982 paddw m3, m5 983 paddw m7, [tlq+2] 984 palignr m1, m8, m2, 14 985 pavgw m3, [tlq+4] 986 palignr m2, m8, m2, 12 987 paddw m7, m3 988 palignr m3, m4, m8, 2 989 psrlw m7, 2 990 palignr m4, m8, 4 991 paddw m8, m1 992 paddw m2, m5 993 paddw m8, m3 994 pavgw m2, m4 995 paddw m8, m2 996 psrlw m8, 2 997 cmp hd, 64 998 je .w32_filter_h64 999 vpermw m9, m11, m8 1000 vpermw m8, m10, m8 1001 jmp .w32_main 1002.w32_filter_h64: 1003 movzx r3d, word [tlq+126] 1004 movzx r2d, word [tlq+124] 1005 lea r2d, [r2+r3*8+4] 1006 sub r2d, r3d 1007 mov r3d, 65 1008 shr r2d, 3 1009 movd xm0, r2d 1010 vpblendmw m9{k1}, m0, m9 1011.w32_main: 1012 rorx r2d, dxd, 23 1013 mov r7, rsp 1014 and rsp, ~63 1015 vpbroadcastw m5, r2d 1016 sub rsp, 64*4 1017 mov r2d, dxd 1018 mova [rsp+64*0], m7 1019 shl r3d, 6 1020 mova [rsp+64*1], m8 1021 mova m6, m5 1022 mova [rsp+64*2], m9 1023 punpckhqdq m9, m9 1024 mova [rsp+64*3], ym9 1025.w32_loop: 1026 lea r5d, [r2+dxq] 1027 shr r2d, 6 1028 movu m0, [rsp+r2*2] 1029 movu m2, [rsp+r2*2+2] 1030 lea r2d, [r5+dxq] 1031 shr r5d, 6 1032 movu m1, [rsp+r5*2] 1033 movu m3, [rsp+r5*2+2] 1034 pand m4, m15, m5 1035 paddw m5, m6 1036 psubw m2, m0 1037 pmulhrsw m2, m4 1038 pand m4, m15, m5 1039 psubw m3, m1 1040 pmulhrsw m3, m4 1041 paddw m0, m2 1042 paddw m1, m3 1043 mova [dstq+strideq*0], m0 1044 mova [dstq+strideq*1], m1 1045 sub hd, 2 1046 jz .w32_end 1047 paddw m5, m6 1048 lea dstq, [dstq+strideq*2] 1049 cmp r2d, r3d 1050 jl .w32_loop 1051.w32_end_loop: 1052 mova [dstq+strideq*0], m9 1053 mova [dstq+strideq*1], m9 1054 lea dstq, [dstq+strideq*2] 1055 sub hd, 2 1056 jg .w32_end_loop 1057.w32_end: 1058 mov rsp, r7 1059 RET 1060.w64_filter96: 1061 vpbroadcastd m4, [base+pw_3] 1062 mov r5d, ~1 1063 movu m0, [tlq-2] 1064 kmovd k1, r5d 1065 paddw m7, m0 1066 vmovdqu16 m0{k1}, [tlq-4] 1067 paddw m0, m4 1068 paddw m7, [tlq+2] 1069 pavgw m0, [tlq+4] 1070 valignq m1, m9, m8, 6 1071 paddw m8, [tlq+62] 1072 paddw m2, m4, [tlq+60] 1073 valignq m3, m10, m9, 2 1074 paddw m8, [tlq+66] 1075 pavgw m2, [tlq+68] 1076 paddw m7, m0 1077 palignr m0, m9, m1, 14 1078 paddw m8, m2 1079 palignr m1, m9, m1, 12 1080 psrlw m7, 2 1081 palignr m2, m3, m9, 2 1082 psrlw m8, 2 1083 palignr m3, m9, 4 1084 paddw m0, m9 1085 paddw m1, m4 1086 paddw m0, m2 1087 pavgw m1, m3 1088 paddw m0, m1 1089 ret 1090.w64: 1091 movu m7, [tlq+64*0] 1092 lea r3d, [hq-1] 1093 movu m8, [tlq+64*1] 1094 vpbroadcastw m11, [tlq+r3*2+128] 1095 movu m9, [tlq+64*2] 1096 cmp hd, 64 1097 je .w64_h64 1098 vpbroadcastw m13, r3d 1099 or r3d, 64 1100 pminuw m12, m13, [base+pw_0to31] 1101 mova m10, m11 1102 vpermw m9, m12, m9 1103 test angled, 0x400 1104 jnz .w64_main 1105 call .w64_filter96 1106 psrlw m0, 2 1107 vpermw m9, m12, m0 1108 vpermw m10, m13, m0 1109 mova m11, m10 1110 jmp .w64_main 1111.w64_h64: 1112 movu m10, [tlq+64*3] 1113 or r3d, 64 1114 test angled, 0x400 1115 jnz .w64_main 1116 call .w64_filter96 1117 valignq m1, m10, m9, 6 1118 valignq m3, m11, m10, 2 1119 vpbroadcastd m11, [base+pw_63] 1120 psrlw m9, m0, 2 1121 palignr m0, m10, m1, 14 1122 palignr m1, m10, m1, 12 1123 palignr m2, m3, m10, 2 1124 palignr m3, m10, 4 1125 paddw m10, m0 1126 paddw m1, m4 1127 paddw m10, m2 1128 pavgw m1, m3 1129 paddw m10, m1 1130 psrlw m10, 2 1131 vpermw m11, m11, m10 1132.w64_main: 1133 rorx r2d, dxd, 23 1134 mov r7, rsp 1135 and rsp, ~63 1136 vpbroadcastw m5, r2d 1137 sub rsp, 64*6 1138 mova [rsp+64*0], m7 1139 mov r2d, dxd 1140 mova [rsp+64*1], m8 1141 lea r5, [rsp+r3*2] 1142 mova [rsp+64*2], m9 1143 shl r3d, 6 1144 mova [rsp+64*3], m10 1145 sub r2, r3 1146 mova [rsp+64*4], m11 1147 mova m6, m5 1148 mova [rsp+64*5], m11 1149.w64_loop: 1150 mov r3, r2 1151 sar r3, 6 1152 movu m0, [r5+r3*2+64*0] 1153 movu m2, [r5+r3*2+64*0+2] 1154 movu m1, [r5+r3*2+64*1] 1155 movu m3, [r5+r3*2+64*1+2] 1156 pand m4, m15, m5 1157 psubw m2, m0 1158 pmulhrsw m2, m4 1159 psubw m3, m1 1160 pmulhrsw m3, m4 1161 paddw m0, m2 1162 paddw m1, m3 1163 mova [dstq+64*0], m0 1164 mova [dstq+64*1], m1 1165 dec hd 1166 jz .w64_end 1167 paddw m5, m6 1168 add dstq, strideq 1169 add r2, dxq 1170 jl .w64_loop 1171.w64_end_loop: 1172 mova [dstq+64*0], m11 1173 mova [dstq+64*1], m11 1174 add dstq, strideq 1175 dec hd 1176 jg .w64_end_loop 1177.w64_end: 1178 mov rsp, r7 1179 RET 1180 1181cglobal ipred_z2_16bpc, 3, 9, 16, dst, stride, tl, w, h, angle, dx, _, dy 1182 tzcnt wd, wm 1183 movifnidn angled, anglem 1184 lea dxq, [dr_intra_derivative-90] 1185 movzx dyd, angleb 1186 xor angled, 0x400 1187 mov r7, dxq 1188 sub dxq, dyq 1189 movifnidn hd, hm 1190 and dyd, ~1 1191 vpbroadcastw m12, [tlq] 1192 and dxq, ~1 1193 movzx dyd, word [r7+dyq] ; angle - 90 1194 lea r7, [z_filter_t0] 1195 movzx dxd, word [dxq+270] ; 180 - angle 1196 mova m0, [base+pw_31to0] 1197 movsxd wq, [base+ipred_z2_16bpc_avx512icl_table+wq*4] 1198 movu m4, [tlq+2] 1199 neg dyd 1200 vpermw m7, m0, [tlq-64*1] 1201 lea wq, [base+ipred_z2_16bpc_avx512icl_table+wq] 1202 vpbroadcastd m14, [base+pw_31806] 1203 vpbroadcastd m15, [base+pw_1] 1204 jmp wq 1205.w4: 1206 movq xm3, [tlq] 1207 vpbroadcastq m8, [base+pw_1to32] 1208 test angled, 0x400 1209 jnz .w4_main ; !enable_intra_edge_filter 1210 lea r3d, [hq+2] 1211 add angled, 1022 1212 shl r3d, 6 1213 test r3d, angled 1214 jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) 1215 pshuflw xm0, xm4, q3321 1216 sub angled, 1075 ; angle - 53 1217 lea r3d, [hq+3] 1218 call .upsample_above 1219 punpcklwd xm4, xm3, xm4 1220 palignr xm3, xm4, xm12, 14 1221 jmp .w4_main 1222.w4_upsample_left: 1223 call .upsample_left 1224 movsldup m1, [base+z_xpos_mul] 1225 paddw m1, m1 1226 jmp .w4_main2 1227.w4_no_upsample_above: 1228 lea r3d, [hq+3] 1229 vpbroadcastd ym0, [base+pw_3] 1230 sub angled, 1112 ; angle - 90 1231 call .filter_above2 1232 lea r3d, [hq+2] 1233 add angled, 973 ; angle + 883 1234 palignr xm3, xm4, xm12, 14 1235 shl r3d, 6 1236 test r3d, angled 1237 jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) 1238 call .filter_left16 1239.w4_main: 1240 movsldup m1, [base+z_xpos_mul] 1241 psllw m15, 3 1242.w4_main2: 1243 vpbroadcastq m0, [base+pw_1to32] 1244 vpbroadcastw m11, dxd 1245 movsldup m2, [base+z_xpos_mul] 1246 vpbroadcastw m13, dyd 1247 vpbroadcastd m5, [tlq-2] 1248 psllw m10, m8, 6 1249 valignq m5, m7, m5, 6 1250 pmullw m2, m11 1251 psubw m10, m2 ; xpos 1252 pmullw m13, m0 ; ypos 1253 palignr m5, m7, m5, 14 1254 psrlw m12, m13, 6 1255 psllw m13, 9 1256 paddw m12, m1 ; base_y 1257 pand m13, m14 ; frac_y << 9 1258 psllw m11, 3 1259 lea r5, [strideq*3] 1260.w4_loop: 1261 psrlw m1, m10, 6 ; base_x 1262 pand m2, m14, m10 ; frac 1263 vpermw m0, m1, m3 ; top[base_x] 1264 vpermw m1, m1, m4 ; top[base_x+1] 1265 vpmovw2m k1, m10 ; base_x < 0 1266 psllw m2, 9 1267 vpermw m0{k1}, m12, m5 ; left[base_y] 1268 vpermw m1{k1}, m12, m7 ; left[base_y+1] 1269 vmovdqu16 m2{k1}, m13 1270 psubw m1, m0 1271 pmulhrsw m1, m2 1272 paddw m0, m1 1273 vextracti32x4 xm1, ym0, 1 1274 movq [dstq+strideq*0], xm0 1275 movhps [dstq+strideq*1], xm0 1276 movq [dstq+strideq*2], xm1 1277 movhps [dstq+r5 ], xm1 1278 sub hd, 8 1279 jl .w4_end 1280 vextracti32x8 ym0, m0, 1 1281 psubw m10, m11 ; base_x -= dx 1282 lea dstq, [dstq+strideq*4] 1283 paddw m12, m15 ; base_y++ 1284 vextracti32x4 xm1, ym0, 1 1285 movq [dstq+strideq*0], xm0 1286 movhps [dstq+strideq*1], xm0 1287 movq [dstq+strideq*2], xm1 1288 movhps [dstq+r5 ], xm1 1289 lea dstq, [dstq+strideq*4] 1290 jg .w4_loop 1291.w4_end: 1292 RET 1293.upsample_above: ; w4/w8 1294 mova ym9, [base+pw_1to32] 1295 palignr xm1, xm4, xm12, 12 1296 paddw xm3, xm4 ; b+c 1297 xor angled, 0x7f ; 180 - angle 1298 paddw xm0, xm1 ; a+d 1299 vpbroadcastw xm1, r9m ; pixel_max 1300 vpbroadcastb xm11, r3d 1301 psubw xm0, xm3, xm0 1302 vpbroadcastb xm2, angled 1303 psraw xm0, 3 1304 shr angled, 8 1305 paddw xm3, xm0 1306 pxor xm0, xm0 1307 vpcmpeqb k2, xm11, [base+z_filter_wh] 1308 pmaxsw xm3, xm0 1309 add dxd, dxd 1310 pavgw xm3, xm0 1311 vpcmpgtb k2{k2}, xm2, [base+z_filter_t0+angleq*8] 1312 pminsw xm3, xm1 1313 paddw m8, m8 1314 jmp .filter_left16b 1315.upsample_left: ; h4/h8 1316 lea r3d, [hq-1] 1317 palignr xm2, xm7, xm12, 14 1318 vpbroadcastw xm0, r3d 1319 palignr xm1, xm7, xm12, 12 1320 pminuw xm0, xm9 1321 paddw xm2, xm7 ; b+c 1322 vpermw xm0, xm0, xm7 1323 add dyd, dyd 1324 paddw xm0, xm1 ; a+d 1325 vpbroadcastw xm1, r9m ; pixel_max 1326 psubw xm0, xm2, xm0 1327 psraw xm0, 3 1328 paddw xm2, xm0 1329 pxor xm0, xm0 1330 pmaxsw xm2, xm0 1331 pavgw xm2, xm0 1332 pminsw xm2, xm1 1333 punpckhwd xm0, xm2, xm7 1334 punpcklwd xm7, xm2, xm7 1335 vinserti32x4 ym7, xm0, 1 1336 ret 1337.filter_above: 1338 sub angled, 90 1339.filter_above2: 1340 vpbroadcastb ym1, r3d 1341 vpbroadcastb ym10, angled 1342 mov r3d, angled 1343 shr r3d, 8 1344 vpcmpeqb k2, ym1, [base+z_filter_wh] 1345 mova xm11, [base+z_filter_t0+r3*8] 1346 vpcmpgtb k1{k2}, ym10, ym11 1347 mova m9, [base+pw_1to32] 1348 kmovd r3d, k1 1349 test r3d, r3d 1350 jz .filter_end 1351 pminuw ym0, ym9 1352 popcnt r3d, r3d 1353 vpbroadcastd ym6, r7m ; max_w 1354 kxnorw k1, k1, k1 1355 vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0] 1356 kaddw k1, k1, k1 ; ~1 1357 vpbroadcastd ym13, [base+z_filter_k+(r3-1)*4+12*1] 1358 vpermw ym2, ym0, ym4 ; +1 1359 pmullw ym5, ym4 1360 paddw ym1, ym2, ym3 1361 vmovdqu16 m3{k1}, [tlq-2] ; -2 1362 vpermw ym2, ym0, ym2 ; +2 1363 vpbroadcastd ym0, [base+z_filter_k+(r3-1)*4+12*2] 1364 pmullw ym1, ym13 1365 movu m13, [base+pw_0to31] 1366 paddw ym2, ym3 1367 packssdw ym6, ym6 1368 pmullw ym2, ym0 1369 paddw ym1, ym5 1370 vpcmpgtw k1, ym6, ym13 1371 paddw ym1, ym2 1372 pxor ym2, ym2 1373 psrlw ym1, 3 1374 pavgw ym4{k1}, ym1, ym2 1375.filter_end: 1376 ret 1377.filter_left16: 1378 vpbroadcastd ym1, [base+pb_90] 1379 psubb ym1, ym10 1380 vpcmpgtb k2{k2}, ym1, ym11 1381.filter_left16b: 1382 kmovd r3d, k2 1383 test r3d, r3d 1384 jz .filter_end 1385 lea r5d, [hq-1] 1386 vinserti32x4 ym0, ym12, xm7, 1 1387 vpbroadcastw ym1, r5d 1388 popcnt r3d, r3d 1389 vpbroadcastd ym6, r8m ; max_h 1390 pminuw ym9, ym1 1391 vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0] 1392 vpermw ym2, ym9, ym7 ; +1 1393 vpbroadcastd ym10, [base+z_filter_k+(r3-1)*4+12*1] 1394 palignr ym1, ym7, ym0, 14 ; -1 1395 pmullw ym5, ym7 1396 palignr ym0, ym7, ym0, 12 ; -2 1397 paddw ym1, ym2 1398 vpermw ym2, ym9, ym2 ; +2 1399 vpbroadcastd ym9, [base+z_filter_k+(r3-1)*4+12*2] 1400 pmullw ym1, ym10 1401 paddw ym2, ym0 1402 packssdw ym6, ym6 1403 pmullw ym2, ym9 1404 paddw ym1, ym5 1405 vpcmpgtw k1, ym6, [base+pw_0to31] 1406 paddw ym1, ym2 1407 pxor ym2, ym2 1408 psrlw ym1, 3 1409 pavgw ym7{k1}, ym1, ym2 1410 ret 1411.filter_left: 1412 cmp hd, 32 1413 jl .filter_left16 1414 vpbroadcastd m5, [base+pw_3] 1415 pminud m0, m9, [base+pw_31] {1to16} 1416.filter_left32: 1417 vpbroadcastd m6, r8m ; max_h 1418 valignq m2, m7, m12, 6 1419 packssdw m6, m6 1420 palignr m1, m7, m2, 14 ; -1 1421 paddw m1, m7 1422 palignr m2, m7, m2, 12 ; -2 1423 vpcmpgtw k1, m6, m13 1424 paddw m2, m5 1425 cmp hd, 64 1426 je .filter_left64 1427 lea r3d, [hq-1] 1428 vpbroadcastw m10, r3d 1429 pminuw m0, m10 1430 vpermw m10, m0, m7 ; +1 1431 paddw m1, m10 1432 vpermw m10, m0, m10 ; +2 1433 pavgw m2, m10 1434 paddw m1, m2 1435 vpsrlw m7{k1}, m1, 2 1436 ret 1437.filter_left64: 1438 valignq m10, m8, m7, 2 1439 vpaddd m13, [base+pw_32] {1to16} 1440 palignr m11, m10, m7, 2 ; +1 1441 paddw m1, m11 1442 palignr m11, m10, m7, 4 ; +2 1443 valignq m10, m8, m7, 6 1444 pavgw m11, m2 1445 vpermw m2, m0, m8 ; 32+1 1446 paddw m1, m11 1447 vpsrlw m7{k1}, m1, 2 1448 palignr m1, m8, m10, 14 ; 32-1 1449 paddw m1, m8 1450 palignr m10, m8, m10, 12 ; 32-2 1451 paddw m1, m2 1452 vpermw m2, m0, m2 ; 32+2 1453 paddw m10, m5 1454 vpcmpgtw k1, m6, m13 1455 pavgw m2, m10 1456 paddw m1, m2 1457 vpsrlw m8{k1}, m1, 2 1458 ret 1459.w8: 1460 mova xm3, [tlq] 1461 vbroadcasti32x4 m8, [base+pw_1to32] 1462 test angled, 0x400 1463 jnz .w8_main 1464 lea r3d, [angleq+126] 1465 mov r3b, hb 1466 cmp r3d, 8 1467 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm 1468 psrldq xm0, xm4, 2 1469 sub angled, 53 1470 pshufhw xm0, xm0, q2210 1471 lea r3d, [hq+7] 1472 call .upsample_above 1473 punpcklwd xm0, xm3, xm4 1474 punpckhwd xm4, xm3, xm4 1475 vinserti32x4 ym3, ym12, xm0, 1 1476 vinserti32x4 ym4, ym0, xm4, 1 1477 palignr ym3, ym4, ym3, 14 1478 jmp .w8_main 1479.w8_upsample_left: 1480 call .upsample_left 1481 movshdup m1, [base+z_xpos_mul] 1482 psllw m15, 3 1483 paddw m1, m1 1484 jmp .w8_main2 1485.w8_no_upsample_above: 1486 lea r3d, [hq+7] 1487 vpbroadcastd ym0, [base+pw_7] 1488 call .filter_above 1489 lea r3d, [angleq-51] 1490 mov r3b, hb 1491 palignr xm3, xm4, xm12, 14 1492 cmp r3d, 8 1493 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm 1494 call .filter_left 1495.w8_main: 1496 movshdup m1, [base+z_xpos_mul] 1497 psllw m15, 2 1498.w8_main2: 1499 vbroadcasti32x4 m0, [base+pw_1to32] 1500 vpbroadcastw m11, dxd 1501 movshdup m2, [base+z_xpos_mul] 1502 vpbroadcastw m13, dyd 1503 psllw m10, m8, 6 1504 valignq m5, m7, m12, 6 1505 pmullw m2, m11 1506 psubw m10, m2 ; xpos 1507 pmullw m13, m0 ; ypos 1508 palignr m5, m7, m5, 14 1509 psrlw m12, m13, 6 1510 psllw m13, 9 1511 mov r2d, 1<<6 1512 paddw m12, m1 ; base_y 1513 lea r3d, [dxq-(8<<6)] ; left-only threshold 1514 pand m13, m14 ; frac_y << 9 1515 shl dxd, 2 1516 psllw m11, 2 1517 lea r5, [strideq*3] 1518.w8_loop: 1519 psrlw m1, m10, 6 1520 pand m2, m14, m10 1521 vpermw m0, m1, m3 1522 vpermw m1, m1, m4 1523 psllw m2, 9 1524 sub r2d, dxd 1525 jge .w8_toponly 1526 vpmovw2m k1, m10 1527 vpermw m0{k1}, m12, m5 1528 vpermw m1{k1}, m12, m7 1529 vmovdqu16 m2{k1}, m13 1530.w8_toponly: 1531 psubw m1, m0 1532 pmulhrsw m1, m2 1533 paddw m0, m1 1534 mova [dstq+strideq*0], xm0 1535 vextracti32x4 [dstq+strideq*1], ym0, 1 1536 vextracti32x4 [dstq+strideq*2], m0, 2 1537 vextracti32x4 [dstq+r5 ], m0, 3 1538 sub hd, 4 1539 jz .w8_end 1540 psubw m10, m11 ; base_x -= dx 1541 lea dstq, [dstq+strideq*4] 1542 paddw m12, m15 ; base_y++ 1543 cmp r2d, r3d 1544 jge .w8_loop 1545.w8_leftonly_loop: 1546 vpermw m0, m12, m5 1547 vpermw m1, m12, m7 1548 psubw m1, m0 1549 pmulhrsw m1, m13 1550 paddw m12, m15 1551 paddw m0, m1 1552 mova [dstq+strideq*0], xm0 1553 vextracti32x4 [dstq+strideq*1], ym0, 1 1554 vextracti32x4 [dstq+strideq*2], m0, 2 1555 vextracti32x4 [dstq+r5 ], m0, 3 1556 lea dstq, [dstq+strideq*4] 1557 sub hd, 4 1558 jg .w8_leftonly_loop 1559.w8_end: 1560 RET 1561.w16: 1562 mova ym3, [tlq] 1563 vpermw m8, m0, [tlq-64*2] 1564 test angled, 0x400 1565 jnz .w16_main 1566 lea r3d, [hq+15] 1567 vpbroadcastd ym0, [base+pw_15] 1568 call .filter_above 1569 call .filter_left 1570 vinserti32x4 ym3, ym12, xm4, 1 1571 palignr ym3, ym4, ym3, 14 1572.w16_main: 1573 vbroadcasti32x8 m0, [base+pw_1to32] 1574 vpbroadcastw m11, dxd 1575 vpbroadcastw m13, dyd 1576 kxnorw k2, k2, k2 1577 psllw m10, m0, 6 1578 valignq m5, m7, m12, 6 1579 psubw m10, m11 ; xpos 1580 valignq m6, m8, m7, 6 1581 pmullw m13, m0 ; ypos 1582 knotd k1, k2 1583 palignr m5, m7, m5, 14 1584 palignr m6, m8, m6, 14 1585 vpsubw m10{k1}, m11 1586 psrlw m12, m13, 6 1587 psllw m13, 9 1588 mov r2d, 1<<6 1589 vpsubw m12{k2}, m15 ; base_y 1590 pand m13, m14 ; frac_y << 9 1591 lea r3d, [dxq-(16<<6)] 1592 paddw m11, m11 1593 add dxd, dxd 1594 paddw m15, m15 1595.w16_loop: 1596 psrlw m1, m10, 6 1597 pand m2, m14, m10 1598 vpermw m0, m1, m3 1599 vpermw m1, m1, m4 1600 psllw m2, 9 1601 psubw m1, m0 1602 pmulhrsw m1, m2 1603 paddw m12, m15 ; base_y++ 1604 paddw m0, m1 1605 sub r2d, dxd 1606 jge .w16_toponly 1607 mova m1, m5 1608 vpermt2w m1, m12, m6 1609 mova m2, m7 1610 vpermt2w m2, m12, m8 1611 vpmovw2m k1, m10 1612 psubw m2, m1 1613 pmulhrsw m2, m13 1614 vpaddw m0{k1}, m1, m2 1615.w16_toponly: 1616 mova [dstq+strideq*0], ym0 1617 vextracti32x8 [dstq+strideq*1], m0, 1 1618 sub hd, 2 1619 jz .w16_end 1620 psubw m10, m11 ; base_x -= dx 1621 lea dstq, [dstq+strideq*2] 1622 cmp r2d, r3d 1623 jge .w16_loop 1624 paddw m12, m15 1625 vpermt2w m5, m12, m6 1626 mova m1, m7 1627 vpermt2w m1, m12, m8 1628 jmp .w16_leftonly_loop_start 1629.w16_leftonly_loop: 1630 mova m1, m7 1631 vpermt2w m1, m12, m8 1632 vshufi32x4 m5, m1, q1032 1633.w16_leftonly_loop_start: 1634 psubw m0, m1, m5 1635 pmulhrsw m0, m13 1636 paddw m12, m15 1637 paddw m0, m5 1638 mova m5, m1 1639 mova [dstq+strideq*0], ym0 1640 vextracti32x8 [dstq+strideq*1], m0, 1 1641 lea dstq, [dstq+strideq*2] 1642 sub hd, 2 1643 jg .w16_leftonly_loop 1644.w16_end: 1645 RET 1646.w32: 1647 mova m3, [tlq] 1648 vpermw m8, m0, [tlq-64*2] 1649 mova m9, [base+pw_1to32] 1650 test angled, 0x400 1651 jnz .w32_main 1652 pminud m0, m9, [base+pw_31] {1to16} 1653 mov r3d, ~1 1654 kmovd k1, r3d 1655 vpbroadcastd m5, [base+pw_3] 1656 vpbroadcastd m6, r6m ; max_w 1657 vpermw m2, m0, m4 ; +1 1658 movu m13, [base+pw_0to31] 1659 paddw m1, m4, m3 1660 vmovdqu16 m3{k1}, [tlq-2] ; -2 1661 packssdw m6, m6 1662 paddw m1, m2 1663 vpermw m2, m0, m2 ; +2 1664 paddw m3, m5 1665 vpcmpgtw k1, m6, m13 1666 pavgw m2, m3 1667 paddw m1, m2 1668 psrlw m4{k1}, m1, 2 1669 call .filter_left32 1670.w32_main: 1671 sub rsp, 64*2 1672 call .w32_main1 1673 add rsp, 64*2 1674 RET 1675.w32_main1: 1676 vpbroadcastw m11, dxd 1677 movu [rsp+64], m4 1678 vpbroadcastw m4, dyd 1679 movd [rsp+60], xm12 1680 valignq m5, m7, m12, 6 1681 psllw m3, m9, 6 ; xpos 1682 valignq m6, m8, m7, 6 1683 pmullw m9, m4 ; ypos 1684 palignr m5, m7, m5, 14 1685 mov r2d, 33<<6 1686 palignr m6, m8, m6, 14 1687 mova m10, m3 1688.w32_main2: 1689 psllw m13, m9, 9 1690 sub r2d, dxd 1691 psrlw m12, m9, 6 ; base_y 1692 mov r8d, hd 1693 pand m13, m14 ; frac_y << 9 1694.w32_loop: 1695 mov r3d, r2d 1696 shr r3d, 6 1697 psubw m10, m11 ; base_x -= dx 1698 movu m0, [rsp+r3*2-2] 1699 pand m2, m10, m14 ; frac_x 1700 movu m1, [rsp+r3*2] 1701 psllw m2, 9 1702 psubw m1, m0 1703 pmulhrsw m1, m2 1704 paddw m12, m15 ; base_y++ 1705 paddw m0, m1 1706 cmp r2d, 32<<6 1707 jge .w32_toponly 1708 mova m1, m5 1709 vpermt2w m1, m12, m6 1710 mova m2, m7 1711 vpermt2w m2, m12, m8 1712 vpmovw2m k1, m10 1713 psubw m2, m1 1714 pmulhrsw m2, m13 1715 vpaddw m0{k1}, m1, m2 1716.w32_toponly: 1717 mova [dstq], m0 1718 dec r8d 1719 jz .w32_end 1720 add dstq, strideq 1721 sub r2d, dxd 1722 jge .w32_loop 1723 paddw m12, m15 1724 mova m2, m5 1725 vpermt2w m2, m12, m6 1726.w32_leftonly_loop: 1727 mova m1, m7 1728 vpermt2w m1, m12, m8 1729 psubw m0, m1, m2 1730 pmulhrsw m0, m13 1731 paddw m12, m15 1732 paddw m0, m2 1733 mova m2, m1 1734 mova [dstq], m0 1735 add dstq, strideq 1736 dec r8d 1737 jg .w32_leftonly_loop 1738.w32_end: 1739 ret 1740.w64: 1741 movu m3, [tlq+66] 1742 vpermw m8, m0, [tlq-64*2] 1743 mova m9, [base+pw_1to32] 1744 test angled, 0x400 1745 jnz .w64_main 1746 mova m2, [tlq] ; -1 1747 mov r3d, ~1 1748 vpbroadcastd m5, [base+pw_3] 1749 kmovd k1, r3d 1750 movu m13, [base+pw_0to31] 1751 vpbroadcastd m6, r6m ; max_w 1752 pminud m0, m9, [base+pw_31] {1to16} 1753 paddw m1, m4, m2 1754 vmovdqu16 m2{k1}, [tlq-2] ; -2 1755 packssdw m6, m6 1756 paddw m1, [tlq+4] ; +1 1757 paddw m2, m5 1758 vpcmpgtw k1, m6, m13 1759 pavgw m2, [tlq+6] ; +2 1760 paddw m1, m2 1761 vpermw m2, m0, m3 ; 32+1 1762 psrlw m4{k1}, m1, 2 1763 paddw m1, m3, [tlq+64] ; 32-1 1764 vpaddd m11, m13, [base+pw_32] {1to16} 1765 paddw m1, m2 1766 vpermw m2, m0, m2 ; 32+2 1767 paddw m10, m5, [tlq+62] ; 32-2 1768 vpcmpgtw k1, m6, m11 1769 pavgw m2, m10 1770 paddw m1, m2 1771 psrlw m3{k1}, m1, 2 1772 call .filter_left32 1773.w64_main: 1774 sub rsp, 64*3 1775 movu [rsp+64*2-gprsize], m3 1776 mov r5, dstq 1777 call .w32_main1 1778 psllw m4, 5 1779 mov r2d, 65<<6 1780 vpaddd m10, m3, [base+pw_2048] {1to16} ; xpos 1781 lea dstq, [r5+64] 1782 paddw m9, m4 ; ypos 1783 call .w32_main2 1784 add rsp, 64*3 1785 RET 1786 1787cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy 1788 lea r7, [z_filter_t0] 1789 tzcnt wd, wm 1790 movifnidn angled, anglem 1791 lea t0, [dr_intra_derivative+45*2-1] 1792 movsxd wq, [base+ipred_z3_16bpc_avx512icl_table+wq*4] 1793 sub angled, 180 1794 mov dyd, angled 1795 neg dyd 1796 xor angled, 0x400 1797 or dyq, ~0x7e 1798 mova m0, [base+pw_31to0] 1799 movzx dyd, word [t0+dyq] 1800 lea wq, [base+ipred_z3_16bpc_avx512icl_table+wq] 1801 movifnidn hd, hm 1802 vpbroadcastd m14, [base+pw_31806] 1803 vpbroadcastd m15, [base+pw_1] 1804 jmp wq 1805.w4: 1806 lea r3d, [hq+3] 1807 xor r3d, 31 ; 32 - (h + imin(w, h)) 1808 vpbroadcastw m7, r3d 1809 pmaxuw m7, m0 1810 vpermw m6, m7, [tlq-64*1] 1811 test angled, 0x400 ; !enable_intra_edge_filter 1812 jnz .w4_main 1813 cmp angleb, 40 1814 jae .w4_filter 1815 lea r3d, [angleq-1024] 1816 sar r3d, 7 1817 add r3d, hd 1818 jg .w4_filter ; h > 8 || (h == 8 && is_sm) 1819 call .upsample 1820 movsldup m1, [base+z_ypos_mul] 1821 paddw m1, m1 1822 jmp .w4_main2 1823.w4_filter: 1824 lea r3d, [hq+3] 1825 call .filter32 1826.w4_main: 1827 movsldup m1, [base+z_ypos_mul] 1828.w4_main2: 1829 vpbroadcastq m0, [base+pw_1to32] 1830 vpbroadcastw m4, dyd 1831 lea r2d, [hq+4] 1832 shr r2d, 3 1833 pmullw m4, m0 ; ypos 1834 vpbroadcastw m0, r2d 1835 imul r2, strideq ; stride * imax(height / 8, 1) 1836 pmullw m1, m0 1837 lea r3, [r2*3] 1838 paddd m1, [base+pw_32736] {1to16} 1839 psrlw m2, m4, 6 1840 psllw m4, 9 1841 paddsw m2, m1 ; base+0 1842 vpandd m4, m14 ; frac << 9 1843 vpermw m3, m2, m6 ; left[base+0] 1844.w4_loop: 1845 paddsw m2, m15 ; base+1 1846 vpermw m1, m2, m6 ; left[base+1] 1847 psubw m0, m1, m3 1848 pmulhrsw m0, m4 1849 paddw m0, m3 1850 movq [dstq+r2*0], xm0 1851 movhps [dstq+r2*1], xm0 1852 vextracti32x4 xm3, ym0, 1 1853 movq [dstq+r2*2], xm3 1854 movhps [dstq+r3 ], xm3 1855 sub hd, 8 1856 jl .w4_end 1857 lea r5, [dstq+r2*4] 1858 vextracti32x8 ym0, m0, 1 1859 mova m3, m1 1860 movq [r5+r2*0], xm0 1861 movhps [r5+r2*1], xm0 1862 vextracti32x4 xm1, ym0, 1 1863 movq [r5+r2*2], xm1 1864 movhps [r5+r3 ], xm1 1865 add dstq, strideq 1866 test hd, hd 1867 jnz .w4_loop 1868.w4_end: 1869 RET 1870.upsample: 1871 vinserti32x4 m6, [tlq-14], 3 1872 mova m3, [base+z_upsample] 1873 vpbroadcastd m4, [base+pd_65536] 1874 add dyd, dyd 1875 vpermw m0, m3, m6 1876 paddw m3, m4 1877 vpermw m1, m3, m6 1878 paddw m3, m4 1879 vpermw m2, m3, m6 1880 paddw m3, m4 1881 vpermw m3, m3, m6 1882 vpbroadcastw m6, r9m ; pixel_max 1883 paddw m1, m2 ; b+c 1884 paddw m0, m3 ; a+d 1885 psubw m0, m1, m0 1886 psraw m0, 3 1887 pxor m2, m2 1888 paddw m0, m1 1889 pmaxsw m0, m2 1890 pavgw m0, m2 1891 pminsw m6, m0 1892 ret 1893.w8: 1894 mova m6, [tlq-64*1] 1895 cmp hd, 32 1896 je .w8_h32 1897 mov r3d, 8 1898 cmp hd, 4 1899 cmove r3d, hd 1900 lea r3d, [r3+hq-1] 1901 xor r3d, 31 ; 32 - (h + imin(w, h)) 1902 vpbroadcastw m1, r3d 1903 vpermw m7, m1, m6 1904 pmaxuw m1, m0 1905 vpermw m6, m1, m6 1906 test angled, 0x400 1907 jnz .w8_main 1908 lea r3d, [angleq+216] 1909 mov r3b, hb 1910 cmp r3d, 8 1911 ja .w8_filter ; is_sm || d >= 40 || h > 8 1912 call .upsample 1913 movshdup m1, [base+z_ypos_mul] 1914 paddw m1, m1 1915 call .w8_main_setup 1916.w8_upsample_loop: 1917 vpermw m3, m2, m6 ; left[base+0] 1918 paddw m2, m15 ; base+1 1919 vpermw m1, m2, m6 ; left[base+1] 1920 psubw m0, m1, m3 1921 pmulhrsw m0, m4 1922 paddw m2, m15 ; base+2 1923 paddw m0, m3 1924 mova m3, m1 1925 mova [dstq+r2*0], xm0 1926 vextracti32x4 [dstq+r2*1], ym0, 1 1927 vextracti32x4 [dstq+r2*2], m0, 2 1928 vextracti32x4 [dstq+r3 ], m0, 3 1929 add dstq, strideq 1930 sub hd, 4 1931 jg .w8_upsample_loop 1932 RET 1933.w8_main_setup: 1934 vbroadcasti32x4 m0, [base+pw_1to32] 1935 vpbroadcastw m4, dyd 1936 rorx r2d, hd, 2 1937 pmullw m4, m0 ; ypos 1938 vpbroadcastw m0, r2d 1939 imul r2, strideq ; stride * height / 4 1940 lea r3, [r2*3] 1941 pmullw m1, m0 ; 0 1 2 3 1942 paddd m1, [base+pw_32704] {1to16} 1943 psrlw m2, m4, 6 1944 psllw m4, 9 1945 paddsw m2, m1 ; base+0 1946 vpandd m4, m14 ; frac << 9 1947 ret 1948.w8_h32: 1949 pmaxud m7, m0, [base+pw_24] {1to16} 1950 vpermw m6, m0, m6 1951 vpermw m7, m7, [tlq-64*2] 1952 test angled, 0x400 1953 jnz .w8_main 1954 call .filter64 1955 vpbroadcastd m0, [base+pw_7] 1956 pminuw m0, [base+pw_0to31] 1957 vpermw m7, m0, m7 1958 jmp .w8_main 1959.w8_filter: 1960 lea r3d, [hq+7] 1961 call .filter32 1962.w8_main: 1963 movshdup m1, [base+z_ypos_mul] 1964 call .w8_main_setup 1965 mova m3, m6 1966 vpermt2w m3, m2, m7 ; left[base+0] 1967.w8_loop: 1968 paddsw m2, m15 ; base+1 1969 mova m1, m6 1970 vpermt2w m1, m2, m7 ; left[base+1] 1971 psubw m0, m1, m3 1972 pmulhrsw m0, m4 1973 paddw m0, m3 1974 mova m3, m1 1975 mova [dstq+r2*0], xm0 1976 vextracti32x4 [dstq+r2*1], ym0, 1 1977 vextracti32x4 [dstq+r2*2], m0, 2 1978 vextracti32x4 [dstq+r3 ], m0, 3 1979 add dstq, strideq 1980 sub hd, 4 1981 jg .w8_loop 1982 RET 1983.filter32: 1984 vpbroadcastb ym10, r3d 1985 vpbroadcastb ym1, angled 1986 shr angled, 8 1987 vpcmpeqb k1, ym10, [base+z_filter_wh] 1988 mova xm2, [base+z_filter_t0+angleq*8] 1989 vpcmpgtb k1{k1}, ym1, ym2 1990 kmovd r5d, k1 1991 test r5d, r5d 1992 jz .filter32_end 1993 vpbroadcastw m2, [tlq] 1994 popcnt r5d, r5d 1995 vpbroadcastd m5, [base+z_filter_k+(r5-1)*4+12*0] 1996 valignq m2, m6, m2, 6 1997 vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1] 1998 valignq m4, m7, m6, 2 1999 vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2] 2000 palignr m1, m6, m2, 14 2001 pmullw m5, m6 2002 palignr m3, m4, m6, 2 2003 paddw m1, m3 2004 palignr m2, m6, m2, 12 2005 pmullw m1, m8 2006 palignr m4, m6, 4 2007 paddw m2, m4 2008 pmullw m2, m9 2009 pmovzxbw m10, ym10 2010 pxor m6, m6 2011 paddw m5, m1 2012 pminuw m1, m10, [base+pw_0to31] 2013 paddw m5, m2 2014 psrlw m5, 3 2015 pavgw m6, m5 2016 vpermw m7, m10, m6 2017 vpermw m6, m1, m6 2018.filter32_end: 2019 ret 2020.w16: 2021 mova m6, [tlq-64*1] 2022 cmp hd, 32 2023 jl .w16_h16 2024 pmaxud m8, m0, [base+pw_16] {1to16} 2025 mova m7, [tlq-64*2] 2026 vpermw m6, m0, m6 2027 jg .w16_h64 2028 vpermw m7, m8, m7 2029 test angled, 0x400 2030 jnz .w16_main 2031 call .filter64 2032 vpbroadcastd m0, [base+pw_15] 2033 vinserti32x8 m0, [base+pw_0to31], 0 2034 vpermw m7, m0, m7 2035 jmp .w16_main 2036.w16_h16: 2037 lea r3d, [hq*2-1] 2038 xor r3d, 31 ; 32 - (h + imin(w, h)) 2039 vpbroadcastw m1, r3d 2040 vpermw m7, m1, m6 2041 pmaxuw m1, m0 2042 vpermw m6, m1, m6 2043 test angled, 0x400 2044 jnz .w16_main 2045 lea r3d, [hq+15] 2046 call .filter32 2047.w16_main: 2048 vbroadcasti32x8 m0, [base+pw_1to32] 2049 vpbroadcastw m4, dyd 2050 rorx r2d, hd, 1 2051 pmullw m4, m0 ; ypos 2052 vpbroadcastw ym1, r2d 2053 imul r2, strideq ; stride * height / 2 2054 paddd m1, [base+pw_32704] {1to16} 2055 lea r3, [r2+strideq] 2056 psrlw m2, m4, 6 2057 psllw m4, 9 2058 paddsw m2, m1 ; base+0 2059 vpandd m4, m14 ; frac << 9 2060 mova m3, m6 2061 vpermt2w m3, m2, m7 ; left[base+0] 2062.w16_loop: 2063 paddsw m1, m2, m15 ; base+1 2064 paddsw m2, m1, m15 ; base+2 2065 vpermi2w m1, m6, m7 ; left[base+1] 2066 psubw m0, m1, m3 2067 pmulhrsw m0, m4 2068 paddw m0, m3 2069 mova m3, m6 2070 vpermt2w m3, m2, m7 ; left[base+2] 2071 vextracti32x8 [dstq+strideq*0], m0, 1 2072 mova [dstq+r2 ], ym0 2073 psubw m0, m3, m1 2074 pmulhrsw m0, m4 2075 paddw m0, m1 2076 vextracti32x8 [dstq+strideq*1], m0, 1 2077 mova [dstq+r3 ], ym0 2078 lea dstq, [dstq+strideq*2] 2079 sub hd, 4 2080 jg .w16_loop 2081 RET 2082.w16_h64: 2083 vpermw m7, m0, m7 2084 vpermw m8, m8, [tlq-64*3] 2085 test angled, 0x400 2086 jnz .w16_h64_main 2087 valignq m11, m8, m7, 6 2088 call .filter64 2089 vshufi32x4 m2, m8, m8, q3321 2090 vpbroadcastd m0, [base+pw_15] 2091 palignr ym3, ym8, ym11, 12 2092 vinserti32x8 m0, [base+pw_0to31], 0 2093 palignr ym4, ym8, ym11, 14 2094 palignr ym1, ym2, ym8, 4 2095 paddw ym3, ym5 2096 palignr ym2, ym8, 2 2097 paddw ym8, ym4 2098 pavgw ym3, ym1 2099 paddw ym8, ym2 2100 paddw ym8, ym3 2101 psrlw ym8, 2 2102 vpermw m8, m0, m8 2103.w16_h64_main: 2104 vbroadcasti32x8 m0, [base+pw_1to32] 2105 vpbroadcastw m4, dyd 2106 pmullw m4, m0 ; ypos 2107 vpbroadcastd ym1, [base+pw_32] 2108 paddd m1, [base+pw_32672] {1to16} 2109 mov r2, strideq 2110 shl r2, 5 ; stride*32 2111 vpbroadcastd m9, [base+pw_32735] 2112 lea r3, [r2+strideq] 2113 psrlw m2, m4, 6 2114 psllw m4, 9 2115 paddsw m2, m1 ; base+0 2116 vpandd m4, m14 ; frac << 9 2117 mova m3, m7 2118 vpermt2w m3, m2, m6 2119 vpcmpgtw k1, m2, m9 2120 vpermw m3{k1}, m2, m8 ; left[base+0] 2121.w16_h64_loop: 2122 paddsw m2, m15 ; base+1 2123 mova m1, m7 2124 vpermt2w m1, m2, m6 2125 vpcmpgtw k1, m2, m9 2126 vpermw m1{k1}, m2, m8 ; left[base+1] 2127 psubw m0, m1, m3 2128 pmulhrsw m0, m4 2129 paddsw m2, m15 ; base+2 2130 paddw m0, m3 2131 mova m3, m7 2132 vpermt2w m3, m2, m6 2133 vpcmpgtw k1, m2, m9 2134 vpermw m3{k1}, m2, m8 ; left[base+2] 2135 vextracti32x8 [dstq+strideq*0], m0, 1 2136 mova [dstq+r2 ], ym0 2137 psubw m0, m3, m1 2138 pmulhrsw m0, m4 2139 paddw m0, m1 2140 vextracti32x8 [dstq+strideq*1], m0, 1 2141 mova [dstq+r3 ], ym0 2142 lea dstq, [dstq+strideq*2] 2143 sub hd, 4 2144 jg .w16_h64_loop 2145 RET 2146.filter64: 2147 vpbroadcastw m2, [tlq] 2148 vpbroadcastd m5, [base+pw_3] 2149 valignq m2, m6, m2, 6 2150 valignq m4, m7, m6, 2 2151 valignq m10, m7, m6, 6 2152 palignr m1, m6, m2, 12 2153 palignr m2, m6, m2, 14 2154 palignr m3, m4, m6, 4 2155 paddw m1, m5 2156 palignr m4, m6, 2 2157 paddw m6, m2 2158 valignq m2, m8, m7, 2 2159 pavgw m1, m3 2160 palignr m3, m7, m10, 12 2161 paddw m6, m4 2162 palignr m4, m7, m10, 14 2163 paddw m6, m1 2164 palignr m1, m2, m7, 4 2165 psrlw m6, 2 2166 palignr m2, m7, 2 2167 paddw m3, m5 2168 paddw m7, m4 2169 pavgw m3, m1 2170 paddw m7, m2 2171 paddw m7, m3 2172 psrlw m7, 2 2173 ret 2174.w32: 2175 mova m6, [tlq-64*1] 2176 cmp hd, 32 2177 jl .w32_h16 2178 mova m8, [tlq-64*2] 2179 vpermw m6, m0, m6 2180 vpermw m7, m0, m8 2181 jg .w32_h64 2182 test angled, 0x400 2183 jnz .w32_main 2184 vpbroadcastw xm8, xm8 2185 jmp .w32_filter 2186.w32_h16: 2187 lea r3d, [hq*2-1] 2188 xor r3d, 31 ; 32 - (h + imin(w, h)) 2189 vpbroadcastw m1, r3d 2190 vpermw m7, m1, m6 2191 pmaxuw m1, m0 2192 vpermw m6, m1, m6 2193 test angled, 0x400 2194 jnz .w32_main 2195 vextracti32x4 xm8, m7, 3 2196.w32_filter: 2197 call .filter64 2198.w32_main: 2199 vpbroadcastw m4, dyd 2200 vpbroadcastd m1, [base+pw_32704] 2201 pmullw m4, [base+pw_1to32] ; ypos 2202 psrlw m2, m4, 6 2203 psllw m4, 9 2204 paddsw m2, m1 ; base+0 2205 vpandd m4, m14 ; frac << 9 2206 mova m3, m6 2207 vpermt2w m3, m2, m7 ; left[base+0] 2208.w32_loop: 2209 paddsw m1, m2, m15 ; base+1 2210 paddsw m2, m1, m15 ; base+2 2211 vpermi2w m1, m6, m7 ; left[base+1] 2212 psubw m0, m1, m3 2213 pmulhrsw m0, m4 2214 paddw m0, m3 2215 mova m3, m6 2216 vpermt2w m3, m2, m7 ; left[base+2] 2217 mova [dstq+strideq*0], m0 2218 psubw m0, m3, m1 2219 pmulhrsw m0, m4 2220 paddw m0, m1 2221 mova [dstq+strideq*1], m0 2222 lea dstq, [dstq+strideq*2] 2223 sub hd, 2 2224 jg .w32_loop 2225 RET 2226.w32_h64: 2227 mova m9, [tlq-64*3] 2228 vpermw m8, m0, m9 2229 test angled, 0x400 2230 jnz .w32_h64_main 2231 vpbroadcastw xm9, xm9 2232 call .filter96 2233.w32_h64_main: 2234 vpbroadcastw m4, dyd 2235 vpbroadcastd m1, [base+pw_32672] 2236 pmullw m4, [base+pw_1to32] ; ypos 2237 vpbroadcastd m9, [base+pw_32735] 2238 psrlw m2, m4, 6 2239 psllw m4, 9 2240 paddsw m2, m1 ; base+0 2241 vpandd m4, m14 ; frac << 9 2242 mova m3, m7 2243 vpermt2w m3, m2, m6 2244 vpcmpgtw k1, m2, m9 2245 vpermw m3{k1}, m2, m8 ; left[base+0] 2246.w32_h64_loop: 2247 paddsw m2, m15 ; base+1 2248 mova m1, m7 2249 vpermt2w m1, m2, m6 2250 vpcmpgtw k1, m2, m9 2251 vpermw m1{k1}, m2, m8 ; left[base+1] 2252 psubw m0, m1, m3 2253 pmulhrsw m0, m4 2254 paddsw m2, m15 ; base+2 2255 paddw m0, m3 2256 mova m3, m7 2257 vpermt2w m3, m2, m6 2258 vpcmpgtw k1, m2, m9 2259 vpermw m3{k1}, m2, m8 ; left[base+2] 2260 mova [dstq+strideq*0], m0 2261 psubw m0, m3, m1 2262 pmulhrsw m0, m4 2263 paddw m0, m1 2264 mova [dstq+strideq*1], m0 2265 lea dstq, [dstq+strideq*2] 2266 sub hd, 2 2267 jg .w32_h64_loop 2268 RET 2269.filter96: 2270 valignq m11, m8, m7, 6 2271 call .filter64 2272 valignq m2, m9, m8, 2 2273 palignr m3, m8, m11, 12 2274 palignr m4, m8, m11, 14 2275 palignr m1, m2, m8, 4 2276 paddw m3, m5 2277 palignr m2, m8, 2 2278 paddw m8, m4 2279 pavgw m3, m1 2280 paddw m8, m2 2281 paddw m8, m3 2282 psrlw m8, 2 2283 ret 2284.w64: 2285 mova m7, [tlq-64*1] 2286 vpermw m6, m0, m7 2287 cmp hd, 32 2288 jl .w64_h16 2289 mova m8, [tlq-64*2] 2290 vpermw m7, m0, m8 2291 jg .w64_h64 2292 test angled, 0x400 2293 jnz .w64_main 2294 vpbroadcastw m8, xm8 2295 mova m9, m8 2296 call .filter96 2297 vshufi32x4 m9, m8, m8, q3333 2298 jmp .w64_h64_main 2299.w64_h16: 2300 vpbroadcastw m7, xm7 2301 test angled, 0x400 2302 jnz .w64_main 2303 mova m8, m7 2304 call .filter64 2305.w64_main: 2306 vpbroadcastw m11, dyd 2307 vpbroadcastd m1, [base+pw_32704] 2308 pmullw m10, m11, [base+pw_1to32] ; ypos 2309 psllw m11, 5 2310 psrlw m8, m10, 6 2311 paddw m11, m10 2312 psllw m10, 9 2313 psrlw m9, m11, 6 2314 psllw m11, 9 2315 psubw m9, m8 2316 paddsw m8, m1 ; base+0 2317 vpandd m10, m14 ; frac << 9 2318 vpandd m11, m14 ; frac << 9 2319 mova m4, m6 2320 vpermt2w m4, m8, m7 ; left[base+0] ( 0..31) 2321 paddsw m5, m8, m9 2322 vpermi2w m5, m6, m7 ; left[base+0] (32..63) 2323.w64_loop: 2324 paddsw m8, m15 ; base+1 ( 0..31) 2325 mova m2, m6 2326 vpermt2w m2, m8, m7 ; left[base+1] ( 0..31) 2327 paddsw m3, m8, m9 ; base+1 (32..63) 2328 vpermi2w m3, m6, m7 ; left[base+1] (32..63) 2329 psubw m0, m2, m4 2330 psubw m1, m3, m5 2331 pmulhrsw m0, m10 2332 pmulhrsw m1, m11 2333 paddw m0, m4 2334 paddw m1, m5 2335 mova m4, m2 2336 mova [dstq+64*0], m0 2337 mova m5, m3 2338 mova [dstq+64*1], m1 2339 add dstq, strideq 2340 dec hd 2341 jg .w64_loop 2342 RET 2343.w64_h64: 2344 vpermw m8, m0, [tlq-64*3] 2345 mova m13, [tlq-64*4] 2346 vpermw m9, m0, m13 2347 test angled, 0x400 2348 jnz .w64_h64_main 2349 valignq m12, m9, m8, 6 2350 call .filter96 2351 vpbroadcastw xm2, xm13 2352 valignq m2, m9, 2 2353 palignr m3, m9, m12, 12 2354 palignr m4, m9, m12, 14 2355 palignr m1, m2, m9, 4 2356 paddw m3, m5 2357 palignr m2, m9, 2 2358 paddw m9, m4 2359 pavgw m3, m1 2360 paddw m9, m2 2361 paddw m9, m3 2362 psrlw m9, 2 2363.w64_h64_main: 2364 vpbroadcastw m11, dyd 2365 vpbroadcastd m1, [base+pw_32640] 2366 pmullw m10, m11, [base+pw_1to32] ; ypos 2367 psllw m11, 5 2368 psrlw m12, m10, 6 2369 paddw m11, m10 2370 psllw m10, 9 2371 psrlw m13, m11, 6 2372 psllw m11, 9 2373 psubw m13, m12 2374 paddsw m12, m1 ; base+0 2375 vpandd m10, m14 ; frac << 9 2376 vpandd m11, m14 ; frac << 9 2377 vpbroadcastd m14, [base+pw_64] 2378 mova m4, m6 2379 vpermt2w m4, m12, m7 2380 vptestmw k1, m12, m14 2381 mova m0, m8 2382 vpermt2w m0, m12, m9 2383 paddsw m1, m12, m13 2384 mova m5, m6 2385 vpermt2w m5, m1, m7 2386 vptestmw k2, m1, m14 2387 vpermi2w m1, m8, m9 2388 vmovdqu16 m4{k1}, m0 ; left[base+0] ( 0..31) 2389 vmovdqu16 m5{k2}, m1 ; left[base+0] (32..63) 2390.w64_h64_loop: 2391 paddsw m12, m15 ; base+1 2392 mova m2, m6 2393 vpermt2w m2, m12, m7 2394 vptestmw k1, m12, m14 2395 mova m0, m8 2396 vpermt2w m0, m12, m9 2397 paddsw m1, m12, m13 2398 mova m3, m6 2399 vpermt2w m3, m1, m7 2400 vptestmw k2, m1, m14 2401 vpermi2w m1, m8, m9 2402 vmovdqu16 m2{k1}, m0 ; left[base+1] ( 0..31) 2403 vmovdqu16 m3{k2}, m1 ; left[base+1] (32..63) 2404 psubw m0, m2, m4 2405 psubw m1, m3, m5 2406 pmulhrsw m0, m10 2407 pmulhrsw m1, m11 2408 paddw m0, m4 2409 paddw m1, m5 2410 mova m4, m2 2411 mova [dstq+64*0], m0 2412 mova m5, m3 2413 mova [dstq+64*1], m1 2414 add dstq, strideq 2415 dec hd 2416 jg .w64_h64_loop 2417 RET 2418 2419cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3 2420 lea r6, [pal_pred_16bpc_avx512icl_table] 2421 tzcnt wd, wm 2422 mova m3, [pal_pred_perm] 2423 movifnidn hd, hm 2424 movsxd wq, [r6+wq*4] 2425 vpbroadcastq m4, [pal_unpack+0] 2426 vpbroadcastq m5, [pal_unpack+8] 2427 add wq, r6 2428 vbroadcasti32x4 m6, [palq] 2429 lea stride3q, [strideq*3] 2430 jmp wq 2431.w4: 2432 pmovzxbd ym0, [idxq] 2433 add idxq, 8 2434 vpmultishiftqb ym0, ym4, ym0 2435 vpermw ym0, ym0, ym6 2436 vextracti32x4 xm1, ym0, 1 2437 movq [dstq+strideq*0], xm0 2438 movhps [dstq+strideq*1], xm0 2439 movq [dstq+strideq*2], xm1 2440 movhps [dstq+stride3q ], xm1 2441 lea dstq, [dstq+strideq*4] 2442 sub hd, 4 2443 jg .w4 2444 RET 2445.w8: 2446 pmovzxbd m0, [idxq] 2447 add idxq, 16 2448 vpmultishiftqb m0, m4, m0 2449 vpermw m0, m0, m6 2450 mova [dstq+strideq*0], xm0 2451 vextracti32x4 [dstq+strideq*1], ym0, 1 2452 vextracti32x4 [dstq+strideq*2], m0, 2 2453 vextracti32x4 [dstq+stride3q ], m0, 3 2454 lea dstq, [dstq+strideq*4] 2455 sub hd, 4 2456 jg .w8 2457 RET 2458.w16: 2459 movu ym1, [idxq] 2460 add idxq, 32 2461 vpermb m1, m3, m1 2462 vpmultishiftqb m1, m4, m1 2463 vpermw m0, m1, m6 2464 psrlw m1, 8 2465 vpermw m1, m1, m6 2466 mova [dstq+strideq*0], ym0 2467 vextracti32x8 [dstq+strideq*1], m0, 1 2468 mova [dstq+strideq*2], ym1 2469 vextracti32x8 [dstq+stride3q ], m1, 1 2470 lea dstq, [dstq+strideq*4] 2471 sub hd, 4 2472 jg .w16 2473 RET 2474.w32: 2475 vpermb m2, m3, [idxq] 2476 add idxq, 64 2477 vpmultishiftqb m1, m4, m2 2478 vpmultishiftqb m2, m5, m2 2479 vpermw m0, m1, m6 2480 psrlw m1, 8 2481 vpermw m1, m1, m6 2482 mova [dstq+strideq*0], m0 2483 mova [dstq+strideq*1], m1 2484 vpermw m0, m2, m6 2485 psrlw m2, 8 2486 vpermw m1, m2, m6 2487 mova [dstq+strideq*2], m0 2488 mova [dstq+stride3q ], m1 2489 lea dstq, [dstq+strideq*4] 2490 sub hd, 4 2491 jg .w32 2492 RET 2493.w64: 2494 vpermb m2, m3, [idxq] 2495 add idxq, 64 2496 vpmultishiftqb m1, m4, m2 2497 vpmultishiftqb m2, m5, m2 2498 vpermw m0, m1, m6 2499 psrlw m1, 8 2500 vpermw m1, m1, m6 2501 mova [dstq+ 0], m0 2502 mova [dstq+64], m1 2503 vpermw m0, m2, m6 2504 psrlw m2, 8 2505 vpermw m1, m2, m6 2506 mova [dstq+strideq+ 0], m0 2507 mova [dstq+strideq+64], m1 2508 lea dstq, [dstq+strideq*2] 2509 sub hd, 2 2510 jg .w64 2511 RET 2512 2513; The ipred_filter SIMD processes 4x2 blocks in the following order which 2514; increases parallelism compared to doing things row by row. 2515; w4 w8 w16 w32 2516; 1 1 2 1 2 5 6 1 2 5 6 9 a d e 2517; 2 2 3 2 3 6 7 2 3 6 7 a b e f 2518; 3 3 4 3 4 7 8 3 4 7 8 b c f g 2519; 4 4 5 4 5 8 9 4 5 8 9 c d g h 2520 2521cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top 2522%define base r6-$$ 2523 lea r6, [$$] 2524%ifidn filterd, filterm 2525 movzx filterd, filterb 2526%else 2527 movzx filterd, byte filterm 2528%endif 2529 shl filterd, 6 2530 movifnidn hd, hm 2531 movu xm0, [tlq-6] 2532 pmovsxbw m7, [base+filter_intra_taps+filterq+32*0] 2533 pmovsxbw m8, [base+filter_intra_taps+filterq+32*1] 2534 mov r5d, r8m ; bitdepth_max 2535 movsldup m9, [base+filter_permA] 2536 movshdup m10, [base+filter_permA] 2537 shr r5d, 11 ; is_12bpc 2538 jnz .12bpc 2539 psllw m7, 2 ; upshift multipliers so that packusdw 2540 psllw m8, 2 ; will perform clipping for free 2541.12bpc: 2542 vpbroadcastd m5, [base+filter_rnd+r5*8] 2543 vpbroadcastd m6, [base+filter_shift+r5*8] 2544 sub wd, 8 2545 jl .w4 2546.w8: 2547 call .main4 2548 movsldup m11, [filter_permB] 2549 lea r5d, [hq*2+2] 2550 movshdup m12, [filter_permB] 2551 lea topq, [tlq+2] 2552 mova m13, [filter_permC] 2553 sub hd, 4 2554 vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1 2555 sub tlq, r5 2556%if WIN64 2557 push r7 2558 push r8 2559%endif 2560 mov r7, dstq 2561 mov r8d, hd 2562.w8_loop: 2563 movlps xm4, xm0, [tlq+hq*2] 2564 call .main8 2565 lea dstq, [dstq+strideq*2] 2566 sub hd, 2 2567 jge .w8_loop 2568 test wd, wd 2569 jz .end 2570 mov r2d, 0x0d 2571 kmovb k1, r2d 2572 lea r2, [strideq*3] 2573.w16: 2574 movd xmm0, [r7+strideq*1+12] 2575 vpblendd xmm0, [topq+8], 0x0e ; t1 t2 2576 pinsrw xm4, xmm0, [r7+strideq*0+14], 2 2577 call .main8 2578 add r7, 16 2579 vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3 2580 mov hd, r8d 2581 mov dstq, r7 2582 add topq, 16 2583.w16_loop: 2584 movd xmm1, [dstq+strideq*2-4] 2585 punpcklwd xm4, xmm1, xmm0 2586 movd xmm0, [dstq+r2-4] 2587 shufps xm4{k1}, xmm0, xm0, q3210 2588 call .main8 2589 lea dstq, [dstq+strideq*2] 2590 sub hd, 2 2591 jge .w16_loop 2592 sub wd, 8 2593 jg .w16 2594.end: 2595 vpermb m2, m11, m0 2596 mova ym1, ym5 2597 vpdpwssd m1, m2, m7 2598 vpermb m2, m12, m0 2599 vpdpwssd m1, m2, m8 2600%if WIN64 2601 pop r8 2602 pop r7 2603%endif 2604 vextracti32x8 ym2, m1, 1 2605 paddd ym1, ym2 2606 packusdw ym1, ym1 2607 vpsrlvw ym1, ym6 2608 vpermt2q m0, m13, m1 2609 vextracti32x4 [dstq+strideq*0], m0, 2 2610 vextracti32x4 [dstq+strideq*1], ym0, 1 2611 RET 2612.w4_loop: 2613 movlps xm0, [tlq-10] 2614 lea dstq, [dstq+strideq*2] 2615 sub tlq, 4 2616.w4: 2617 call .main4 2618 movq [dstq+strideq*0], xm0 2619 movhps [dstq+strideq*1], xm0 2620 sub hd, 2 2621 jg .w4_loop 2622 RET 2623ALIGN function_align 2624.main4: 2625 vpermb m2, m9, m0 2626 mova ym1, ym5 2627 vpdpwssd m1, m2, m7 2628 vpermb m0, m10, m0 2629 vpdpwssd m1, m0, m8 2630 vextracti32x8 ym0, m1, 1 2631 paddd ym0, ym1 2632 vextracti32x4 xm1, ym0, 1 2633 packusdw xm0, xm1 ; clip 2634 vpsrlvw xm0, xm6 2635 ret 2636ALIGN function_align 2637.main8: 2638 vpermb m3, m11, m0 2639 mova ym2, ym5 2640 vpdpwssd m2, m3, m7 2641 vpermb m3, m9, m4 2642 mova ym1, ym5 2643 vpdpwssd m1, m3, m7 2644 vpermb m3, m12, m0 2645 vpdpwssd m2, m3, m8 2646 vpermb m3, m10, m4 2647 vpdpwssd m1, m3, m8 2648 vextracti32x8 ym4, m2, 1 2649 vextracti32x8 ym3, m1, 1 2650 paddd ym2, ym4 2651 paddd ym1, ym3 2652 packusdw ym1, ym2 ; clip 2653 vpsrlvw ym1, ym6 2654 vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1 2655 vextracti32x4 [dstq+strideq*0], m0, 2 2656 vextracti32x4 [dstq+strideq*1], ym0, 1 2657 ret 2658 2659%endif 2660