1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION_RODATA 17pw_8: times 8 dw 8 18bilin_filter_m_sse2: times 8 dw 16 19 times 8 dw 0 20 times 8 dw 14 21 times 8 dw 2 22 times 8 dw 12 23 times 8 dw 4 24 times 8 dw 10 25 times 8 dw 6 26 times 16 dw 8 27 times 8 dw 6 28 times 8 dw 10 29 times 8 dw 4 30 times 8 dw 12 31 times 8 dw 2 32 times 8 dw 14 33 34SECTION .text 35 36; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 37; int x_offset, int y_offset, 38; const uint8_t *dst, ptrdiff_t dst_stride, 39; int height, unsigned int *sse); 40; 41; This function returns the SE and stores SSE in the given pointer. 42 43%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse 44 psubw %3, %4 45 psubw %1, %2 46 mova %4, %3 ; make copies to manipulate to calc sum 47 mova %2, %1 ; use originals for calc sse 48 pmaddwd %3, %3 49 paddw %4, %2 50 pmaddwd %1, %1 51 movhlps %2, %4 52 paddd %6, %3 53 paddw %4, %2 54 pxor %2, %2 55 pcmpgtw %2, %4 ; mask for 0 > %4 (sum) 56 punpcklwd %4, %2 ; sign-extend word to dword 57 paddd %6, %1 58 paddd %5, %4 59 60%endmacro 61 62%macro STORE_AND_RET 0 63%if mmsize == 16 64 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 65 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 66 ; We have to sign-extend it before adding the words within the register 67 ; and outputing to a dword. 68 movhlps m3, m7 69 movhlps m4, m6 70 paddd m7, m3 71 paddd m6, m4 72 pshufd m3, m7, 0x1 73 pshufd m4, m6, 0x1 74 paddd m7, m3 75 paddd m6, m4 76 mov r1, ssem ; r1 = unsigned int *sse 77 movd [r1], m7 ; store sse 78 movd eax, m6 ; store sum as return value 79%endif 80 RET 81%endmacro 82 83%macro INC_SRC_BY_SRC_STRIDE 0 84%if AOM_ARCH_X86=1 && CONFIG_PIC=1 85 add srcq, src_stridemp 86 add srcq, src_stridemp 87%else 88 lea srcq, [srcq + src_strideq*2] 89%endif 90%endmacro 91 92%macro SUBPEL_VARIANCE 1-2 0 ; W 93%define bilin_filter_m bilin_filter_m_sse2 94%define filter_idx_shift 5 95 96 97%if AOM_ARCH_X86_64 98 %if %2 == 1 ; avg 99 cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 100 x_offset, y_offset, \ 101 dst, dst_stride, \ 102 sec, sec_stride, height, sse 103 %define sec_str sec_strideq 104 %else 105 cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ 106 x_offset, y_offset, \ 107 dst, dst_stride, height, sse 108 %endif 109 %define block_height heightd 110 %define bilin_filter sseq 111%else 112 %if CONFIG_PIC=1 113 %if %2 == 1 ; avg 114 cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 115 x_offset, y_offset, \ 116 dst, dst_stride, \ 117 sec, sec_stride, height, sse 118 %define block_height dword heightm 119 %define sec_str sec_stridemp 120 %else 121 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 122 x_offset, y_offset, \ 123 dst, dst_stride, height, sse 124 %define block_height heightd 125 %endif 126 127 ; reuse argument stack space 128 %define g_bilin_filterm x_offsetm 129 %define g_pw_8m y_offsetm 130 131 ; Store bilin_filter and pw_8 location in stack 132 %if GET_GOT_DEFINED == 1 133 GET_GOT eax 134 add esp, 4 ; restore esp 135 %endif 136 137 lea ecx, [GLOBAL(bilin_filter_m)] 138 mov g_bilin_filterm, ecx 139 140 lea ecx, [GLOBAL(pw_8)] 141 mov g_pw_8m, ecx 142 143 LOAD_IF_USED 0, 1 ; load eax, ecx back 144 %else 145 %if %2 == 1 ; avg 146 cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 147 x_offset, y_offset, \ 148 dst, dst_stride, \ 149 sec, sec_stride, height, sse 150 %define block_height dword heightm 151 %define sec_str sec_stridemp 152 %else 153 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 154 x_offset, y_offset, \ 155 dst, dst_stride, height, sse 156 %define block_height heightd 157 %endif 158 159 %define bilin_filter bilin_filter_m 160 %endif 161%endif 162 163 ASSERT %1 <= 16 ; m6 overflows if w > 16 164 pxor m6, m6 ; sum 165 pxor m7, m7 ; sse 166 167%if %1 < 16 168 sar block_height, 1 169%endif 170%if %2 == 1 ; avg 171 shl sec_str, 1 172%endif 173 174 ; FIXME(rbultje) replace by jumptable? 175 test x_offsetd, x_offsetd 176 jnz .x_nonzero 177 ; x_offset == 0 178 test y_offsetd, y_offsetd 179 jnz .x_zero_y_nonzero 180 181 ; x_offset == 0 && y_offset == 0 182.x_zero_y_zero_loop: 183%if %1 == 16 184 movu m0, [srcq] 185 movu m2, [srcq + 16] 186 mova m1, [dstq] 187 mova m3, [dstq + 16] 188%if %2 == 1 ; avg 189 pavgw m0, [secq] 190 pavgw m2, [secq+16] 191%endif 192 SUM_SSE m0, m1, m2, m3, m6, m7 193 194 lea srcq, [srcq + src_strideq*2] 195 lea dstq, [dstq + dst_strideq*2] 196%if %2 == 1 ; avg 197 add secq, sec_str 198%endif 199%else ; %1 < 16 200 movu m0, [srcq] 201 movu m2, [srcq + src_strideq*2] 202 mova m1, [dstq] 203 mova m3, [dstq + dst_strideq*2] 204%if %2 == 1 ; avg 205 pavgw m0, [secq] 206 add secq, sec_str 207 pavgw m2, [secq] 208%endif 209 SUM_SSE m0, m1, m2, m3, m6, m7 210 211 lea srcq, [srcq + src_strideq*4] 212 lea dstq, [dstq + dst_strideq*4] 213%if %2 == 1 ; avg 214 add secq, sec_str 215%endif 216%endif 217 dec block_height 218 jg .x_zero_y_zero_loop 219 STORE_AND_RET 220 221.x_zero_y_nonzero: 222 cmp y_offsetd, 8 223 jne .x_zero_y_nonhalf 224 225 ; x_offset == 0 && y_offset == 0.5 226.x_zero_y_half_loop: 227%if %1 == 16 228 movu m0, [srcq] 229 movu m1, [srcq+16] 230 movu m4, [srcq+src_strideq*2] 231 movu m5, [srcq+src_strideq*2+16] 232 mova m2, [dstq] 233 mova m3, [dstq+16] 234 pavgw m0, m4 235 pavgw m1, m5 236%if %2 == 1 ; avg 237 pavgw m0, [secq] 238 pavgw m1, [secq+16] 239%endif 240 SUM_SSE m0, m2, m1, m3, m6, m7 241 242 lea srcq, [srcq + src_strideq*2] 243 lea dstq, [dstq + dst_strideq*2] 244%if %2 == 1 ; avg 245 add secq, sec_str 246%endif 247%else ; %1 < 16 248 movu m0, [srcq] 249 movu m1, [srcq+src_strideq*2] 250 movu m5, [srcq+src_strideq*4] 251 mova m2, [dstq] 252 mova m3, [dstq+dst_strideq*2] 253 pavgw m0, m1 254 pavgw m1, m5 255%if %2 == 1 ; avg 256 pavgw m0, [secq] 257 add secq, sec_str 258 pavgw m1, [secq] 259%endif 260 SUM_SSE m0, m2, m1, m3, m6, m7 261 262 lea srcq, [srcq + src_strideq*4] 263 lea dstq, [dstq + dst_strideq*4] 264%if %2 == 1 ; avg 265 add secq, sec_str 266%endif 267%endif 268 dec block_height 269 jg .x_zero_y_half_loop 270 STORE_AND_RET 271 272.x_zero_y_nonhalf: 273 ; x_offset == 0 && y_offset == bilin interpolation 274%if AOM_ARCH_X86_64 275 lea bilin_filter, [GLOBAL(bilin_filter_m)] 276%endif 277 shl y_offsetd, filter_idx_shift 278%if AOM_ARCH_X86_64 && mmsize == 16 279 mova m8, [bilin_filter+y_offsetq] 280 mova m9, [bilin_filter+y_offsetq+16] 281 mova m10, [GLOBAL(pw_8)] 282%define filter_y_a m8 283%define filter_y_b m9 284%define filter_rnd m10 285%else ; x86-32 or mmx 286%if AOM_ARCH_X86=1 && CONFIG_PIC=1 287; x_offset == 0, reuse x_offset reg 288%define tempq x_offsetq 289 add y_offsetq, g_bilin_filterm 290%define filter_y_a [y_offsetq] 291%define filter_y_b [y_offsetq+16] 292 mov tempq, g_pw_8m 293%define filter_rnd [tempq] 294%else 295 add y_offsetq, bilin_filter 296%define filter_y_a [y_offsetq] 297%define filter_y_b [y_offsetq+16] 298%define filter_rnd [GLOBAL(pw_8)] 299%endif 300%endif 301 302.x_zero_y_other_loop: 303%if %1 == 16 304 movu m0, [srcq] 305 movu m1, [srcq + 16] 306 movu m4, [srcq+src_strideq*2] 307 movu m5, [srcq+src_strideq*2+16] 308 mova m2, [dstq] 309 mova m3, [dstq+16] 310 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 311 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 312 ; instructions is the same (5), but it is 1 mul instead of 2, so might be 313 ; slightly faster because of pmullw latency. It would also cut our rodata 314 ; tables in half for this function, and save 1-2 registers on x86-64. 315 pmullw m1, filter_y_a 316 pmullw m5, filter_y_b 317 paddw m1, filter_rnd 318 pmullw m0, filter_y_a 319 pmullw m4, filter_y_b 320 paddw m0, filter_rnd 321 paddw m1, m5 322 paddw m0, m4 323 psrlw m1, 4 324 psrlw m0, 4 325%if %2 == 1 ; avg 326 pavgw m0, [secq] 327 pavgw m1, [secq+16] 328%endif 329 SUM_SSE m0, m2, m1, m3, m6, m7 330 331 lea srcq, [srcq + src_strideq*2] 332 lea dstq, [dstq + dst_strideq*2] 333%if %2 == 1 ; avg 334 add secq, sec_str 335%endif 336%else ; %1 < 16 337 movu m0, [srcq] 338 movu m1, [srcq+src_strideq*2] 339 movu m5, [srcq+src_strideq*4] 340 mova m4, m1 341 mova m2, [dstq] 342 mova m3, [dstq+dst_strideq*2] 343 pmullw m1, filter_y_a 344 pmullw m5, filter_y_b 345 paddw m1, filter_rnd 346 pmullw m0, filter_y_a 347 pmullw m4, filter_y_b 348 paddw m0, filter_rnd 349 paddw m1, m5 350 paddw m0, m4 351 psrlw m1, 4 352 psrlw m0, 4 353%if %2 == 1 ; avg 354 pavgw m0, [secq] 355 add secq, sec_str 356 pavgw m1, [secq] 357%endif 358 SUM_SSE m0, m2, m1, m3, m6, m7 359 360 lea srcq, [srcq + src_strideq*4] 361 lea dstq, [dstq + dst_strideq*4] 362%if %2 == 1 ; avg 363 add secq, sec_str 364%endif 365%endif 366 dec block_height 367 jg .x_zero_y_other_loop 368%undef filter_y_a 369%undef filter_y_b 370%undef filter_rnd 371 STORE_AND_RET 372 373.x_nonzero: 374 cmp x_offsetd, 8 375 jne .x_nonhalf 376 ; x_offset == 0.5 377 test y_offsetd, y_offsetd 378 jnz .x_half_y_nonzero 379 380 ; x_offset == 0.5 && y_offset == 0 381.x_half_y_zero_loop: 382%if %1 == 16 383 movu m0, [srcq] 384 movu m1, [srcq + 16] 385 movu m4, [srcq + 2] 386 movu m5, [srcq + 18] 387 mova m2, [dstq] 388 mova m3, [dstq + 16] 389 pavgw m0, m4 390 pavgw m1, m5 391%if %2 == 1 ; avg 392 pavgw m0, [secq] 393 pavgw m1, [secq+16] 394%endif 395 SUM_SSE m0, m2, m1, m3, m6, m7 396 397 lea srcq, [srcq + src_strideq*2] 398 lea dstq, [dstq + dst_strideq*2] 399%if %2 == 1 ; avg 400 add secq, sec_str 401%endif 402%else ; %1 < 16 403 movu m0, [srcq] 404 movu m1, [srcq + src_strideq*2] 405 movu m4, [srcq + 2] 406 movu m5, [srcq + src_strideq*2 + 2] 407 mova m2, [dstq] 408 mova m3, [dstq + dst_strideq*2] 409 pavgw m0, m4 410 pavgw m1, m5 411%if %2 == 1 ; avg 412 pavgw m0, [secq] 413 add secq, sec_str 414 pavgw m1, [secq] 415%endif 416 SUM_SSE m0, m2, m1, m3, m6, m7 417 418 lea srcq, [srcq + src_strideq*4] 419 lea dstq, [dstq + dst_strideq*4] 420%if %2 == 1 ; avg 421 add secq, sec_str 422%endif 423%endif 424 dec block_height 425 jg .x_half_y_zero_loop 426 STORE_AND_RET 427 428.x_half_y_nonzero: 429 cmp y_offsetd, 8 430 jne .x_half_y_nonhalf 431 432 ; x_offset == 0.5 && y_offset == 0.5 433%if %1 == 16 434 movu m0, [srcq] 435 movu m1, [srcq+16] 436 movu m2, [srcq+2] 437 movu m3, [srcq+18] 438 lea srcq, [srcq + src_strideq*2] 439 pavgw m0, m2 440 pavgw m1, m3 441.x_half_y_half_loop: 442 movu m2, [srcq] 443 movu m3, [srcq + 16] 444 movu m4, [srcq + 2] 445 movu m5, [srcq + 18] 446 pavgw m2, m4 447 pavgw m3, m5 448 pavgw m0, m2 449 pavgw m1, m3 450 mova m4, [dstq] 451 mova m5, [dstq + 16] 452%if %2 == 1 ; avg 453 pavgw m0, [secq] 454 pavgw m1, [secq+16] 455%endif 456 SUM_SSE m0, m4, m1, m5, m6, m7 457 mova m0, m2 458 mova m1, m3 459 460 lea srcq, [srcq + src_strideq*2] 461 lea dstq, [dstq + dst_strideq*2] 462%if %2 == 1 ; avg 463 add secq, sec_str 464%endif 465%else ; %1 < 16 466 movu m0, [srcq] 467 movu m2, [srcq+2] 468 lea srcq, [srcq + src_strideq*2] 469 pavgw m0, m2 470.x_half_y_half_loop: 471 movu m2, [srcq] 472 movu m3, [srcq + src_strideq*2] 473 movu m4, [srcq + 2] 474 movu m5, [srcq + src_strideq*2 + 2] 475 pavgw m2, m4 476 pavgw m3, m5 477 pavgw m0, m2 478 pavgw m2, m3 479 mova m4, [dstq] 480 mova m5, [dstq + dst_strideq*2] 481%if %2 == 1 ; avg 482 pavgw m0, [secq] 483 add secq, sec_str 484 pavgw m2, [secq] 485%endif 486 SUM_SSE m0, m4, m2, m5, m6, m7 487 mova m0, m3 488 489 lea srcq, [srcq + src_strideq*4] 490 lea dstq, [dstq + dst_strideq*4] 491%if %2 == 1 ; avg 492 add secq, sec_str 493%endif 494%endif 495 dec block_height 496 jg .x_half_y_half_loop 497 STORE_AND_RET 498 499.x_half_y_nonhalf: 500 ; x_offset == 0.5 && y_offset == bilin interpolation 501%if AOM_ARCH_X86_64 502 lea bilin_filter, [GLOBAL(bilin_filter_m)] 503%endif 504 shl y_offsetd, filter_idx_shift 505%if AOM_ARCH_X86_64 && mmsize == 16 506 mova m8, [bilin_filter+y_offsetq] 507 mova m9, [bilin_filter+y_offsetq+16] 508 mova m10, [GLOBAL(pw_8)] 509%define filter_y_a m8 510%define filter_y_b m9 511%define filter_rnd m10 512%else ; x86_32 513%if AOM_ARCH_X86=1 && CONFIG_PIC=1 514; x_offset == 0.5. We can reuse x_offset reg 515%define tempq x_offsetq 516 add y_offsetq, g_bilin_filterm 517%define filter_y_a [y_offsetq] 518%define filter_y_b [y_offsetq+16] 519 mov tempq, g_pw_8m 520%define filter_rnd [tempq] 521%else 522 add y_offsetq, bilin_filter 523%define filter_y_a [y_offsetq] 524%define filter_y_b [y_offsetq+16] 525%define filter_rnd [GLOBAL(pw_8)] 526%endif 527%endif 528 529%if %1 == 16 530 movu m0, [srcq] 531 movu m1, [srcq+16] 532 movu m2, [srcq+2] 533 movu m3, [srcq+18] 534 lea srcq, [srcq + src_strideq*2] 535 pavgw m0, m2 536 pavgw m1, m3 537.x_half_y_other_loop: 538 movu m2, [srcq] 539 movu m3, [srcq+16] 540 movu m4, [srcq+2] 541 movu m5, [srcq+18] 542 pavgw m2, m4 543 pavgw m3, m5 544 mova m4, m2 545 mova m5, m3 546 pmullw m1, filter_y_a 547 pmullw m3, filter_y_b 548 paddw m1, filter_rnd 549 paddw m1, m3 550 pmullw m0, filter_y_a 551 pmullw m2, filter_y_b 552 paddw m0, filter_rnd 553 psrlw m1, 4 554 paddw m0, m2 555 mova m2, [dstq] 556 psrlw m0, 4 557 mova m3, [dstq+16] 558%if %2 == 1 ; avg 559 pavgw m0, [secq] 560 pavgw m1, [secq+16] 561%endif 562 SUM_SSE m0, m2, m1, m3, m6, m7 563 mova m0, m4 564 mova m1, m5 565 566 lea srcq, [srcq + src_strideq*2] 567 lea dstq, [dstq + dst_strideq*2] 568%if %2 == 1 ; avg 569 add secq, sec_str 570%endif 571%else ; %1 < 16 572 movu m0, [srcq] 573 movu m2, [srcq+2] 574 lea srcq, [srcq + src_strideq*2] 575 pavgw m0, m2 576.x_half_y_other_loop: 577 movu m2, [srcq] 578 movu m3, [srcq+src_strideq*2] 579 movu m4, [srcq+2] 580 movu m5, [srcq+src_strideq*2+2] 581 pavgw m2, m4 582 pavgw m3, m5 583 mova m4, m2 584 mova m5, m3 585 pmullw m4, filter_y_a 586 pmullw m3, filter_y_b 587 paddw m4, filter_rnd 588 paddw m4, m3 589 pmullw m0, filter_y_a 590 pmullw m2, filter_y_b 591 paddw m0, filter_rnd 592 psrlw m4, 4 593 paddw m0, m2 594 mova m2, [dstq] 595 psrlw m0, 4 596 mova m3, [dstq+dst_strideq*2] 597%if %2 == 1 ; avg 598 pavgw m0, [secq] 599 add secq, sec_str 600 pavgw m4, [secq] 601%endif 602 SUM_SSE m0, m2, m4, m3, m6, m7 603 mova m0, m5 604 605 lea srcq, [srcq + src_strideq*4] 606 lea dstq, [dstq + dst_strideq*4] 607%if %2 == 1 ; avg 608 add secq, sec_str 609%endif 610%endif 611 dec block_height 612 jg .x_half_y_other_loop 613%undef filter_y_a 614%undef filter_y_b 615%undef filter_rnd 616 STORE_AND_RET 617 618.x_nonhalf: 619 test y_offsetd, y_offsetd 620 jnz .x_nonhalf_y_nonzero 621 622 ; x_offset == bilin interpolation && y_offset == 0 623%if AOM_ARCH_X86_64 624 lea bilin_filter, [GLOBAL(bilin_filter_m)] 625%endif 626 shl x_offsetd, filter_idx_shift 627%if AOM_ARCH_X86_64 && mmsize == 16 628 mova m8, [bilin_filter+x_offsetq] 629 mova m9, [bilin_filter+x_offsetq+16] 630 mova m10, [GLOBAL(pw_8)] 631%define filter_x_a m8 632%define filter_x_b m9 633%define filter_rnd m10 634%else ; x86-32 635%if AOM_ARCH_X86=1 && CONFIG_PIC=1 636; y_offset == 0. We can reuse y_offset reg. 637%define tempq y_offsetq 638 add x_offsetq, g_bilin_filterm 639%define filter_x_a [x_offsetq] 640%define filter_x_b [x_offsetq+16] 641 mov tempq, g_pw_8m 642%define filter_rnd [tempq] 643%else 644 add x_offsetq, bilin_filter 645%define filter_x_a [x_offsetq] 646%define filter_x_b [x_offsetq+16] 647%define filter_rnd [GLOBAL(pw_8)] 648%endif 649%endif 650 651.x_other_y_zero_loop: 652%if %1 == 16 653 movu m0, [srcq] 654 movu m1, [srcq+16] 655 movu m2, [srcq+2] 656 movu m3, [srcq+18] 657 mova m4, [dstq] 658 mova m5, [dstq+16] 659 pmullw m1, filter_x_a 660 pmullw m3, filter_x_b 661 paddw m1, filter_rnd 662 pmullw m0, filter_x_a 663 pmullw m2, filter_x_b 664 paddw m0, filter_rnd 665 paddw m1, m3 666 paddw m0, m2 667 psrlw m1, 4 668 psrlw m0, 4 669%if %2 == 1 ; avg 670 pavgw m0, [secq] 671 pavgw m1, [secq+16] 672%endif 673 SUM_SSE m0, m4, m1, m5, m6, m7 674 675 lea srcq, [srcq+src_strideq*2] 676 lea dstq, [dstq+dst_strideq*2] 677%if %2 == 1 ; avg 678 add secq, sec_str 679%endif 680%else ; %1 < 16 681 movu m0, [srcq] 682 movu m1, [srcq+src_strideq*2] 683 movu m2, [srcq+2] 684 movu m3, [srcq+src_strideq*2+2] 685 mova m4, [dstq] 686 mova m5, [dstq+dst_strideq*2] 687 pmullw m1, filter_x_a 688 pmullw m3, filter_x_b 689 paddw m1, filter_rnd 690 pmullw m0, filter_x_a 691 pmullw m2, filter_x_b 692 paddw m0, filter_rnd 693 paddw m1, m3 694 paddw m0, m2 695 psrlw m1, 4 696 psrlw m0, 4 697%if %2 == 1 ; avg 698 pavgw m0, [secq] 699 add secq, sec_str 700 pavgw m1, [secq] 701%endif 702 SUM_SSE m0, m4, m1, m5, m6, m7 703 704 lea srcq, [srcq+src_strideq*4] 705 lea dstq, [dstq+dst_strideq*4] 706%if %2 == 1 ; avg 707 add secq, sec_str 708%endif 709%endif 710 dec block_height 711 jg .x_other_y_zero_loop 712%undef filter_x_a 713%undef filter_x_b 714%undef filter_rnd 715 STORE_AND_RET 716 717.x_nonhalf_y_nonzero: 718 cmp y_offsetd, 8 719 jne .x_nonhalf_y_nonhalf 720 721 ; x_offset == bilin interpolation && y_offset == 0.5 722%if AOM_ARCH_X86_64 723 lea bilin_filter, [GLOBAL(bilin_filter_m)] 724%endif 725 shl x_offsetd, filter_idx_shift 726%if AOM_ARCH_X86_64 && mmsize == 16 727 mova m8, [bilin_filter+x_offsetq] 728 mova m9, [bilin_filter+x_offsetq+16] 729 mova m10, [GLOBAL(pw_8)] 730%define filter_x_a m8 731%define filter_x_b m9 732%define filter_rnd m10 733%else ; x86-32 734%if AOM_ARCH_X86=1 && CONFIG_PIC=1 735; y_offset == 0.5. We can reuse y_offset reg. 736%define tempq y_offsetq 737 add x_offsetq, g_bilin_filterm 738%define filter_x_a [x_offsetq] 739%define filter_x_b [x_offsetq+16] 740 mov tempq, g_pw_8m 741%define filter_rnd [tempq] 742%else 743 add x_offsetq, bilin_filter 744%define filter_x_a [x_offsetq] 745%define filter_x_b [x_offsetq+16] 746%define filter_rnd [GLOBAL(pw_8)] 747%endif 748%endif 749 750%if %1 == 16 751 movu m0, [srcq] 752 movu m1, [srcq+16] 753 movu m2, [srcq+2] 754 movu m3, [srcq+18] 755 pmullw m0, filter_x_a 756 pmullw m2, filter_x_b 757 paddw m0, filter_rnd 758 pmullw m1, filter_x_a 759 pmullw m3, filter_x_b 760 paddw m1, filter_rnd 761 paddw m0, m2 762 paddw m1, m3 763 psrlw m0, 4 764 psrlw m1, 4 765 lea srcq, [srcq+src_strideq*2] 766.x_other_y_half_loop: 767 movu m2, [srcq] 768 movu m3, [srcq+16] 769 movu m4, [srcq+2] 770 movu m5, [srcq+18] 771 pmullw m2, filter_x_a 772 pmullw m4, filter_x_b 773 paddw m2, filter_rnd 774 pmullw m3, filter_x_a 775 pmullw m5, filter_x_b 776 paddw m3, filter_rnd 777 paddw m2, m4 778 paddw m3, m5 779 mova m4, [dstq] 780 mova m5, [dstq+16] 781 psrlw m2, 4 782 psrlw m3, 4 783 pavgw m0, m2 784 pavgw m1, m3 785%if %2 == 1 ; avg 786 pavgw m0, [secq] 787 pavgw m1, [secq+16] 788%endif 789 SUM_SSE m0, m4, m1, m5, m6, m7 790 mova m0, m2 791 mova m1, m3 792 793 lea srcq, [srcq+src_strideq*2] 794 lea dstq, [dstq+dst_strideq*2] 795%if %2 == 1 ; avg 796 add secq, sec_str 797%endif 798%else ; %1 < 16 799 movu m0, [srcq] 800 movu m2, [srcq+2] 801 pmullw m0, filter_x_a 802 pmullw m2, filter_x_b 803 paddw m0, filter_rnd 804 paddw m0, m2 805 psrlw m0, 4 806 lea srcq, [srcq+src_strideq*2] 807.x_other_y_half_loop: 808 movu m2, [srcq] 809 movu m3, [srcq+src_strideq*2] 810 movu m4, [srcq+2] 811 movu m5, [srcq+src_strideq*2+2] 812 pmullw m2, filter_x_a 813 pmullw m4, filter_x_b 814 paddw m2, filter_rnd 815 pmullw m3, filter_x_a 816 pmullw m5, filter_x_b 817 paddw m3, filter_rnd 818 paddw m2, m4 819 paddw m3, m5 820 mova m4, [dstq] 821 mova m5, [dstq+dst_strideq*2] 822 psrlw m2, 4 823 psrlw m3, 4 824 pavgw m0, m2 825 pavgw m2, m3 826%if %2 == 1 ; avg 827 pavgw m0, [secq] 828 add secq, sec_str 829 pavgw m2, [secq] 830%endif 831 SUM_SSE m0, m4, m2, m5, m6, m7 832 mova m0, m3 833 834 lea srcq, [srcq+src_strideq*4] 835 lea dstq, [dstq+dst_strideq*4] 836%if %2 == 1 ; avg 837 add secq, sec_str 838%endif 839%endif 840 dec block_height 841 jg .x_other_y_half_loop 842%undef filter_x_a 843%undef filter_x_b 844%undef filter_rnd 845 STORE_AND_RET 846 847.x_nonhalf_y_nonhalf: 848; loading filter - this is same as in 8-bit depth 849%if AOM_ARCH_X86_64 850 lea bilin_filter, [GLOBAL(bilin_filter_m)] 851%endif 852 shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 853 shl y_offsetd, filter_idx_shift 854%if AOM_ARCH_X86_64 && mmsize == 16 855 mova m8, [bilin_filter+x_offsetq] 856 mova m9, [bilin_filter+x_offsetq+16] 857 mova m10, [bilin_filter+y_offsetq] 858 mova m11, [bilin_filter+y_offsetq+16] 859 mova m12, [GLOBAL(pw_8)] 860%define filter_x_a m8 861%define filter_x_b m9 862%define filter_y_a m10 863%define filter_y_b m11 864%define filter_rnd m12 865%else ; x86-32 866%if AOM_ARCH_X86=1 && CONFIG_PIC=1 867; In this case, there is NO unused register. Used src_stride register. Later, 868; src_stride has to be loaded from stack when it is needed. 869%define tempq src_strideq 870 mov tempq, g_bilin_filterm 871 add x_offsetq, tempq 872 add y_offsetq, tempq 873%define filter_x_a [x_offsetq] 874%define filter_x_b [x_offsetq+16] 875%define filter_y_a [y_offsetq] 876%define filter_y_b [y_offsetq+16] 877 878 mov tempq, g_pw_8m 879%define filter_rnd [tempq] 880%else 881 add x_offsetq, bilin_filter 882 add y_offsetq, bilin_filter 883%define filter_x_a [x_offsetq] 884%define filter_x_b [x_offsetq+16] 885%define filter_y_a [y_offsetq] 886%define filter_y_b [y_offsetq+16] 887%define filter_rnd [GLOBAL(pw_8)] 888%endif 889%endif 890; end of load filter 891 892 ; x_offset == bilin interpolation && y_offset == bilin interpolation 893%if %1 == 16 894 movu m0, [srcq] 895 movu m2, [srcq+2] 896 movu m1, [srcq+16] 897 movu m3, [srcq+18] 898 pmullw m0, filter_x_a 899 pmullw m2, filter_x_b 900 paddw m0, filter_rnd 901 pmullw m1, filter_x_a 902 pmullw m3, filter_x_b 903 paddw m1, filter_rnd 904 paddw m0, m2 905 paddw m1, m3 906 psrlw m0, 4 907 psrlw m1, 4 908 909 INC_SRC_BY_SRC_STRIDE 910 911.x_other_y_other_loop: 912 movu m2, [srcq] 913 movu m4, [srcq+2] 914 movu m3, [srcq+16] 915 movu m5, [srcq+18] 916 pmullw m2, filter_x_a 917 pmullw m4, filter_x_b 918 paddw m2, filter_rnd 919 pmullw m3, filter_x_a 920 pmullw m5, filter_x_b 921 paddw m3, filter_rnd 922 paddw m2, m4 923 paddw m3, m5 924 psrlw m2, 4 925 psrlw m3, 4 926 mova m4, m2 927 mova m5, m3 928 pmullw m0, filter_y_a 929 pmullw m2, filter_y_b 930 paddw m0, filter_rnd 931 pmullw m1, filter_y_a 932 pmullw m3, filter_y_b 933 paddw m0, m2 934 paddw m1, filter_rnd 935 mova m2, [dstq] 936 paddw m1, m3 937 psrlw m0, 4 938 psrlw m1, 4 939 mova m3, [dstq+16] 940%if %2 == 1 ; avg 941 pavgw m0, [secq] 942 pavgw m1, [secq+16] 943%endif 944 SUM_SSE m0, m2, m1, m3, m6, m7 945 mova m0, m4 946 mova m1, m5 947 948 INC_SRC_BY_SRC_STRIDE 949 lea dstq, [dstq + dst_strideq * 2] 950%if %2 == 1 ; avg 951 add secq, sec_str 952%endif 953%else ; %1 < 16 954 movu m0, [srcq] 955 movu m2, [srcq+2] 956 pmullw m0, filter_x_a 957 pmullw m2, filter_x_b 958 paddw m0, filter_rnd 959 paddw m0, m2 960 psrlw m0, 4 961 962 INC_SRC_BY_SRC_STRIDE 963 964.x_other_y_other_loop: 965 movu m2, [srcq] 966 movu m4, [srcq+2] 967 INC_SRC_BY_SRC_STRIDE 968 movu m3, [srcq] 969 movu m5, [srcq+2] 970 pmullw m2, filter_x_a 971 pmullw m4, filter_x_b 972 paddw m2, filter_rnd 973 pmullw m3, filter_x_a 974 pmullw m5, filter_x_b 975 paddw m3, filter_rnd 976 paddw m2, m4 977 paddw m3, m5 978 psrlw m2, 4 979 psrlw m3, 4 980 mova m4, m2 981 mova m5, m3 982 pmullw m0, filter_y_a 983 pmullw m2, filter_y_b 984 paddw m0, filter_rnd 985 pmullw m4, filter_y_a 986 pmullw m3, filter_y_b 987 paddw m0, m2 988 paddw m4, filter_rnd 989 mova m2, [dstq] 990 paddw m4, m3 991 psrlw m0, 4 992 psrlw m4, 4 993 mova m3, [dstq+dst_strideq*2] 994%if %2 == 1 ; avg 995 pavgw m0, [secq] 996 add secq, sec_str 997 pavgw m4, [secq] 998%endif 999 SUM_SSE m0, m2, m4, m3, m6, m7 1000 mova m0, m5 1001 1002 INC_SRC_BY_SRC_STRIDE 1003 lea dstq, [dstq + dst_strideq * 4] 1004%if %2 == 1 ; avg 1005 add secq, sec_str 1006%endif 1007%endif 1008 dec block_height 1009 jg .x_other_y_other_loop 1010%undef filter_x_a 1011%undef filter_x_b 1012%undef filter_y_a 1013%undef filter_y_b 1014%undef filter_rnd 1015 STORE_AND_RET 1016%endmacro 1017 1018INIT_XMM sse2 1019SUBPEL_VARIANCE 8 1020SUBPEL_VARIANCE 16 1021 1022INIT_XMM sse2 1023SUBPEL_VARIANCE 8, 1 1024SUBPEL_VARIANCE 16, 1 1025