1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18; Macro Arguments 19; Arg 1: Width 20; Arg 2: Height 21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit 22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows 23%macro SAD_FN 4 24%if %4 == 0 ; normal sad 25%if %3 == 5 26cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 27%else ; %3 == 7 28cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 29 src_stride3, ref_stride3, n_rows 30%endif ; %3 == 5/7 31 32%elif %4 == 2 ; skip 33%if %3 == 5 34cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 35%else ; %3 == 7 36cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 37 src_stride3, ref_stride3, n_rows 38%endif ; %3 == 5/7 39 40%else 41%if %3 == 5 42cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ 43 second_pred, n_rows 44%else ; %3 == 7 45cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \ 46 ref, ref_stride, \ 47 second_pred, \ 48 src_stride3, ref_stride3 49%if AOM_ARCH_X86_64 50%define n_rowsd r7d 51%else ; x86-32 52%define n_rowsd dword r0m 53%endif ; x86-32/64 54%endif ; %3 == 5/7 55%endif ; sad/avg/skip 56%if %4 == 2; skip rows so double the stride 57lea src_strided, [src_strided*2] 58lea ref_strided, [ref_strided*2] 59%endif ; %4 skip 60 movsxdifnidn src_strideq, src_strided 61 movsxdifnidn ref_strideq, ref_strided 62%if %3 == 7 63 lea src_stride3q, [src_strideq*3] 64 lea ref_stride3q, [ref_strideq*3] 65%endif ; %3 == 7 66%endmacro 67 68; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, 69; uint8_t *ref, int ref_stride); 70%macro SAD128XN 1-2 0 71 SAD_FN 128, %1, 5, %2 72%if %2 == 2 73 mov n_rowsd, %1/2 74%else 75 mov n_rowsd, %1 76%endif 77 pxor m0, m0 78 79.loop: 80 movu m1, [refq] 81 movu m2, [refq+16] 82 movu m3, [refq+32] 83 movu m4, [refq+48] 84%if %2 == 1 85 pavgb m1, [second_predq+mmsize*0] 86 pavgb m2, [second_predq+mmsize*1] 87 pavgb m3, [second_predq+mmsize*2] 88 pavgb m4, [second_predq+mmsize*3] 89%endif 90 psadbw m1, [srcq] 91 psadbw m2, [srcq+16] 92 psadbw m3, [srcq+32] 93 psadbw m4, [srcq+48] 94 95 paddd m1, m2 96 paddd m3, m4 97 paddd m0, m1 98 paddd m0, m3 99 100 movu m1, [refq+64] 101 movu m2, [refq+80] 102 movu m3, [refq+96] 103 movu m4, [refq+112] 104%if %2 == 1 105 pavgb m1, [second_predq+mmsize*4] 106 pavgb m2, [second_predq+mmsize*5] 107 pavgb m3, [second_predq+mmsize*6] 108 pavgb m4, [second_predq+mmsize*7] 109 lea second_predq, [second_predq+mmsize*8] 110%endif 111 psadbw m1, [srcq+64] 112 psadbw m2, [srcq+80] 113 psadbw m3, [srcq+96] 114 psadbw m4, [srcq+112] 115 116 add refq, ref_strideq 117 add srcq, src_strideq 118 119 paddd m1, m2 120 paddd m3, m4 121 paddd m0, m1 122 paddd m0, m3 123 124 sub n_rowsd, 1 125 jg .loop 126 127 movhlps m1, m0 128 paddd m0, m1 129%if %2 == 2 ; we skipped rows, so now we need to double the sad 130 pslld m0, 1 131%endif 132 movd eax, m0 133 RET 134%endmacro 135 136INIT_XMM sse2 137SAD128XN 128 ; sad128x128_sse2 138SAD128XN 128, 1 ; sad128x128_avg_sse2 139SAD128XN 128, 2 ; sad128x128_skip_sse2 140SAD128XN 64 ; sad128x64_sse2 141SAD128XN 64, 1 ; sad128x64_avg_sse2 142SAD128XN 64, 2 ; sad128x64_skip_sse2 143 144 145; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, 146; uint8_t *ref, int ref_stride); 147%macro SAD64XN 1-2 0 148 SAD_FN 64, %1, 5, %2 149%if %2 == 2 150 mov n_rowsd, %1/2 151%else 152 mov n_rowsd, %1 153%endif 154 pxor m0, m0 155.loop: 156 movu m1, [refq] 157 movu m2, [refq+16] 158 movu m3, [refq+32] 159 movu m4, [refq+48] 160%if %2 == 1 161 pavgb m1, [second_predq+mmsize*0] 162 pavgb m2, [second_predq+mmsize*1] 163 pavgb m3, [second_predq+mmsize*2] 164 pavgb m4, [second_predq+mmsize*3] 165 lea second_predq, [second_predq+mmsize*4] 166%endif 167 psadbw m1, [srcq] 168 psadbw m2, [srcq+16] 169 psadbw m3, [srcq+32] 170 psadbw m4, [srcq+48] 171 paddd m1, m2 172 paddd m3, m4 173 add refq, ref_strideq 174 paddd m0, m1 175 add srcq, src_strideq 176 paddd m0, m3 177 dec n_rowsd 178 jg .loop 179 180 movhlps m1, m0 181 paddd m0, m1 182%if %2 == 2 ; we skipped rows, so now we need to double the sad 183 pslld m0, 1 184%endif 185 movd eax, m0 186 RET 187%endmacro 188 189INIT_XMM sse2 190SAD64XN 128 ; sad64x128_sse2 191SAD64XN 64 ; sad64x64_sse2 192SAD64XN 32 ; sad64x32_sse2 193SAD64XN 128, 1 ; sad64x128_avg_sse2 194SAD64XN 64, 1 ; sad64x64_avg_sse2 195SAD64XN 32, 1 ; sad64x32_avg_sse2 196SAD64XN 128, 2 ; sad64x128_skip_sse2 197SAD64XN 64, 2 ; sad64x64_skip_sse2 198SAD64XN 32, 2 ; sad64x32_skip_sse2 199%if CONFIG_REALTIME_ONLY==0 200SAD64XN 16 ; sad64x16_sse2 201SAD64XN 16, 1 ; sad64x16_avg_sse2 202SAD64XN 16, 2 ; sad64x16_skip_sse2 203%endif 204 205; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, 206; uint8_t *ref, int ref_stride); 207%macro SAD32XN 1-2 0 208 SAD_FN 32, %1, 5, %2 209%if %2 == 2 210 mov n_rowsd, %1/4 211%else 212 mov n_rowsd, %1/2 213%endif 214 pxor m0, m0 215.loop: 216 movu m1, [refq] 217 movu m2, [refq+16] 218 movu m3, [refq+ref_strideq] 219 movu m4, [refq+ref_strideq+16] 220%if %2 == 1 221 pavgb m1, [second_predq+mmsize*0] 222 pavgb m2, [second_predq+mmsize*1] 223 pavgb m3, [second_predq+mmsize*2] 224 pavgb m4, [second_predq+mmsize*3] 225 lea second_predq, [second_predq+mmsize*4] 226%endif 227 psadbw m1, [srcq] 228 psadbw m2, [srcq+16] 229 psadbw m3, [srcq+src_strideq] 230 psadbw m4, [srcq+src_strideq+16] 231 paddd m1, m2 232 paddd m3, m4 233 lea refq, [refq+ref_strideq*2] 234 paddd m0, m1 235 lea srcq, [srcq+src_strideq*2] 236 paddd m0, m3 237 dec n_rowsd 238 jg .loop 239 240 movhlps m1, m0 241 paddd m0, m1 242%if %2 == 2 ; we skipped rows, so now we need to double the sad 243 pslld m0, 1 244%endif 245 movd eax, m0 246 RET 247%endmacro 248 249INIT_XMM sse2 250SAD32XN 64 ; sad32x64_sse2 251SAD32XN 32 ; sad32x32_sse2 252SAD32XN 16 ; sad32x16_sse2 253SAD32XN 64, 1 ; sad32x64_avg_sse2 254SAD32XN 32, 1 ; sad32x32_avg_sse2 255SAD32XN 16, 1 ; sad32x16_avg_sse2 256SAD32XN 64, 2 ; sad32x64_skip_sse2 257SAD32XN 32, 2 ; sad32x32_skip_sse2 258SAD32XN 16, 2 ; sad32x16_skip_sse2 259%if CONFIG_REALTIME_ONLY==0 260SAD32XN 8 ; sad_32x8_sse2 261SAD32XN 8, 1 ; sad_32x8_avg_sse2 262SAD32XN 8, 2 ; sad_32x8_skip_sse2 263%endif 264 265; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, 266; uint8_t *ref, int ref_stride); 267%macro SAD16XN 1-2 0 268 SAD_FN 16, %1, 7, %2 269%if %2 == 2 270 mov n_rowsd, %1/8 271%else 272 mov n_rowsd, %1/4 273%endif 274 pxor m0, m0 275 276.loop: 277 movu m1, [refq] 278 movu m2, [refq+ref_strideq] 279 movu m3, [refq+ref_strideq*2] 280 movu m4, [refq+ref_stride3q] 281%if %2 == 1 282 pavgb m1, [second_predq+mmsize*0] 283 pavgb m2, [second_predq+mmsize*1] 284 pavgb m3, [second_predq+mmsize*2] 285 pavgb m4, [second_predq+mmsize*3] 286 lea second_predq, [second_predq+mmsize*4] 287%endif 288 psadbw m1, [srcq] 289 psadbw m2, [srcq+src_strideq] 290 psadbw m3, [srcq+src_strideq*2] 291 psadbw m4, [srcq+src_stride3q] 292 paddd m1, m2 293 paddd m3, m4 294 lea refq, [refq+ref_strideq*4] 295 paddd m0, m1 296 lea srcq, [srcq+src_strideq*4] 297 paddd m0, m3 298 dec n_rowsd 299 jg .loop 300 301 movhlps m1, m0 302 paddd m0, m1 303%if %2 == 2 ; we skipped rows, so now we need to double the sad 304 pslld m0, 1 305%endif 306 movd eax, m0 307 RET 308%endmacro 309 310INIT_XMM sse2 311SAD16XN 32 ; sad16x32_sse2 312SAD16XN 16 ; sad16x16_sse2 313SAD16XN 8 ; sad16x8_sse2 314SAD16XN 32, 1 ; sad16x32_avg_sse2 315SAD16XN 16, 1 ; sad16x16_avg_sse2 316SAD16XN 8, 1 ; sad16x8_avg_sse2 317SAD16XN 32, 2 ; sad16x32_skip_sse2 318SAD16XN 16, 2 ; sad16x16_skip_sse2 319SAD16XN 8, 2 ; sad16x8_skip_sse2 320%if CONFIG_REALTIME_ONLY==0 321SAD16XN 64 ; sad_16x64_sse2 322SAD16XN 4 ; sad_16x4_sse2 323SAD16XN 64, 1 ; sad_16x64_avg_sse2 324SAD16XN 4, 1 ; sad_16x4_avg_sse2 325SAD16XN 64, 2 ; sad_16x64_skip_sse2 326%endif 327 328; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, 329; uint8_t *ref, int ref_stride); 330%macro SAD8XN 1-2 0 331 SAD_FN 8, %1, 7, %2 332%if %2 == 2 333 mov n_rowsd, %1/8 334%else 335 mov n_rowsd, %1/4 336%endif 337 pxor m0, m0 338 339.loop: 340 movh m1, [refq] 341 movhps m1, [refq+ref_strideq] 342 movh m2, [refq+ref_strideq*2] 343 movhps m2, [refq+ref_stride3q] 344%if %2 == 1 345 pavgb m1, [second_predq+mmsize*0] 346 pavgb m2, [second_predq+mmsize*1] 347 lea second_predq, [second_predq+mmsize*2] 348%endif 349 movh m3, [srcq] 350 movhps m3, [srcq+src_strideq] 351 movh m4, [srcq+src_strideq*2] 352 movhps m4, [srcq+src_stride3q] 353 psadbw m1, m3 354 psadbw m2, m4 355 lea refq, [refq+ref_strideq*4] 356 paddd m0, m1 357 lea srcq, [srcq+src_strideq*4] 358 paddd m0, m2 359 dec n_rowsd 360 jg .loop 361 362 movhlps m1, m0 363 paddd m0, m1 364%if %2 == 2 ; we skipped rows, so now we need to double the sad 365 pslld m0, 1 366%endif 367 movd eax, m0 368 RET 369%endmacro 370 371INIT_XMM sse2 372SAD8XN 16 ; sad8x16_sse2 373SAD8XN 8 ; sad8x8_sse2 374SAD8XN 4 ; sad8x4_sse2 375SAD8XN 16, 1 ; sad8x16_avg_sse2 376SAD8XN 8, 1 ; sad8x8_avg_sse2 377SAD8XN 4, 1 ; sad8x4_avg_sse2 378SAD8XN 16, 2 ; sad8x16_skip_sse2 379SAD8XN 8, 2 ; sad8x8_skip_sse2 380%if CONFIG_REALTIME_ONLY==0 381SAD8XN 32 ; sad_8x32_sse2 382SAD8XN 32, 1 ; sad_8x32_avg_sse2 383SAD8XN 32, 2 ; sad_8x32_skip_sse2 384%endif 385 386; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, 387; uint8_t *ref, int ref_stride); 388%macro SAD4XN 1-2 0 389 SAD_FN 4, %1, 7, %2 390%if %2 == 2 391 mov n_rowsd, %1/8 392%else 393 mov n_rowsd, %1/4 394%endif 395 pxor m0, m0 396 397.loop: 398 movd m1, [refq] 399 movd m2, [refq+ref_strideq] 400 movd m3, [refq+ref_strideq*2] 401 movd m4, [refq+ref_stride3q] 402 punpckldq m1, m2 403 punpckldq m3, m4 404 movlhps m1, m3 405%if %2 == 1 406 pavgb m1, [second_predq+mmsize*0] 407 lea second_predq, [second_predq+mmsize*1] 408%endif 409 movd m2, [srcq] 410 movd m5, [srcq+src_strideq] 411 movd m4, [srcq+src_strideq*2] 412 movd m3, [srcq+src_stride3q] 413 punpckldq m2, m5 414 punpckldq m4, m3 415 movlhps m2, m4 416 psadbw m1, m2 417 lea refq, [refq+ref_strideq*4] 418 paddd m0, m1 419 lea srcq, [srcq+src_strideq*4] 420 dec n_rowsd 421 jg .loop 422 423 movhlps m1, m0 424 paddd m0, m1 425%if %2 == 2 ; we skipped rows, so now we need to double the sad 426 pslld m0, 1 427%endif 428 movd eax, m0 429 RET 430%endmacro 431 432INIT_XMM sse2 433SAD4XN 8 ; sad4x8_sse2 434SAD4XN 4 ; sad4x4_sse2 435SAD4XN 8, 1 ; sad4x8_avg_sse2 436SAD4XN 4, 1 ; sad4x4_avg_sse2 437SAD4XN 8, 2 ; sad4x8_skip_sse2 438%if CONFIG_REALTIME_ONLY==0 439SAD4XN 16 ; sad_4x16_sse2 440SAD4XN 16, 1 ; sad_4x16_avg_sse2 441SAD4XN 16, 2 ; sad_4x16_skip_sse2 442%endif 443