1;***************************************************************************** 2;* x86inc.asm: x264asm abstraction layer 3;***************************************************************************** 4;* Copyright (C) 2005-2019 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Henrik Gramner <henrik@gramner.com> 8;* Anton Mitrofanov <BugMaster@narod.ru> 9;* Fiona Glaser <fiona@x264.com> 10;* 11;* Permission to use, copy, modify, and/or distribute this software for any 12;* purpose with or without fee is hereby granted, provided that the above 13;* copyright notice and this permission notice appear in all copies. 14;* 15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22;***************************************************************************** 23 24; This is a header file for the x264ASM assembly language, which uses 25; NASM/YASM syntax combined with a large number of macros to provide easy 26; abstraction between different calling conventions (x86_32, win64, linux64). 27; It also has various other useful features to simplify writing the kind of 28; DSP functions that are most often used in x264. 29 30; Unlike the rest of x264, this file is available under an ISC license, as it 31; has significant usefulness outside of x264 and we want it to be available 32; to the largest audience possible. Of course, if you modify it for your own 33; purposes to add a new feature, we strongly encourage contributing a patch 34; as this feature might be useful for others as well. Send patches or ideas 35; to x264-devel@videolan.org . 36 37%include "config/aom_config.asm" 38 39%ifndef private_prefix 40 %define private_prefix aom 41%endif 42 43%ifndef public_prefix 44 %define public_prefix private_prefix 45%endif 46 47%ifndef STACK_ALIGNMENT 48 %if AOM_ARCH_X86_64 49 %define STACK_ALIGNMENT 16 50 %else 51 %define STACK_ALIGNMENT 4 52 %endif 53%endif 54 55%define WIN64 0 56%define UNIX64 0 57%if AOM_ARCH_X86_64 58 %ifidn __OUTPUT_FORMAT__,win32 59 %define WIN64 1 60 %elifidn __OUTPUT_FORMAT__,win64 61 %define WIN64 1 62 %elifidn __OUTPUT_FORMAT__,x64 63 %define WIN64 1 64 %else 65 %define UNIX64 1 66 %endif 67%endif 68 69%define FORMAT_ELF 0 70%define FORMAT_MACHO 0 71%ifidn __OUTPUT_FORMAT__,elf 72 %define FORMAT_ELF 1 73%elifidn __OUTPUT_FORMAT__,elf32 74 %define FORMAT_ELF 1 75%elifidn __OUTPUT_FORMAT__,elf64 76 %define FORMAT_ELF 1 77%elifidn __OUTPUT_FORMAT__,macho 78 %define FORMAT_MACHO 1 79%elifidn __OUTPUT_FORMAT__,macho32 80 %define FORMAT_MACHO 1 81%elifidn __OUTPUT_FORMAT__,macho64 82 %define FORMAT_MACHO 1 83%endif 84 85; Set PREFIX for libaom builds. 86%if FORMAT_ELF 87 %undef PREFIX 88%elif WIN64 89 %undef PREFIX 90%else 91 %define PREFIX 92%endif 93 94%ifdef PREFIX 95 %define mangle(x) _ %+ x 96%else 97 %define mangle(x) x 98%endif 99 100; In some instances macho32 tables get misaligned when using .rodata. 101; When looking at the disassembly it appears that the offset is either 102; correct or consistently off by 90. Placing them in the .text section 103; works around the issue. It appears to be specific to the way libaom 104; handles the tables. 105%macro SECTION_RODATA 0-1 16 106 %ifidn __OUTPUT_FORMAT__,win32 107 SECTION .rdata align=%1 108 %elif WIN64 109 SECTION .rdata align=%1 110 %elifidn __OUTPUT_FORMAT__,macho32 111 SECTION .text align=%1 112 fakegot: 113 %elifidn __OUTPUT_FORMAT__,aout 114 SECTION .text 115 %else 116 SECTION .rodata align=%1 117 %endif 118%endmacro 119 120; PIC macros from aom_ports/x86_abi_support.asm. 121%ifidn __OUTPUT_FORMAT__,elf32 122%define ABI_IS_32BIT 1 123%elifidn __OUTPUT_FORMAT__,macho32 124%define ABI_IS_32BIT 1 125%elifidn __OUTPUT_FORMAT__,win32 126%define ABI_IS_32BIT 1 127%elifidn __OUTPUT_FORMAT__,aout 128%define ABI_IS_32BIT 1 129%else 130%define ABI_IS_32BIT 0 131%endif 132 133%if ABI_IS_32BIT 134 %if CONFIG_PIC=1 135 %ifidn __OUTPUT_FORMAT__,elf32 136 %define GET_GOT_DEFINED 1 137 %define WRT_PLT wrt ..plt 138 %macro GET_GOT 1 139 extern _GLOBAL_OFFSET_TABLE_ 140 push %1 141 call %%get_got 142 %%sub_offset: 143 jmp %%exitGG 144 %%get_got: 145 mov %1, [esp] 146 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc 147 ret 148 %%exitGG: 149 %undef GLOBAL 150 %define GLOBAL(x) x + %1 wrt ..gotoff 151 %undef RESTORE_GOT 152 %define RESTORE_GOT pop %1 153 %endmacro 154 %elifidn __OUTPUT_FORMAT__,macho32 155 %define GET_GOT_DEFINED 1 156 %macro GET_GOT 1 157 push %1 158 call %%get_got 159 %%get_got: 160 pop %1 161 %undef GLOBAL 162 %define GLOBAL(x) x + %1 - %%get_got 163 %undef RESTORE_GOT 164 %define RESTORE_GOT pop %1 165 %endmacro 166 %else 167 %define GET_GOT_DEFINED 0 168 %endif 169 %endif 170 171 %if AOM_ARCH_X86_64 == 0 172 %undef PIC 173 %endif 174 175%else 176 %macro GET_GOT 1 177 %endmacro 178 %define GLOBAL(x) rel x 179 %define WRT_PLT wrt ..plt 180 181 %if WIN64 182 %define PIC 183 %elifidn __OUTPUT_FORMAT__,macho64 184 %define PIC 185 %elif CONFIG_PIC 186 %define PIC 187 %endif 188%endif 189 190%ifnmacro GET_GOT 191 %macro GET_GOT 1 192 %endmacro 193 %define GLOBAL(x) x 194%endif 195%ifndef RESTORE_GOT 196 %define RESTORE_GOT 197%endif 198%ifndef WRT_PLT 199 %define WRT_PLT 200%endif 201 202%ifdef PIC 203 default rel 204%endif 205 206%ifndef GET_GOT_DEFINED 207 %define GET_GOT_DEFINED 0 208%endif 209; End PIC macros from aom_ports/x86_abi_support.asm. 210 211; libaom explicitly sets visibilty in shared object builds. Avoid setting 212; visibility to hidden as it may break builds that split sources on e.g., 213; directory boundaries. 214%ifdef CHROMIUM 215 %define VISIBILITY hidden 216 %define HAVE_PRIVATE_EXTERN 1 217%else 218 %define VISIBILITY 219 %define HAVE_PRIVATE_EXTERN 0 220%endif 221 222%ifdef __NASM_VER__ 223 %use smartalign 224 %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 225 %define HAVE_PRIVATE_EXTERN 0 226 %endif 227%endif 228 229; Macros to eliminate most code duplication between x86_32 and x86_64: 230; Currently this works only for leaf functions which load all their arguments 231; into registers at the start, and make no other use of the stack. Luckily that 232; covers most of x264's asm. 233 234; PROLOGUE: 235; %1 = number of arguments. loads them from stack if needed. 236; %2 = number of registers used. pushes callee-saved regs if needed. 237; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 238; %4 = (optional) stack size to be allocated. The stack will be aligned before 239; allocating the specified stack size. If the required stack alignment is 240; larger than the known stack alignment the stack will be manually aligned 241; and an extra register will be allocated to hold the original stack 242; pointer (to not invalidate r0m etc.). To prevent the use of an extra 243; register as stack pointer, request a negative stack size. 244; %4+/%5+ = list of names to define to registers 245; PROLOGUE can also be invoked by adding the same options to cglobal 246 247; e.g. 248; cglobal foo, 2,3,7,0x40, dst, src, tmp 249; declares a function (foo) that automatically loads two arguments (dst and 250; src) into registers, uses one additional register (tmp) plus 7 vector 251; registers (m0-m6) and allocates 0x40 bytes of stack space. 252 253; TODO Some functions can use some args directly from the stack. If they're the 254; last args then you can just not declare them, but if they're in the middle 255; we need more flexible macro. 256 257; RET: 258; Pops anything that was pushed by PROLOGUE, and returns. 259 260; REP_RET: 261; Use this instead of RET if it's a branch target. 262 263; registers: 264; rN and rNq are the native-size register holding function argument N 265; rNd, rNw, rNb are dword, word, and byte size 266; rNh is the high 8 bits of the word size 267; rNm is the original location of arg N (a register or on the stack), dword 268; rNmp is native size 269 270%macro DECLARE_REG 2-3 271 %define r%1q %2 272 %define r%1d %2d 273 %define r%1w %2w 274 %define r%1b %2b 275 %define r%1h %2h 276 %define %2q %2 277 %if %0 == 2 278 %define r%1m %2d 279 %define r%1mp %2 280 %elif AOM_ARCH_X86_64 ; memory 281 %define r%1m [rstk + stack_offset + %3] 282 %define r%1mp qword r %+ %1 %+ m 283 %else 284 %define r%1m [rstk + stack_offset + %3] 285 %define r%1mp dword r %+ %1 %+ m 286 %endif 287 %define r%1 %2 288%endmacro 289 290%macro DECLARE_REG_SIZE 3 291 %define r%1q r%1 292 %define e%1q r%1 293 %define r%1d e%1 294 %define e%1d e%1 295 %define r%1w %1 296 %define e%1w %1 297 %define r%1h %3 298 %define e%1h %3 299 %define r%1b %2 300 %define e%1b %2 301 %if AOM_ARCH_X86_64 == 0 302 %define r%1 e%1 303 %endif 304%endmacro 305 306DECLARE_REG_SIZE ax, al, ah 307DECLARE_REG_SIZE bx, bl, bh 308DECLARE_REG_SIZE cx, cl, ch 309DECLARE_REG_SIZE dx, dl, dh 310DECLARE_REG_SIZE si, sil, null 311DECLARE_REG_SIZE di, dil, null 312DECLARE_REG_SIZE bp, bpl, null 313 314; t# defines for when per-arch register allocation is more complex than just function arguments 315 316%macro DECLARE_REG_TMP 1-* 317 %assign %%i 0 318 %rep %0 319 CAT_XDEFINE t, %%i, r%1 320 %assign %%i %%i+1 321 %rotate 1 322 %endrep 323%endmacro 324 325%macro DECLARE_REG_TMP_SIZE 0-* 326 %rep %0 327 %define t%1q t%1 %+ q 328 %define t%1d t%1 %+ d 329 %define t%1w t%1 %+ w 330 %define t%1h t%1 %+ h 331 %define t%1b t%1 %+ b 332 %rotate 1 333 %endrep 334%endmacro 335 336DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 337 338%if AOM_ARCH_X86_64 339 %define gprsize 8 340%else 341 %define gprsize 4 342%endif 343 344%macro LEA 2 345%if AOM_ARCH_X86_64 346 lea %1, [%2] 347%elif PIC 348 call $+5 ; special-cased to not affect the RSB on most CPU:s 349 pop %1 350 add %1, (%2)-$+1 351%else 352 mov %1, %2 353%endif 354%endmacro 355 356%macro PUSH 1 357 push %1 358 %ifidn rstk, rsp 359 %assign stack_offset stack_offset+gprsize 360 %endif 361%endmacro 362 363%macro POP 1 364 pop %1 365 %ifidn rstk, rsp 366 %assign stack_offset stack_offset-gprsize 367 %endif 368%endmacro 369 370%macro PUSH_IF_USED 1-* 371 %rep %0 372 %if %1 < regs_used 373 PUSH r%1 374 %endif 375 %rotate 1 376 %endrep 377%endmacro 378 379%macro POP_IF_USED 1-* 380 %rep %0 381 %if %1 < regs_used 382 pop r%1 383 %endif 384 %rotate 1 385 %endrep 386%endmacro 387 388%macro LOAD_IF_USED 1-* 389 %rep %0 390 %if %1 < num_args 391 mov r%1, r %+ %1 %+ mp 392 %endif 393 %rotate 1 394 %endrep 395%endmacro 396 397%macro SUB 2 398 sub %1, %2 399 %ifidn %1, rstk 400 %assign stack_offset stack_offset+(%2) 401 %endif 402%endmacro 403 404%macro ADD 2 405 add %1, %2 406 %ifidn %1, rstk 407 %assign stack_offset stack_offset-(%2) 408 %endif 409%endmacro 410 411%macro movifnidn 2 412 %ifnidn %1, %2 413 mov %1, %2 414 %endif 415%endmacro 416 417%if AOM_ARCH_X86_64 == 0 418 %define movsxd movifnidn 419%endif 420 421%macro movsxdifnidn 2 422 %ifnidn %1, %2 423 movsxd %1, %2 424 %endif 425%endmacro 426 427%macro ASSERT 1 428 %if (%1) == 0 429 %error assertion ``%1'' failed 430 %endif 431%endmacro 432 433%macro DEFINE_ARGS 0-* 434 %ifdef n_arg_names 435 %assign %%i 0 436 %rep n_arg_names 437 CAT_UNDEF arg_name %+ %%i, q 438 CAT_UNDEF arg_name %+ %%i, d 439 CAT_UNDEF arg_name %+ %%i, w 440 CAT_UNDEF arg_name %+ %%i, h 441 CAT_UNDEF arg_name %+ %%i, b 442 CAT_UNDEF arg_name %+ %%i, m 443 CAT_UNDEF arg_name %+ %%i, mp 444 CAT_UNDEF arg_name, %%i 445 %assign %%i %%i+1 446 %endrep 447 %endif 448 449 %xdefine %%stack_offset stack_offset 450 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 451 %assign %%i 0 452 %rep %0 453 %xdefine %1q r %+ %%i %+ q 454 %xdefine %1d r %+ %%i %+ d 455 %xdefine %1w r %+ %%i %+ w 456 %xdefine %1h r %+ %%i %+ h 457 %xdefine %1b r %+ %%i %+ b 458 %xdefine %1m r %+ %%i %+ m 459 %xdefine %1mp r %+ %%i %+ mp 460 CAT_XDEFINE arg_name, %%i, %1 461 %assign %%i %%i+1 462 %rotate 1 463 %endrep 464 %xdefine stack_offset %%stack_offset 465 %assign n_arg_names %0 466%endmacro 467 468%define required_stack_alignment ((mmsize + 15) & ~15) 469%define vzeroupper_required (mmsize > 16 && (AOM_ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) 470%define high_mm_regs (16*cpuflag(avx512)) 471 472%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) 473 %ifnum %1 474 %if %1 != 0 475 %assign %%pad 0 476 %assign stack_size %1 477 %if stack_size < 0 478 %assign stack_size -stack_size 479 %endif 480 %if WIN64 481 %assign %%pad %%pad + 32 ; shadow space 482 %if mmsize != 8 483 %assign xmm_regs_used %2 484 %if xmm_regs_used > 8 485 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers 486 %endif 487 %endif 488 %endif 489 %if required_stack_alignment <= STACK_ALIGNMENT 490 ; maintain the current stack alignment 491 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 492 SUB rsp, stack_size_padded 493 %else 494 %assign %%reg_num (regs_used - 1) 495 %xdefine rstk r %+ %%reg_num 496 ; align stack, and save original stack location directly above 497 ; it, i.e. in [rsp+stack_size_padded], so we can restore the 498 ; stack in a single instruction (i.e. mov rsp, rstk or mov 499 ; rsp, [rsp+stack_size_padded]) 500 %if %1 < 0 ; need to store rsp on stack 501 %xdefine rstkm [rsp + stack_size + %%pad] 502 %assign %%pad %%pad + gprsize 503 %else ; can keep rsp in rstk during whole function 504 %xdefine rstkm rstk 505 %endif 506 %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) 507 mov rstk, rsp 508 and rsp, ~(required_stack_alignment-1) 509 sub rsp, stack_size_padded 510 movifnidn rstkm, rstk 511 %endif 512 WIN64_PUSH_XMM 513 %endif 514 %endif 515%endmacro 516 517%macro SETUP_STACK_POINTER 1 518 %ifnum %1 519 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT 520 %if %1 > 0 521 ; Reserve an additional register for storing the original stack pointer, but avoid using 522 ; eax/rax for this purpose since it can potentially get overwritten as a return value. 523 %assign regs_used (regs_used + 1) 524 %if AOM_ARCH_X86_64 && regs_used == 7 525 %assign regs_used 8 526 %elif AOM_ARCH_X86_64 == 0 && regs_used == 1 527 %assign regs_used 2 528 %endif 529 %endif 530 %if AOM_ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 531 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) 532 ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. 533 %assign regs_used 5 + UNIX64 * 3 534 %endif 535 %endif 536 %endif 537%endmacro 538 539%macro DEFINE_ARGS_INTERNAL 3+ 540 %ifnum %2 541 DEFINE_ARGS %3 542 %elif %1 == 4 543 DEFINE_ARGS %2 544 %elif %1 > 4 545 DEFINE_ARGS %2, %3 546 %endif 547%endmacro 548 549%if WIN64 ; Windows x64 ;================================================= 550 551DECLARE_REG 0, rcx 552DECLARE_REG 1, rdx 553DECLARE_REG 2, R8 554DECLARE_REG 3, R9 555DECLARE_REG 4, R10, 40 556DECLARE_REG 5, R11, 48 557DECLARE_REG 6, rax, 56 558DECLARE_REG 7, rdi, 64 559DECLARE_REG 8, rsi, 72 560DECLARE_REG 9, rbx, 80 561DECLARE_REG 10, rbp, 88 562DECLARE_REG 11, R14, 96 563DECLARE_REG 12, R15, 104 564DECLARE_REG 13, R12, 112 565DECLARE_REG 14, R13, 120 566 567%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 568 %assign num_args %1 569 %assign regs_used %2 570 ASSERT regs_used >= num_args 571 SETUP_STACK_POINTER %4 572 ASSERT regs_used <= 15 573 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 574 ALLOC_STACK %4, %3 575 %if mmsize != 8 && stack_size == 0 576 WIN64_SPILL_XMM %3 577 %endif 578 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 579 DEFINE_ARGS_INTERNAL %0, %4, %5 580%endmacro 581 582%macro WIN64_PUSH_XMM 0 583 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. 584 %if xmm_regs_used > 6 + high_mm_regs 585 movaps [rstk + stack_offset + 8], xmm6 586 %endif 587 %if xmm_regs_used > 7 + high_mm_regs 588 movaps [rstk + stack_offset + 24], xmm7 589 %endif 590 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 591 %if %%xmm_regs_on_stack > 0 592 %assign %%i 8 593 %rep %%xmm_regs_on_stack 594 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i 595 %assign %%i %%i+1 596 %endrep 597 %endif 598%endmacro 599 600%macro WIN64_SPILL_XMM 1 601 %assign xmm_regs_used %1 602 ASSERT xmm_regs_used <= 16 + high_mm_regs 603 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 604 %if %%xmm_regs_on_stack > 0 605 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. 606 %assign %%pad %%xmm_regs_on_stack*16 + 32 607 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 608 SUB rsp, stack_size_padded 609 %endif 610 WIN64_PUSH_XMM 611%endmacro 612 613%macro WIN64_RESTORE_XMM_INTERNAL 0 614 %assign %%pad_size 0 615 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 616 %if %%xmm_regs_on_stack > 0 617 %assign %%i xmm_regs_used - high_mm_regs 618 %rep %%xmm_regs_on_stack 619 %assign %%i %%i-1 620 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] 621 %endrep 622 %endif 623 %if stack_size_padded > 0 624 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT 625 mov rsp, rstkm 626 %else 627 add rsp, stack_size_padded 628 %assign %%pad_size stack_size_padded 629 %endif 630 %endif 631 %if xmm_regs_used > 7 + high_mm_regs 632 movaps xmm7, [rsp + stack_offset - %%pad_size + 24] 633 %endif 634 %if xmm_regs_used > 6 + high_mm_regs 635 movaps xmm6, [rsp + stack_offset - %%pad_size + 8] 636 %endif 637%endmacro 638 639%macro WIN64_RESTORE_XMM 0 640 WIN64_RESTORE_XMM_INTERNAL 641 %assign stack_offset (stack_offset-stack_size_padded) 642 %assign stack_size_padded 0 643 %assign xmm_regs_used 0 644%endmacro 645 646%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs 647 648%macro RET 0 649 WIN64_RESTORE_XMM_INTERNAL 650 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 651 %if vzeroupper_required 652 vzeroupper 653 %endif 654 AUTO_REP_RET 655%endmacro 656 657%elif AOM_ARCH_X86_64 ; *nix x64 ;============================================= 658 659DECLARE_REG 0, rdi 660DECLARE_REG 1, rsi 661DECLARE_REG 2, rdx 662DECLARE_REG 3, rcx 663DECLARE_REG 4, R8 664DECLARE_REG 5, R9 665DECLARE_REG 6, rax, 8 666DECLARE_REG 7, R10, 16 667DECLARE_REG 8, R11, 24 668DECLARE_REG 9, rbx, 32 669DECLARE_REG 10, rbp, 40 670DECLARE_REG 11, R14, 48 671DECLARE_REG 12, R15, 56 672DECLARE_REG 13, R12, 64 673DECLARE_REG 14, R13, 72 674 675%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 676 %assign num_args %1 677 %assign regs_used %2 678 %assign xmm_regs_used %3 679 ASSERT regs_used >= num_args 680 SETUP_STACK_POINTER %4 681 ASSERT regs_used <= 15 682 PUSH_IF_USED 9, 10, 11, 12, 13, 14 683 ALLOC_STACK %4 684 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 685 DEFINE_ARGS_INTERNAL %0, %4, %5 686%endmacro 687 688%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required 689 690%macro RET 0 691 %if stack_size_padded > 0 692 %if required_stack_alignment > STACK_ALIGNMENT 693 mov rsp, rstkm 694 %else 695 add rsp, stack_size_padded 696 %endif 697 %endif 698 POP_IF_USED 14, 13, 12, 11, 10, 9 699 %if vzeroupper_required 700 vzeroupper 701 %endif 702 AUTO_REP_RET 703%endmacro 704 705%else ; X86_32 ;============================================================== 706 707DECLARE_REG 0, eax, 4 708DECLARE_REG 1, ecx, 8 709DECLARE_REG 2, edx, 12 710DECLARE_REG 3, ebx, 16 711DECLARE_REG 4, esi, 20 712DECLARE_REG 5, edi, 24 713DECLARE_REG 6, ebp, 28 714%define rsp esp 715 716%macro DECLARE_ARG 1-* 717 %rep %0 718 %define r%1m [rstk + stack_offset + 4*%1 + 4] 719 %define r%1mp dword r%1m 720 %rotate 1 721 %endrep 722%endmacro 723 724DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 725 726%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 727 %assign num_args %1 728 %assign regs_used %2 729 ASSERT regs_used >= num_args 730 %if num_args > 7 731 %assign num_args 7 732 %endif 733 %if regs_used > 7 734 %assign regs_used 7 735 %endif 736 SETUP_STACK_POINTER %4 737 ASSERT regs_used <= 7 738 PUSH_IF_USED 3, 4, 5, 6 739 ALLOC_STACK %4 740 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 741 DEFINE_ARGS_INTERNAL %0, %4, %5 742%endmacro 743 744%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required 745 746%macro RET 0 747 %if stack_size_padded > 0 748 %if required_stack_alignment > STACK_ALIGNMENT 749 mov rsp, rstkm 750 %else 751 add rsp, stack_size_padded 752 %endif 753 %endif 754 POP_IF_USED 6, 5, 4, 3 755 %if vzeroupper_required 756 vzeroupper 757 %endif 758 AUTO_REP_RET 759%endmacro 760 761%endif ;====================================================================== 762 763%if WIN64 == 0 764 %macro WIN64_SPILL_XMM 1 765 %endmacro 766 %macro WIN64_RESTORE_XMM 0 767 %endmacro 768 %macro WIN64_PUSH_XMM 0 769 %endmacro 770%endif 771 772; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either 773; a branch or a branch target. So switch to a 2-byte form of ret in that case. 774; We can automatically detect "follows a branch", but not a branch target. 775; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) 776%macro REP_RET 0 777 %if has_epilogue || cpuflag(ssse3) 778 RET 779 %else 780 rep ret 781 %endif 782 annotate_function_size 783%endmacro 784 785%define last_branch_adr $$ 786%macro AUTO_REP_RET 0 787 %if notcpuflag(ssse3) 788 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. 789 %endif 790 ret 791 annotate_function_size 792%endmacro 793 794%macro BRANCH_INSTR 0-* 795 %rep %0 796 %macro %1 1-2 %1 797 %2 %1 798 %if notcpuflag(ssse3) 799 %%branch_instr equ $ 800 %xdefine last_branch_adr %%branch_instr 801 %endif 802 %endmacro 803 %rotate 1 804 %endrep 805%endmacro 806 807BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp 808 809%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent 810 %if has_epilogue 811 call %1 812 RET 813 %elif %2 814 jmp %1 815 %endif 816 annotate_function_size 817%endmacro 818 819;============================================================================= 820; arch-independent part 821;============================================================================= 822 823%assign function_align 16 824 825; Begin a function. 826; Applies any symbol mangling needed for C linkage, and sets up a define such that 827; subsequent uses of the function name automatically refer to the mangled version. 828; Appends cpuflags to the function name if cpuflags has been specified. 829; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX 830; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). 831%macro cglobal 1-2+ "" ; name, [PROLOGUE args] 832 cglobal_internal 1, %1 %+ SUFFIX, %2 833%endmacro 834%macro cvisible 1-2+ "" ; name, [PROLOGUE args] 835 cglobal_internal 0, %1 %+ SUFFIX, %2 836%endmacro 837%macro cglobal_internal 2-3+ 838 annotate_function_size 839 %ifndef cglobaled_%2 840 %if %1 841 %xdefine %2 mangle(private_prefix %+ _ %+ %2) 842 %else 843 %xdefine %2 mangle(public_prefix %+ _ %+ %2) 844 %endif 845 %xdefine %2.skip_prologue %2 %+ .skip_prologue 846 CAT_XDEFINE cglobaled_, %2, 1 847 %endif 848 %xdefine current_function %2 849 %xdefine current_function_section __SECT__ 850 %if FORMAT_ELF 851 %if %1 852 global %2:function VISIBILITY 853 %else 854 global %2:function 855 %endif 856 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 857 global %2:private_extern 858 %else 859 global %2 860 %endif 861 align function_align 862 %2: 863 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer 864 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required 865 %assign stack_offset 0 ; stack pointer offset relative to the return address 866 %assign stack_size 0 ; amount of stack space that can be freely used inside a function 867 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding 868 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper 869 %ifnidn %3, "" 870 PROLOGUE %3 871 %endif 872%endmacro 873 874; Create a global symbol from a local label with the correct name mangling and type 875%macro cglobal_label 1 876 %if FORMAT_ELF 877 global current_function %+ %1:function VISIBILITY 878 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN 879 global current_function %+ %1:private_extern 880 %else 881 global current_function %+ %1 882 %endif 883 %1: 884%endmacro 885 886%macro cextern 1 887 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 888 CAT_XDEFINE cglobaled_, %1, 1 889 extern %1 890%endmacro 891 892; like cextern, but without the prefix 893%macro cextern_naked 1 894 %ifdef PREFIX 895 %xdefine %1 mangle(%1) 896 %endif 897 CAT_XDEFINE cglobaled_, %1, 1 898 extern %1 899%endmacro 900 901%macro const 1-2+ 902 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 903 %if FORMAT_ELF 904 global %1:data VISIBILITY 905 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN 906 global %1:private_extern 907 %else 908 global %1 909 %endif 910 %1: %2 911%endmacro 912 913; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. 914%if FORMAT_ELF 915 [SECTION .note.GNU-stack noalloc noexec nowrite progbits] 916%endif 917 918; Tell debuggers how large the function was. 919; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. 920; This is invoked by RET and similar macros, and also cglobal does it for the previous function, 921; but if the last function in a source file doesn't use any of the standard macros for its epilogue, 922; then its size might be unspecified. 923%macro annotate_function_size 0 924 %ifdef __YASM_VER__ 925 %ifdef current_function 926 %if FORMAT_ELF 927 current_function_section 928 %%ecf equ $ 929 size current_function %%ecf - current_function 930 __SECT__ 931 %endif 932 %endif 933 %endif 934%endmacro 935 936; cpuflags 937 938%assign cpuflags_mmx (1<<0) 939%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 940%assign cpuflags_3dnow (1<<2) | cpuflags_mmx 941%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow 942%assign cpuflags_sse (1<<4) | cpuflags_mmx2 943%assign cpuflags_sse2 (1<<5) | cpuflags_sse 944%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 945%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 946%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 947%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 948%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 949%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 950%assign cpuflags_aesni (1<<12)| cpuflags_sse42 951%assign cpuflags_gfni (1<<13)| cpuflags_sse42 952%assign cpuflags_avx (1<<14)| cpuflags_sse42 953%assign cpuflags_xop (1<<15)| cpuflags_avx 954%assign cpuflags_fma4 (1<<16)| cpuflags_avx 955%assign cpuflags_fma3 (1<<17)| cpuflags_avx 956%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt 957%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1 958%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2 959%assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL 960 961%assign cpuflags_cache32 (1<<22) 962%assign cpuflags_cache64 (1<<23) 963%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant 964%assign cpuflags_atom (1<<25) 965 966; Returns a boolean value expressing whether or not the specified cpuflag is enabled. 967%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) 968%define notcpuflag(x) (cpuflag(x) ^ 1) 969 970; Takes an arbitrary number of cpuflags from the above list. 971; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 972; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 973%macro INIT_CPUFLAGS 0-* 974 %xdefine SUFFIX 975 %undef cpuname 976 %assign cpuflags 0 977 978 %if %0 >= 1 979 %rep %0 980 %ifdef cpuname 981 %xdefine cpuname cpuname %+ _%1 982 %else 983 %xdefine cpuname %1 984 %endif 985 %assign cpuflags cpuflags | cpuflags_%1 986 %rotate 1 987 %endrep 988 %xdefine SUFFIX _ %+ cpuname 989 990 %if cpuflag(avx) 991 %assign avx_enabled 1 992 %endif 993 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) 994 %define mova movaps 995 %define movu movups 996 %define movnta movntps 997 %endif 998 %if cpuflag(aligned) 999 %define movu mova 1000 %elif cpuflag(sse3) && notcpuflag(ssse3) 1001 %define movu lddqu 1002 %endif 1003 %endif 1004 1005 %if AOM_ARCH_X86_64 || cpuflag(sse2) 1006 %ifdef __NASM_VER__ 1007 ALIGNMODE p6 1008 %else 1009 CPU amdnop 1010 %endif 1011 %else 1012 %ifdef __NASM_VER__ 1013 ALIGNMODE nop 1014 %else 1015 CPU basicnop 1016 %endif 1017 %endif 1018%endmacro 1019 1020; Merge mmx, sse*, and avx* 1021; m# is a simd register of the currently selected size 1022; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# 1023; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# 1024; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# 1025; (All 4 remain in sync through SWAP.) 1026 1027%macro CAT_XDEFINE 3 1028 %xdefine %1%2 %3 1029%endmacro 1030 1031%macro CAT_UNDEF 2 1032 %undef %1%2 1033%endmacro 1034 1035%macro DEFINE_MMREGS 1 ; mmtype 1036 %assign %%prev_mmregs 0 1037 %ifdef num_mmregs 1038 %assign %%prev_mmregs num_mmregs 1039 %endif 1040 1041 %assign num_mmregs 8 1042 %if AOM_ARCH_X86_64 && mmsize >= 16 1043 %assign num_mmregs 16 1044 %if cpuflag(avx512) || mmsize == 64 1045 %assign num_mmregs 32 1046 %endif 1047 %endif 1048 1049 %assign %%i 0 1050 %rep num_mmregs 1051 CAT_XDEFINE m, %%i, %1 %+ %%i 1052 CAT_XDEFINE nn%1, %%i, %%i 1053 %assign %%i %%i+1 1054 %endrep 1055 %if %%prev_mmregs > num_mmregs 1056 %rep %%prev_mmregs - num_mmregs 1057 CAT_UNDEF m, %%i 1058 CAT_UNDEF nn %+ mmtype, %%i 1059 %assign %%i %%i+1 1060 %endrep 1061 %endif 1062 %xdefine mmtype %1 1063%endmacro 1064 1065; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper 1066%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg 1067 %if AOM_ARCH_X86_64 && cpuflag(avx512) 1068 %assign %%i %1 1069 %rep 16-%1 1070 %assign %%i_high %%i+16 1071 SWAP %%i, %%i_high 1072 %assign %%i %%i+1 1073 %endrep 1074 %endif 1075%endmacro 1076 1077%macro INIT_MMX 0-1+ 1078 %assign avx_enabled 0 1079 %define RESET_MM_PERMUTATION INIT_MMX %1 1080 %define mmsize 8 1081 %define mova movq 1082 %define movu movq 1083 %define movh movd 1084 %define movnta movntq 1085 INIT_CPUFLAGS %1 1086 DEFINE_MMREGS mm 1087%endmacro 1088 1089%macro INIT_XMM 0-1+ 1090 %assign avx_enabled 0 1091 %define RESET_MM_PERMUTATION INIT_XMM %1 1092 %define mmsize 16 1093 %define mova movdqa 1094 %define movu movdqu 1095 %define movh movq 1096 %define movnta movntdq 1097 INIT_CPUFLAGS %1 1098 DEFINE_MMREGS xmm 1099 %if WIN64 1100 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers 1101 %endif 1102%endmacro 1103 1104%macro INIT_YMM 0-1+ 1105 %assign avx_enabled 1 1106 %define RESET_MM_PERMUTATION INIT_YMM %1 1107 %define mmsize 32 1108 %define mova movdqa 1109 %define movu movdqu 1110 %undef movh 1111 %define movnta movntdq 1112 INIT_CPUFLAGS %1 1113 DEFINE_MMREGS ymm 1114 AVX512_MM_PERMUTATION 1115%endmacro 1116 1117%macro INIT_ZMM 0-1+ 1118 %assign avx_enabled 1 1119 %define RESET_MM_PERMUTATION INIT_ZMM %1 1120 %define mmsize 64 1121 %define mova movdqa 1122 %define movu movdqu 1123 %undef movh 1124 %define movnta movntdq 1125 INIT_CPUFLAGS %1 1126 DEFINE_MMREGS zmm 1127 AVX512_MM_PERMUTATION 1128%endmacro 1129 1130INIT_XMM 1131 1132%macro DECLARE_MMCAST 1 1133 %define mmmm%1 mm%1 1134 %define mmxmm%1 mm%1 1135 %define mmymm%1 mm%1 1136 %define mmzmm%1 mm%1 1137 %define xmmmm%1 mm%1 1138 %define xmmxmm%1 xmm%1 1139 %define xmmymm%1 xmm%1 1140 %define xmmzmm%1 xmm%1 1141 %define ymmmm%1 mm%1 1142 %define ymmxmm%1 xmm%1 1143 %define ymmymm%1 ymm%1 1144 %define ymmzmm%1 ymm%1 1145 %define zmmmm%1 mm%1 1146 %define zmmxmm%1 xmm%1 1147 %define zmmymm%1 ymm%1 1148 %define zmmzmm%1 zmm%1 1149 %define xm%1 xmm %+ m%1 1150 %define ym%1 ymm %+ m%1 1151 %define zm%1 zmm %+ m%1 1152%endmacro 1153 1154%assign i 0 1155%rep 32 1156 DECLARE_MMCAST i 1157 %assign i i+1 1158%endrep 1159 1160; I often want to use macros that permute their arguments. e.g. there's no 1161; efficient way to implement butterfly or transpose or dct without swapping some 1162; arguments. 1163; 1164; I would like to not have to manually keep track of the permutations: 1165; If I insert a permutation in the middle of a function, it should automatically 1166; change everything that follows. For more complex macros I may also have multiple 1167; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 1168; 1169; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 1170; permutes its arguments. It's equivalent to exchanging the contents of the 1171; registers, except that this way you exchange the register names instead, so it 1172; doesn't cost any cycles. 1173 1174%macro PERMUTE 2-* ; takes a list of pairs to swap 1175 %rep %0/2 1176 %xdefine %%tmp%2 m%2 1177 %rotate 2 1178 %endrep 1179 %rep %0/2 1180 %xdefine m%1 %%tmp%2 1181 CAT_XDEFINE nn, m%1, %1 1182 %rotate 2 1183 %endrep 1184%endmacro 1185 1186%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) 1187 %ifnum %1 ; SWAP 0, 1, ... 1188 SWAP_INTERNAL_NUM %1, %2 1189 %else ; SWAP m0, m1, ... 1190 SWAP_INTERNAL_NAME %1, %2 1191 %endif 1192%endmacro 1193 1194%macro SWAP_INTERNAL_NUM 2-* 1195 %rep %0-1 1196 %xdefine %%tmp m%1 1197 %xdefine m%1 m%2 1198 %xdefine m%2 %%tmp 1199 CAT_XDEFINE nn, m%1, %1 1200 CAT_XDEFINE nn, m%2, %2 1201 %rotate 1 1202 %endrep 1203%endmacro 1204 1205%macro SWAP_INTERNAL_NAME 2-* 1206 %xdefine %%args nn %+ %1 1207 %rep %0-1 1208 %xdefine %%args %%args, nn %+ %2 1209 %rotate 1 1210 %endrep 1211 SWAP_INTERNAL_NUM %%args 1212%endmacro 1213 1214; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 1215; calls to that function will automatically load the permutation, so values can 1216; be returned in mmregs. 1217%macro SAVE_MM_PERMUTATION 0-1 1218 %if %0 1219 %xdefine %%f %1_m 1220 %else 1221 %xdefine %%f current_function %+ _m 1222 %endif 1223 %assign %%i 0 1224 %rep num_mmregs 1225 %xdefine %%tmp m %+ %%i 1226 CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp 1227 %assign %%i %%i+1 1228 %endrep 1229%endmacro 1230 1231%macro LOAD_MM_PERMUTATION 0-1 ; name to load from 1232 %if %0 1233 %xdefine %%f %1_m 1234 %else 1235 %xdefine %%f current_function %+ _m 1236 %endif 1237 %xdefine %%tmp %%f %+ 0 1238 %ifnum %%tmp 1239 RESET_MM_PERMUTATION 1240 %assign %%i 0 1241 %rep num_mmregs 1242 %xdefine %%tmp %%f %+ %%i 1243 CAT_XDEFINE %%m, %%i, m %+ %%tmp 1244 %assign %%i %%i+1 1245 %endrep 1246 %rep num_mmregs 1247 %assign %%i %%i-1 1248 CAT_XDEFINE m, %%i, %%m %+ %%i 1249 CAT_XDEFINE nn, m %+ %%i, %%i 1250 %endrep 1251 %endif 1252%endmacro 1253 1254; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 1255%macro call 1 1256 %ifid %1 1257 call_internal %1 %+ SUFFIX, %1 1258 %else 1259 call %1 1260 %endif 1261%endmacro 1262%macro call_internal 2 1263 %xdefine %%i %2 1264 %ifndef cglobaled_%2 1265 %ifdef cglobaled_%1 1266 %xdefine %%i %1 1267 %endif 1268 %endif 1269 call %%i 1270 LOAD_MM_PERMUTATION %%i 1271%endmacro 1272 1273; Substitutions that reduce instruction size but are functionally equivalent 1274%macro add 2 1275 %ifnum %2 1276 %if %2==128 1277 sub %1, -128 1278 %else 1279 add %1, %2 1280 %endif 1281 %else 1282 add %1, %2 1283 %endif 1284%endmacro 1285 1286%macro sub 2 1287 %ifnum %2 1288 %if %2==128 1289 add %1, -128 1290 %else 1291 sub %1, %2 1292 %endif 1293 %else 1294 sub %1, %2 1295 %endif 1296%endmacro 1297 1298;============================================================================= 1299; AVX abstraction layer 1300;============================================================================= 1301 1302%assign i 0 1303%rep 32 1304 %if i < 8 1305 CAT_XDEFINE sizeofmm, i, 8 1306 CAT_XDEFINE regnumofmm, i, i 1307 %endif 1308 CAT_XDEFINE sizeofxmm, i, 16 1309 CAT_XDEFINE sizeofymm, i, 32 1310 CAT_XDEFINE sizeofzmm, i, 64 1311 CAT_XDEFINE regnumofxmm, i, i 1312 CAT_XDEFINE regnumofymm, i, i 1313 CAT_XDEFINE regnumofzmm, i, i 1314 %assign i i+1 1315%endrep 1316%undef i 1317 1318%macro CHECK_AVX_INSTR_EMU 3-* 1319 %xdefine %%opcode %1 1320 %xdefine %%dst %2 1321 %rep %0-2 1322 %ifidn %%dst, %3 1323 %error non-avx emulation of ``%%opcode'' is not supported 1324 %endif 1325 %rotate 1 1326 %endrep 1327%endmacro 1328 1329;%1 == instruction 1330;%2 == minimal instruction set 1331;%3 == 1 if float, 0 if int 1332;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) 1333;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1334;%6+: operands 1335%macro RUN_AVX_INSTR 6-9+ 1336 %ifnum sizeof%7 1337 %assign __sizeofreg sizeof%7 1338 %elifnum sizeof%6 1339 %assign __sizeofreg sizeof%6 1340 %else 1341 %assign __sizeofreg mmsize 1342 %endif 1343 %assign __emulate_avx 0 1344 %if avx_enabled && __sizeofreg >= 16 1345 %xdefine __instr v%1 1346 %else 1347 %xdefine __instr %1 1348 %if %0 >= 8+%4 1349 %assign __emulate_avx 1 1350 %endif 1351 %endif 1352 %ifnidn %2, fnord 1353 %ifdef cpuname 1354 %if notcpuflag(%2) 1355 %error use of ``%1'' %2 instruction in cpuname function: current_function 1356 %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) 1357 %error use of ``%1'' sse2 instruction in cpuname function: current_function 1358 %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) 1359 %error use of ``%1'' avx2 instruction in cpuname function: current_function 1360 %elif __sizeofreg == 16 && notcpuflag(sse) 1361 %error use of ``%1'' sse instruction in cpuname function: current_function 1362 %elif __sizeofreg == 32 && notcpuflag(avx) 1363 %error use of ``%1'' avx instruction in cpuname function: current_function 1364 %elif __sizeofreg == 64 && notcpuflag(avx512) 1365 %error use of ``%1'' avx512 instruction in cpuname function: current_function 1366 %elifidn %1, pextrw ; special case because the base instruction is mmx2, 1367 %ifnid %6 ; but sse4 is required for memory operands 1368 %if notcpuflag(sse4) 1369 %error use of ``%1'' sse4 instruction in cpuname function: current_function 1370 %endif 1371 %endif 1372 %endif 1373 %endif 1374 %endif 1375 1376 %if __emulate_avx 1377 %xdefine __src1 %7 1378 %xdefine __src2 %8 1379 %if %5 && %4 == 0 1380 %ifnidn %6, %7 1381 %ifidn %6, %8 1382 %xdefine __src1 %8 1383 %xdefine __src2 %7 1384 %elifnnum sizeof%8 1385 ; 3-operand AVX instructions with a memory arg can only have it in src2, 1386 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). 1387 ; So, if the instruction is commutative with a memory arg, swap them. 1388 %xdefine __src1 %8 1389 %xdefine __src2 %7 1390 %endif 1391 %endif 1392 %endif 1393 %ifnidn %6, __src1 1394 %if %0 >= 9 1395 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 1396 %else 1397 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 1398 %endif 1399 %if __sizeofreg == 8 1400 MOVQ %6, __src1 1401 %elif %3 1402 MOVAPS %6, __src1 1403 %else 1404 MOVDQA %6, __src1 1405 %endif 1406 %endif 1407 %if %0 >= 9 1408 %1 %6, __src2, %9 1409 %else 1410 %1 %6, __src2 1411 %endif 1412 %elif %0 >= 9 1413 __instr %6, %7, %8, %9 1414 %elif %0 == 8 1415 %if avx_enabled && %5 1416 %xdefine __src1 %7 1417 %xdefine __src2 %8 1418 %ifnum regnumof%7 1419 %ifnum regnumof%8 1420 %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 1421 ; Most VEX-encoded instructions require an additional byte to encode when 1422 ; src2 is a high register (e.g. m8..15). If the instruction is commutative 1423 ; we can swap src1 and src2 when doing so reduces the instruction length. 1424 %xdefine __src1 %8 1425 %xdefine __src2 %7 1426 %endif 1427 %endif 1428 %endif 1429 __instr %6, __src1, __src2 1430 %else 1431 __instr %6, %7, %8 1432 %endif 1433 %elif %0 == 7 1434 %if avx_enabled && %5 1435 %xdefine __src1 %6 1436 %xdefine __src2 %7 1437 %ifnum regnumof%6 1438 %ifnum regnumof%7 1439 %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 1440 %xdefine __src1 %7 1441 %xdefine __src2 %6 1442 %endif 1443 %endif 1444 %endif 1445 __instr %6, __src1, __src2 1446 %else 1447 __instr %6, %7 1448 %endif 1449 %else 1450 __instr %6 1451 %endif 1452%endmacro 1453 1454;%1 == instruction 1455;%2 == minimal instruction set 1456;%3 == 1 if float, 0 if int 1457;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) 1458;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1459%macro AVX_INSTR 1-5 fnord, 0, 255, 0 1460 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 1461 %ifidn %2, fnord 1462 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 1463 %elifidn %3, fnord 1464 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 1465 %elifidn %4, fnord 1466 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 1467 %elifidn %5, fnord 1468 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 1469 %else 1470 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 1471 %endif 1472 %endmacro 1473%endmacro 1474 1475; Instructions with both VEX/EVEX and legacy encodings 1476; Non-destructive instructions are written without parameters 1477AVX_INSTR addpd, sse2, 1, 0, 1 1478AVX_INSTR addps, sse, 1, 0, 1 1479AVX_INSTR addsd, sse2, 1, 0, 0 1480AVX_INSTR addss, sse, 1, 0, 0 1481AVX_INSTR addsubpd, sse3, 1, 0, 0 1482AVX_INSTR addsubps, sse3, 1, 0, 0 1483AVX_INSTR aesdec, aesni, 0, 0, 0 1484AVX_INSTR aesdeclast, aesni, 0, 0, 0 1485AVX_INSTR aesenc, aesni, 0, 0, 0 1486AVX_INSTR aesenclast, aesni, 0, 0, 0 1487AVX_INSTR aesimc, aesni 1488AVX_INSTR aeskeygenassist, aesni 1489AVX_INSTR andnpd, sse2, 1, 0, 0 1490AVX_INSTR andnps, sse, 1, 0, 0 1491AVX_INSTR andpd, sse2, 1, 0, 1 1492AVX_INSTR andps, sse, 1, 0, 1 1493AVX_INSTR blendpd, sse4, 1, 1, 0 1494AVX_INSTR blendps, sse4, 1, 1, 0 1495AVX_INSTR blendvpd, sse4 ; can't be emulated 1496AVX_INSTR blendvps, sse4 ; can't be emulated 1497AVX_INSTR cmpeqpd, sse2, 1, 0, 1 1498AVX_INSTR cmpeqps, sse, 1, 0, 1 1499AVX_INSTR cmpeqsd, sse2, 1, 0, 0 1500AVX_INSTR cmpeqss, sse, 1, 0, 0 1501AVX_INSTR cmplepd, sse2, 1, 0, 0 1502AVX_INSTR cmpleps, sse, 1, 0, 0 1503AVX_INSTR cmplesd, sse2, 1, 0, 0 1504AVX_INSTR cmpless, sse, 1, 0, 0 1505AVX_INSTR cmpltpd, sse2, 1, 0, 0 1506AVX_INSTR cmpltps, sse, 1, 0, 0 1507AVX_INSTR cmpltsd, sse2, 1, 0, 0 1508AVX_INSTR cmpltss, sse, 1, 0, 0 1509AVX_INSTR cmpneqpd, sse2, 1, 0, 1 1510AVX_INSTR cmpneqps, sse, 1, 0, 1 1511AVX_INSTR cmpneqsd, sse2, 1, 0, 0 1512AVX_INSTR cmpneqss, sse, 1, 0, 0 1513AVX_INSTR cmpnlepd, sse2, 1, 0, 0 1514AVX_INSTR cmpnleps, sse, 1, 0, 0 1515AVX_INSTR cmpnlesd, sse2, 1, 0, 0 1516AVX_INSTR cmpnless, sse, 1, 0, 0 1517AVX_INSTR cmpnltpd, sse2, 1, 0, 0 1518AVX_INSTR cmpnltps, sse, 1, 0, 0 1519AVX_INSTR cmpnltsd, sse2, 1, 0, 0 1520AVX_INSTR cmpnltss, sse, 1, 0, 0 1521AVX_INSTR cmpordpd, sse2 1, 0, 1 1522AVX_INSTR cmpordps, sse 1, 0, 1 1523AVX_INSTR cmpordsd, sse2 1, 0, 0 1524AVX_INSTR cmpordss, sse 1, 0, 0 1525AVX_INSTR cmppd, sse2, 1, 1, 0 1526AVX_INSTR cmpps, sse, 1, 1, 0 1527AVX_INSTR cmpsd, sse2, 1, 1, 0 1528AVX_INSTR cmpss, sse, 1, 1, 0 1529AVX_INSTR cmpunordpd, sse2, 1, 0, 1 1530AVX_INSTR cmpunordps, sse, 1, 0, 1 1531AVX_INSTR cmpunordsd, sse2, 1, 0, 0 1532AVX_INSTR cmpunordss, sse, 1, 0, 0 1533AVX_INSTR comisd, sse2, 1 1534AVX_INSTR comiss, sse, 1 1535AVX_INSTR cvtdq2pd, sse2, 1 1536AVX_INSTR cvtdq2ps, sse2, 1 1537AVX_INSTR cvtpd2dq, sse2, 1 1538AVX_INSTR cvtpd2ps, sse2, 1 1539AVX_INSTR cvtps2dq, sse2, 1 1540AVX_INSTR cvtps2pd, sse2, 1 1541AVX_INSTR cvtsd2si, sse2, 1 1542AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 1543AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 1544AVX_INSTR cvtsi2ss, sse, 1, 0, 0 1545AVX_INSTR cvtss2sd, sse2, 1, 0, 0 1546AVX_INSTR cvtss2si, sse, 1 1547AVX_INSTR cvttpd2dq, sse2, 1 1548AVX_INSTR cvttps2dq, sse2, 1 1549AVX_INSTR cvttsd2si, sse2, 1 1550AVX_INSTR cvttss2si, sse, 1 1551AVX_INSTR divpd, sse2, 1, 0, 0 1552AVX_INSTR divps, sse, 1, 0, 0 1553AVX_INSTR divsd, sse2, 1, 0, 0 1554AVX_INSTR divss, sse, 1, 0, 0 1555AVX_INSTR dppd, sse4, 1, 1, 0 1556AVX_INSTR dpps, sse4, 1, 1, 0 1557AVX_INSTR extractps, sse4, 1 1558AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 1559AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 1560AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 1561AVX_INSTR haddpd, sse3, 1, 0, 0 1562AVX_INSTR haddps, sse3, 1, 0, 0 1563AVX_INSTR hsubpd, sse3, 1, 0, 0 1564AVX_INSTR hsubps, sse3, 1, 0, 0 1565AVX_INSTR insertps, sse4, 1, 1, 0 1566AVX_INSTR lddqu, sse3 1567AVX_INSTR ldmxcsr, sse, 1 1568AVX_INSTR maskmovdqu, sse2 1569AVX_INSTR maxpd, sse2, 1, 0, 1 1570AVX_INSTR maxps, sse, 1, 0, 1 1571AVX_INSTR maxsd, sse2, 1, 0, 0 1572AVX_INSTR maxss, sse, 1, 0, 0 1573AVX_INSTR minpd, sse2, 1, 0, 1 1574AVX_INSTR minps, sse, 1, 0, 1 1575AVX_INSTR minsd, sse2, 1, 0, 0 1576AVX_INSTR minss, sse, 1, 0, 0 1577AVX_INSTR movapd, sse2, 1 1578AVX_INSTR movaps, sse, 1 1579AVX_INSTR movd, mmx 1580AVX_INSTR movddup, sse3, 1 1581AVX_INSTR movdqa, sse2 1582AVX_INSTR movdqu, sse2 1583AVX_INSTR movhlps, sse, 1, 0, 0 1584AVX_INSTR movhpd, sse2, 1, 0, 0 1585AVX_INSTR movhps, sse, 1, 0, 0 1586AVX_INSTR movlhps, sse, 1, 0, 0 1587AVX_INSTR movlpd, sse2, 1, 0, 0 1588AVX_INSTR movlps, sse, 1, 0, 0 1589AVX_INSTR movmskpd, sse2, 1 1590AVX_INSTR movmskps, sse, 1 1591AVX_INSTR movntdq, sse2 1592AVX_INSTR movntdqa, sse4 1593AVX_INSTR movntpd, sse2, 1 1594AVX_INSTR movntps, sse, 1 1595AVX_INSTR movq, mmx 1596AVX_INSTR movsd, sse2, 1, 0, 0 1597AVX_INSTR movshdup, sse3, 1 1598AVX_INSTR movsldup, sse3, 1 1599AVX_INSTR movss, sse, 1, 0, 0 1600AVX_INSTR movupd, sse2, 1 1601AVX_INSTR movups, sse, 1 1602AVX_INSTR mpsadbw, sse4, 0, 1, 0 1603AVX_INSTR mulpd, sse2, 1, 0, 1 1604AVX_INSTR mulps, sse, 1, 0, 1 1605AVX_INSTR mulsd, sse2, 1, 0, 0 1606AVX_INSTR mulss, sse, 1, 0, 0 1607AVX_INSTR orpd, sse2, 1, 0, 1 1608AVX_INSTR orps, sse, 1, 0, 1 1609AVX_INSTR pabsb, ssse3 1610AVX_INSTR pabsd, ssse3 1611AVX_INSTR pabsw, ssse3 1612AVX_INSTR packsswb, mmx, 0, 0, 0 1613AVX_INSTR packssdw, mmx, 0, 0, 0 1614AVX_INSTR packuswb, mmx, 0, 0, 0 1615AVX_INSTR packusdw, sse4, 0, 0, 0 1616AVX_INSTR paddb, mmx, 0, 0, 1 1617AVX_INSTR paddw, mmx, 0, 0, 1 1618AVX_INSTR paddd, mmx, 0, 0, 1 1619AVX_INSTR paddq, sse2, 0, 0, 1 1620AVX_INSTR paddsb, mmx, 0, 0, 1 1621AVX_INSTR paddsw, mmx, 0, 0, 1 1622AVX_INSTR paddusb, mmx, 0, 0, 1 1623AVX_INSTR paddusw, mmx, 0, 0, 1 1624AVX_INSTR palignr, ssse3, 0, 1, 0 1625AVX_INSTR pand, mmx, 0, 0, 1 1626AVX_INSTR pandn, mmx, 0, 0, 0 1627AVX_INSTR pavgb, mmx2, 0, 0, 1 1628AVX_INSTR pavgw, mmx2, 0, 0, 1 1629AVX_INSTR pblendvb, sse4 ; can't be emulated 1630AVX_INSTR pblendw, sse4, 0, 1, 0 1631AVX_INSTR pclmulqdq, fnord, 0, 1, 0 1632AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 1633AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 1634AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 1635AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 1636AVX_INSTR pcmpestri, sse42 1637AVX_INSTR pcmpestrm, sse42 1638AVX_INSTR pcmpistri, sse42 1639AVX_INSTR pcmpistrm, sse42 1640AVX_INSTR pcmpeqb, mmx, 0, 0, 1 1641AVX_INSTR pcmpeqw, mmx, 0, 0, 1 1642AVX_INSTR pcmpeqd, mmx, 0, 0, 1 1643AVX_INSTR pcmpeqq, sse4, 0, 0, 1 1644AVX_INSTR pcmpgtb, mmx, 0, 0, 0 1645AVX_INSTR pcmpgtw, mmx, 0, 0, 0 1646AVX_INSTR pcmpgtd, mmx, 0, 0, 0 1647AVX_INSTR pcmpgtq, sse42, 0, 0, 0 1648AVX_INSTR pextrb, sse4 1649AVX_INSTR pextrd, sse4 1650AVX_INSTR pextrq, sse4 1651AVX_INSTR pextrw, mmx2 1652AVX_INSTR phaddw, ssse3, 0, 0, 0 1653AVX_INSTR phaddd, ssse3, 0, 0, 0 1654AVX_INSTR phaddsw, ssse3, 0, 0, 0 1655AVX_INSTR phminposuw, sse4 1656AVX_INSTR phsubw, ssse3, 0, 0, 0 1657AVX_INSTR phsubd, ssse3, 0, 0, 0 1658AVX_INSTR phsubsw, ssse3, 0, 0, 0 1659AVX_INSTR pinsrb, sse4, 0, 1, 0 1660AVX_INSTR pinsrd, sse4, 0, 1, 0 1661AVX_INSTR pinsrq, sse4, 0, 1, 0 1662AVX_INSTR pinsrw, mmx2, 0, 1, 0 1663AVX_INSTR pmaddwd, mmx, 0, 0, 1 1664AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 1665AVX_INSTR pmaxsb, sse4, 0, 0, 1 1666AVX_INSTR pmaxsw, mmx2, 0, 0, 1 1667AVX_INSTR pmaxsd, sse4, 0, 0, 1 1668AVX_INSTR pmaxub, mmx2, 0, 0, 1 1669AVX_INSTR pmaxuw, sse4, 0, 0, 1 1670AVX_INSTR pmaxud, sse4, 0, 0, 1 1671AVX_INSTR pminsb, sse4, 0, 0, 1 1672AVX_INSTR pminsw, mmx2, 0, 0, 1 1673AVX_INSTR pminsd, sse4, 0, 0, 1 1674AVX_INSTR pminub, mmx2, 0, 0, 1 1675AVX_INSTR pminuw, sse4, 0, 0, 1 1676AVX_INSTR pminud, sse4, 0, 0, 1 1677AVX_INSTR pmovmskb, mmx2 1678AVX_INSTR pmovsxbw, sse4 1679AVX_INSTR pmovsxbd, sse4 1680AVX_INSTR pmovsxbq, sse4 1681AVX_INSTR pmovsxwd, sse4 1682AVX_INSTR pmovsxwq, sse4 1683AVX_INSTR pmovsxdq, sse4 1684AVX_INSTR pmovzxbw, sse4 1685AVX_INSTR pmovzxbd, sse4 1686AVX_INSTR pmovzxbq, sse4 1687AVX_INSTR pmovzxwd, sse4 1688AVX_INSTR pmovzxwq, sse4 1689AVX_INSTR pmovzxdq, sse4 1690AVX_INSTR pmuldq, sse4, 0, 0, 1 1691AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 1692AVX_INSTR pmulhuw, mmx2, 0, 0, 1 1693AVX_INSTR pmulhw, mmx, 0, 0, 1 1694AVX_INSTR pmullw, mmx, 0, 0, 1 1695AVX_INSTR pmulld, sse4, 0, 0, 1 1696AVX_INSTR pmuludq, sse2, 0, 0, 1 1697AVX_INSTR por, mmx, 0, 0, 1 1698AVX_INSTR psadbw, mmx2, 0, 0, 1 1699AVX_INSTR pshufb, ssse3, 0, 0, 0 1700AVX_INSTR pshufd, sse2 1701AVX_INSTR pshufhw, sse2 1702AVX_INSTR pshuflw, sse2 1703AVX_INSTR psignb, ssse3, 0, 0, 0 1704AVX_INSTR psignw, ssse3, 0, 0, 0 1705AVX_INSTR psignd, ssse3, 0, 0, 0 1706AVX_INSTR psllw, mmx, 0, 0, 0 1707AVX_INSTR pslld, mmx, 0, 0, 0 1708AVX_INSTR psllq, mmx, 0, 0, 0 1709AVX_INSTR pslldq, sse2, 0, 0, 0 1710AVX_INSTR psraw, mmx, 0, 0, 0 1711AVX_INSTR psrad, mmx, 0, 0, 0 1712AVX_INSTR psrlw, mmx, 0, 0, 0 1713AVX_INSTR psrld, mmx, 0, 0, 0 1714AVX_INSTR psrlq, mmx, 0, 0, 0 1715AVX_INSTR psrldq, sse2, 0, 0, 0 1716AVX_INSTR psubb, mmx, 0, 0, 0 1717AVX_INSTR psubw, mmx, 0, 0, 0 1718AVX_INSTR psubd, mmx, 0, 0, 0 1719AVX_INSTR psubq, sse2, 0, 0, 0 1720AVX_INSTR psubsb, mmx, 0, 0, 0 1721AVX_INSTR psubsw, mmx, 0, 0, 0 1722AVX_INSTR psubusb, mmx, 0, 0, 0 1723AVX_INSTR psubusw, mmx, 0, 0, 0 1724AVX_INSTR ptest, sse4 1725AVX_INSTR punpckhbw, mmx, 0, 0, 0 1726AVX_INSTR punpckhwd, mmx, 0, 0, 0 1727AVX_INSTR punpckhdq, mmx, 0, 0, 0 1728AVX_INSTR punpckhqdq, sse2, 0, 0, 0 1729AVX_INSTR punpcklbw, mmx, 0, 0, 0 1730AVX_INSTR punpcklwd, mmx, 0, 0, 0 1731AVX_INSTR punpckldq, mmx, 0, 0, 0 1732AVX_INSTR punpcklqdq, sse2, 0, 0, 0 1733AVX_INSTR pxor, mmx, 0, 0, 1 1734AVX_INSTR rcpps, sse, 1 1735AVX_INSTR rcpss, sse, 1, 0, 0 1736AVX_INSTR roundpd, sse4, 1 1737AVX_INSTR roundps, sse4, 1 1738AVX_INSTR roundsd, sse4, 1, 1, 0 1739AVX_INSTR roundss, sse4, 1, 1, 0 1740AVX_INSTR rsqrtps, sse, 1 1741AVX_INSTR rsqrtss, sse, 1, 0, 0 1742AVX_INSTR shufpd, sse2, 1, 1, 0 1743AVX_INSTR shufps, sse, 1, 1, 0 1744AVX_INSTR sqrtpd, sse2, 1 1745AVX_INSTR sqrtps, sse, 1 1746AVX_INSTR sqrtsd, sse2, 1, 0, 0 1747AVX_INSTR sqrtss, sse, 1, 0, 0 1748AVX_INSTR stmxcsr, sse, 1 1749AVX_INSTR subpd, sse2, 1, 0, 0 1750AVX_INSTR subps, sse, 1, 0, 0 1751AVX_INSTR subsd, sse2, 1, 0, 0 1752AVX_INSTR subss, sse, 1, 0, 0 1753AVX_INSTR ucomisd, sse2, 1 1754AVX_INSTR ucomiss, sse, 1 1755AVX_INSTR unpckhpd, sse2, 1, 0, 0 1756AVX_INSTR unpckhps, sse, 1, 0, 0 1757AVX_INSTR unpcklpd, sse2, 1, 0, 0 1758AVX_INSTR unpcklps, sse, 1, 0, 0 1759AVX_INSTR xorpd, sse2, 1, 0, 1 1760AVX_INSTR xorps, sse, 1, 0, 1 1761 1762; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1763AVX_INSTR pfadd, 3dnow, 1, 0, 1 1764AVX_INSTR pfsub, 3dnow, 1, 0, 0 1765AVX_INSTR pfmul, 3dnow, 1, 0, 1 1766 1767;%1 == instruction 1768;%2 == minimal instruction set 1769%macro GPR_INSTR 2 1770 %macro %1 2-5 fnord, %1, %2 1771 %ifdef cpuname 1772 %if notcpuflag(%5) 1773 %error use of ``%4'' %5 instruction in cpuname function: current_function 1774 %endif 1775 %endif 1776 %ifidn %3, fnord 1777 %4 %1, %2 1778 %else 1779 %4 %1, %2, %3 1780 %endif 1781 %endmacro 1782%endmacro 1783 1784GPR_INSTR andn, bmi1 1785GPR_INSTR bextr, bmi1 1786GPR_INSTR blsi, bmi1 1787GPR_INSTR blsr, bmi1 1788GPR_INSTR blsmsk, bmi1 1789GPR_INSTR bzhi, bmi2 1790GPR_INSTR mulx, bmi2 1791GPR_INSTR pdep, bmi2 1792GPR_INSTR pext, bmi2 1793GPR_INSTR popcnt, sse42 1794GPR_INSTR rorx, bmi2 1795GPR_INSTR sarx, bmi2 1796GPR_INSTR shlx, bmi2 1797GPR_INSTR shrx, bmi2 1798 1799; base-4 constants for shuffles 1800%assign i 0 1801%rep 256 1802 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1803 %if j < 10 1804 CAT_XDEFINE q000, j, i 1805 %elif j < 100 1806 CAT_XDEFINE q00, j, i 1807 %elif j < 1000 1808 CAT_XDEFINE q0, j, i 1809 %else 1810 CAT_XDEFINE q, j, i 1811 %endif 1812 %assign i i+1 1813%endrep 1814%undef i 1815%undef j 1816 1817%macro FMA_INSTR 3 1818 %macro %1 4-7 %1, %2, %3 1819 %if cpuflag(xop) 1820 v%5 %1, %2, %3, %4 1821 %elifnidn %1, %4 1822 %6 %1, %2, %3 1823 %7 %1, %4 1824 %else 1825 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported 1826 %endif 1827 %endmacro 1828%endmacro 1829 1830FMA_INSTR pmacsww, pmullw, paddw 1831FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation 1832FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation 1833FMA_INSTR pmadcswd, pmaddwd, paddd 1834 1835; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. 1836; FMA3 is only possible if dst is the same as one of the src registers. 1837; Either src2 or src3 can be a memory operand. 1838%macro FMA4_INSTR 2-* 1839 %push fma4_instr 1840 %xdefine %$prefix %1 1841 %rep %0 - 1 1842 %macro %$prefix%2 4-6 %$prefix, %2 1843 %if notcpuflag(fma3) && notcpuflag(fma4) 1844 %error use of ``%5%6'' fma instruction in cpuname function: current_function 1845 %elif cpuflag(fma4) 1846 v%5%6 %1, %2, %3, %4 1847 %elifidn %1, %2 1848 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. 1849 %ifnum sizeof%3 1850 v%{5}213%6 %2, %3, %4 1851 %else 1852 v%{5}132%6 %2, %4, %3 1853 %endif 1854 %elifidn %1, %3 1855 v%{5}213%6 %3, %2, %4 1856 %elifidn %1, %4 1857 v%{5}231%6 %4, %2, %3 1858 %else 1859 %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported 1860 %endif 1861 %endmacro 1862 %rotate 1 1863 %endrep 1864 %pop 1865%endmacro 1866 1867FMA4_INSTR fmadd, pd, ps, sd, ss 1868FMA4_INSTR fmaddsub, pd, ps 1869FMA4_INSTR fmsub, pd, ps, sd, ss 1870FMA4_INSTR fmsubadd, pd, ps 1871FMA4_INSTR fnmadd, pd, ps, sd, ss 1872FMA4_INSTR fnmsub, pd, ps, sd, ss 1873 1874; Macros for converting VEX instructions to equivalent EVEX ones. 1875%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex 1876 %macro %1 2-7 fnord, fnord, %1, %2, %3 1877 %ifidn %3, fnord 1878 %define %%args %1, %2 1879 %elifidn %4, fnord 1880 %define %%args %1, %2, %3 1881 %else 1882 %define %%args %1, %2, %3, %4 1883 %endif 1884 %assign %%evex_required cpuflag(avx512) & %7 1885 %ifnum regnumof%1 1886 %if regnumof%1 >= 16 || sizeof%1 > 32 1887 %assign %%evex_required 1 1888 %endif 1889 %endif 1890 %ifnum regnumof%2 1891 %if regnumof%2 >= 16 || sizeof%2 > 32 1892 %assign %%evex_required 1 1893 %endif 1894 %endif 1895 %ifnum regnumof%3 1896 %if regnumof%3 >= 16 || sizeof%3 > 32 1897 %assign %%evex_required 1 1898 %endif 1899 %endif 1900 %if %%evex_required 1901 %6 %%args 1902 %else 1903 %5 %%args ; Prefer VEX over EVEX due to shorter instruction length 1904 %endif 1905 %endmacro 1906%endmacro 1907 1908EVEX_INSTR vbroadcastf128, vbroadcastf32x4 1909EVEX_INSTR vbroadcasti128, vbroadcasti32x4 1910EVEX_INSTR vextractf128, vextractf32x4 1911EVEX_INSTR vextracti128, vextracti32x4 1912EVEX_INSTR vinsertf128, vinsertf32x4 1913EVEX_INSTR vinserti128, vinserti32x4 1914EVEX_INSTR vmovdqa, vmovdqa32 1915EVEX_INSTR vmovdqu, vmovdqu32 1916EVEX_INSTR vpand, vpandd 1917EVEX_INSTR vpandn, vpandnd 1918EVEX_INSTR vpor, vpord 1919EVEX_INSTR vpxor, vpxord 1920EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision 1921EVEX_INSTR vrcpss, vrcp14ss, 1 1922EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 1923EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 1924