1;***************************************************************************** 2;* x86inc.asm: x86 abstraction layer 3;***************************************************************************** 4;* Copyright (C) 2005-2024 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Henrik Gramner <henrik@gramner.com> 8;* Anton Mitrofanov <BugMaster@narod.ru> 9;* Fiona Glaser <fiona@x264.com> 10;* 11;* Permission to use, copy, modify, and/or distribute this software for any 12;* purpose with or without fee is hereby granted, provided that the above 13;* copyright notice and this permission notice appear in all copies. 14;* 15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22;***************************************************************************** 23 24; This is a header file for the x86inc.asm assembly language, which uses 25; NASM/YASM syntax combined with a large number of macros to provide easy 26; abstraction between different calling conventions (x86_32, win64, linux64). 27; It also has various other useful features to simplify writing the kind of 28; DSP functions that are most often used. 29 30%ifndef private_prefix 31 %error private_prefix not defined 32%endif 33 34%ifndef public_prefix 35 %define public_prefix private_prefix 36%endif 37 38%ifndef STACK_ALIGNMENT 39 %if ARCH_X86_64 40 %define STACK_ALIGNMENT 16 41 %else 42 %define STACK_ALIGNMENT 4 43 %endif 44%endif 45 46%define WIN64 0 47%define UNIX64 0 48%if ARCH_X86_64 49 %ifidn __OUTPUT_FORMAT__,win32 50 %define WIN64 1 51 %elifidn __OUTPUT_FORMAT__,win64 52 %define WIN64 1 53 %elifidn __OUTPUT_FORMAT__,x64 54 %define WIN64 1 55 %else 56 %define UNIX64 1 57 %endif 58%endif 59 60%define FORMAT_ELF 0 61%define FORMAT_MACHO 0 62%ifidn __OUTPUT_FORMAT__,elf 63 %define FORMAT_ELF 1 64%elifidn __OUTPUT_FORMAT__,elf32 65 %define FORMAT_ELF 1 66%elifidn __OUTPUT_FORMAT__,elf64 67 %define FORMAT_ELF 1 68%elifidn __OUTPUT_FORMAT__,macho 69 %define FORMAT_MACHO 1 70%elifidn __OUTPUT_FORMAT__,macho32 71 %define FORMAT_MACHO 1 72%elifidn __OUTPUT_FORMAT__,macho64 73 %define FORMAT_MACHO 1 74%endif 75 76%ifdef PREFIX 77 %define mangle(x) _ %+ x 78%else 79 %define mangle(x) x 80%endif 81 82; Use VEX-encoding even in non-AVX functions 83%ifndef FORCE_VEX_ENCODING 84 %define FORCE_VEX_ENCODING 0 85%endif 86 87%macro SECTION_RODATA 0-1 16 88 %ifidn __OUTPUT_FORMAT__,win32 89 SECTION .rdata align=%1 90 %elif WIN64 91 SECTION .rdata align=%1 92 %else 93 SECTION .rodata align=%1 94 %endif 95%endmacro 96 97%if ARCH_X86_64 98 %define PIC 1 ; always use PIC on x86-64 99 default rel 100%elifidn __OUTPUT_FORMAT__,win32 101 %define PIC 0 ; PIC isn't used on 32-bit Windows 102%elifndef PIC 103 %define PIC 0 104%endif 105 106%define HAVE_PRIVATE_EXTERN 1 107%ifdef __NASM_VERSION_ID__ 108 %use smartalign 109 %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 110 %define HAVE_PRIVATE_EXTERN 0 111 %endif 112%endif 113 114; Macros to eliminate most code duplication between x86_32 and x86_64: 115; Currently this works only for leaf functions which load all their arguments 116; into registers at the start, and make no other use of the stack. Luckily that 117; covers most use cases. 118 119; PROLOGUE: 120; %1 = number of arguments. loads them from stack if needed. 121; %2 = number of registers used. pushes callee-saved regs if needed. 122; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 123; %4 = (optional) stack size to be allocated. The stack will be aligned before 124; allocating the specified stack size. If the required stack alignment is 125; larger than the known stack alignment the stack will be manually aligned 126; and an extra register will be allocated to hold the original stack 127; pointer (to not invalidate r0m etc.). To prevent the use of an extra 128; register as stack pointer, request a negative stack size. 129; %4+/%5+ = list of names to define to registers 130; PROLOGUE can also be invoked by adding the same options to cglobal 131 132; e.g. 133; cglobal foo, 2,3,7,0x40, dst, src, tmp 134; declares a function (foo) that automatically loads two arguments (dst and 135; src) into registers, uses one additional register (tmp) plus 7 vector 136; registers (m0-m6) and allocates 0x40 bytes of stack space. 137 138; TODO Some functions can use some args directly from the stack. If they're the 139; last args then you can just not declare them, but if they're in the middle 140; we need more flexible macro. 141 142; RET: 143; Pops anything that was pushed by PROLOGUE, and returns. 144 145; REP_RET: 146; Use this instead of RET if it's a branch target. 147 148; registers: 149; rN and rNq are the native-size register holding function argument N 150; rNd, rNw, rNb are dword, word, and byte size 151; rNh is the high 8 bits of the word size 152; rNm is the original location of arg N (a register or on the stack), dword 153; rNmp is native size 154 155%macro DECLARE_REG 2-3 156 %define r%1q %2 157 %define r%1d %2d 158 %define r%1w %2w 159 %define r%1b %2b 160 %define r%1h %2h 161 %define %2q %2 162 %if %0 == 2 163 %define r%1m %2d 164 %define r%1mp %2 165 %elif ARCH_X86_64 ; memory 166 %define r%1m [rstk + stack_offset + %3] 167 %define r%1mp qword r %+ %1 %+ m 168 %else 169 %define r%1m [rstk + stack_offset + %3] 170 %define r%1mp dword r %+ %1 %+ m 171 %endif 172 %define r%1 %2 173%endmacro 174 175%macro DECLARE_REG_SIZE 3 176 %define r%1q r%1 177 %define e%1q r%1 178 %define r%1d e%1 179 %define e%1d e%1 180 %define r%1w %1 181 %define e%1w %1 182 %define r%1h %3 183 %define e%1h %3 184 %define r%1b %2 185 %define e%1b %2 186 %if ARCH_X86_64 == 0 187 %define r%1 e%1 188 %endif 189%endmacro 190 191DECLARE_REG_SIZE ax, al, ah 192DECLARE_REG_SIZE bx, bl, bh 193DECLARE_REG_SIZE cx, cl, ch 194DECLARE_REG_SIZE dx, dl, dh 195DECLARE_REG_SIZE si, sil, null 196DECLARE_REG_SIZE di, dil, null 197DECLARE_REG_SIZE bp, bpl, null 198 199; t# defines for when per-arch register allocation is more complex than just function arguments 200 201%macro DECLARE_REG_TMP 1-* 202 %assign %%i 0 203 %rep %0 204 CAT_XDEFINE t, %%i, r%1 205 %assign %%i %%i+1 206 %rotate 1 207 %endrep 208%endmacro 209 210%macro DECLARE_REG_TMP_SIZE 0-* 211 %rep %0 212 %define t%1q t%1 %+ q 213 %define t%1d t%1 %+ d 214 %define t%1w t%1 %+ w 215 %define t%1h t%1 %+ h 216 %define t%1b t%1 %+ b 217 %rotate 1 218 %endrep 219%endmacro 220 221DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 222 223%if ARCH_X86_64 224 %define gprsize 8 225%else 226 %define gprsize 4 227%endif 228 229%macro LEA 2 230%if ARCH_X86_64 231 lea %1, [%2] 232%elif PIC 233 call $+5 ; special-cased to not affect the RSB on most CPU:s 234 pop %1 235 add %1, -$+1+%2 236%else 237 mov %1, %2 238%endif 239%endmacro 240 241; Repeats an instruction/operation for multiple arguments. 242; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" 243%macro REPX 2-* ; operation, args 244 %xdefine %%f(x) %1 245 %rep %0 - 1 246 %rotate 1 247 %%f(%1) 248 %endrep 249%endmacro 250 251%macro PUSH 1 252 push %1 253 %ifidn rstk, rsp 254 %assign stack_offset stack_offset+gprsize 255 %endif 256%endmacro 257 258%macro POP 1 259 pop %1 260 %ifidn rstk, rsp 261 %assign stack_offset stack_offset-gprsize 262 %endif 263%endmacro 264 265%macro PUSH_IF_USED 1-* 266 %rep %0 267 %if %1 < regs_used 268 PUSH r%1 269 %endif 270 %rotate 1 271 %endrep 272%endmacro 273 274%macro POP_IF_USED 1-* 275 %rep %0 276 %if %1 < regs_used 277 pop r%1 278 %endif 279 %rotate 1 280 %endrep 281%endmacro 282 283%macro LOAD_IF_USED 1-* 284 %rep %0 285 %if %1 < num_args 286 mov r%1, r %+ %1 %+ mp 287 %endif 288 %rotate 1 289 %endrep 290%endmacro 291 292%macro SUB 2 293 sub %1, %2 294 %ifidn %1, rstk 295 %assign stack_offset stack_offset+(%2) 296 %endif 297%endmacro 298 299%macro ADD 2 300 add %1, %2 301 %ifidn %1, rstk 302 %assign stack_offset stack_offset-(%2) 303 %endif 304%endmacro 305 306%macro movifnidn 2 307 %ifnidn %1, %2 308 mov %1, %2 309 %endif 310%endmacro 311 312%if ARCH_X86_64 == 0 313 %define movsxd movifnidn 314%endif 315 316%macro movsxdifnidn 2 317 %ifnidn %1, %2 318 movsxd %1, %2 319 %endif 320%endmacro 321 322%macro ASSERT 1 323 %if (%1) == 0 324 %error assertion ``%1'' failed 325 %endif 326%endmacro 327 328%macro DEFINE_ARGS 0-* 329 %ifdef n_arg_names 330 %assign %%i 0 331 %rep n_arg_names 332 CAT_UNDEF arg_name %+ %%i, q 333 CAT_UNDEF arg_name %+ %%i, d 334 CAT_UNDEF arg_name %+ %%i, w 335 CAT_UNDEF arg_name %+ %%i, h 336 CAT_UNDEF arg_name %+ %%i, b 337 CAT_UNDEF arg_name %+ %%i, m 338 CAT_UNDEF arg_name %+ %%i, mp 339 CAT_UNDEF arg_name, %%i 340 %assign %%i %%i+1 341 %endrep 342 %endif 343 344 %xdefine %%stack_offset stack_offset 345 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 346 %assign %%i 0 347 %rep %0 348 %xdefine %1q r %+ %%i %+ q 349 %xdefine %1d r %+ %%i %+ d 350 %xdefine %1w r %+ %%i %+ w 351 %xdefine %1h r %+ %%i %+ h 352 %xdefine %1b r %+ %%i %+ b 353 %xdefine %1m r %+ %%i %+ m 354 %xdefine %1mp r %+ %%i %+ mp 355 CAT_XDEFINE arg_name, %%i, %1 356 %assign %%i %%i+1 357 %rotate 1 358 %endrep 359 %xdefine stack_offset %%stack_offset 360 %assign n_arg_names %0 361%endmacro 362 363%define required_stack_alignment ((mmsize + 15) & ~15) 364%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) 365%define high_mm_regs (16*cpuflag(avx512)) 366 367; Large stack allocations on Windows need to use stack probing in order 368; to guarantee that all stack memory is committed before accessing it. 369; This is done by ensuring that the guard page(s) at the end of the 370; currently committed pages are touched prior to any pages beyond that. 371%if WIN64 372 %assign STACK_PROBE_SIZE 8192 373%elifidn __OUTPUT_FORMAT__, win32 374 %assign STACK_PROBE_SIZE 4096 375%else 376 %assign STACK_PROBE_SIZE 0 377%endif 378 379%macro PROBE_STACK 1 ; stack_size 380 %if STACK_PROBE_SIZE 381 %assign %%i STACK_PROBE_SIZE 382 %rep %1 / STACK_PROBE_SIZE 383 mov eax, [rsp-%%i] 384 %assign %%i %%i+STACK_PROBE_SIZE 385 %endrep 386 %endif 387%endmacro 388 389%macro RESET_STACK_STATE 0 390 %ifidn rstk, rsp 391 %assign stack_offset stack_offset - stack_size_padded 392 %else 393 %xdefine rstk rsp 394 %endif 395 %assign stack_size 0 396 %assign stack_size_padded 0 397 %assign xmm_regs_used 0 398%endmacro 399 400%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs 401 RESET_STACK_STATE 402 %ifnum %2 403 %if mmsize != 8 404 %assign xmm_regs_used %2 405 %endif 406 %endif 407 %ifnum %1 408 %if %1 != 0 409 %assign %%pad 0 410 %assign stack_size %1 411 %if stack_size < 0 412 %assign stack_size -stack_size 413 %endif 414 %if WIN64 415 %assign %%pad %%pad + 32 ; shadow space 416 %if xmm_regs_used > 8 417 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers 418 %endif 419 %endif 420 %if required_stack_alignment <= STACK_ALIGNMENT 421 ; maintain the current stack alignment 422 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 423 PROBE_STACK stack_size_padded 424 SUB rsp, stack_size_padded 425 %else 426 %assign %%reg_num (regs_used - 1) 427 %xdefine rstk r %+ %%reg_num 428 ; align stack, and save original stack location directly above 429 ; it, i.e. in [rsp+stack_size_padded], so we can restore the 430 ; stack in a single instruction (i.e. mov rsp, rstk or mov 431 ; rsp, [rsp+stack_size_padded]) 432 %if %1 < 0 ; need to store rsp on stack 433 %xdefine rstkm [rsp + stack_size + %%pad] 434 %assign %%pad %%pad + gprsize 435 %else ; can keep rsp in rstk during whole function 436 %xdefine rstkm rstk 437 %endif 438 %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) 439 PROBE_STACK stack_size_padded 440 mov rstk, rsp 441 and rsp, ~(required_stack_alignment-1) 442 sub rsp, stack_size_padded 443 movifnidn rstkm, rstk 444 %endif 445 WIN64_PUSH_XMM 446 %endif 447 %endif 448%endmacro 449 450%macro SETUP_STACK_POINTER 0-1 0 451 %ifnum %1 452 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT 453 %if %1 > 0 454 ; Reserve an additional register for storing the original stack pointer, but avoid using 455 ; eax/rax for this purpose since it can potentially get overwritten as a return value. 456 %assign regs_used (regs_used + 1) 457 %if ARCH_X86_64 && regs_used == 7 458 %assign regs_used 8 459 %elif ARCH_X86_64 == 0 && regs_used == 1 460 %assign regs_used 2 461 %endif 462 %endif 463 %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 464 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) 465 ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. 466 %assign regs_used 5 + UNIX64 * 3 467 %endif 468 %endif 469 %endif 470%endmacro 471 472%if WIN64 ; Windows x64 ;================================================= 473 474DECLARE_REG 0, rcx 475DECLARE_REG 1, rdx 476DECLARE_REG 2, R8 477DECLARE_REG 3, R9 478DECLARE_REG 4, R10, 40 479DECLARE_REG 5, R11, 48 480DECLARE_REG 6, rax, 56 481DECLARE_REG 7, rdi, 64 482DECLARE_REG 8, rsi, 72 483DECLARE_REG 9, rbx, 80 484DECLARE_REG 10, rbp, 88 485DECLARE_REG 11, R14, 96 486DECLARE_REG 12, R15, 104 487DECLARE_REG 13, R12, 112 488DECLARE_REG 14, R13, 120 489 490%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 491 %assign num_args %1 492 %assign regs_used %2 493 ASSERT regs_used >= num_args 494 SETUP_STACK_POINTER %4 495 ASSERT regs_used <= 15 496 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 497 ALLOC_STACK %4, %3 498 %if mmsize != 8 && stack_size == 0 499 WIN64_SPILL_XMM %3 500 %endif 501 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 502 %if %0 > 4 503 %ifnum %4 504 DEFINE_ARGS %5 505 %else 506 DEFINE_ARGS %4, %5 507 %endif 508 %elifnnum %4 509 DEFINE_ARGS %4 510 %endif 511%endmacro 512 513; Push XMM registers to the stack. If no argument is specified all used register 514; will be pushed, otherwise only push previously unpushed registers. 515%macro WIN64_PUSH_XMM 0-2 ; new_xmm_regs_used, xmm_regs_pushed 516 %if mmsize != 8 517 %if %0 == 2 518 %assign %%pushed %2 519 %assign xmm_regs_used %1 520 %elif %0 == 1 521 %assign %%pushed xmm_regs_used 522 %assign xmm_regs_used %1 523 %else 524 %assign %%pushed 0 525 %endif 526 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. 527 %if %%pushed <= 6 + high_mm_regs && xmm_regs_used > 6 + high_mm_regs 528 movaps [rstk + stack_offset + 8], xmm6 529 %endif 530 %if %%pushed <= 7 + high_mm_regs && xmm_regs_used > 7 + high_mm_regs 531 movaps [rstk + stack_offset + 24], xmm7 532 %endif 533 %assign %%pushed %%pushed - high_mm_regs - 8 534 %if %%pushed < 0 535 %assign %%pushed 0 536 %endif 537 %assign %%regs_to_push xmm_regs_used - %%pushed - high_mm_regs - 8 538 %if %%regs_to_push > 0 539 ASSERT (%%regs_to_push + %%pushed) * 16 <= stack_size_padded - stack_size - 32 540 %assign %%i %%pushed + 8 541 %rep %%regs_to_push 542 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i 543 %assign %%i %%i+1 544 %endrep 545 %endif 546 %endif 547%endmacro 548 549; Allocated stack space for XMM registers and push all, or a subset, of those 550%macro WIN64_SPILL_XMM 1-2 ; xmm_regs_used, xmm_regs_reserved 551 RESET_STACK_STATE 552 %if mmsize != 8 553 %assign xmm_regs_used %1 554 ASSERT xmm_regs_used <= 16 + high_mm_regs 555 %if %0 == 2 556 ASSERT %2 >= %1 557 %assign %%xmm_regs_on_stack %2 - high_mm_regs - 8 558 %else 559 %assign %%xmm_regs_on_stack %1 - high_mm_regs - 8 560 %endif 561 %if %%xmm_regs_on_stack > 0 562 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. 563 %assign %%pad %%xmm_regs_on_stack*16 + 32 564 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 565 SUB rsp, stack_size_padded 566 %endif 567 WIN64_PUSH_XMM 568 %endif 569%endmacro 570 571%macro WIN64_RESTORE_XMM_INTERNAL 0 572 %assign %%pad_size 0 573 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 574 %if %%xmm_regs_on_stack > 0 575 %assign %%i xmm_regs_used - high_mm_regs 576 %rep %%xmm_regs_on_stack 577 %assign %%i %%i-1 578 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] 579 %endrep 580 %endif 581 %if stack_size_padded > 0 582 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT 583 mov rsp, rstkm 584 %else 585 add rsp, stack_size_padded 586 %assign %%pad_size stack_size_padded 587 %endif 588 %endif 589 %if xmm_regs_used > 7 + high_mm_regs 590 movaps xmm7, [rsp + stack_offset - %%pad_size + 24] 591 %endif 592 %if xmm_regs_used > 6 + high_mm_regs 593 movaps xmm6, [rsp + stack_offset - %%pad_size + 8] 594 %endif 595%endmacro 596 597%macro WIN64_RESTORE_XMM 0 598 WIN64_RESTORE_XMM_INTERNAL 599 RESET_STACK_STATE 600%endmacro 601 602%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs 603 604%macro RET 0 605 WIN64_RESTORE_XMM_INTERNAL 606 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 607 %if vzeroupper_required 608 vzeroupper 609 %endif 610 AUTO_REP_RET 611%endmacro 612 613%elif ARCH_X86_64 ; *nix x64 ;============================================= 614 615DECLARE_REG 0, rdi 616DECLARE_REG 1, rsi 617DECLARE_REG 2, rdx 618DECLARE_REG 3, rcx 619DECLARE_REG 4, R8 620DECLARE_REG 5, R9 621DECLARE_REG 6, rax, 8 622DECLARE_REG 7, R10, 16 623DECLARE_REG 8, R11, 24 624DECLARE_REG 9, rbx, 32 625DECLARE_REG 10, rbp, 40 626DECLARE_REG 11, R14, 48 627DECLARE_REG 12, R15, 56 628DECLARE_REG 13, R12, 64 629DECLARE_REG 14, R13, 72 630 631%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 632 %assign num_args %1 633 %assign regs_used %2 634 ASSERT regs_used >= num_args 635 SETUP_STACK_POINTER %4 636 ASSERT regs_used <= 15 637 PUSH_IF_USED 9, 10, 11, 12, 13, 14 638 ALLOC_STACK %4, %3 639 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 640 %if %0 > 4 641 %ifnum %4 642 DEFINE_ARGS %5 643 %else 644 DEFINE_ARGS %4, %5 645 %endif 646 %elifnnum %4 647 DEFINE_ARGS %4 648 %endif 649%endmacro 650 651%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required 652 653%macro RET 0 654 %if stack_size_padded > 0 655 %if required_stack_alignment > STACK_ALIGNMENT 656 mov rsp, rstkm 657 %else 658 add rsp, stack_size_padded 659 %endif 660 %endif 661 POP_IF_USED 14, 13, 12, 11, 10, 9 662 %if vzeroupper_required 663 vzeroupper 664 %endif 665 AUTO_REP_RET 666%endmacro 667 668%else ; X86_32 ;============================================================== 669 670DECLARE_REG 0, eax, 4 671DECLARE_REG 1, ecx, 8 672DECLARE_REG 2, edx, 12 673DECLARE_REG 3, ebx, 16 674DECLARE_REG 4, esi, 20 675DECLARE_REG 5, edi, 24 676DECLARE_REG 6, ebp, 28 677%define rsp esp 678 679%macro DECLARE_ARG 1-* 680 %rep %0 681 %define r%1m [rstk + stack_offset + 4*%1 + 4] 682 %define r%1mp dword r%1m 683 %rotate 1 684 %endrep 685%endmacro 686 687DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 688 689%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 690 %assign num_args %1 691 %assign regs_used %2 692 ASSERT regs_used >= num_args 693 %if num_args > 7 694 %assign num_args 7 695 %endif 696 %if regs_used > 7 697 %assign regs_used 7 698 %endif 699 SETUP_STACK_POINTER %4 700 ASSERT regs_used <= 7 701 PUSH_IF_USED 3, 4, 5, 6 702 ALLOC_STACK %4, %3 703 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 704 %if %0 > 4 705 %ifnum %4 706 DEFINE_ARGS %5 707 %else 708 DEFINE_ARGS %4, %5 709 %endif 710 %elifnnum %4 711 DEFINE_ARGS %4 712 %endif 713%endmacro 714 715%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required 716 717%macro RET 0 718 %if stack_size_padded > 0 719 %if required_stack_alignment > STACK_ALIGNMENT 720 mov rsp, rstkm 721 %else 722 add rsp, stack_size_padded 723 %endif 724 %endif 725 POP_IF_USED 6, 5, 4, 3 726 %if vzeroupper_required 727 vzeroupper 728 %endif 729 AUTO_REP_RET 730%endmacro 731 732%endif ;====================================================================== 733 734%if WIN64 == 0 735 %macro WIN64_SPILL_XMM 1-2 736 RESET_STACK_STATE 737 %if mmsize != 8 738 %assign xmm_regs_used %1 739 %endif 740 %endmacro 741 %macro WIN64_RESTORE_XMM 0 742 RESET_STACK_STATE 743 %endmacro 744 %macro WIN64_PUSH_XMM 0-2 745 %if mmsize != 8 && %0 >= 1 746 %assign xmm_regs_used %1 747 %endif 748 %endmacro 749%endif 750 751; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either 752; a branch or a branch target. So switch to a 2-byte form of ret in that case. 753; We can automatically detect "follows a branch", but not a branch target. 754; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) 755%macro REP_RET 0 756 %if has_epilogue || cpuflag(ssse3) 757 RET 758 %else 759 rep ret 760 %endif 761 annotate_function_size 762%endmacro 763 764%define last_branch_adr $$ 765%macro AUTO_REP_RET 0 766 %if notcpuflag(ssse3) 767 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. 768 %endif 769 ret 770 annotate_function_size 771%endmacro 772 773%macro BRANCH_INSTR 0-* 774 %rep %0 775 %macro %1 1-2 %1 776 %2 %1 777 %if notcpuflag(ssse3) 778 %%branch_instr equ $ 779 %xdefine last_branch_adr %%branch_instr 780 %endif 781 %endmacro 782 %rotate 1 783 %endrep 784%endmacro 785 786BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp 787 788%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent 789 %if has_epilogue 790 call %1 791 RET 792 %elif %2 793 jmp %1 794 %endif 795 annotate_function_size 796%endmacro 797 798;============================================================================= 799; arch-independent part 800;============================================================================= 801 802%assign function_align 16 803 804; Begin a function. 805; Applies any symbol mangling needed for C linkage, and sets up a define such that 806; subsequent uses of the function name automatically refer to the mangled version. 807; Appends cpuflags to the function name if cpuflags has been specified. 808; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX 809; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). 810%macro cglobal 1-2+ "" ; name, [PROLOGUE args] 811 cglobal_internal 1, %1 %+ SUFFIX, %2 812%endmacro 813%macro cvisible 1-2+ "" ; name, [PROLOGUE args] 814 cglobal_internal 0, %1 %+ SUFFIX, %2 815%endmacro 816%macro cglobal_internal 2-3+ 817 annotate_function_size 818 %ifndef cglobaled_%2 819 %if %1 820 %xdefine %2 mangle(private_prefix %+ _ %+ %2) 821 %else 822 %xdefine %2 mangle(public_prefix %+ _ %+ %2) 823 %endif 824 %xdefine %2.skip_prologue %2 %+ .skip_prologue 825 CAT_XDEFINE cglobaled_, %2, 1 826 %endif 827 %xdefine current_function %2 828 %xdefine current_function_section __SECT__ 829 %if FORMAT_ELF 830 %if %1 831 global %2:function hidden 832 %else 833 global %2:function 834 %endif 835 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 836 global %2:private_extern 837 %else 838 global %2 839 %endif 840 align function_align 841 %2: 842 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer 843 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required 844 %assign stack_offset 0 ; stack pointer offset relative to the return address 845 %assign stack_size 0 ; amount of stack space that can be freely used inside a function 846 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding 847 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper 848 %ifnidn %3, "" 849 PROLOGUE %3 850 %endif 851%endmacro 852 853; Create a global symbol from a local label with the correct name mangling and type 854%macro cglobal_label 1 855 %if FORMAT_ELF 856 global current_function %+ %1:function hidden 857 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN 858 global current_function %+ %1:private_extern 859 %else 860 global current_function %+ %1 861 %endif 862 %1: 863%endmacro 864 865%macro cextern 1 866 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 867 CAT_XDEFINE cglobaled_, %1, 2 868 extern %1 869%endmacro 870 871; Like cextern, but without the prefix. This should be used for symbols from external libraries. 872%macro cextern_naked 1 873 %ifdef PREFIX 874 %xdefine %1 mangle(%1) 875 %endif 876 CAT_XDEFINE cglobaled_, %1, 3 877 extern %1 878%endmacro 879 880%macro const 1-2+ 881 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 882 %if FORMAT_ELF 883 global %1:data hidden 884 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN 885 global %1:private_extern 886 %else 887 global %1 888 %endif 889 %1: %2 890%endmacro 891 892%if FORMAT_ELF 893 ; The GNU linker assumes the stack is executable by default. 894 [SECTION .note.GNU-stack noalloc noexec nowrite progbits] 895 896 %ifdef __NASM_VERSION_ID__ 897 %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03 898 %if ARCH_X86_64 899 ; Control-flow Enforcement Technology (CET) properties. 900 [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize] 901 dd 0x00000004 ; n_namesz 902 dd gprsize + 8 ; n_descsz 903 dd 0x00000005 ; n_type = NT_GNU_PROPERTY_TYPE_0 904 db "GNU",0 ; n_name 905 dd 0xc0000002 ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND 906 dd 0x00000004 ; pr_datasz 907 dd 0x00000002 ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK 908 dd 0x00000000 ; pr_padding 909 %endif 910 %endif 911 %endif 912%endif 913 914; Tell debuggers how large the function was. 915; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. 916; This is invoked by RET and similar macros, and also cglobal does it for the previous function, 917; but if the last function in a source file doesn't use any of the standard macros for its epilogue, 918; then its size might be unspecified. 919%macro annotate_function_size 0 920 %ifdef __YASM_VER__ 921 %ifdef current_function 922 %if FORMAT_ELF 923 current_function_section 924 %%ecf equ $ 925 size current_function %%ecf - current_function 926 __SECT__ 927 %endif 928 %endif 929 %endif 930%endmacro 931 932; cpuflags 933 934%assign cpuflags_mmx (1<<0) 935%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 936%assign cpuflags_3dnow (1<<2) | cpuflags_mmx 937%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow 938%assign cpuflags_sse (1<<4) | cpuflags_mmx2 939%assign cpuflags_sse2 (1<<5) | cpuflags_sse 940%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 941%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 942%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 943%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 944%assign cpuflags_sse4 (1<<10) | cpuflags_ssse3 945%assign cpuflags_sse42 (1<<11) | cpuflags_sse4 946%assign cpuflags_aesni (1<<12) | cpuflags_sse42 947%assign cpuflags_clmul (1<<13) | cpuflags_sse42 948%assign cpuflags_gfni (1<<14) | cpuflags_aesni|cpuflags_clmul 949%assign cpuflags_avx (1<<15) | cpuflags_sse42 950%assign cpuflags_xop (1<<16) | cpuflags_avx 951%assign cpuflags_fma4 (1<<17) | cpuflags_avx 952%assign cpuflags_fma3 (1<<18) | cpuflags_avx 953%assign cpuflags_bmi1 (1<<19) | cpuflags_avx|cpuflags_lzcnt 954%assign cpuflags_bmi2 (1<<20) | cpuflags_bmi1 955%assign cpuflags_avx2 (1<<21) | cpuflags_fma3|cpuflags_bmi2 956%assign cpuflags_avx512 (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL 957%assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ 958 959%assign cpuflags_cache32 (1<<24) 960%assign cpuflags_cache64 (1<<25) 961%assign cpuflags_aligned (1<<26) ; not a cpu feature, but a function variant 962%assign cpuflags_atom (1<<27) 963 964; Returns a boolean value expressing whether or not the specified cpuflag is enabled. 965%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) 966%define notcpuflag(x) (cpuflag(x) ^ 1) 967 968; Takes an arbitrary number of cpuflags from the above list. 969; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 970; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 971%macro INIT_CPUFLAGS 0-* 972 %xdefine SUFFIX 973 %undef cpuname 974 %assign cpuflags 0 975 976 %if %0 >= 1 977 %rep %0 978 %ifdef cpuname 979 %xdefine cpuname cpuname %+ _%1 980 %else 981 %xdefine cpuname %1 982 %endif 983 %assign cpuflags cpuflags | cpuflags_%1 984 %rotate 1 985 %endrep 986 %xdefine SUFFIX _ %+ cpuname 987 988 %if cpuflag(avx) 989 %assign avx_enabled 1 990 %endif 991 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) 992 %define mova movaps 993 %define movu movups 994 %define movnta movntps 995 %endif 996 %if cpuflag(aligned) 997 %define movu mova 998 %elif cpuflag(sse3) && notcpuflag(ssse3) 999 %define movu lddqu 1000 %endif 1001 %endif 1002 1003 %if ARCH_X86_64 || cpuflag(sse2) 1004 %ifdef __NASM_VERSION_ID__ 1005 ALIGNMODE p6 1006 %else 1007 CPU amdnop 1008 %endif 1009 %else 1010 %ifdef __NASM_VERSION_ID__ 1011 ALIGNMODE nop 1012 %else 1013 CPU basicnop 1014 %endif 1015 %endif 1016%endmacro 1017 1018; Merge mmx, sse*, and avx* 1019; m# is a simd register of the currently selected size 1020; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# 1021; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# 1022; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# 1023; (All 4 remain in sync through SWAP.) 1024 1025%macro CAT_XDEFINE 3 1026 %xdefine %1%2 %3 1027%endmacro 1028 1029%macro CAT_UNDEF 2 1030 %undef %1%2 1031%endmacro 1032 1033%macro DEFINE_MMREGS 1 ; mmtype 1034 %assign %%prev_mmregs 0 1035 %ifdef num_mmregs 1036 %assign %%prev_mmregs num_mmregs 1037 %endif 1038 1039 %assign num_mmregs 8 1040 %if ARCH_X86_64 && mmsize >= 16 1041 %assign num_mmregs 16 1042 %if cpuflag(avx512) || mmsize == 64 1043 %assign num_mmregs 32 1044 %endif 1045 %endif 1046 1047 %assign %%i 0 1048 %rep num_mmregs 1049 CAT_XDEFINE m, %%i, %1 %+ %%i 1050 CAT_XDEFINE nn%1, %%i, %%i 1051 %assign %%i %%i+1 1052 %endrep 1053 %if %%prev_mmregs > num_mmregs 1054 %rep %%prev_mmregs - num_mmregs 1055 CAT_UNDEF m, %%i 1056 CAT_UNDEF nn %+ mmtype, %%i 1057 %assign %%i %%i+1 1058 %endrep 1059 %endif 1060 %xdefine mmtype %1 1061%endmacro 1062 1063; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper 1064%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg 1065 %if ARCH_X86_64 && cpuflag(avx512) 1066 %assign %%i %1 1067 %rep 16-%1 1068 %assign %%i_high %%i+16 1069 SWAP %%i, %%i_high 1070 %assign %%i %%i+1 1071 %endrep 1072 %endif 1073%endmacro 1074 1075%macro INIT_MMX 0-1+ 1076 %assign avx_enabled 0 1077 %define RESET_MM_PERMUTATION INIT_MMX %1 1078 %define mmsize 8 1079 %define mova movq 1080 %define movu movq 1081 %define movh movd 1082 %define movnta movntq 1083 INIT_CPUFLAGS %1 1084 DEFINE_MMREGS mm 1085%endmacro 1086 1087%macro INIT_XMM 0-1+ 1088 %assign avx_enabled FORCE_VEX_ENCODING 1089 %define RESET_MM_PERMUTATION INIT_XMM %1 1090 %define mmsize 16 1091 %define mova movdqa 1092 %define movu movdqu 1093 %define movh movq 1094 %define movnta movntdq 1095 INIT_CPUFLAGS %1 1096 DEFINE_MMREGS xmm 1097 %if WIN64 1098 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers 1099 %endif 1100 %xdefine bcstw 1to8 1101 %xdefine bcstd 1to4 1102 %xdefine bcstq 1to2 1103%endmacro 1104 1105%macro INIT_YMM 0-1+ 1106 %assign avx_enabled 1 1107 %define RESET_MM_PERMUTATION INIT_YMM %1 1108 %define mmsize 32 1109 %define mova movdqa 1110 %define movu movdqu 1111 %undef movh 1112 %define movnta movntdq 1113 INIT_CPUFLAGS %1 1114 DEFINE_MMREGS ymm 1115 AVX512_MM_PERMUTATION 1116 %xdefine bcstw 1to16 1117 %xdefine bcstd 1to8 1118 %xdefine bcstq 1to4 1119%endmacro 1120 1121%macro INIT_ZMM 0-1+ 1122 %assign avx_enabled 1 1123 %define RESET_MM_PERMUTATION INIT_ZMM %1 1124 %define mmsize 64 1125 %define mova movdqa 1126 %define movu movdqu 1127 %undef movh 1128 %define movnta movntdq 1129 INIT_CPUFLAGS %1 1130 DEFINE_MMREGS zmm 1131 AVX512_MM_PERMUTATION 1132 %xdefine bcstw 1to32 1133 %xdefine bcstd 1to16 1134 %xdefine bcstq 1to8 1135%endmacro 1136 1137INIT_XMM 1138 1139%macro DECLARE_MMCAST 1 1140 %define mmmm%1 mm%1 1141 %define mmxmm%1 mm%1 1142 %define mmymm%1 mm%1 1143 %define mmzmm%1 mm%1 1144 %define xmmmm%1 mm%1 1145 %define xmmxmm%1 xmm%1 1146 %define xmmymm%1 xmm%1 1147 %define xmmzmm%1 xmm%1 1148 %define ymmmm%1 mm%1 1149 %define ymmxmm%1 xmm%1 1150 %define ymmymm%1 ymm%1 1151 %define ymmzmm%1 ymm%1 1152 %define zmmmm%1 mm%1 1153 %define zmmxmm%1 xmm%1 1154 %define zmmymm%1 ymm%1 1155 %define zmmzmm%1 zmm%1 1156 %define xm%1 xmm %+ m%1 1157 %define ym%1 ymm %+ m%1 1158 %define zm%1 zmm %+ m%1 1159%endmacro 1160 1161%assign i 0 1162%rep 32 1163 DECLARE_MMCAST i 1164 %assign i i+1 1165%endrep 1166 1167; I often want to use macros that permute their arguments. e.g. there's no 1168; efficient way to implement butterfly or transpose or dct without swapping some 1169; arguments. 1170; 1171; I would like to not have to manually keep track of the permutations: 1172; If I insert a permutation in the middle of a function, it should automatically 1173; change everything that follows. For more complex macros I may also have multiple 1174; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 1175; 1176; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 1177; permutes its arguments. It's equivalent to exchanging the contents of the 1178; registers, except that this way you exchange the register names instead, so it 1179; doesn't cost any cycles. 1180 1181%macro PERMUTE 2-* ; takes a list of pairs to swap 1182 %rep %0/2 1183 %xdefine %%tmp%2 m%2 1184 %rotate 2 1185 %endrep 1186 %rep %0/2 1187 %xdefine m%1 %%tmp%2 1188 CAT_XDEFINE nn, m%1, %1 1189 %rotate 2 1190 %endrep 1191%endmacro 1192 1193%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) 1194 %ifnum %1 ; SWAP 0, 1, ... 1195 SWAP_INTERNAL_NUM %1, %2 1196 %else ; SWAP m0, m1, ... 1197 SWAP_INTERNAL_NAME %1, %2 1198 %endif 1199%endmacro 1200 1201%macro SWAP_INTERNAL_NUM 2-* 1202 %rep %0-1 1203 %xdefine %%tmp m%1 1204 %xdefine m%1 m%2 1205 %xdefine m%2 %%tmp 1206 CAT_XDEFINE nn, m%1, %1 1207 CAT_XDEFINE nn, m%2, %2 1208 %rotate 1 1209 %endrep 1210%endmacro 1211 1212%macro SWAP_INTERNAL_NAME 2-* 1213 %xdefine %%args nn %+ %1 1214 %rep %0-1 1215 %xdefine %%args %%args, nn %+ %2 1216 %rotate 1 1217 %endrep 1218 SWAP_INTERNAL_NUM %%args 1219%endmacro 1220 1221; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 1222; calls to that function will automatically load the permutation, so values can 1223; be returned in mmregs. 1224%macro SAVE_MM_PERMUTATION 0-1 1225 %if %0 1226 %xdefine %%f %1_m 1227 %else 1228 %xdefine %%f current_function %+ _m 1229 %endif 1230 %assign %%i 0 1231 %rep num_mmregs 1232 %xdefine %%tmp m %+ %%i 1233 CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp 1234 %assign %%i %%i+1 1235 %endrep 1236%endmacro 1237 1238%macro LOAD_MM_PERMUTATION 0-1 ; name to load from 1239 %if %0 1240 %xdefine %%f %1_m 1241 %else 1242 %xdefine %%f current_function %+ _m 1243 %endif 1244 %xdefine %%tmp %%f %+ 0 1245 %ifnum %%tmp 1246 DEFINE_MMREGS mmtype 1247 %assign %%i 0 1248 %rep num_mmregs 1249 %xdefine %%tmp %%f %+ %%i 1250 CAT_XDEFINE %%m, %%i, m %+ %%tmp 1251 %assign %%i %%i+1 1252 %endrep 1253 %rep num_mmregs 1254 %assign %%i %%i-1 1255 CAT_XDEFINE m, %%i, %%m %+ %%i 1256 CAT_XDEFINE nn, m %+ %%i, %%i 1257 %endrep 1258 %endif 1259%endmacro 1260 1261; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 1262%macro call 1 1263 %ifid %1 1264 call_internal %1 %+ SUFFIX, %1 1265 %else 1266 call %1 1267 %endif 1268%endmacro 1269%macro call_internal 2 1270 %xdefine %%i %2 1271 %define %%j %%i 1272 %ifndef cglobaled_%2 1273 %ifdef cglobaled_%1 1274 %xdefine %%i %1 1275 %endif 1276 %elif FORMAT_ELF 1277 %if ARCH_X86_64 1278 %if cglobaled_%2 >= 2 1279 ; Always emit PLT relocations when calling external functions, 1280 ; the linker will eliminate unnecessary PLT indirections anyway. 1281 %define %%j %%i wrt ..plt 1282 %endif 1283 %elif PIC && cglobaled_%2 == 3 1284 ; Go through the GOT for functions declared using cextern_naked with 1285 ; PIC, as such functions presumably exists in external libraries. 1286 extern _GLOBAL_OFFSET_TABLE_ 1287 LEA eax, $$+_GLOBAL_OFFSET_TABLE_ wrt ..gotpc 1288 %define %%j [eax+%%i wrt ..got] 1289 %endif 1290 %endif 1291 call %%j 1292 LOAD_MM_PERMUTATION %%i 1293%endmacro 1294 1295; Substitutions that reduce instruction size but are functionally equivalent 1296%macro add 2 1297 %ifnum %2 1298 %if %2==128 1299 sub %1, -128 1300 %else 1301 add %1, %2 1302 %endif 1303 %else 1304 add %1, %2 1305 %endif 1306%endmacro 1307 1308%macro sub 2 1309 %ifnum %2 1310 %if %2==128 1311 add %1, -128 1312 %else 1313 sub %1, %2 1314 %endif 1315 %else 1316 sub %1, %2 1317 %endif 1318%endmacro 1319 1320;============================================================================= 1321; AVX abstraction layer 1322;============================================================================= 1323 1324%assign i 0 1325%rep 32 1326 %if i < 8 1327 CAT_XDEFINE sizeofmm, i, 8 1328 CAT_XDEFINE regnumofmm, i, i 1329 %endif 1330 CAT_XDEFINE sizeofxmm, i, 16 1331 CAT_XDEFINE sizeofymm, i, 32 1332 CAT_XDEFINE sizeofzmm, i, 64 1333 CAT_XDEFINE regnumofxmm, i, i 1334 CAT_XDEFINE regnumofymm, i, i 1335 CAT_XDEFINE regnumofzmm, i, i 1336 %assign i i+1 1337%endrep 1338%undef i 1339 1340%macro CHECK_AVX_INSTR_EMU 3-* 1341 %xdefine %%opcode %1 1342 %xdefine %%dst %2 1343 %rep %0-2 1344 %ifidn %%dst, %3 1345 %error non-avx emulation of ``%%opcode'' is not supported 1346 %endif 1347 %rotate 1 1348 %endrep 1349%endmacro 1350 1351;%1 == instruction 1352;%2 == minimal instruction set 1353;%3 == 1 if float, 0 if int 1354;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) 1355;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1356;%6+: operands 1357%macro RUN_AVX_INSTR 6-9+ 1358 %ifnum sizeof%7 1359 %assign __sizeofreg sizeof%7 1360 %elifnum sizeof%6 1361 %assign __sizeofreg sizeof%6 1362 %else 1363 %assign __sizeofreg mmsize 1364 %endif 1365 %assign __emulate_avx 0 1366 %if avx_enabled && __sizeofreg >= 16 1367 %xdefine __instr v%1 1368 %else 1369 %xdefine __instr %1 1370 %if %0 >= 8+%4 1371 %assign __emulate_avx 1 1372 %endif 1373 %endif 1374 %ifnidn %2, fnord 1375 %ifdef cpuname 1376 %if notcpuflag(%2) 1377 %error use of ``%1'' %2 instruction in cpuname function: current_function 1378 %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) 1379 %error use of ``%1'' sse2 instruction in cpuname function: current_function 1380 %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) 1381 %error use of ``%1'' avx2 instruction in cpuname function: current_function 1382 %elif __sizeofreg == 16 && notcpuflag(sse) 1383 %error use of ``%1'' sse instruction in cpuname function: current_function 1384 %elif __sizeofreg == 32 && notcpuflag(avx) 1385 %error use of ``%1'' avx instruction in cpuname function: current_function 1386 %elif __sizeofreg == 64 && notcpuflag(avx512) 1387 %error use of ``%1'' avx512 instruction in cpuname function: current_function 1388 %elifidn %1, pextrw ; special case because the base instruction is mmx2, 1389 %ifnid %6 ; but sse4 is required for memory operands 1390 %if notcpuflag(sse4) 1391 %error use of ``%1'' sse4 instruction in cpuname function: current_function 1392 %endif 1393 %endif 1394 %endif 1395 %endif 1396 %endif 1397 1398 %if __emulate_avx 1399 %xdefine __src1 %7 1400 %xdefine __src2 %8 1401 %if %5 && %4 == 0 1402 %ifnidn %6, %7 1403 %ifidn %6, %8 1404 %xdefine __src1 %8 1405 %xdefine __src2 %7 1406 %elifnnum sizeof%8 1407 ; 3-operand AVX instructions with a memory arg can only have it in src2, 1408 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). 1409 ; So, if the instruction is commutative with a memory arg, swap them. 1410 %xdefine __src1 %8 1411 %xdefine __src2 %7 1412 %endif 1413 %endif 1414 %endif 1415 %ifnidn %6, __src1 1416 %if %0 >= 9 1417 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 1418 %else 1419 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 1420 %endif 1421 %if __sizeofreg == 8 1422 MOVQ %6, __src1 1423 %elif %3 1424 MOVAPS %6, __src1 1425 %else 1426 MOVDQA %6, __src1 1427 %endif 1428 %endif 1429 %if %0 >= 9 1430 %1 %6, __src2, %9 1431 %else 1432 %1 %6, __src2 1433 %endif 1434 %elif %0 >= 9 1435 %if avx_enabled && __sizeofreg >= 16 && %4 == 1 1436 %ifnnum regnumof%7 1437 %if %3 1438 vmovaps %6, %7 1439 %else 1440 vmovdqa %6, %7 1441 %endif 1442 __instr %6, %6, %8, %9 1443 %else 1444 __instr %6, %7, %8, %9 1445 %endif 1446 %else 1447 __instr %6, %7, %8, %9 1448 %endif 1449 %elif %0 == 8 1450 %if avx_enabled && __sizeofreg >= 16 && %4 == 0 1451 %xdefine __src1 %7 1452 %xdefine __src2 %8 1453 %if %5 1454 %ifnum regnumof%7 1455 %ifnum regnumof%8 1456 %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 1457 ; Most VEX-encoded instructions require an additional byte to encode when 1458 ; src2 is a high register (e.g. m8..15). If the instruction is commutative 1459 ; we can swap src1 and src2 when doing so reduces the instruction length. 1460 %xdefine __src1 %8 1461 %xdefine __src2 %7 1462 %endif 1463 %endif 1464 %elifnum regnumof%8 ; put memory operands in src2 when possible 1465 %xdefine __src1 %8 1466 %xdefine __src2 %7 1467 %else 1468 %assign __emulate_avx 1 1469 %endif 1470 %elifnnum regnumof%7 1471 ; EVEX allows imm8 shift instructions to be used with memory operands, 1472 ; but VEX does not. This handles those special cases. 1473 %ifnnum %8 1474 %assign __emulate_avx 1 1475 %elif notcpuflag(avx512) 1476 %assign __emulate_avx 1 1477 %endif 1478 %endif 1479 %if __emulate_avx ; a separate load is required 1480 %if %3 1481 vmovaps %6, %7 1482 %else 1483 vmovdqa %6, %7 1484 %endif 1485 __instr %6, %6, %8 1486 %else 1487 __instr %6, __src1, __src2 1488 %endif 1489 %else 1490 __instr %6, %7, %8 1491 %endif 1492 %elif %0 == 7 1493 %if avx_enabled && __sizeofreg >= 16 && %5 1494 %xdefine __src1 %6 1495 %xdefine __src2 %7 1496 %ifnum regnumof%6 1497 %ifnum regnumof%7 1498 %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 1499 %xdefine __src1 %7 1500 %xdefine __src2 %6 1501 %endif 1502 %endif 1503 %endif 1504 __instr %6, __src1, __src2 1505 %else 1506 __instr %6, %7 1507 %endif 1508 %else 1509 __instr %6 1510 %endif 1511%endmacro 1512 1513;%1 == instruction 1514;%2 == minimal instruction set 1515;%3 == 1 if float, 0 if int 1516;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) 1517;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1518%macro AVX_INSTR 1-5 fnord, 0, 255, 0 1519 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 1520 %ifidn %2, fnord 1521 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 1522 %elifidn %3, fnord 1523 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 1524 %elifidn %4, fnord 1525 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 1526 %elifidn %5, fnord 1527 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 1528 %else 1529 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 1530 %endif 1531 %endmacro 1532%endmacro 1533 1534; Instructions with both VEX/EVEX and legacy encodings 1535; Non-destructive instructions are written without parameters 1536AVX_INSTR addpd, sse2, 1, 0, 1 1537AVX_INSTR addps, sse, 1, 0, 1 1538AVX_INSTR addsd, sse2, 1, 0, 0 1539AVX_INSTR addss, sse, 1, 0, 0 1540AVX_INSTR addsubpd, sse3, 1, 0, 0 1541AVX_INSTR addsubps, sse3, 1, 0, 0 1542AVX_INSTR aesdec, aesni, 0, 0, 0 1543AVX_INSTR aesdeclast, aesni, 0, 0, 0 1544AVX_INSTR aesenc, aesni, 0, 0, 0 1545AVX_INSTR aesenclast, aesni, 0, 0, 0 1546AVX_INSTR aesimc, aesni 1547AVX_INSTR aeskeygenassist, aesni 1548AVX_INSTR andnpd, sse2, 1, 0, 0 1549AVX_INSTR andnps, sse, 1, 0, 0 1550AVX_INSTR andpd, sse2, 1, 0, 1 1551AVX_INSTR andps, sse, 1, 0, 1 1552AVX_INSTR blendpd, sse4, 1, 1, 0 1553AVX_INSTR blendps, sse4, 1, 1, 0 1554AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding 1555AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding 1556AVX_INSTR cmpeqpd, sse2, 1, 0, 1 1557AVX_INSTR cmpeqps, sse, 1, 0, 1 1558AVX_INSTR cmpeqsd, sse2, 1, 0, 0 1559AVX_INSTR cmpeqss, sse, 1, 0, 0 1560AVX_INSTR cmplepd, sse2, 1, 0, 0 1561AVX_INSTR cmpleps, sse, 1, 0, 0 1562AVX_INSTR cmplesd, sse2, 1, 0, 0 1563AVX_INSTR cmpless, sse, 1, 0, 0 1564AVX_INSTR cmpltpd, sse2, 1, 0, 0 1565AVX_INSTR cmpltps, sse, 1, 0, 0 1566AVX_INSTR cmpltsd, sse2, 1, 0, 0 1567AVX_INSTR cmpltss, sse, 1, 0, 0 1568AVX_INSTR cmpneqpd, sse2, 1, 0, 1 1569AVX_INSTR cmpneqps, sse, 1, 0, 1 1570AVX_INSTR cmpneqsd, sse2, 1, 0, 0 1571AVX_INSTR cmpneqss, sse, 1, 0, 0 1572AVX_INSTR cmpnlepd, sse2, 1, 0, 0 1573AVX_INSTR cmpnleps, sse, 1, 0, 0 1574AVX_INSTR cmpnlesd, sse2, 1, 0, 0 1575AVX_INSTR cmpnless, sse, 1, 0, 0 1576AVX_INSTR cmpnltpd, sse2, 1, 0, 0 1577AVX_INSTR cmpnltps, sse, 1, 0, 0 1578AVX_INSTR cmpnltsd, sse2, 1, 0, 0 1579AVX_INSTR cmpnltss, sse, 1, 0, 0 1580AVX_INSTR cmpordpd, sse2 1, 0, 1 1581AVX_INSTR cmpordps, sse 1, 0, 1 1582AVX_INSTR cmpordsd, sse2 1, 0, 0 1583AVX_INSTR cmpordss, sse 1, 0, 0 1584AVX_INSTR cmppd, sse2, 1, 1, 0 1585AVX_INSTR cmpps, sse, 1, 1, 0 1586AVX_INSTR cmpsd, sse2, 1, 1, 0 1587AVX_INSTR cmpss, sse, 1, 1, 0 1588AVX_INSTR cmpunordpd, sse2, 1, 0, 1 1589AVX_INSTR cmpunordps, sse, 1, 0, 1 1590AVX_INSTR cmpunordsd, sse2, 1, 0, 0 1591AVX_INSTR cmpunordss, sse, 1, 0, 0 1592AVX_INSTR comisd, sse2, 1 1593AVX_INSTR comiss, sse, 1 1594AVX_INSTR cvtdq2pd, sse2, 1 1595AVX_INSTR cvtdq2ps, sse2, 1 1596AVX_INSTR cvtpd2dq, sse2, 1 1597AVX_INSTR cvtpd2ps, sse2, 1 1598AVX_INSTR cvtps2dq, sse2, 1 1599AVX_INSTR cvtps2pd, sse2, 1 1600AVX_INSTR cvtsd2si, sse2, 1 1601AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 1602AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 1603AVX_INSTR cvtsi2ss, sse, 1, 0, 0 1604AVX_INSTR cvtss2sd, sse2, 1, 0, 0 1605AVX_INSTR cvtss2si, sse, 1 1606AVX_INSTR cvttpd2dq, sse2, 1 1607AVX_INSTR cvttps2dq, sse2, 1 1608AVX_INSTR cvttsd2si, sse2, 1 1609AVX_INSTR cvttss2si, sse, 1 1610AVX_INSTR divpd, sse2, 1, 0, 0 1611AVX_INSTR divps, sse, 1, 0, 0 1612AVX_INSTR divsd, sse2, 1, 0, 0 1613AVX_INSTR divss, sse, 1, 0, 0 1614AVX_INSTR dppd, sse4, 1, 1, 0 1615AVX_INSTR dpps, sse4, 1, 1, 0 1616AVX_INSTR extractps, sse4, 1 1617AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 1618AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 1619AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 1620AVX_INSTR haddpd, sse3, 1, 0, 0 1621AVX_INSTR haddps, sse3, 1, 0, 0 1622AVX_INSTR hsubpd, sse3, 1, 0, 0 1623AVX_INSTR hsubps, sse3, 1, 0, 0 1624AVX_INSTR insertps, sse4, 1, 1, 0 1625AVX_INSTR lddqu, sse3 1626AVX_INSTR ldmxcsr, sse, 1 1627AVX_INSTR maskmovdqu, sse2 1628AVX_INSTR maxpd, sse2, 1, 0, 1 1629AVX_INSTR maxps, sse, 1, 0, 1 1630AVX_INSTR maxsd, sse2, 1, 0, 0 1631AVX_INSTR maxss, sse, 1, 0, 0 1632AVX_INSTR minpd, sse2, 1, 0, 1 1633AVX_INSTR minps, sse, 1, 0, 1 1634AVX_INSTR minsd, sse2, 1, 0, 0 1635AVX_INSTR minss, sse, 1, 0, 0 1636AVX_INSTR movapd, sse2, 1 1637AVX_INSTR movaps, sse, 1 1638AVX_INSTR movd, mmx 1639AVX_INSTR movddup, sse3, 1 1640AVX_INSTR movdqa, sse2 1641AVX_INSTR movdqu, sse2 1642AVX_INSTR movhlps, sse, 1, 0, 0 1643AVX_INSTR movhpd, sse2, 1, 0, 0 1644AVX_INSTR movhps, sse, 1, 0, 0 1645AVX_INSTR movlhps, sse, 1, 0, 0 1646AVX_INSTR movlpd, sse2, 1, 0, 0 1647AVX_INSTR movlps, sse, 1, 0, 0 1648AVX_INSTR movmskpd, sse2, 1 1649AVX_INSTR movmskps, sse, 1 1650AVX_INSTR movntdq, sse2 1651AVX_INSTR movntdqa, sse4 1652AVX_INSTR movntpd, sse2, 1 1653AVX_INSTR movntps, sse, 1 1654AVX_INSTR movq, mmx 1655AVX_INSTR movsd, sse2, 1, 0, 0 1656AVX_INSTR movshdup, sse3, 1 1657AVX_INSTR movsldup, sse3, 1 1658AVX_INSTR movss, sse, 1, 0, 0 1659AVX_INSTR movupd, sse2, 1 1660AVX_INSTR movups, sse, 1 1661AVX_INSTR mpsadbw, sse4, 0, 1, 0 1662AVX_INSTR mulpd, sse2, 1, 0, 1 1663AVX_INSTR mulps, sse, 1, 0, 1 1664AVX_INSTR mulsd, sse2, 1, 0, 0 1665AVX_INSTR mulss, sse, 1, 0, 0 1666AVX_INSTR orpd, sse2, 1, 0, 1 1667AVX_INSTR orps, sse, 1, 0, 1 1668AVX_INSTR pabsb, ssse3 1669AVX_INSTR pabsd, ssse3 1670AVX_INSTR pabsw, ssse3 1671AVX_INSTR packssdw, mmx, 0, 0, 0 1672AVX_INSTR packsswb, mmx, 0, 0, 0 1673AVX_INSTR packusdw, sse4, 0, 0, 0 1674AVX_INSTR packuswb, mmx, 0, 0, 0 1675AVX_INSTR paddb, mmx, 0, 0, 1 1676AVX_INSTR paddd, mmx, 0, 0, 1 1677AVX_INSTR paddq, sse2, 0, 0, 1 1678AVX_INSTR paddsb, mmx, 0, 0, 1 1679AVX_INSTR paddsw, mmx, 0, 0, 1 1680AVX_INSTR paddusb, mmx, 0, 0, 1 1681AVX_INSTR paddusw, mmx, 0, 0, 1 1682AVX_INSTR paddw, mmx, 0, 0, 1 1683AVX_INSTR palignr, ssse3, 0, 1, 0 1684AVX_INSTR pand, mmx, 0, 0, 1 1685AVX_INSTR pandn, mmx, 0, 0, 0 1686AVX_INSTR pavgb, mmx2, 0, 0, 1 1687AVX_INSTR pavgw, mmx2, 0, 0, 1 1688AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding 1689AVX_INSTR pblendw, sse4, 0, 1, 0 1690AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0 1691AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0 1692AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0 1693AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0 1694AVX_INSTR pclmulqdq, clmul, 0, 1, 0 1695AVX_INSTR pcmpeqb, mmx, 0, 0, 1 1696AVX_INSTR pcmpeqd, mmx, 0, 0, 1 1697AVX_INSTR pcmpeqq, sse4, 0, 0, 1 1698AVX_INSTR pcmpeqw, mmx, 0, 0, 1 1699AVX_INSTR pcmpestri, sse42 1700AVX_INSTR pcmpestrm, sse42 1701AVX_INSTR pcmpgtb, mmx, 0, 0, 0 1702AVX_INSTR pcmpgtd, mmx, 0, 0, 0 1703AVX_INSTR pcmpgtq, sse42, 0, 0, 0 1704AVX_INSTR pcmpgtw, mmx, 0, 0, 0 1705AVX_INSTR pcmpistri, sse42 1706AVX_INSTR pcmpistrm, sse42 1707AVX_INSTR pextrb, sse4 1708AVX_INSTR pextrd, sse4 1709AVX_INSTR pextrq, sse4 1710AVX_INSTR pextrw, mmx2 1711AVX_INSTR phaddd, ssse3, 0, 0, 0 1712AVX_INSTR phaddsw, ssse3, 0, 0, 0 1713AVX_INSTR phaddw, ssse3, 0, 0, 0 1714AVX_INSTR phminposuw, sse4 1715AVX_INSTR phsubd, ssse3, 0, 0, 0 1716AVX_INSTR phsubsw, ssse3, 0, 0, 0 1717AVX_INSTR phsubw, ssse3, 0, 0, 0 1718AVX_INSTR pinsrb, sse4, 0, 1, 0 1719AVX_INSTR pinsrd, sse4, 0, 1, 0 1720AVX_INSTR pinsrq, sse4, 0, 1, 0 1721AVX_INSTR pinsrw, mmx2, 0, 1, 0 1722AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 1723AVX_INSTR pmaddwd, mmx, 0, 0, 1 1724AVX_INSTR pmaxsb, sse4, 0, 0, 1 1725AVX_INSTR pmaxsd, sse4, 0, 0, 1 1726AVX_INSTR pmaxsw, mmx2, 0, 0, 1 1727AVX_INSTR pmaxub, mmx2, 0, 0, 1 1728AVX_INSTR pmaxud, sse4, 0, 0, 1 1729AVX_INSTR pmaxuw, sse4, 0, 0, 1 1730AVX_INSTR pminsb, sse4, 0, 0, 1 1731AVX_INSTR pminsd, sse4, 0, 0, 1 1732AVX_INSTR pminsw, mmx2, 0, 0, 1 1733AVX_INSTR pminub, mmx2, 0, 0, 1 1734AVX_INSTR pminud, sse4, 0, 0, 1 1735AVX_INSTR pminuw, sse4, 0, 0, 1 1736AVX_INSTR pmovmskb, mmx2 1737AVX_INSTR pmovsxbd, sse4 1738AVX_INSTR pmovsxbq, sse4 1739AVX_INSTR pmovsxbw, sse4 1740AVX_INSTR pmovsxdq, sse4 1741AVX_INSTR pmovsxwd, sse4 1742AVX_INSTR pmovsxwq, sse4 1743AVX_INSTR pmovzxbd, sse4 1744AVX_INSTR pmovzxbq, sse4 1745AVX_INSTR pmovzxbw, sse4 1746AVX_INSTR pmovzxdq, sse4 1747AVX_INSTR pmovzxwd, sse4 1748AVX_INSTR pmovzxwq, sse4 1749AVX_INSTR pmuldq, sse4, 0, 0, 1 1750AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 1751AVX_INSTR pmulhuw, mmx2, 0, 0, 1 1752AVX_INSTR pmulhw, mmx, 0, 0, 1 1753AVX_INSTR pmulld, sse4, 0, 0, 1 1754AVX_INSTR pmullw, mmx, 0, 0, 1 1755AVX_INSTR pmuludq, sse2, 0, 0, 1 1756AVX_INSTR por, mmx, 0, 0, 1 1757AVX_INSTR psadbw, mmx2, 0, 0, 1 1758AVX_INSTR pshufb, ssse3, 0, 0, 0 1759AVX_INSTR pshufd, sse2 1760AVX_INSTR pshufhw, sse2 1761AVX_INSTR pshuflw, sse2 1762AVX_INSTR psignb, ssse3, 0, 0, 0 1763AVX_INSTR psignd, ssse3, 0, 0, 0 1764AVX_INSTR psignw, ssse3, 0, 0, 0 1765AVX_INSTR pslld, mmx, 0, 0, 0 1766AVX_INSTR pslldq, sse2, 0, 0, 0 1767AVX_INSTR psllq, mmx, 0, 0, 0 1768AVX_INSTR psllw, mmx, 0, 0, 0 1769AVX_INSTR psrad, mmx, 0, 0, 0 1770AVX_INSTR psraw, mmx, 0, 0, 0 1771AVX_INSTR psrld, mmx, 0, 0, 0 1772AVX_INSTR psrldq, sse2, 0, 0, 0 1773AVX_INSTR psrlq, mmx, 0, 0, 0 1774AVX_INSTR psrlw, mmx, 0, 0, 0 1775AVX_INSTR psubb, mmx, 0, 0, 0 1776AVX_INSTR psubd, mmx, 0, 0, 0 1777AVX_INSTR psubq, sse2, 0, 0, 0 1778AVX_INSTR psubsb, mmx, 0, 0, 0 1779AVX_INSTR psubsw, mmx, 0, 0, 0 1780AVX_INSTR psubusb, mmx, 0, 0, 0 1781AVX_INSTR psubusw, mmx, 0, 0, 0 1782AVX_INSTR psubw, mmx, 0, 0, 0 1783AVX_INSTR ptest, sse4 1784AVX_INSTR punpckhbw, mmx, 0, 0, 0 1785AVX_INSTR punpckhdq, mmx, 0, 0, 0 1786AVX_INSTR punpckhqdq, sse2, 0, 0, 0 1787AVX_INSTR punpckhwd, mmx, 0, 0, 0 1788AVX_INSTR punpcklbw, mmx, 0, 0, 0 1789AVX_INSTR punpckldq, mmx, 0, 0, 0 1790AVX_INSTR punpcklqdq, sse2, 0, 0, 0 1791AVX_INSTR punpcklwd, mmx, 0, 0, 0 1792AVX_INSTR pxor, mmx, 0, 0, 1 1793AVX_INSTR rcpps, sse, 1 1794AVX_INSTR rcpss, sse, 1, 0, 0 1795AVX_INSTR roundpd, sse4, 1 1796AVX_INSTR roundps, sse4, 1 1797AVX_INSTR roundsd, sse4, 1, 1, 0 1798AVX_INSTR roundss, sse4, 1, 1, 0 1799AVX_INSTR rsqrtps, sse, 1 1800AVX_INSTR rsqrtss, sse, 1, 0, 0 1801AVX_INSTR shufpd, sse2, 1, 1, 0 1802AVX_INSTR shufps, sse, 1, 1, 0 1803AVX_INSTR sqrtpd, sse2, 1 1804AVX_INSTR sqrtps, sse, 1 1805AVX_INSTR sqrtsd, sse2, 1, 0, 0 1806AVX_INSTR sqrtss, sse, 1, 0, 0 1807AVX_INSTR stmxcsr, sse, 1 1808AVX_INSTR subpd, sse2, 1, 0, 0 1809AVX_INSTR subps, sse, 1, 0, 0 1810AVX_INSTR subsd, sse2, 1, 0, 0 1811AVX_INSTR subss, sse, 1, 0, 0 1812AVX_INSTR ucomisd, sse2, 1 1813AVX_INSTR ucomiss, sse, 1 1814AVX_INSTR unpckhpd, sse2, 1, 0, 0 1815AVX_INSTR unpckhps, sse, 1, 0, 0 1816AVX_INSTR unpcklpd, sse2, 1, 0, 0 1817AVX_INSTR unpcklps, sse, 1, 0, 0 1818AVX_INSTR xorpd, sse2, 1, 0, 1 1819AVX_INSTR xorps, sse, 1, 0, 1 1820 1821; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1822AVX_INSTR pfadd, 3dnow, 1, 0, 1 1823AVX_INSTR pfmul, 3dnow, 1, 0, 1 1824AVX_INSTR pfsub, 3dnow, 1, 0, 0 1825 1826;%1 == instruction 1827;%2 == minimal instruction set 1828%macro GPR_INSTR 2 1829 %macro %1 2-5 fnord, %1, %2 1830 %ifdef cpuname 1831 %if notcpuflag(%5) 1832 %error use of ``%4'' %5 instruction in cpuname function: current_function 1833 %endif 1834 %endif 1835 %ifidn %3, fnord 1836 %4 %1, %2 1837 %else 1838 %4 %1, %2, %3 1839 %endif 1840 %endmacro 1841%endmacro 1842 1843GPR_INSTR andn, bmi1 1844GPR_INSTR bextr, bmi1 1845GPR_INSTR blsi, bmi1 1846GPR_INSTR blsmsk, bmi1 1847GPR_INSTR blsr, bmi1 1848GPR_INSTR bzhi, bmi2 1849GPR_INSTR crc32, sse42 1850GPR_INSTR mulx, bmi2 1851GPR_INSTR pdep, bmi2 1852GPR_INSTR pext, bmi2 1853GPR_INSTR popcnt, sse42 1854GPR_INSTR rorx, bmi2 1855GPR_INSTR sarx, bmi2 1856GPR_INSTR shlx, bmi2 1857GPR_INSTR shrx, bmi2 1858 1859; base-4 constants for shuffles 1860%assign i 0 1861%rep 256 1862 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1863 %if j < 10 1864 CAT_XDEFINE q000, j, i 1865 %elif j < 100 1866 CAT_XDEFINE q00, j, i 1867 %elif j < 1000 1868 CAT_XDEFINE q0, j, i 1869 %else 1870 CAT_XDEFINE q, j, i 1871 %endif 1872 %assign i i+1 1873%endrep 1874%undef i 1875%undef j 1876 1877%macro FMA_INSTR 3 1878 %macro %1 4-7 %1, %2, %3 1879 %if cpuflag(xop) 1880 v%5 %1, %2, %3, %4 1881 %elifnidn %1, %4 1882 %6 %1, %2, %3 1883 %7 %1, %4 1884 %else 1885 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported 1886 %endif 1887 %endmacro 1888%endmacro 1889 1890FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation 1891FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation 1892FMA_INSTR pmacsww, pmullw, paddw 1893FMA_INSTR pmadcswd, pmaddwd, paddd 1894 1895; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. 1896; FMA3 is only possible if dst is the same as one of the src registers. 1897; Either src2 or src3 can be a memory operand. 1898%macro FMA4_INSTR 2-* 1899 %push fma4_instr 1900 %xdefine %$prefix %1 1901 %rep %0 - 1 1902 %macro %$prefix%2 4-6 %$prefix, %2 1903 %if notcpuflag(fma3) && notcpuflag(fma4) 1904 %error use of ``%5%6'' fma instruction in cpuname function: current_function 1905 %elif cpuflag(fma4) 1906 v%5%6 %1, %2, %3, %4 1907 %elifidn %1, %2 1908 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. 1909 %ifnum sizeof%3 1910 v%{5}213%6 %2, %3, %4 1911 %else 1912 v%{5}132%6 %2, %4, %3 1913 %endif 1914 %elifidn %1, %3 1915 v%{5}213%6 %3, %2, %4 1916 %elifidn %1, %4 1917 v%{5}231%6 %4, %2, %3 1918 %else 1919 %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported 1920 %endif 1921 %endmacro 1922 %rotate 1 1923 %endrep 1924 %pop 1925%endmacro 1926 1927FMA4_INSTR fmadd, pd, ps, sd, ss 1928FMA4_INSTR fmaddsub, pd, ps 1929FMA4_INSTR fmsub, pd, ps, sd, ss 1930FMA4_INSTR fmsubadd, pd, ps 1931FMA4_INSTR fnmadd, pd, ps, sd, ss 1932FMA4_INSTR fnmsub, pd, ps, sd, ss 1933 1934; Macros for converting VEX instructions to equivalent EVEX ones. 1935%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex 1936 %macro %1 2-7 fnord, fnord, %1, %2, %3 1937 %ifidn %3, fnord 1938 %define %%args %1, %2 1939 %elifidn %4, fnord 1940 %define %%args %1, %2, %3 1941 %else 1942 %define %%args %1, %2, %3, %4 1943 %endif 1944 %assign %%evex_required cpuflag(avx512) & %7 1945 %ifnum regnumof%1 1946 %if regnumof%1 >= 16 || sizeof%1 > 32 1947 %assign %%evex_required 1 1948 %endif 1949 %endif 1950 %ifnum regnumof%2 1951 %if regnumof%2 >= 16 || sizeof%2 > 32 1952 %assign %%evex_required 1 1953 %endif 1954 %endif 1955 %ifnum regnumof%3 1956 %if regnumof%3 >= 16 || sizeof%3 > 32 1957 %assign %%evex_required 1 1958 %endif 1959 %endif 1960 %if %%evex_required 1961 %6 %%args 1962 %else 1963 %5 %%args ; Prefer VEX over EVEX due to shorter instruction length 1964 %endif 1965 %endmacro 1966%endmacro 1967 1968EVEX_INSTR vbroadcastf128, vbroadcastf32x4 1969EVEX_INSTR vbroadcasti128, vbroadcasti32x4 1970EVEX_INSTR vextractf128, vextractf32x4 1971EVEX_INSTR vextracti128, vextracti32x4 1972EVEX_INSTR vinsertf128, vinsertf32x4 1973EVEX_INSTR vinserti128, vinserti32x4 1974EVEX_INSTR vmovdqa, vmovdqa32 1975EVEX_INSTR vmovdqu, vmovdqu32 1976EVEX_INSTR vpand, vpandd 1977EVEX_INSTR vpandn, vpandnd 1978EVEX_INSTR vpor, vpord 1979EVEX_INSTR vpxor, vpxord 1980EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision 1981EVEX_INSTR vrcpss, vrcp14ss, 1 1982EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 1983EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 1984