xref: /aosp_15_r20/external/libdav1d/src/ext/x86/x86inc.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1;*****************************************************************************
2;* x86inc.asm: x86 abstraction layer
3;*****************************************************************************
4;* Copyright (C) 2005-2024 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Henrik Gramner <henrik@gramner.com>
8;*          Anton Mitrofanov <BugMaster@narod.ru>
9;*          Fiona Glaser <fiona@x264.com>
10;*
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
14;*
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22;*****************************************************************************
23
24; This is a header file for the x86inc.asm assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used.
29
30%ifndef private_prefix
31    %error private_prefix not defined
32%endif
33
34%ifndef public_prefix
35    %define public_prefix private_prefix
36%endif
37
38%ifndef STACK_ALIGNMENT
39    %if ARCH_X86_64
40        %define STACK_ALIGNMENT 16
41    %else
42        %define STACK_ALIGNMENT 4
43    %endif
44%endif
45
46%define WIN64  0
47%define UNIX64 0
48%if ARCH_X86_64
49    %ifidn __OUTPUT_FORMAT__,win32
50        %define WIN64  1
51    %elifidn __OUTPUT_FORMAT__,win64
52        %define WIN64  1
53    %elifidn __OUTPUT_FORMAT__,x64
54        %define WIN64  1
55    %else
56        %define UNIX64 1
57    %endif
58%endif
59
60%define FORMAT_ELF 0
61%define FORMAT_MACHO 0
62%ifidn __OUTPUT_FORMAT__,elf
63    %define FORMAT_ELF 1
64%elifidn __OUTPUT_FORMAT__,elf32
65    %define FORMAT_ELF 1
66%elifidn __OUTPUT_FORMAT__,elf64
67    %define FORMAT_ELF 1
68%elifidn __OUTPUT_FORMAT__,macho
69    %define FORMAT_MACHO 1
70%elifidn __OUTPUT_FORMAT__,macho32
71    %define FORMAT_MACHO 1
72%elifidn __OUTPUT_FORMAT__,macho64
73    %define FORMAT_MACHO 1
74%endif
75
76%ifdef PREFIX
77    %define mangle(x) _ %+ x
78%else
79    %define mangle(x) x
80%endif
81
82; Use VEX-encoding even in non-AVX functions
83%ifndef FORCE_VEX_ENCODING
84    %define FORCE_VEX_ENCODING 0
85%endif
86
87%macro SECTION_RODATA 0-1 16
88    %ifidn __OUTPUT_FORMAT__,win32
89        SECTION .rdata align=%1
90    %elif WIN64
91        SECTION .rdata align=%1
92    %else
93        SECTION .rodata align=%1
94    %endif
95%endmacro
96
97%if ARCH_X86_64
98    %define PIC 1 ; always use PIC on x86-64
99    default rel
100%elifidn __OUTPUT_FORMAT__,win32
101    %define PIC 0 ; PIC isn't used on 32-bit Windows
102%elifndef PIC
103    %define PIC 0
104%endif
105
106%define HAVE_PRIVATE_EXTERN 1
107%ifdef __NASM_VERSION_ID__
108    %use smartalign
109    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
110        %define HAVE_PRIVATE_EXTERN 0
111    %endif
112%endif
113
114; Macros to eliminate most code duplication between x86_32 and x86_64:
115; Currently this works only for leaf functions which load all their arguments
116; into registers at the start, and make no other use of the stack. Luckily that
117; covers most use cases.
118
119; PROLOGUE:
120; %1 = number of arguments. loads them from stack if needed.
121; %2 = number of registers used. pushes callee-saved regs if needed.
122; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
123; %4 = (optional) stack size to be allocated. The stack will be aligned before
124;      allocating the specified stack size. If the required stack alignment is
125;      larger than the known stack alignment the stack will be manually aligned
126;      and an extra register will be allocated to hold the original stack
127;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
128;      register as stack pointer, request a negative stack size.
129; %4+/%5+ = list of names to define to registers
130; PROLOGUE can also be invoked by adding the same options to cglobal
131
132; e.g.
133; cglobal foo, 2,3,7,0x40, dst, src, tmp
134; declares a function (foo) that automatically loads two arguments (dst and
135; src) into registers, uses one additional register (tmp) plus 7 vector
136; registers (m0-m6) and allocates 0x40 bytes of stack space.
137
138; TODO Some functions can use some args directly from the stack. If they're the
139; last args then you can just not declare them, but if they're in the middle
140; we need more flexible macro.
141
142; RET:
143; Pops anything that was pushed by PROLOGUE, and returns.
144
145; REP_RET:
146; Use this instead of RET if it's a branch target.
147
148; registers:
149; rN and rNq are the native-size register holding function argument N
150; rNd, rNw, rNb are dword, word, and byte size
151; rNh is the high 8 bits of the word size
152; rNm is the original location of arg N (a register or on the stack), dword
153; rNmp is native size
154
155%macro DECLARE_REG 2-3
156    %define r%1q %2
157    %define r%1d %2d
158    %define r%1w %2w
159    %define r%1b %2b
160    %define r%1h %2h
161    %define %2q %2
162    %if %0 == 2
163        %define r%1m  %2d
164        %define r%1mp %2
165    %elif ARCH_X86_64 ; memory
166        %define r%1m [rstk + stack_offset + %3]
167        %define r%1mp qword r %+ %1 %+ m
168    %else
169        %define r%1m [rstk + stack_offset + %3]
170        %define r%1mp dword r %+ %1 %+ m
171    %endif
172    %define r%1  %2
173%endmacro
174
175%macro DECLARE_REG_SIZE 3
176    %define r%1q r%1
177    %define e%1q r%1
178    %define r%1d e%1
179    %define e%1d e%1
180    %define r%1w %1
181    %define e%1w %1
182    %define r%1h %3
183    %define e%1h %3
184    %define r%1b %2
185    %define e%1b %2
186    %if ARCH_X86_64 == 0
187        %define r%1 e%1
188    %endif
189%endmacro
190
191DECLARE_REG_SIZE ax, al, ah
192DECLARE_REG_SIZE bx, bl, bh
193DECLARE_REG_SIZE cx, cl, ch
194DECLARE_REG_SIZE dx, dl, dh
195DECLARE_REG_SIZE si, sil, null
196DECLARE_REG_SIZE di, dil, null
197DECLARE_REG_SIZE bp, bpl, null
198
199; t# defines for when per-arch register allocation is more complex than just function arguments
200
201%macro DECLARE_REG_TMP 1-*
202    %assign %%i 0
203    %rep %0
204        CAT_XDEFINE t, %%i, r%1
205        %assign %%i %%i+1
206        %rotate 1
207    %endrep
208%endmacro
209
210%macro DECLARE_REG_TMP_SIZE 0-*
211    %rep %0
212        %define t%1q t%1 %+ q
213        %define t%1d t%1 %+ d
214        %define t%1w t%1 %+ w
215        %define t%1h t%1 %+ h
216        %define t%1b t%1 %+ b
217        %rotate 1
218    %endrep
219%endmacro
220
221DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
222
223%if ARCH_X86_64
224    %define gprsize 8
225%else
226    %define gprsize 4
227%endif
228
229%macro LEA 2
230%if ARCH_X86_64
231    lea %1, [%2]
232%elif PIC
233    call $+5 ; special-cased to not affect the RSB on most CPU:s
234    pop %1
235    add %1, -$+1+%2
236%else
237    mov %1, %2
238%endif
239%endmacro
240
241; Repeats an instruction/operation for multiple arguments.
242; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3"
243%macro REPX 2-* ; operation, args
244    %xdefine %%f(x) %1
245    %rep %0 - 1
246        %rotate 1
247        %%f(%1)
248    %endrep
249%endmacro
250
251%macro PUSH 1
252    push %1
253    %ifidn rstk, rsp
254        %assign stack_offset stack_offset+gprsize
255    %endif
256%endmacro
257
258%macro POP 1
259    pop %1
260    %ifidn rstk, rsp
261        %assign stack_offset stack_offset-gprsize
262    %endif
263%endmacro
264
265%macro PUSH_IF_USED 1-*
266    %rep %0
267        %if %1 < regs_used
268            PUSH r%1
269        %endif
270        %rotate 1
271    %endrep
272%endmacro
273
274%macro POP_IF_USED 1-*
275    %rep %0
276        %if %1 < regs_used
277            pop r%1
278        %endif
279        %rotate 1
280    %endrep
281%endmacro
282
283%macro LOAD_IF_USED 1-*
284    %rep %0
285        %if %1 < num_args
286            mov r%1, r %+ %1 %+ mp
287        %endif
288        %rotate 1
289    %endrep
290%endmacro
291
292%macro SUB 2
293    sub %1, %2
294    %ifidn %1, rstk
295        %assign stack_offset stack_offset+(%2)
296    %endif
297%endmacro
298
299%macro ADD 2
300    add %1, %2
301    %ifidn %1, rstk
302        %assign stack_offset stack_offset-(%2)
303    %endif
304%endmacro
305
306%macro movifnidn 2
307    %ifnidn %1, %2
308        mov %1, %2
309    %endif
310%endmacro
311
312%if ARCH_X86_64 == 0
313    %define movsxd movifnidn
314%endif
315
316%macro movsxdifnidn 2
317    %ifnidn %1, %2
318        movsxd %1, %2
319    %endif
320%endmacro
321
322%macro ASSERT 1
323    %if (%1) == 0
324        %error assertion ``%1'' failed
325    %endif
326%endmacro
327
328%macro DEFINE_ARGS 0-*
329    %ifdef n_arg_names
330        %assign %%i 0
331        %rep n_arg_names
332            CAT_UNDEF arg_name %+ %%i, q
333            CAT_UNDEF arg_name %+ %%i, d
334            CAT_UNDEF arg_name %+ %%i, w
335            CAT_UNDEF arg_name %+ %%i, h
336            CAT_UNDEF arg_name %+ %%i, b
337            CAT_UNDEF arg_name %+ %%i, m
338            CAT_UNDEF arg_name %+ %%i, mp
339            CAT_UNDEF arg_name, %%i
340            %assign %%i %%i+1
341        %endrep
342    %endif
343
344    %xdefine %%stack_offset stack_offset
345    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
346    %assign %%i 0
347    %rep %0
348        %xdefine %1q r %+ %%i %+ q
349        %xdefine %1d r %+ %%i %+ d
350        %xdefine %1w r %+ %%i %+ w
351        %xdefine %1h r %+ %%i %+ h
352        %xdefine %1b r %+ %%i %+ b
353        %xdefine %1m r %+ %%i %+ m
354        %xdefine %1mp r %+ %%i %+ mp
355        CAT_XDEFINE arg_name, %%i, %1
356        %assign %%i %%i+1
357        %rotate 1
358    %endrep
359    %xdefine stack_offset %%stack_offset
360    %assign n_arg_names %0
361%endmacro
362
363%define required_stack_alignment ((mmsize + 15) & ~15)
364%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
365%define high_mm_regs (16*cpuflag(avx512))
366
367; Large stack allocations on Windows need to use stack probing in order
368; to guarantee that all stack memory is committed before accessing it.
369; This is done by ensuring that the guard page(s) at the end of the
370; currently committed pages are touched prior to any pages beyond that.
371%if WIN64
372    %assign STACK_PROBE_SIZE 8192
373%elifidn __OUTPUT_FORMAT__, win32
374    %assign STACK_PROBE_SIZE 4096
375%else
376    %assign STACK_PROBE_SIZE 0
377%endif
378
379%macro PROBE_STACK 1 ; stack_size
380    %if STACK_PROBE_SIZE
381        %assign %%i STACK_PROBE_SIZE
382        %rep %1 / STACK_PROBE_SIZE
383            mov eax, [rsp-%%i]
384            %assign %%i %%i+STACK_PROBE_SIZE
385        %endrep
386    %endif
387%endmacro
388
389%macro RESET_STACK_STATE 0
390    %ifidn rstk, rsp
391        %assign stack_offset stack_offset - stack_size_padded
392    %else
393        %xdefine rstk rsp
394    %endif
395    %assign stack_size 0
396    %assign stack_size_padded 0
397    %assign xmm_regs_used 0
398%endmacro
399
400%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs
401    RESET_STACK_STATE
402    %ifnum %2
403        %if mmsize != 8
404            %assign xmm_regs_used %2
405        %endif
406    %endif
407    %ifnum %1
408        %if %1 != 0
409            %assign %%pad 0
410            %assign stack_size %1
411            %if stack_size < 0
412                %assign stack_size -stack_size
413            %endif
414            %if WIN64
415                %assign %%pad %%pad + 32 ; shadow space
416                %if xmm_regs_used > 8
417                    %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
418                %endif
419            %endif
420            %if required_stack_alignment <= STACK_ALIGNMENT
421                ; maintain the current stack alignment
422                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
423                PROBE_STACK stack_size_padded
424                SUB rsp, stack_size_padded
425            %else
426                %assign %%reg_num (regs_used - 1)
427                %xdefine rstk r %+ %%reg_num
428                ; align stack, and save original stack location directly above
429                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
430                ; stack in a single instruction (i.e. mov rsp, rstk or mov
431                ; rsp, [rsp+stack_size_padded])
432                %if %1 < 0 ; need to store rsp on stack
433                    %xdefine rstkm [rsp + stack_size + %%pad]
434                    %assign %%pad %%pad + gprsize
435                %else ; can keep rsp in rstk during whole function
436                    %xdefine rstkm rstk
437                %endif
438                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
439                PROBE_STACK stack_size_padded
440                mov rstk, rsp
441                and rsp, ~(required_stack_alignment-1)
442                sub rsp, stack_size_padded
443                movifnidn rstkm, rstk
444            %endif
445            WIN64_PUSH_XMM
446        %endif
447    %endif
448%endmacro
449
450%macro SETUP_STACK_POINTER 0-1 0
451    %ifnum %1
452        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
453            %if %1 > 0
454                ; Reserve an additional register for storing the original stack pointer, but avoid using
455                ; eax/rax for this purpose since it can potentially get overwritten as a return value.
456                %assign regs_used (regs_used + 1)
457                %if ARCH_X86_64 && regs_used == 7
458                    %assign regs_used 8
459                %elif ARCH_X86_64 == 0 && regs_used == 1
460                    %assign regs_used 2
461                %endif
462            %endif
463            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
464                ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
465                ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
466                %assign regs_used 5 + UNIX64 * 3
467            %endif
468        %endif
469    %endif
470%endmacro
471
472%if WIN64 ; Windows x64 ;=================================================
473
474DECLARE_REG 0,  rcx
475DECLARE_REG 1,  rdx
476DECLARE_REG 2,  R8
477DECLARE_REG 3,  R9
478DECLARE_REG 4,  R10, 40
479DECLARE_REG 5,  R11, 48
480DECLARE_REG 6,  rax, 56
481DECLARE_REG 7,  rdi, 64
482DECLARE_REG 8,  rsi, 72
483DECLARE_REG 9,  rbx, 80
484DECLARE_REG 10, rbp, 88
485DECLARE_REG 11, R14, 96
486DECLARE_REG 12, R15, 104
487DECLARE_REG 13, R12, 112
488DECLARE_REG 14, R13, 120
489
490%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
491    %assign num_args %1
492    %assign regs_used %2
493    ASSERT regs_used >= num_args
494    SETUP_STACK_POINTER %4
495    ASSERT regs_used <= 15
496    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
497    ALLOC_STACK %4, %3
498    %if mmsize != 8 && stack_size == 0
499        WIN64_SPILL_XMM %3
500    %endif
501    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
502    %if %0 > 4
503        %ifnum %4
504            DEFINE_ARGS %5
505        %else
506            DEFINE_ARGS %4, %5
507        %endif
508    %elifnnum %4
509        DEFINE_ARGS %4
510    %endif
511%endmacro
512
513; Push XMM registers to the stack. If no argument is specified all used register
514; will be pushed, otherwise only push previously unpushed registers.
515%macro WIN64_PUSH_XMM 0-2 ; new_xmm_regs_used, xmm_regs_pushed
516    %if mmsize != 8
517        %if %0 == 2
518            %assign %%pushed %2
519            %assign xmm_regs_used %1
520        %elif %0 == 1
521            %assign %%pushed xmm_regs_used
522            %assign xmm_regs_used %1
523        %else
524            %assign %%pushed 0
525        %endif
526        ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
527        %if %%pushed <= 6 + high_mm_regs && xmm_regs_used > 6 + high_mm_regs
528            movaps [rstk + stack_offset +  8], xmm6
529        %endif
530        %if %%pushed <= 7 + high_mm_regs && xmm_regs_used > 7 + high_mm_regs
531            movaps [rstk + stack_offset + 24], xmm7
532        %endif
533        %assign %%pushed %%pushed - high_mm_regs - 8
534        %if %%pushed < 0
535            %assign %%pushed 0
536        %endif
537        %assign %%regs_to_push xmm_regs_used - %%pushed - high_mm_regs - 8
538        %if %%regs_to_push > 0
539            ASSERT (%%regs_to_push + %%pushed) * 16 <= stack_size_padded - stack_size - 32
540            %assign %%i %%pushed + 8
541            %rep %%regs_to_push
542                movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
543                %assign %%i %%i+1
544            %endrep
545        %endif
546    %endif
547%endmacro
548
549; Allocated stack space for XMM registers and push all, or a subset, of those
550%macro WIN64_SPILL_XMM 1-2 ; xmm_regs_used, xmm_regs_reserved
551    RESET_STACK_STATE
552    %if mmsize != 8
553        %assign xmm_regs_used %1
554        ASSERT xmm_regs_used <= 16 + high_mm_regs
555        %if %0 == 2
556            ASSERT %2 >= %1
557            %assign %%xmm_regs_on_stack %2 - high_mm_regs - 8
558        %else
559            %assign %%xmm_regs_on_stack %1 - high_mm_regs - 8
560        %endif
561        %if %%xmm_regs_on_stack > 0
562            ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
563            %assign %%pad %%xmm_regs_on_stack*16 + 32
564            %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
565            SUB rsp, stack_size_padded
566        %endif
567        WIN64_PUSH_XMM
568    %endif
569%endmacro
570
571%macro WIN64_RESTORE_XMM_INTERNAL 0
572    %assign %%pad_size 0
573    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
574    %if %%xmm_regs_on_stack > 0
575        %assign %%i xmm_regs_used - high_mm_regs
576        %rep %%xmm_regs_on_stack
577            %assign %%i %%i-1
578            movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
579        %endrep
580    %endif
581    %if stack_size_padded > 0
582        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
583            mov rsp, rstkm
584        %else
585            add rsp, stack_size_padded
586            %assign %%pad_size stack_size_padded
587        %endif
588    %endif
589    %if xmm_regs_used > 7 + high_mm_regs
590        movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
591    %endif
592    %if xmm_regs_used > 6 + high_mm_regs
593        movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
594    %endif
595%endmacro
596
597%macro WIN64_RESTORE_XMM 0
598    WIN64_RESTORE_XMM_INTERNAL
599    RESET_STACK_STATE
600%endmacro
601
602%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
603
604%macro RET 0
605    WIN64_RESTORE_XMM_INTERNAL
606    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
607    %if vzeroupper_required
608        vzeroupper
609    %endif
610    AUTO_REP_RET
611%endmacro
612
613%elif ARCH_X86_64 ; *nix x64 ;=============================================
614
615DECLARE_REG 0,  rdi
616DECLARE_REG 1,  rsi
617DECLARE_REG 2,  rdx
618DECLARE_REG 3,  rcx
619DECLARE_REG 4,  R8
620DECLARE_REG 5,  R9
621DECLARE_REG 6,  rax, 8
622DECLARE_REG 7,  R10, 16
623DECLARE_REG 8,  R11, 24
624DECLARE_REG 9,  rbx, 32
625DECLARE_REG 10, rbp, 40
626DECLARE_REG 11, R14, 48
627DECLARE_REG 12, R15, 56
628DECLARE_REG 13, R12, 64
629DECLARE_REG 14, R13, 72
630
631%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
632    %assign num_args %1
633    %assign regs_used %2
634    ASSERT regs_used >= num_args
635    SETUP_STACK_POINTER %4
636    ASSERT regs_used <= 15
637    PUSH_IF_USED 9, 10, 11, 12, 13, 14
638    ALLOC_STACK %4, %3
639    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
640    %if %0 > 4
641        %ifnum %4
642            DEFINE_ARGS %5
643        %else
644            DEFINE_ARGS %4, %5
645        %endif
646    %elifnnum %4
647        DEFINE_ARGS %4
648    %endif
649%endmacro
650
651%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
652
653%macro RET 0
654    %if stack_size_padded > 0
655        %if required_stack_alignment > STACK_ALIGNMENT
656            mov rsp, rstkm
657        %else
658            add rsp, stack_size_padded
659        %endif
660    %endif
661    POP_IF_USED 14, 13, 12, 11, 10, 9
662    %if vzeroupper_required
663        vzeroupper
664    %endif
665    AUTO_REP_RET
666%endmacro
667
668%else ; X86_32 ;==============================================================
669
670DECLARE_REG 0, eax, 4
671DECLARE_REG 1, ecx, 8
672DECLARE_REG 2, edx, 12
673DECLARE_REG 3, ebx, 16
674DECLARE_REG 4, esi, 20
675DECLARE_REG 5, edi, 24
676DECLARE_REG 6, ebp, 28
677%define rsp esp
678
679%macro DECLARE_ARG 1-*
680    %rep %0
681        %define r%1m [rstk + stack_offset + 4*%1 + 4]
682        %define r%1mp dword r%1m
683        %rotate 1
684    %endrep
685%endmacro
686
687DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
688
689%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
690    %assign num_args %1
691    %assign regs_used %2
692    ASSERT regs_used >= num_args
693    %if num_args > 7
694        %assign num_args 7
695    %endif
696    %if regs_used > 7
697        %assign regs_used 7
698    %endif
699    SETUP_STACK_POINTER %4
700    ASSERT regs_used <= 7
701    PUSH_IF_USED 3, 4, 5, 6
702    ALLOC_STACK %4, %3
703    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
704    %if %0 > 4
705        %ifnum %4
706            DEFINE_ARGS %5
707        %else
708            DEFINE_ARGS %4, %5
709        %endif
710    %elifnnum %4
711        DEFINE_ARGS %4
712    %endif
713%endmacro
714
715%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
716
717%macro RET 0
718    %if stack_size_padded > 0
719        %if required_stack_alignment > STACK_ALIGNMENT
720            mov rsp, rstkm
721        %else
722            add rsp, stack_size_padded
723        %endif
724    %endif
725    POP_IF_USED 6, 5, 4, 3
726    %if vzeroupper_required
727        vzeroupper
728    %endif
729    AUTO_REP_RET
730%endmacro
731
732%endif ;======================================================================
733
734%if WIN64 == 0
735    %macro WIN64_SPILL_XMM 1-2
736        RESET_STACK_STATE
737        %if mmsize != 8
738            %assign xmm_regs_used %1
739        %endif
740    %endmacro
741    %macro WIN64_RESTORE_XMM 0
742        RESET_STACK_STATE
743    %endmacro
744    %macro WIN64_PUSH_XMM 0-2
745        %if mmsize != 8 && %0 >= 1
746            %assign xmm_regs_used %1
747        %endif
748    %endmacro
749%endif
750
751; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
752; a branch or a branch target. So switch to a 2-byte form of ret in that case.
753; We can automatically detect "follows a branch", but not a branch target.
754; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
755%macro REP_RET 0
756    %if has_epilogue || cpuflag(ssse3)
757        RET
758    %else
759        rep ret
760    %endif
761    annotate_function_size
762%endmacro
763
764%define last_branch_adr $$
765%macro AUTO_REP_RET 0
766    %if notcpuflag(ssse3)
767        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
768    %endif
769    ret
770    annotate_function_size
771%endmacro
772
773%macro BRANCH_INSTR 0-*
774    %rep %0
775        %macro %1 1-2 %1
776            %2 %1
777            %if notcpuflag(ssse3)
778                %%branch_instr equ $
779                %xdefine last_branch_adr %%branch_instr
780            %endif
781        %endmacro
782        %rotate 1
783    %endrep
784%endmacro
785
786BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
787
788%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
789    %if has_epilogue
790        call %1
791        RET
792    %elif %2
793        jmp %1
794    %endif
795    annotate_function_size
796%endmacro
797
798;=============================================================================
799; arch-independent part
800;=============================================================================
801
802%assign function_align 16
803
804; Begin a function.
805; Applies any symbol mangling needed for C linkage, and sets up a define such that
806; subsequent uses of the function name automatically refer to the mangled version.
807; Appends cpuflags to the function name if cpuflags has been specified.
808; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
809; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
810%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
811    cglobal_internal 1, %1 %+ SUFFIX, %2
812%endmacro
813%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
814    cglobal_internal 0, %1 %+ SUFFIX, %2
815%endmacro
816%macro cglobal_internal 2-3+
817    annotate_function_size
818    %ifndef cglobaled_%2
819        %if %1
820            %xdefine %2 mangle(private_prefix %+ _ %+ %2)
821        %else
822            %xdefine %2 mangle(public_prefix %+ _ %+ %2)
823        %endif
824        %xdefine %2.skip_prologue %2 %+ .skip_prologue
825        CAT_XDEFINE cglobaled_, %2, 1
826    %endif
827    %xdefine current_function %2
828    %xdefine current_function_section __SECT__
829    %if FORMAT_ELF
830        %if %1
831            global %2:function hidden
832        %else
833            global %2:function
834        %endif
835    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
836        global %2:private_extern
837    %else
838        global %2
839    %endif
840    align function_align
841    %2:
842    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
843    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
844    %assign stack_offset 0      ; stack pointer offset relative to the return address
845    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
846    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
847    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
848    %ifnidn %3, ""
849        PROLOGUE %3
850    %endif
851%endmacro
852
853; Create a global symbol from a local label with the correct name mangling and type
854%macro cglobal_label 1
855    %if FORMAT_ELF
856        global current_function %+ %1:function hidden
857    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
858        global current_function %+ %1:private_extern
859    %else
860        global current_function %+ %1
861    %endif
862    %1:
863%endmacro
864
865%macro cextern 1
866    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
867    CAT_XDEFINE cglobaled_, %1, 2
868    extern %1
869%endmacro
870
871; Like cextern, but without the prefix. This should be used for symbols from external libraries.
872%macro cextern_naked 1
873    %ifdef PREFIX
874        %xdefine %1 mangle(%1)
875    %endif
876    CAT_XDEFINE cglobaled_, %1, 3
877    extern %1
878%endmacro
879
880%macro const 1-2+
881    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
882    %if FORMAT_ELF
883        global %1:data hidden
884    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
885        global %1:private_extern
886    %else
887        global %1
888    %endif
889    %1: %2
890%endmacro
891
892%if FORMAT_ELF
893    ; The GNU linker assumes the stack is executable by default.
894    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
895
896    %ifdef __NASM_VERSION_ID__
897        %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03
898            %if ARCH_X86_64
899                ; Control-flow Enforcement Technology (CET) properties.
900                [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize]
901                dd 0x00000004  ; n_namesz
902                dd gprsize + 8 ; n_descsz
903                dd 0x00000005  ; n_type = NT_GNU_PROPERTY_TYPE_0
904                db "GNU",0     ; n_name
905                dd 0xc0000002  ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND
906                dd 0x00000004  ; pr_datasz
907                dd 0x00000002  ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK
908                dd 0x00000000  ; pr_padding
909            %endif
910        %endif
911    %endif
912%endif
913
914; Tell debuggers how large the function was.
915; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
916; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
917; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
918; then its size might be unspecified.
919%macro annotate_function_size 0
920    %ifdef __YASM_VER__
921        %ifdef current_function
922            %if FORMAT_ELF
923                current_function_section
924                %%ecf equ $
925                size current_function %%ecf - current_function
926                __SECT__
927            %endif
928        %endif
929    %endif
930%endmacro
931
932; cpuflags
933
934%assign cpuflags_mmx       (1<<0)
935%assign cpuflags_mmx2      (1<<1)  | cpuflags_mmx
936%assign cpuflags_3dnow     (1<<2)  | cpuflags_mmx
937%assign cpuflags_3dnowext  (1<<3)  | cpuflags_3dnow
938%assign cpuflags_sse       (1<<4)  | cpuflags_mmx2
939%assign cpuflags_sse2      (1<<5)  | cpuflags_sse
940%assign cpuflags_sse2slow  (1<<6)  | cpuflags_sse2
941%assign cpuflags_lzcnt     (1<<7)  | cpuflags_sse2
942%assign cpuflags_sse3      (1<<8)  | cpuflags_sse2
943%assign cpuflags_ssse3     (1<<9)  | cpuflags_sse3
944%assign cpuflags_sse4      (1<<10) | cpuflags_ssse3
945%assign cpuflags_sse42     (1<<11) | cpuflags_sse4
946%assign cpuflags_aesni     (1<<12) | cpuflags_sse42
947%assign cpuflags_clmul     (1<<13) | cpuflags_sse42
948%assign cpuflags_gfni      (1<<14) | cpuflags_aesni|cpuflags_clmul
949%assign cpuflags_avx       (1<<15) | cpuflags_sse42
950%assign cpuflags_xop       (1<<16) | cpuflags_avx
951%assign cpuflags_fma4      (1<<17) | cpuflags_avx
952%assign cpuflags_fma3      (1<<18) | cpuflags_avx
953%assign cpuflags_bmi1      (1<<19) | cpuflags_avx|cpuflags_lzcnt
954%assign cpuflags_bmi2      (1<<20) | cpuflags_bmi1
955%assign cpuflags_avx2      (1<<21) | cpuflags_fma3|cpuflags_bmi2
956%assign cpuflags_avx512    (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL
957%assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
958
959%assign cpuflags_cache32   (1<<24)
960%assign cpuflags_cache64   (1<<25)
961%assign cpuflags_aligned   (1<<26) ; not a cpu feature, but a function variant
962%assign cpuflags_atom      (1<<27)
963
964; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
965%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
966%define notcpuflag(x) (cpuflag(x) ^ 1)
967
968; Takes an arbitrary number of cpuflags from the above list.
969; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
970; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
971%macro INIT_CPUFLAGS 0-*
972    %xdefine SUFFIX
973    %undef cpuname
974    %assign cpuflags 0
975
976    %if %0 >= 1
977        %rep %0
978            %ifdef cpuname
979                %xdefine cpuname cpuname %+ _%1
980            %else
981                %xdefine cpuname %1
982            %endif
983            %assign cpuflags cpuflags | cpuflags_%1
984            %rotate 1
985        %endrep
986        %xdefine SUFFIX _ %+ cpuname
987
988        %if cpuflag(avx)
989            %assign avx_enabled 1
990        %endif
991        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
992            %define mova movaps
993            %define movu movups
994            %define movnta movntps
995        %endif
996        %if cpuflag(aligned)
997            %define movu mova
998        %elif cpuflag(sse3) && notcpuflag(ssse3)
999            %define movu lddqu
1000        %endif
1001    %endif
1002
1003    %if ARCH_X86_64 || cpuflag(sse2)
1004        %ifdef __NASM_VERSION_ID__
1005            ALIGNMODE p6
1006        %else
1007            CPU amdnop
1008        %endif
1009    %else
1010        %ifdef __NASM_VERSION_ID__
1011            ALIGNMODE nop
1012        %else
1013            CPU basicnop
1014        %endif
1015    %endif
1016%endmacro
1017
1018; Merge mmx, sse*, and avx*
1019; m# is a simd register of the currently selected size
1020; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
1021; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
1022; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
1023; (All 4 remain in sync through SWAP.)
1024
1025%macro CAT_XDEFINE 3
1026    %xdefine %1%2 %3
1027%endmacro
1028
1029%macro CAT_UNDEF 2
1030    %undef %1%2
1031%endmacro
1032
1033%macro DEFINE_MMREGS 1 ; mmtype
1034    %assign %%prev_mmregs 0
1035    %ifdef num_mmregs
1036        %assign %%prev_mmregs num_mmregs
1037    %endif
1038
1039    %assign num_mmregs 8
1040    %if ARCH_X86_64 && mmsize >= 16
1041        %assign num_mmregs 16
1042        %if cpuflag(avx512) || mmsize == 64
1043            %assign num_mmregs 32
1044        %endif
1045    %endif
1046
1047    %assign %%i 0
1048    %rep num_mmregs
1049        CAT_XDEFINE m, %%i, %1 %+ %%i
1050        CAT_XDEFINE nn%1, %%i, %%i
1051        %assign %%i %%i+1
1052    %endrep
1053    %if %%prev_mmregs > num_mmregs
1054        %rep %%prev_mmregs - num_mmregs
1055            CAT_UNDEF m, %%i
1056            CAT_UNDEF nn %+ mmtype, %%i
1057            %assign %%i %%i+1
1058        %endrep
1059    %endif
1060    %xdefine mmtype %1
1061%endmacro
1062
1063; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
1064%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
1065    %if ARCH_X86_64 && cpuflag(avx512)
1066        %assign %%i %1
1067        %rep 16-%1
1068            %assign %%i_high %%i+16
1069            SWAP %%i, %%i_high
1070            %assign %%i %%i+1
1071        %endrep
1072    %endif
1073%endmacro
1074
1075%macro INIT_MMX 0-1+
1076    %assign avx_enabled 0
1077    %define RESET_MM_PERMUTATION INIT_MMX %1
1078    %define mmsize 8
1079    %define mova movq
1080    %define movu movq
1081    %define movh movd
1082    %define movnta movntq
1083    INIT_CPUFLAGS %1
1084    DEFINE_MMREGS mm
1085%endmacro
1086
1087%macro INIT_XMM 0-1+
1088    %assign avx_enabled FORCE_VEX_ENCODING
1089    %define RESET_MM_PERMUTATION INIT_XMM %1
1090    %define mmsize 16
1091    %define mova movdqa
1092    %define movu movdqu
1093    %define movh movq
1094    %define movnta movntdq
1095    INIT_CPUFLAGS %1
1096    DEFINE_MMREGS xmm
1097    %if WIN64
1098        AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
1099    %endif
1100    %xdefine bcstw 1to8
1101    %xdefine bcstd 1to4
1102    %xdefine bcstq 1to2
1103%endmacro
1104
1105%macro INIT_YMM 0-1+
1106    %assign avx_enabled 1
1107    %define RESET_MM_PERMUTATION INIT_YMM %1
1108    %define mmsize 32
1109    %define mova movdqa
1110    %define movu movdqu
1111    %undef movh
1112    %define movnta movntdq
1113    INIT_CPUFLAGS %1
1114    DEFINE_MMREGS ymm
1115    AVX512_MM_PERMUTATION
1116    %xdefine bcstw 1to16
1117    %xdefine bcstd 1to8
1118    %xdefine bcstq 1to4
1119%endmacro
1120
1121%macro INIT_ZMM 0-1+
1122    %assign avx_enabled 1
1123    %define RESET_MM_PERMUTATION INIT_ZMM %1
1124    %define mmsize 64
1125    %define mova movdqa
1126    %define movu movdqu
1127    %undef movh
1128    %define movnta movntdq
1129    INIT_CPUFLAGS %1
1130    DEFINE_MMREGS zmm
1131    AVX512_MM_PERMUTATION
1132    %xdefine bcstw 1to32
1133    %xdefine bcstd 1to16
1134    %xdefine bcstq 1to8
1135%endmacro
1136
1137INIT_XMM
1138
1139%macro DECLARE_MMCAST 1
1140    %define  mmmm%1   mm%1
1141    %define  mmxmm%1  mm%1
1142    %define  mmymm%1  mm%1
1143    %define  mmzmm%1  mm%1
1144    %define xmmmm%1   mm%1
1145    %define xmmxmm%1 xmm%1
1146    %define xmmymm%1 xmm%1
1147    %define xmmzmm%1 xmm%1
1148    %define ymmmm%1   mm%1
1149    %define ymmxmm%1 xmm%1
1150    %define ymmymm%1 ymm%1
1151    %define ymmzmm%1 ymm%1
1152    %define zmmmm%1   mm%1
1153    %define zmmxmm%1 xmm%1
1154    %define zmmymm%1 ymm%1
1155    %define zmmzmm%1 zmm%1
1156    %define xm%1 xmm %+ m%1
1157    %define ym%1 ymm %+ m%1
1158    %define zm%1 zmm %+ m%1
1159%endmacro
1160
1161%assign i 0
1162%rep 32
1163    DECLARE_MMCAST i
1164    %assign i i+1
1165%endrep
1166
1167; I often want to use macros that permute their arguments. e.g. there's no
1168; efficient way to implement butterfly or transpose or dct without swapping some
1169; arguments.
1170;
1171; I would like to not have to manually keep track of the permutations:
1172; If I insert a permutation in the middle of a function, it should automatically
1173; change everything that follows. For more complex macros I may also have multiple
1174; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
1175;
1176; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
1177; permutes its arguments. It's equivalent to exchanging the contents of the
1178; registers, except that this way you exchange the register names instead, so it
1179; doesn't cost any cycles.
1180
1181%macro PERMUTE 2-* ; takes a list of pairs to swap
1182    %rep %0/2
1183        %xdefine %%tmp%2 m%2
1184        %rotate 2
1185    %endrep
1186    %rep %0/2
1187        %xdefine m%1 %%tmp%2
1188        CAT_XDEFINE nn, m%1, %1
1189        %rotate 2
1190    %endrep
1191%endmacro
1192
1193%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
1194    %ifnum %1 ; SWAP 0, 1, ...
1195        SWAP_INTERNAL_NUM %1, %2
1196    %else ; SWAP m0, m1, ...
1197        SWAP_INTERNAL_NAME %1, %2
1198    %endif
1199%endmacro
1200
1201%macro SWAP_INTERNAL_NUM 2-*
1202    %rep %0-1
1203        %xdefine %%tmp m%1
1204        %xdefine m%1 m%2
1205        %xdefine m%2 %%tmp
1206        CAT_XDEFINE nn, m%1, %1
1207        CAT_XDEFINE nn, m%2, %2
1208        %rotate 1
1209    %endrep
1210%endmacro
1211
1212%macro SWAP_INTERNAL_NAME 2-*
1213    %xdefine %%args nn %+ %1
1214    %rep %0-1
1215        %xdefine %%args %%args, nn %+ %2
1216        %rotate 1
1217    %endrep
1218    SWAP_INTERNAL_NUM %%args
1219%endmacro
1220
1221; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
1222; calls to that function will automatically load the permutation, so values can
1223; be returned in mmregs.
1224%macro SAVE_MM_PERMUTATION 0-1
1225    %if %0
1226        %xdefine %%f %1_m
1227    %else
1228        %xdefine %%f current_function %+ _m
1229    %endif
1230    %assign %%i 0
1231    %rep num_mmregs
1232        %xdefine %%tmp m %+ %%i
1233        CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
1234        %assign %%i %%i+1
1235    %endrep
1236%endmacro
1237
1238%macro LOAD_MM_PERMUTATION 0-1 ; name to load from
1239    %if %0
1240        %xdefine %%f %1_m
1241    %else
1242        %xdefine %%f current_function %+ _m
1243    %endif
1244    %xdefine %%tmp %%f %+ 0
1245    %ifnum %%tmp
1246        DEFINE_MMREGS mmtype
1247        %assign %%i 0
1248        %rep num_mmregs
1249            %xdefine %%tmp %%f %+ %%i
1250            CAT_XDEFINE %%m, %%i, m %+ %%tmp
1251            %assign %%i %%i+1
1252        %endrep
1253        %rep num_mmregs
1254            %assign %%i %%i-1
1255            CAT_XDEFINE m, %%i, %%m %+ %%i
1256            CAT_XDEFINE nn, m %+ %%i, %%i
1257        %endrep
1258    %endif
1259%endmacro
1260
1261; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
1262%macro call 1
1263    %ifid %1
1264        call_internal %1 %+ SUFFIX, %1
1265    %else
1266        call %1
1267    %endif
1268%endmacro
1269%macro call_internal 2
1270    %xdefine %%i %2
1271    %define %%j %%i
1272    %ifndef cglobaled_%2
1273        %ifdef cglobaled_%1
1274            %xdefine %%i %1
1275        %endif
1276    %elif FORMAT_ELF
1277        %if ARCH_X86_64
1278            %if cglobaled_%2 >= 2
1279                ; Always emit PLT relocations when calling external functions,
1280                ; the linker will eliminate unnecessary PLT indirections anyway.
1281                %define %%j %%i wrt ..plt
1282            %endif
1283        %elif PIC && cglobaled_%2 == 3
1284            ; Go through the GOT for functions declared using cextern_naked with
1285            ; PIC, as such functions presumably exists in external libraries.
1286            extern _GLOBAL_OFFSET_TABLE_
1287            LEA eax, $$+_GLOBAL_OFFSET_TABLE_ wrt ..gotpc
1288            %define %%j [eax+%%i wrt ..got]
1289        %endif
1290    %endif
1291    call %%j
1292    LOAD_MM_PERMUTATION %%i
1293%endmacro
1294
1295; Substitutions that reduce instruction size but are functionally equivalent
1296%macro add 2
1297    %ifnum %2
1298        %if %2==128
1299            sub %1, -128
1300        %else
1301            add %1, %2
1302        %endif
1303    %else
1304        add %1, %2
1305    %endif
1306%endmacro
1307
1308%macro sub 2
1309    %ifnum %2
1310        %if %2==128
1311            add %1, -128
1312        %else
1313            sub %1, %2
1314        %endif
1315    %else
1316        sub %1, %2
1317    %endif
1318%endmacro
1319
1320;=============================================================================
1321; AVX abstraction layer
1322;=============================================================================
1323
1324%assign i 0
1325%rep 32
1326    %if i < 8
1327        CAT_XDEFINE sizeofmm, i, 8
1328        CAT_XDEFINE regnumofmm, i, i
1329    %endif
1330    CAT_XDEFINE sizeofxmm, i, 16
1331    CAT_XDEFINE sizeofymm, i, 32
1332    CAT_XDEFINE sizeofzmm, i, 64
1333    CAT_XDEFINE regnumofxmm, i, i
1334    CAT_XDEFINE regnumofymm, i, i
1335    CAT_XDEFINE regnumofzmm, i, i
1336    %assign i i+1
1337%endrep
1338%undef i
1339
1340%macro CHECK_AVX_INSTR_EMU 3-*
1341    %xdefine %%opcode %1
1342    %xdefine %%dst %2
1343    %rep %0-2
1344        %ifidn %%dst, %3
1345            %error non-avx emulation of ``%%opcode'' is not supported
1346        %endif
1347        %rotate 1
1348    %endrep
1349%endmacro
1350
1351;%1 == instruction
1352;%2 == minimal instruction set
1353;%3 == 1 if float, 0 if int
1354;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
1355;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1356;%6+: operands
1357%macro RUN_AVX_INSTR 6-9+
1358    %ifnum sizeof%7
1359        %assign __sizeofreg sizeof%7
1360    %elifnum sizeof%6
1361        %assign __sizeofreg sizeof%6
1362    %else
1363        %assign __sizeofreg mmsize
1364    %endif
1365    %assign __emulate_avx 0
1366    %if avx_enabled && __sizeofreg >= 16
1367        %xdefine __instr v%1
1368    %else
1369        %xdefine __instr %1
1370        %if %0 >= 8+%4
1371            %assign __emulate_avx 1
1372        %endif
1373    %endif
1374    %ifnidn %2, fnord
1375        %ifdef cpuname
1376            %if notcpuflag(%2)
1377                %error use of ``%1'' %2 instruction in cpuname function: current_function
1378            %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
1379                %error use of ``%1'' sse2 instruction in cpuname function: current_function
1380            %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
1381                %error use of ``%1'' avx2 instruction in cpuname function: current_function
1382            %elif __sizeofreg == 16 && notcpuflag(sse)
1383                %error use of ``%1'' sse instruction in cpuname function: current_function
1384            %elif __sizeofreg == 32 && notcpuflag(avx)
1385                %error use of ``%1'' avx instruction in cpuname function: current_function
1386            %elif __sizeofreg == 64 && notcpuflag(avx512)
1387                %error use of ``%1'' avx512 instruction in cpuname function: current_function
1388            %elifidn %1, pextrw ; special case because the base instruction is mmx2,
1389                %ifnid %6       ; but sse4 is required for memory operands
1390                    %if notcpuflag(sse4)
1391                        %error use of ``%1'' sse4 instruction in cpuname function: current_function
1392                    %endif
1393                %endif
1394            %endif
1395        %endif
1396    %endif
1397
1398    %if __emulate_avx
1399        %xdefine __src1 %7
1400        %xdefine __src2 %8
1401        %if %5 && %4 == 0
1402            %ifnidn %6, %7
1403                %ifidn %6, %8
1404                    %xdefine __src1 %8
1405                    %xdefine __src2 %7
1406                %elifnnum sizeof%8
1407                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
1408                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
1409                    ; So, if the instruction is commutative with a memory arg, swap them.
1410                    %xdefine __src1 %8
1411                    %xdefine __src2 %7
1412                %endif
1413            %endif
1414        %endif
1415        %ifnidn %6, __src1
1416            %if %0 >= 9
1417                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
1418            %else
1419                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
1420            %endif
1421            %if __sizeofreg == 8
1422                MOVQ %6, __src1
1423            %elif %3
1424                MOVAPS %6, __src1
1425            %else
1426                MOVDQA %6, __src1
1427            %endif
1428        %endif
1429        %if %0 >= 9
1430            %1 %6, __src2, %9
1431        %else
1432            %1 %6, __src2
1433        %endif
1434    %elif %0 >= 9
1435        %if avx_enabled && __sizeofreg >= 16 && %4 == 1
1436            %ifnnum regnumof%7
1437                %if %3
1438                    vmovaps %6, %7
1439                %else
1440                    vmovdqa %6, %7
1441                %endif
1442                __instr %6, %6, %8, %9
1443            %else
1444                __instr %6, %7, %8, %9
1445            %endif
1446        %else
1447            __instr %6, %7, %8, %9
1448        %endif
1449    %elif %0 == 8
1450        %if avx_enabled && __sizeofreg >= 16 && %4 == 0
1451            %xdefine __src1 %7
1452            %xdefine __src2 %8
1453            %if %5
1454                %ifnum regnumof%7
1455                    %ifnum regnumof%8
1456                        %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
1457                            ; Most VEX-encoded instructions require an additional byte to encode when
1458                            ; src2 is a high register (e.g. m8..15). If the instruction is commutative
1459                            ; we can swap src1 and src2 when doing so reduces the instruction length.
1460                            %xdefine __src1 %8
1461                            %xdefine __src2 %7
1462                        %endif
1463                    %endif
1464                %elifnum regnumof%8 ; put memory operands in src2 when possible
1465                    %xdefine __src1 %8
1466                    %xdefine __src2 %7
1467                %else
1468                    %assign __emulate_avx 1
1469                %endif
1470            %elifnnum regnumof%7
1471                ; EVEX allows imm8 shift instructions to be used with memory operands,
1472                ; but VEX does not. This handles those special cases.
1473                %ifnnum %8
1474                    %assign __emulate_avx 1
1475                %elif notcpuflag(avx512)
1476                    %assign __emulate_avx 1
1477                %endif
1478            %endif
1479            %if __emulate_avx ; a separate load is required
1480                %if %3
1481                    vmovaps %6, %7
1482                %else
1483                    vmovdqa %6, %7
1484                %endif
1485                __instr %6, %6, %8
1486            %else
1487                __instr %6, __src1, __src2
1488            %endif
1489        %else
1490            __instr %6, %7, %8
1491        %endif
1492    %elif %0 == 7
1493        %if avx_enabled && __sizeofreg >= 16 && %5
1494            %xdefine __src1 %6
1495            %xdefine __src2 %7
1496            %ifnum regnumof%6
1497                %ifnum regnumof%7
1498                    %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
1499                        %xdefine __src1 %7
1500                        %xdefine __src2 %6
1501                    %endif
1502                %endif
1503            %endif
1504            __instr %6, __src1, __src2
1505        %else
1506            __instr %6, %7
1507        %endif
1508    %else
1509        __instr %6
1510    %endif
1511%endmacro
1512
1513;%1 == instruction
1514;%2 == minimal instruction set
1515;%3 == 1 if float, 0 if int
1516;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
1517;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1518%macro AVX_INSTR 1-5 fnord, 0, 255, 0
1519    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
1520        %ifidn %2, fnord
1521            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
1522        %elifidn %3, fnord
1523            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
1524        %elifidn %4, fnord
1525            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
1526        %elifidn %5, fnord
1527            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
1528        %else
1529            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
1530        %endif
1531    %endmacro
1532%endmacro
1533
1534; Instructions with both VEX/EVEX and legacy encodings
1535; Non-destructive instructions are written without parameters
1536AVX_INSTR addpd, sse2, 1, 0, 1
1537AVX_INSTR addps, sse, 1, 0, 1
1538AVX_INSTR addsd, sse2, 1, 0, 0
1539AVX_INSTR addss, sse, 1, 0, 0
1540AVX_INSTR addsubpd, sse3, 1, 0, 0
1541AVX_INSTR addsubps, sse3, 1, 0, 0
1542AVX_INSTR aesdec, aesni, 0, 0, 0
1543AVX_INSTR aesdeclast, aesni, 0, 0, 0
1544AVX_INSTR aesenc, aesni, 0, 0, 0
1545AVX_INSTR aesenclast, aesni, 0, 0, 0
1546AVX_INSTR aesimc, aesni
1547AVX_INSTR aeskeygenassist, aesni
1548AVX_INSTR andnpd, sse2, 1, 0, 0
1549AVX_INSTR andnps, sse, 1, 0, 0
1550AVX_INSTR andpd, sse2, 1, 0, 1
1551AVX_INSTR andps, sse, 1, 0, 1
1552AVX_INSTR blendpd, sse4, 1, 1, 0
1553AVX_INSTR blendps, sse4, 1, 1, 0
1554AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
1555AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
1556AVX_INSTR cmpeqpd, sse2, 1, 0, 1
1557AVX_INSTR cmpeqps, sse, 1, 0, 1
1558AVX_INSTR cmpeqsd, sse2, 1, 0, 0
1559AVX_INSTR cmpeqss, sse, 1, 0, 0
1560AVX_INSTR cmplepd, sse2, 1, 0, 0
1561AVX_INSTR cmpleps, sse, 1, 0, 0
1562AVX_INSTR cmplesd, sse2, 1, 0, 0
1563AVX_INSTR cmpless, sse, 1, 0, 0
1564AVX_INSTR cmpltpd, sse2, 1, 0, 0
1565AVX_INSTR cmpltps, sse, 1, 0, 0
1566AVX_INSTR cmpltsd, sse2, 1, 0, 0
1567AVX_INSTR cmpltss, sse, 1, 0, 0
1568AVX_INSTR cmpneqpd, sse2, 1, 0, 1
1569AVX_INSTR cmpneqps, sse, 1, 0, 1
1570AVX_INSTR cmpneqsd, sse2, 1, 0, 0
1571AVX_INSTR cmpneqss, sse, 1, 0, 0
1572AVX_INSTR cmpnlepd, sse2, 1, 0, 0
1573AVX_INSTR cmpnleps, sse, 1, 0, 0
1574AVX_INSTR cmpnlesd, sse2, 1, 0, 0
1575AVX_INSTR cmpnless, sse, 1, 0, 0
1576AVX_INSTR cmpnltpd, sse2, 1, 0, 0
1577AVX_INSTR cmpnltps, sse, 1, 0, 0
1578AVX_INSTR cmpnltsd, sse2, 1, 0, 0
1579AVX_INSTR cmpnltss, sse, 1, 0, 0
1580AVX_INSTR cmpordpd, sse2 1, 0, 1
1581AVX_INSTR cmpordps, sse 1, 0, 1
1582AVX_INSTR cmpordsd, sse2 1, 0, 0
1583AVX_INSTR cmpordss, sse 1, 0, 0
1584AVX_INSTR cmppd, sse2, 1, 1, 0
1585AVX_INSTR cmpps, sse, 1, 1, 0
1586AVX_INSTR cmpsd, sse2, 1, 1, 0
1587AVX_INSTR cmpss, sse, 1, 1, 0
1588AVX_INSTR cmpunordpd, sse2, 1, 0, 1
1589AVX_INSTR cmpunordps, sse, 1, 0, 1
1590AVX_INSTR cmpunordsd, sse2, 1, 0, 0
1591AVX_INSTR cmpunordss, sse, 1, 0, 0
1592AVX_INSTR comisd, sse2, 1
1593AVX_INSTR comiss, sse, 1
1594AVX_INSTR cvtdq2pd, sse2, 1
1595AVX_INSTR cvtdq2ps, sse2, 1
1596AVX_INSTR cvtpd2dq, sse2, 1
1597AVX_INSTR cvtpd2ps, sse2, 1
1598AVX_INSTR cvtps2dq, sse2, 1
1599AVX_INSTR cvtps2pd, sse2, 1
1600AVX_INSTR cvtsd2si, sse2, 1
1601AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
1602AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
1603AVX_INSTR cvtsi2ss, sse, 1, 0, 0
1604AVX_INSTR cvtss2sd, sse2, 1, 0, 0
1605AVX_INSTR cvtss2si, sse, 1
1606AVX_INSTR cvttpd2dq, sse2, 1
1607AVX_INSTR cvttps2dq, sse2, 1
1608AVX_INSTR cvttsd2si, sse2, 1
1609AVX_INSTR cvttss2si, sse, 1
1610AVX_INSTR divpd, sse2, 1, 0, 0
1611AVX_INSTR divps, sse, 1, 0, 0
1612AVX_INSTR divsd, sse2, 1, 0, 0
1613AVX_INSTR divss, sse, 1, 0, 0
1614AVX_INSTR dppd, sse4, 1, 1, 0
1615AVX_INSTR dpps, sse4, 1, 1, 0
1616AVX_INSTR extractps, sse4, 1
1617AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
1618AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
1619AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
1620AVX_INSTR haddpd, sse3, 1, 0, 0
1621AVX_INSTR haddps, sse3, 1, 0, 0
1622AVX_INSTR hsubpd, sse3, 1, 0, 0
1623AVX_INSTR hsubps, sse3, 1, 0, 0
1624AVX_INSTR insertps, sse4, 1, 1, 0
1625AVX_INSTR lddqu, sse3
1626AVX_INSTR ldmxcsr, sse, 1
1627AVX_INSTR maskmovdqu, sse2
1628AVX_INSTR maxpd, sse2, 1, 0, 1
1629AVX_INSTR maxps, sse, 1, 0, 1
1630AVX_INSTR maxsd, sse2, 1, 0, 0
1631AVX_INSTR maxss, sse, 1, 0, 0
1632AVX_INSTR minpd, sse2, 1, 0, 1
1633AVX_INSTR minps, sse, 1, 0, 1
1634AVX_INSTR minsd, sse2, 1, 0, 0
1635AVX_INSTR minss, sse, 1, 0, 0
1636AVX_INSTR movapd, sse2, 1
1637AVX_INSTR movaps, sse, 1
1638AVX_INSTR movd, mmx
1639AVX_INSTR movddup, sse3, 1
1640AVX_INSTR movdqa, sse2
1641AVX_INSTR movdqu, sse2
1642AVX_INSTR movhlps, sse, 1, 0, 0
1643AVX_INSTR movhpd, sse2, 1, 0, 0
1644AVX_INSTR movhps, sse, 1, 0, 0
1645AVX_INSTR movlhps, sse, 1, 0, 0
1646AVX_INSTR movlpd, sse2, 1, 0, 0
1647AVX_INSTR movlps, sse, 1, 0, 0
1648AVX_INSTR movmskpd, sse2, 1
1649AVX_INSTR movmskps, sse, 1
1650AVX_INSTR movntdq, sse2
1651AVX_INSTR movntdqa, sse4
1652AVX_INSTR movntpd, sse2, 1
1653AVX_INSTR movntps, sse, 1
1654AVX_INSTR movq, mmx
1655AVX_INSTR movsd, sse2, 1, 0, 0
1656AVX_INSTR movshdup, sse3, 1
1657AVX_INSTR movsldup, sse3, 1
1658AVX_INSTR movss, sse, 1, 0, 0
1659AVX_INSTR movupd, sse2, 1
1660AVX_INSTR movups, sse, 1
1661AVX_INSTR mpsadbw, sse4, 0, 1, 0
1662AVX_INSTR mulpd, sse2, 1, 0, 1
1663AVX_INSTR mulps, sse, 1, 0, 1
1664AVX_INSTR mulsd, sse2, 1, 0, 0
1665AVX_INSTR mulss, sse, 1, 0, 0
1666AVX_INSTR orpd, sse2, 1, 0, 1
1667AVX_INSTR orps, sse, 1, 0, 1
1668AVX_INSTR pabsb, ssse3
1669AVX_INSTR pabsd, ssse3
1670AVX_INSTR pabsw, ssse3
1671AVX_INSTR packssdw, mmx, 0, 0, 0
1672AVX_INSTR packsswb, mmx, 0, 0, 0
1673AVX_INSTR packusdw, sse4, 0, 0, 0
1674AVX_INSTR packuswb, mmx, 0, 0, 0
1675AVX_INSTR paddb, mmx, 0, 0, 1
1676AVX_INSTR paddd, mmx, 0, 0, 1
1677AVX_INSTR paddq, sse2, 0, 0, 1
1678AVX_INSTR paddsb, mmx, 0, 0, 1
1679AVX_INSTR paddsw, mmx, 0, 0, 1
1680AVX_INSTR paddusb, mmx, 0, 0, 1
1681AVX_INSTR paddusw, mmx, 0, 0, 1
1682AVX_INSTR paddw, mmx, 0, 0, 1
1683AVX_INSTR palignr, ssse3, 0, 1, 0
1684AVX_INSTR pand, mmx, 0, 0, 1
1685AVX_INSTR pandn, mmx, 0, 0, 0
1686AVX_INSTR pavgb, mmx2, 0, 0, 1
1687AVX_INSTR pavgw, mmx2, 0, 0, 1
1688AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding
1689AVX_INSTR pblendw, sse4, 0, 1, 0
1690AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0
1691AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0
1692AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0
1693AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0
1694AVX_INSTR pclmulqdq, clmul, 0, 1, 0
1695AVX_INSTR pcmpeqb, mmx, 0, 0, 1
1696AVX_INSTR pcmpeqd, mmx, 0, 0, 1
1697AVX_INSTR pcmpeqq, sse4, 0, 0, 1
1698AVX_INSTR pcmpeqw, mmx, 0, 0, 1
1699AVX_INSTR pcmpestri, sse42
1700AVX_INSTR pcmpestrm, sse42
1701AVX_INSTR pcmpgtb, mmx, 0, 0, 0
1702AVX_INSTR pcmpgtd, mmx, 0, 0, 0
1703AVX_INSTR pcmpgtq, sse42, 0, 0, 0
1704AVX_INSTR pcmpgtw, mmx, 0, 0, 0
1705AVX_INSTR pcmpistri, sse42
1706AVX_INSTR pcmpistrm, sse42
1707AVX_INSTR pextrb, sse4
1708AVX_INSTR pextrd, sse4
1709AVX_INSTR pextrq, sse4
1710AVX_INSTR pextrw, mmx2
1711AVX_INSTR phaddd, ssse3, 0, 0, 0
1712AVX_INSTR phaddsw, ssse3, 0, 0, 0
1713AVX_INSTR phaddw, ssse3, 0, 0, 0
1714AVX_INSTR phminposuw, sse4
1715AVX_INSTR phsubd, ssse3, 0, 0, 0
1716AVX_INSTR phsubsw, ssse3, 0, 0, 0
1717AVX_INSTR phsubw, ssse3, 0, 0, 0
1718AVX_INSTR pinsrb, sse4, 0, 1, 0
1719AVX_INSTR pinsrd, sse4, 0, 1, 0
1720AVX_INSTR pinsrq, sse4, 0, 1, 0
1721AVX_INSTR pinsrw, mmx2, 0, 1, 0
1722AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
1723AVX_INSTR pmaddwd, mmx, 0, 0, 1
1724AVX_INSTR pmaxsb, sse4, 0, 0, 1
1725AVX_INSTR pmaxsd, sse4, 0, 0, 1
1726AVX_INSTR pmaxsw, mmx2, 0, 0, 1
1727AVX_INSTR pmaxub, mmx2, 0, 0, 1
1728AVX_INSTR pmaxud, sse4, 0, 0, 1
1729AVX_INSTR pmaxuw, sse4, 0, 0, 1
1730AVX_INSTR pminsb, sse4, 0, 0, 1
1731AVX_INSTR pminsd, sse4, 0, 0, 1
1732AVX_INSTR pminsw, mmx2, 0, 0, 1
1733AVX_INSTR pminub, mmx2, 0, 0, 1
1734AVX_INSTR pminud, sse4, 0, 0, 1
1735AVX_INSTR pminuw, sse4, 0, 0, 1
1736AVX_INSTR pmovmskb, mmx2
1737AVX_INSTR pmovsxbd, sse4
1738AVX_INSTR pmovsxbq, sse4
1739AVX_INSTR pmovsxbw, sse4
1740AVX_INSTR pmovsxdq, sse4
1741AVX_INSTR pmovsxwd, sse4
1742AVX_INSTR pmovsxwq, sse4
1743AVX_INSTR pmovzxbd, sse4
1744AVX_INSTR pmovzxbq, sse4
1745AVX_INSTR pmovzxbw, sse4
1746AVX_INSTR pmovzxdq, sse4
1747AVX_INSTR pmovzxwd, sse4
1748AVX_INSTR pmovzxwq, sse4
1749AVX_INSTR pmuldq, sse4, 0, 0, 1
1750AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
1751AVX_INSTR pmulhuw, mmx2, 0, 0, 1
1752AVX_INSTR pmulhw, mmx, 0, 0, 1
1753AVX_INSTR pmulld, sse4, 0, 0, 1
1754AVX_INSTR pmullw, mmx, 0, 0, 1
1755AVX_INSTR pmuludq, sse2, 0, 0, 1
1756AVX_INSTR por, mmx, 0, 0, 1
1757AVX_INSTR psadbw, mmx2, 0, 0, 1
1758AVX_INSTR pshufb, ssse3, 0, 0, 0
1759AVX_INSTR pshufd, sse2
1760AVX_INSTR pshufhw, sse2
1761AVX_INSTR pshuflw, sse2
1762AVX_INSTR psignb, ssse3, 0, 0, 0
1763AVX_INSTR psignd, ssse3, 0, 0, 0
1764AVX_INSTR psignw, ssse3, 0, 0, 0
1765AVX_INSTR pslld, mmx, 0, 0, 0
1766AVX_INSTR pslldq, sse2, 0, 0, 0
1767AVX_INSTR psllq, mmx, 0, 0, 0
1768AVX_INSTR psllw, mmx, 0, 0, 0
1769AVX_INSTR psrad, mmx, 0, 0, 0
1770AVX_INSTR psraw, mmx, 0, 0, 0
1771AVX_INSTR psrld, mmx, 0, 0, 0
1772AVX_INSTR psrldq, sse2, 0, 0, 0
1773AVX_INSTR psrlq, mmx, 0, 0, 0
1774AVX_INSTR psrlw, mmx, 0, 0, 0
1775AVX_INSTR psubb, mmx, 0, 0, 0
1776AVX_INSTR psubd, mmx, 0, 0, 0
1777AVX_INSTR psubq, sse2, 0, 0, 0
1778AVX_INSTR psubsb, mmx, 0, 0, 0
1779AVX_INSTR psubsw, mmx, 0, 0, 0
1780AVX_INSTR psubusb, mmx, 0, 0, 0
1781AVX_INSTR psubusw, mmx, 0, 0, 0
1782AVX_INSTR psubw, mmx, 0, 0, 0
1783AVX_INSTR ptest, sse4
1784AVX_INSTR punpckhbw, mmx, 0, 0, 0
1785AVX_INSTR punpckhdq, mmx, 0, 0, 0
1786AVX_INSTR punpckhqdq, sse2, 0, 0, 0
1787AVX_INSTR punpckhwd, mmx, 0, 0, 0
1788AVX_INSTR punpcklbw, mmx, 0, 0, 0
1789AVX_INSTR punpckldq, mmx, 0, 0, 0
1790AVX_INSTR punpcklqdq, sse2, 0, 0, 0
1791AVX_INSTR punpcklwd, mmx, 0, 0, 0
1792AVX_INSTR pxor, mmx, 0, 0, 1
1793AVX_INSTR rcpps, sse, 1
1794AVX_INSTR rcpss, sse, 1, 0, 0
1795AVX_INSTR roundpd, sse4, 1
1796AVX_INSTR roundps, sse4, 1
1797AVX_INSTR roundsd, sse4, 1, 1, 0
1798AVX_INSTR roundss, sse4, 1, 1, 0
1799AVX_INSTR rsqrtps, sse, 1
1800AVX_INSTR rsqrtss, sse, 1, 0, 0
1801AVX_INSTR shufpd, sse2, 1, 1, 0
1802AVX_INSTR shufps, sse, 1, 1, 0
1803AVX_INSTR sqrtpd, sse2, 1
1804AVX_INSTR sqrtps, sse, 1
1805AVX_INSTR sqrtsd, sse2, 1, 0, 0
1806AVX_INSTR sqrtss, sse, 1, 0, 0
1807AVX_INSTR stmxcsr, sse, 1
1808AVX_INSTR subpd, sse2, 1, 0, 0
1809AVX_INSTR subps, sse, 1, 0, 0
1810AVX_INSTR subsd, sse2, 1, 0, 0
1811AVX_INSTR subss, sse, 1, 0, 0
1812AVX_INSTR ucomisd, sse2, 1
1813AVX_INSTR ucomiss, sse, 1
1814AVX_INSTR unpckhpd, sse2, 1, 0, 0
1815AVX_INSTR unpckhps, sse, 1, 0, 0
1816AVX_INSTR unpcklpd, sse2, 1, 0, 0
1817AVX_INSTR unpcklps, sse, 1, 0, 0
1818AVX_INSTR xorpd, sse2, 1, 0, 1
1819AVX_INSTR xorps, sse, 1, 0, 1
1820
1821; 3DNow instructions, for sharing code between AVX, SSE and 3DN
1822AVX_INSTR pfadd, 3dnow, 1, 0, 1
1823AVX_INSTR pfmul, 3dnow, 1, 0, 1
1824AVX_INSTR pfsub, 3dnow, 1, 0, 0
1825
1826;%1 == instruction
1827;%2 == minimal instruction set
1828%macro GPR_INSTR 2
1829    %macro %1 2-5 fnord, %1, %2
1830        %ifdef cpuname
1831            %if notcpuflag(%5)
1832                %error use of ``%4'' %5 instruction in cpuname function: current_function
1833            %endif
1834        %endif
1835        %ifidn %3, fnord
1836            %4 %1, %2
1837        %else
1838            %4 %1, %2, %3
1839        %endif
1840    %endmacro
1841%endmacro
1842
1843GPR_INSTR andn, bmi1
1844GPR_INSTR bextr, bmi1
1845GPR_INSTR blsi, bmi1
1846GPR_INSTR blsmsk, bmi1
1847GPR_INSTR blsr, bmi1
1848GPR_INSTR bzhi, bmi2
1849GPR_INSTR crc32, sse42
1850GPR_INSTR mulx, bmi2
1851GPR_INSTR pdep, bmi2
1852GPR_INSTR pext, bmi2
1853GPR_INSTR popcnt, sse42
1854GPR_INSTR rorx, bmi2
1855GPR_INSTR sarx, bmi2
1856GPR_INSTR shlx, bmi2
1857GPR_INSTR shrx, bmi2
1858
1859; base-4 constants for shuffles
1860%assign i 0
1861%rep 256
1862    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1863    %if j < 10
1864        CAT_XDEFINE q000, j, i
1865    %elif j < 100
1866        CAT_XDEFINE q00, j, i
1867    %elif j < 1000
1868        CAT_XDEFINE q0, j, i
1869    %else
1870        CAT_XDEFINE q, j, i
1871    %endif
1872    %assign i i+1
1873%endrep
1874%undef i
1875%undef j
1876
1877%macro FMA_INSTR 3
1878    %macro %1 4-7 %1, %2, %3
1879        %if cpuflag(xop)
1880            v%5 %1, %2, %3, %4
1881        %elifnidn %1, %4
1882            %6 %1, %2, %3
1883            %7 %1, %4
1884        %else
1885            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
1886        %endif
1887    %endmacro
1888%endmacro
1889
1890FMA_INSTR pmacsdd,  pmulld,  paddd ; sse4 emulation
1891FMA_INSTR pmacsdql, pmuldq,  paddq ; sse4 emulation
1892FMA_INSTR pmacsww,  pmullw,  paddw
1893FMA_INSTR pmadcswd, pmaddwd, paddd
1894
1895; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
1896; FMA3 is only possible if dst is the same as one of the src registers.
1897; Either src2 or src3 can be a memory operand.
1898%macro FMA4_INSTR 2-*
1899    %push fma4_instr
1900    %xdefine %$prefix %1
1901    %rep %0 - 1
1902        %macro %$prefix%2 4-6 %$prefix, %2
1903            %if notcpuflag(fma3) && notcpuflag(fma4)
1904                %error use of ``%5%6'' fma instruction in cpuname function: current_function
1905            %elif cpuflag(fma4)
1906                v%5%6 %1, %2, %3, %4
1907            %elifidn %1, %2
1908                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
1909                %ifnum sizeof%3
1910                    v%{5}213%6 %2, %3, %4
1911                %else
1912                    v%{5}132%6 %2, %4, %3
1913                %endif
1914            %elifidn %1, %3
1915                v%{5}213%6 %3, %2, %4
1916            %elifidn %1, %4
1917                v%{5}231%6 %4, %2, %3
1918            %else
1919                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
1920            %endif
1921        %endmacro
1922        %rotate 1
1923    %endrep
1924    %pop
1925%endmacro
1926
1927FMA4_INSTR fmadd,    pd, ps, sd, ss
1928FMA4_INSTR fmaddsub, pd, ps
1929FMA4_INSTR fmsub,    pd, ps, sd, ss
1930FMA4_INSTR fmsubadd, pd, ps
1931FMA4_INSTR fnmadd,   pd, ps, sd, ss
1932FMA4_INSTR fnmsub,   pd, ps, sd, ss
1933
1934; Macros for converting VEX instructions to equivalent EVEX ones.
1935%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
1936    %macro %1 2-7 fnord, fnord, %1, %2, %3
1937        %ifidn %3, fnord
1938            %define %%args %1, %2
1939        %elifidn %4, fnord
1940            %define %%args %1, %2, %3
1941        %else
1942            %define %%args %1, %2, %3, %4
1943        %endif
1944        %assign %%evex_required cpuflag(avx512) & %7
1945        %ifnum regnumof%1
1946            %if regnumof%1 >= 16 || sizeof%1 > 32
1947                %assign %%evex_required 1
1948            %endif
1949        %endif
1950        %ifnum regnumof%2
1951            %if regnumof%2 >= 16 || sizeof%2 > 32
1952                %assign %%evex_required 1
1953            %endif
1954        %endif
1955        %ifnum regnumof%3
1956            %if regnumof%3 >= 16 || sizeof%3 > 32
1957                %assign %%evex_required 1
1958            %endif
1959        %endif
1960        %if %%evex_required
1961            %6 %%args
1962        %else
1963            %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
1964        %endif
1965    %endmacro
1966%endmacro
1967
1968EVEX_INSTR vbroadcastf128, vbroadcastf32x4
1969EVEX_INSTR vbroadcasti128, vbroadcasti32x4
1970EVEX_INSTR vextractf128,   vextractf32x4
1971EVEX_INSTR vextracti128,   vextracti32x4
1972EVEX_INSTR vinsertf128,    vinsertf32x4
1973EVEX_INSTR vinserti128,    vinserti32x4
1974EVEX_INSTR vmovdqa,        vmovdqa32
1975EVEX_INSTR vmovdqu,        vmovdqu32
1976EVEX_INSTR vpand,          vpandd
1977EVEX_INSTR vpandn,         vpandnd
1978EVEX_INSTR vpor,           vpord
1979EVEX_INSTR vpxor,          vpxord
1980EVEX_INSTR vrcpps,         vrcp14ps,   1 ; EVEX versions have higher precision
1981EVEX_INSTR vrcpss,         vrcp14ss,   1
1982EVEX_INSTR vrsqrtps,       vrsqrt14ps, 1
1983EVEX_INSTR vrsqrtss,       vrsqrt14ss, 1
1984