1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5/*
6Package arm64 implements an ARM64 assembler. Go assembly syntax is different from GNU ARM64
7syntax, but we can still follow the general rules to map between them.
8
9# Instructions mnemonics mapping rules
10
111. Most instructions use width suffixes of instruction names to indicate operand width rather than
12using different register names.
13
14Examples:
15
16	ADC R24, R14, R12          <=>     adc x12, x14, x24
17	ADDW R26->24, R21, R15     <=>     add w15, w21, w26, asr #24
18	FCMPS F2, F3               <=>     fcmp s3, s2
19	FCMPD F2, F3               <=>     fcmp d3, d2
20	FCVTDH F2, F3              <=>     fcvt h3, d2
21
222. Go uses .P and .W suffixes to indicate post-increment and pre-increment.
23
24Examples:
25
26	MOVD.P -8(R10), R8         <=>      ldr x8, [x10],#-8
27	MOVB.W 16(R16), R10        <=>      ldrsb x10, [x16,#16]!
28	MOVBU.W 16(R16), R10       <=>      ldrb x10, [x16,#16]!
29
303. Go uses a series of MOV instructions as load and store.
31
3264-bit variant ldr, str, stur => MOVD;
3332-bit variant str, stur, ldrsw => MOVW;
3432-bit variant ldr => MOVWU;
35ldrb => MOVBU; ldrh => MOVHU;
36ldrsb, sturb, strb => MOVB;
37ldrsh, sturh, strh =>  MOVH.
38
394. Go moves conditions into opcode suffix, like BLT.
40
415. Go adds a V prefix for most floating-point and SIMD instructions, except cryptographic extension
42instructions and floating-point(scalar) instructions.
43
44Examples:
45
46	VADD V5.H8, V18.H8, V9.H8         <=>      add v9.8h, v18.8h, v5.8h
47	VLD1.P (R6)(R11), [V31.D1]        <=>      ld1 {v31.1d}, [x6], x11
48	VFMLA V29.S2, V20.S2, V14.S2      <=>      fmla v14.2s, v20.2s, v29.2s
49	AESD V22.B16, V19.B16             <=>      aesd v19.16b, v22.16b
50	SCVTFWS R3, F16                   <=>      scvtf s17, w6
51
526. Align directive
53
54Go asm supports the PCALIGN directive, which indicates that the next instruction should be aligned
55to a specified boundary by padding with NOOP instruction. The alignment value supported on arm64
56must be a power of 2 and in the range of [8, 2048].
57
58Examples:
59
60	PCALIGN $16
61	MOVD $2, R0          // This instruction is aligned with 16 bytes.
62	PCALIGN $1024
63	MOVD $3, R1          // This instruction is aligned with 1024 bytes.
64
65PCALIGN also changes the function alignment. If a function has one or more PCALIGN directives,
66its address will be aligned to the same or coarser boundary, which is the maximum of all the
67alignment values.
68
69In the following example, the function Add is aligned with 128 bytes.
70
71Examples:
72
73	TEXT ·Add(SB),$40-16
74	MOVD $2, R0
75	PCALIGN $32
76	MOVD $4, R1
77	PCALIGN $128
78	MOVD $8, R2
79	RET
80
81On arm64, functions in Go are aligned to 16 bytes by default, we can also use PCALIGN to set the
82function alignment. The functions that need to be aligned are preferably using NOFRAME and NOSPLIT
83to avoid the impact of the prologues inserted by the assembler, so that the function address will
84have the same alignment as the first hand-written instruction.
85
86In the following example, PCALIGN at the entry of the function Add will align its address to 2048 bytes.
87
88Examples:
89
90	TEXT ·Add(SB),NOSPLIT|NOFRAME,$0
91	  PCALIGN $2048
92	  MOVD $1, R0
93	  MOVD $1, R1
94	  RET
95
967. Move large constants to vector registers.
97
98Go asm uses VMOVQ/VMOVD/VMOVS to move 128-bit, 64-bit and 32-bit constants into vector registers, respectively.
99And for a 128-bit integer, it take two 64-bit operands, for the low and high parts separately.
100
101Examples:
102
103	VMOVS $0x11223344, V0
104	VMOVD $0x1122334455667788, V1
105	VMOVQ $0x1122334455667788, $0x99aabbccddeeff00, V2   // V2=0x99aabbccddeeff001122334455667788
106
1078. Move an optionally-shifted 16-bit immediate value to a register.
108
109The instructions are MOVK(W), MOVZ(W) and MOVN(W), the assembly syntax is "op $(uimm16<<shift), <Rd>". The <uimm16>
110is the 16-bit unsigned immediate, in the range 0 to 65535; For the 32-bit variant, the <shift> is 0 or 16, for the
11164-bit variant, the <shift> is 0, 16, 32 or 48.
112
113The current Go assembler does not accept zero shifts, such as "op $0, Rd" and "op $(0<<(16|32|48)), Rd" instructions.
114
115Examples:
116
117	MOVK $(10<<32), R20     <=>      movk x20, #10, lsl #32
118	MOVZW $(20<<16), R8     <=>      movz w8, #20, lsl #16
119	MOVK $(0<<16), R10 will be reported as an error by the assembler.
120
121Special Cases.
122
123(1) umov is written as VMOV.
124
125(2) br is renamed JMP, blr is renamed CALL.
126
127(3) No need to add "W" suffix: LDARB, LDARH, LDAXRB, LDAXRH, LDTRH, LDXRB, LDXRH.
128
129(4) In Go assembly syntax, NOP is a zero-width pseudo-instruction serves generic purpose, nothing
130related to real ARM64 instruction. NOOP serves for the hardware nop instruction. NOOP is an alias of
131HINT $0.
132
133Examples:
134
135	VMOV V13.B[1], R20      <=>      mov x20, v13.b[1]
136	VMOV V13.H[1], R20      <=>      mov w20, v13.h[1]
137	JMP (R3)                <=>      br x3
138	CALL (R17)              <=>      blr x17
139	LDAXRB (R19), R16       <=>      ldaxrb w16, [x19]
140	NOOP                    <=>      nop
141
142# Register mapping rules
143
1441. All basic register names are written as Rn.
145
1462. Go uses ZR as the zero register and RSP as the stack pointer.
147
1483. Bn, Hn, Dn, Sn and Qn instructions are written as Fn in floating-point instructions and as Vn
149in SIMD instructions.
150
151# Argument mapping rules
152
1531. The operands appear in left-to-right assignment order.
154
155Go reverses the arguments of most instructions.
156
157Examples:
158
159	ADD R11.SXTB<<1, RSP, R25      <=>      add x25, sp, w11, sxtb #1
160	VADD V16, V19, V14             <=>      add d14, d19, d16
161
162Special Cases.
163
164(1) Argument order is the same as in the GNU ARM64 syntax: cbz, cbnz and some store instructions,
165such as str, stur, strb, sturb, strh, sturh stlr, stlrb. stlrh, st1.
166
167Examples:
168
169	MOVD R29, 384(R19)    <=>    str x29, [x19,#384]
170	MOVB.P R30, 30(R4)    <=>    strb w30, [x4],#30
171	STLRH R21, (R19)      <=>    stlrh w21, [x19]
172
173(2) MADD, MADDW, MSUB, MSUBW, SMADDL, SMSUBL, UMADDL, UMSUBL <Rm>, <Ra>, <Rn>, <Rd>
174
175Examples:
176
177	MADD R2, R30, R22, R6       <=>    madd x6, x22, x2, x30
178	SMSUBL R10, R3, R17, R27    <=>    smsubl x27, w17, w10, x3
179
180(3) FMADDD, FMADDS, FMSUBD, FMSUBS, FNMADDD, FNMADDS, FNMSUBD, FNMSUBS <Fm>, <Fa>, <Fn>, <Fd>
181
182Examples:
183
184	FMADDD F30, F20, F3, F29    <=>    fmadd d29, d3, d30, d20
185	FNMSUBS F7, F25, F7, F22    <=>    fnmsub s22, s7, s7, s25
186
187(4) BFI, BFXIL, SBFIZ, SBFX, UBFIZ, UBFX $<lsb>, <Rn>, $<width>, <Rd>
188
189Examples:
190
191	BFIW $16, R20, $6, R0      <=>    bfi w0, w20, #16, #6
192	UBFIZ $34, R26, $5, R20    <=>    ubfiz x20, x26, #34, #5
193
194(5) FCCMPD, FCCMPS, FCCMPED, FCCMPES <cond>, Fm. Fn, $<nzcv>
195
196Examples:
197
198	FCCMPD AL, F8, F26, $0     <=>    fccmp d26, d8, #0x0, al
199	FCCMPS VS, F29, F4, $4     <=>    fccmp s4, s29, #0x4, vs
200	FCCMPED LE, F20, F5, $13   <=>    fccmpe d5, d20, #0xd, le
201	FCCMPES NE, F26, F10, $0   <=>    fccmpe s10, s26, #0x0, ne
202
203(6) CCMN, CCMNW, CCMP, CCMPW <cond>, <Rn>, $<imm>, $<nzcv>
204
205Examples:
206
207	CCMP MI, R22, $12, $13     <=>    ccmp x22, #0xc, #0xd, mi
208	CCMNW AL, R1, $11, $8      <=>    ccmn w1, #0xb, #0x8, al
209
210(7) CCMN, CCMNW, CCMP, CCMPW <cond>, <Rn>, <Rm>, $<nzcv>
211
212Examples:
213
214	CCMN VS, R13, R22, $10     <=>    ccmn x13, x22, #0xa, vs
215	CCMPW HS, R19, R14, $11    <=>    ccmp w19, w14, #0xb, cs
216
217(9) CSEL, CSELW, CSNEG, CSNEGW, CSINC, CSINCW <cond>, <Rn>, <Rm>, <Rd> ;
218FCSELD, FCSELS <cond>, <Fn>, <Fm>, <Fd>
219
220Examples:
221
222	CSEL GT, R0, R19, R1        <=>    csel x1, x0, x19, gt
223	CSNEGW GT, R7, R17, R8      <=>    csneg w8, w7, w17, gt
224	FCSELD EQ, F15, F18, F16    <=>    fcsel d16, d15, d18, eq
225
226(10) TBNZ, TBZ $<imm>, <Rt>, <label>
227
228(11) STLXR, STLXRW, STXR, STXRW, STLXRB, STLXRH, STXRB, STXRH  <Rf>, (<Rn|RSP>), <Rs>
229
230Examples:
231
232	STLXR ZR, (R15), R16    <=>    stlxr w16, xzr, [x15]
233	STXRB R9, (R21), R19    <=>    stxrb w19, w9, [x21]
234
235(12) STLXP, STLXPW, STXP, STXPW (<Rf1>, <Rf2>), (<Rn|RSP>), <Rs>
236
237Examples:
238
239	STLXP (R17, R19), (R4), R5      <=>    stlxp w5, x17, x19, [x4]
240	STXPW (R30, R25), (R22), R13    <=>    stxp w13, w30, w25, [x22]
241
2422. Expressions for special arguments.
243
244#<immediate> is written as $<immediate>.
245
246Optionally-shifted immediate.
247
248Examples:
249
250	ADD $(3151<<12), R14, R20     <=>    add x20, x14, #0xc4f, lsl #12
251	ADDW $1864, R25, R6           <=>    add w6, w25, #0x748
252
253Optionally-shifted registers are written as <Rm>{<shift><amount>}.
254The <shift> can be <<(lsl), >>(lsr), ->(asr), @>(ror).
255
256Examples:
257
258	ADD R19>>30, R10, R24     <=>    add x24, x10, x19, lsr #30
259	ADDW R26->24, R21, R15    <=>    add w15, w21, w26, asr #24
260
261Extended registers are written as <Rm>{.<extend>{<<<amount>}}.
262<extend> can be UXTB, UXTH, UXTW, UXTX, SXTB, SXTH, SXTW or SXTX.
263
264Examples:
265
266	ADDS R19.UXTB<<4, R9, R26     <=>    adds x26, x9, w19, uxtb #4
267	ADDSW R14.SXTX, R14, R6       <=>    adds w6, w14, w14, sxtx
268
269Memory references: [<Xn|SP>{,#0}] is written as (Rn|RSP), a base register and an immediate
270offset is written as imm(Rn|RSP), a base register and an offset register is written as (Rn|RSP)(Rm).
271
272Examples:
273
274	LDAR (R22), R9                  <=>    ldar x9, [x22]
275	LDP 28(R17), (R15, R23)         <=>    ldp x15, x23, [x17,#28]
276	MOVWU (R4)(R12<<2), R8          <=>    ldr w8, [x4, x12, lsl #2]
277	MOVD (R7)(R11.UXTW<<3), R25     <=>    ldr x25, [x7,w11,uxtw #3]
278	MOVBU (R27)(R23), R14           <=>    ldrb w14, [x27,x23]
279
280Register pairs are written as (Rt1, Rt2).
281
282Examples:
283
284	LDP.P -240(R11), (R12, R26)    <=>    ldp x12, x26, [x11],#-240
285
286Register with arrangement and register with arrangement and index.
287
288Examples:
289
290	VADD V5.H8, V18.H8, V9.H8                     <=>    add v9.8h, v18.8h, v5.8h
291	VLD1 (R2), [V21.B16]                          <=>    ld1 {v21.16b}, [x2]
292	VST1.P V9.S[1], (R16)(R21)                    <=>    st1 {v9.s}[1], [x16], x28
293	VST1.P [V13.H8, V14.H8, V15.H8], (R3)(R14)    <=>    st1 {v13.8h-v15.8h}, [x3], x14
294	VST1.P [V14.D1, V15.D1], (R7)(R23)            <=>    st1 {v14.1d, v15.1d}, [x7], x23
295*/
296package arm64
297