1// Inferno utils/6l/span.c
2// https://bitbucket.org/inferno-os/inferno-os/src/master/utils/6l/span.c
3//
4//	Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
5//	Portions Copyright © 1995-1997 C H Forsyth ([email protected])
6//	Portions Copyright © 1997-1999 Vita Nuova Limited
7//	Portions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com)
8//	Portions Copyright © 2004,2006 Bruce Ellis
9//	Portions Copyright © 2005-2007 C H Forsyth ([email protected])
10//	Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
11//	Portions Copyright © 2009 The Go Authors. All rights reserved.
12//
13// Permission is hereby granted, free of charge, to any person obtaining a copy
14// of this software and associated documentation files (the "Software"), to deal
15// in the Software without restriction, including without limitation the rights
16// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17// copies of the Software, and to permit persons to whom the Software is
18// furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included in
21// all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
26// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29// THE SOFTWARE.
30
31package x86
32
33import (
34	"cmd/internal/obj"
35	"cmd/internal/objabi"
36	"cmd/internal/sys"
37	"encoding/binary"
38	"fmt"
39	"internal/buildcfg"
40	"log"
41	"strings"
42)
43
44var (
45	plan9privates *obj.LSym
46)
47
48// Instruction layout.
49
50// Loop alignment constants:
51// want to align loop entry to loopAlign-byte boundary,
52// and willing to insert at most maxLoopPad bytes of NOP to do so.
53// We define a loop entry as the target of a backward jump.
54//
55// gcc uses maxLoopPad = 10 for its 'generic x86-64' config,
56// and it aligns all jump targets, not just backward jump targets.
57//
58// As of 6/1/2012, the effect of setting maxLoopPad = 10 here
59// is very slight but negative, so the alignment is disabled by
60// setting MaxLoopPad = 0. The code is here for reference and
61// for future experiments.
62const (
63	loopAlign  = 16
64	maxLoopPad = 0
65)
66
67// Bit flags that are used to express jump target properties.
68const (
69	// branchBackwards marks targets that are located behind.
70	// Used to express jumps to loop headers.
71	branchBackwards = (1 << iota)
72	// branchShort marks branches those target is close,
73	// with offset is in -128..127 range.
74	branchShort
75	// branchLoopHead marks loop entry.
76	// Used to insert padding for misaligned loops.
77	branchLoopHead
78)
79
80// opBytes holds optab encoding bytes.
81// Each ytab reserves fixed amount of bytes in this array.
82//
83// The size should be the minimal number of bytes that
84// are enough to hold biggest optab op lines.
85type opBytes [31]uint8
86
87type Optab struct {
88	as     obj.As
89	ytab   []ytab
90	prefix uint8
91	op     opBytes
92}
93
94type movtab struct {
95	as   obj.As
96	ft   uint8
97	f3t  uint8
98	tt   uint8
99	code uint8
100	op   [4]uint8
101}
102
103const (
104	Yxxx = iota
105	Ynone
106	Yi0 // $0
107	Yi1 // $1
108	Yu2 // $x, x fits in uint2
109	Yi8 // $x, x fits in int8
110	Yu8 // $x, x fits in uint8
111	Yu7 // $x, x in 0..127 (fits in both int8 and uint8)
112	Ys32
113	Yi32
114	Yi64
115	Yiauto
116	Yal
117	Ycl
118	Yax
119	Ycx
120	Yrb
121	Yrl
122	Yrl32 // Yrl on 32-bit system
123	Yrf
124	Yf0
125	Yrx
126	Ymb
127	Yml
128	Ym
129	Ybr
130	Ycs
131	Yss
132	Yds
133	Yes
134	Yfs
135	Ygs
136	Ygdtr
137	Yidtr
138	Yldtr
139	Ymsw
140	Ytask
141	Ycr0
142	Ycr1
143	Ycr2
144	Ycr3
145	Ycr4
146	Ycr5
147	Ycr6
148	Ycr7
149	Ycr8
150	Ydr0
151	Ydr1
152	Ydr2
153	Ydr3
154	Ydr4
155	Ydr5
156	Ydr6
157	Ydr7
158	Ytr0
159	Ytr1
160	Ytr2
161	Ytr3
162	Ytr4
163	Ytr5
164	Ytr6
165	Ytr7
166	Ymr
167	Ymm
168	Yxr0          // X0 only. "<XMM0>" notation in Intel manual.
169	YxrEvexMulti4 // [ X<n> - X<n+3> ]; multisource YxrEvex
170	Yxr           // X0..X15
171	YxrEvex       // X0..X31
172	Yxm
173	YxmEvex       // YxrEvex+Ym
174	Yxvm          // VSIB vector array; vm32x/vm64x
175	YxvmEvex      // Yxvm which permits High-16 X register as index.
176	YyrEvexMulti4 // [ Y<n> - Y<n+3> ]; multisource YyrEvex
177	Yyr           // Y0..Y15
178	YyrEvex       // Y0..Y31
179	Yym
180	YymEvex   // YyrEvex+Ym
181	Yyvm      // VSIB vector array; vm32y/vm64y
182	YyvmEvex  // Yyvm which permits High-16 Y register as index.
183	YzrMulti4 // [ Z<n> - Z<n+3> ]; multisource YzrEvex
184	Yzr       // Z0..Z31
185	Yzm       // Yzr+Ym
186	Yzvm      // VSIB vector array; vm32z/vm64z
187	Yk0       // K0
188	Yknot0    // K1..K7; write mask
189	Yk        // K0..K7; used for KOP
190	Ykm       // Yk+Ym; used for KOP
191	Ytls
192	Ytextsize
193	Yindir
194	Ymax
195)
196
197const (
198	Zxxx = iota
199	Zlit
200	Zlitm_r
201	Zlitr_m
202	Zlit_m_r
203	Z_rp
204	Zbr
205	Zcall
206	Zcallcon
207	Zcallduff
208	Zcallind
209	Zcallindreg
210	Zib_
211	Zib_rp
212	Zibo_m
213	Zibo_m_xm
214	Zil_
215	Zil_rp
216	Ziq_rp
217	Zilo_m
218	Zjmp
219	Zjmpcon
220	Zloop
221	Zo_iw
222	Zm_o
223	Zm_r
224	Z_m_r
225	Zm2_r
226	Zm_r_xm
227	Zm_r_i_xm
228	Zm_r_xm_nr
229	Zr_m_xm_nr
230	Zibm_r // mmx1,mmx2/mem64,imm8
231	Zibr_m
232	Zmb_r
233	Zaut_r
234	Zo_m
235	Zo_m64
236	Zpseudo
237	Zr_m
238	Zr_m_xm
239	Zrp_
240	Z_ib
241	Z_il
242	Zm_ibo
243	Zm_ilo
244	Zib_rr
245	Zil_rr
246	Zbyte
247
248	Zvex_rm_v_r
249	Zvex_rm_v_ro
250	Zvex_r_v_rm
251	Zvex_i_rm_vo
252	Zvex_v_rm_r
253	Zvex_i_rm_r
254	Zvex_i_r_v
255	Zvex_i_rm_v_r
256	Zvex
257	Zvex_rm_r_vo
258	Zvex_i_r_rm
259	Zvex_hr_rm_v_r
260
261	Zevex_first
262	Zevex_i_r_k_rm
263	Zevex_i_r_rm
264	Zevex_i_rm_k_r
265	Zevex_i_rm_k_vo
266	Zevex_i_rm_r
267	Zevex_i_rm_v_k_r
268	Zevex_i_rm_v_r
269	Zevex_i_rm_vo
270	Zevex_k_rmo
271	Zevex_r_k_rm
272	Zevex_r_v_k_rm
273	Zevex_r_v_rm
274	Zevex_rm_k_r
275	Zevex_rm_v_k_r
276	Zevex_rm_v_r
277	Zevex_last
278
279	Zmax
280)
281
282const (
283	Px   = 0
284	Px1  = 1    // symbolic; exact value doesn't matter
285	P32  = 0x32 // 32-bit only
286	Pe   = 0x66 // operand escape
287	Pm   = 0x0f // 2byte opcode escape
288	Pq   = 0xff // both escapes: 66 0f
289	Pb   = 0xfe // byte operands
290	Pf2  = 0xf2 // xmm escape 1: f2 0f
291	Pf3  = 0xf3 // xmm escape 2: f3 0f
292	Pef3 = 0xf5 // xmm escape 2 with 16-bit prefix: 66 f3 0f
293	Pq3  = 0x67 // xmm escape 3: 66 48 0f
294	Pq4  = 0x68 // xmm escape 4: 66 0F 38
295	Pq4w = 0x69 // Pq4 with Rex.w 66 0F 38
296	Pq5  = 0x6a // xmm escape 5: F3 0F 38
297	Pq5w = 0x6b // Pq5 with Rex.w F3 0F 38
298	Pfw  = 0xf4 // Pf3 with Rex.w: f3 48 0f
299	Pw   = 0x48 // Rex.w
300	Pw8  = 0x90 // symbolic; exact value doesn't matter
301	Py   = 0x80 // defaults to 64-bit mode
302	Py1  = 0x81 // symbolic; exact value doesn't matter
303	Py3  = 0x83 // symbolic; exact value doesn't matter
304	Pavx = 0x84 // symbolic; exact value doesn't matter
305
306	RxrEvex = 1 << 4 // AVX512 extension to REX.R/VEX.R
307	Rxw     = 1 << 3 // =1, 64-bit operand size
308	Rxr     = 1 << 2 // extend modrm reg
309	Rxx     = 1 << 1 // extend sib index
310	Rxb     = 1 << 0 // extend modrm r/m, sib base, or opcode reg
311)
312
313const (
314	// Encoding for VEX prefix in tables.
315	// The P, L, and W fields are chosen to match
316	// their eventual locations in the VEX prefix bytes.
317
318	// Encoding for VEX prefix in tables.
319	// The P, L, and W fields are chosen to match
320	// their eventual locations in the VEX prefix bytes.
321
322	// Using spare bit to make leading [E]VEX encoding byte different from
323	// 0x0f even if all other VEX fields are 0.
324	avxEscape = 1 << 6
325
326	// P field - 2 bits
327	vex66 = 1 << 0
328	vexF3 = 2 << 0
329	vexF2 = 3 << 0
330	// L field - 1 bit
331	vexLZ  = 0 << 2
332	vexLIG = 0 << 2
333	vex128 = 0 << 2
334	vex256 = 1 << 2
335	// W field - 1 bit
336	vexWIG = 0 << 7
337	vexW0  = 0 << 7
338	vexW1  = 1 << 7
339	// M field - 5 bits, but mostly reserved; we can store up to 3
340	vex0F   = 1 << 3
341	vex0F38 = 2 << 3
342	vex0F3A = 3 << 3
343)
344
345var ycover [Ymax * Ymax]uint8
346
347var reg [MAXREG]int
348
349var regrex [MAXREG + 1]int
350
351var ynone = []ytab{
352	{Zlit, 1, argList{}},
353}
354
355var ytext = []ytab{
356	{Zpseudo, 0, argList{Ymb, Ytextsize}},
357	{Zpseudo, 1, argList{Ymb, Yi32, Ytextsize}},
358}
359
360var ynop = []ytab{
361	{Zpseudo, 0, argList{}},
362	{Zpseudo, 0, argList{Yiauto}},
363	{Zpseudo, 0, argList{Yml}},
364	{Zpseudo, 0, argList{Yrf}},
365	{Zpseudo, 0, argList{Yxr}},
366	{Zpseudo, 0, argList{Yiauto}},
367	{Zpseudo, 0, argList{Yml}},
368	{Zpseudo, 0, argList{Yrf}},
369	{Zpseudo, 1, argList{Yxr}},
370}
371
372var yfuncdata = []ytab{
373	{Zpseudo, 0, argList{Yi32, Ym}},
374}
375
376var ypcdata = []ytab{
377	{Zpseudo, 0, argList{Yi32, Yi32}},
378}
379
380var yxorb = []ytab{
381	{Zib_, 1, argList{Yi32, Yal}},
382	{Zibo_m, 2, argList{Yi32, Ymb}},
383	{Zr_m, 1, argList{Yrb, Ymb}},
384	{Zm_r, 1, argList{Ymb, Yrb}},
385}
386
387var yaddl = []ytab{
388	{Zibo_m, 2, argList{Yi8, Yml}},
389	{Zil_, 1, argList{Yi32, Yax}},
390	{Zilo_m, 2, argList{Yi32, Yml}},
391	{Zr_m, 1, argList{Yrl, Yml}},
392	{Zm_r, 1, argList{Yml, Yrl}},
393}
394
395var yincl = []ytab{
396	{Z_rp, 1, argList{Yrl}},
397	{Zo_m, 2, argList{Yml}},
398}
399
400var yincq = []ytab{
401	{Zo_m, 2, argList{Yml}},
402}
403
404var ycmpb = []ytab{
405	{Z_ib, 1, argList{Yal, Yi32}},
406	{Zm_ibo, 2, argList{Ymb, Yi32}},
407	{Zm_r, 1, argList{Ymb, Yrb}},
408	{Zr_m, 1, argList{Yrb, Ymb}},
409}
410
411var ycmpl = []ytab{
412	{Zm_ibo, 2, argList{Yml, Yi8}},
413	{Z_il, 1, argList{Yax, Yi32}},
414	{Zm_ilo, 2, argList{Yml, Yi32}},
415	{Zm_r, 1, argList{Yml, Yrl}},
416	{Zr_m, 1, argList{Yrl, Yml}},
417}
418
419var yshb = []ytab{
420	{Zo_m, 2, argList{Yi1, Ymb}},
421	{Zibo_m, 2, argList{Yu8, Ymb}},
422	{Zo_m, 2, argList{Ycx, Ymb}},
423}
424
425var yshl = []ytab{
426	{Zo_m, 2, argList{Yi1, Yml}},
427	{Zibo_m, 2, argList{Yu8, Yml}},
428	{Zo_m, 2, argList{Ycl, Yml}},
429	{Zo_m, 2, argList{Ycx, Yml}},
430}
431
432var ytestl = []ytab{
433	{Zil_, 1, argList{Yi32, Yax}},
434	{Zilo_m, 2, argList{Yi32, Yml}},
435	{Zr_m, 1, argList{Yrl, Yml}},
436	{Zm_r, 1, argList{Yml, Yrl}},
437}
438
439var ymovb = []ytab{
440	{Zr_m, 1, argList{Yrb, Ymb}},
441	{Zm_r, 1, argList{Ymb, Yrb}},
442	{Zib_rp, 1, argList{Yi32, Yrb}},
443	{Zibo_m, 2, argList{Yi32, Ymb}},
444}
445
446var ybtl = []ytab{
447	{Zibo_m, 2, argList{Yi8, Yml}},
448	{Zr_m, 1, argList{Yrl, Yml}},
449}
450
451var ymovw = []ytab{
452	{Zr_m, 1, argList{Yrl, Yml}},
453	{Zm_r, 1, argList{Yml, Yrl}},
454	{Zil_rp, 1, argList{Yi32, Yrl}},
455	{Zilo_m, 2, argList{Yi32, Yml}},
456	{Zaut_r, 2, argList{Yiauto, Yrl}},
457}
458
459var ymovl = []ytab{
460	{Zr_m, 1, argList{Yrl, Yml}},
461	{Zm_r, 1, argList{Yml, Yrl}},
462	{Zil_rp, 1, argList{Yi32, Yrl}},
463	{Zilo_m, 2, argList{Yi32, Yml}},
464	{Zm_r_xm, 1, argList{Yml, Ymr}}, // MMX MOVD
465	{Zr_m_xm, 1, argList{Ymr, Yml}}, // MMX MOVD
466	{Zm_r_xm, 2, argList{Yml, Yxr}}, // XMM MOVD (32 bit)
467	{Zr_m_xm, 2, argList{Yxr, Yml}}, // XMM MOVD (32 bit)
468	{Zaut_r, 2, argList{Yiauto, Yrl}},
469}
470
471var yret = []ytab{
472	{Zo_iw, 1, argList{}},
473	{Zo_iw, 1, argList{Yi32}},
474}
475
476var ymovq = []ytab{
477	// valid in 32-bit mode
478	{Zm_r_xm_nr, 1, argList{Ym, Ymr}},  // 0x6f MMX MOVQ (shorter encoding)
479	{Zr_m_xm_nr, 1, argList{Ymr, Ym}},  // 0x7f MMX MOVQ
480	{Zm_r_xm_nr, 2, argList{Yxr, Ymr}}, // Pf2, 0xd6 MOVDQ2Q
481	{Zm_r_xm_nr, 2, argList{Yxm, Yxr}}, // Pf3, 0x7e MOVQ xmm1/m64 -> xmm2
482	{Zr_m_xm_nr, 2, argList{Yxr, Yxm}}, // Pe, 0xd6 MOVQ xmm1 -> xmm2/m64
483
484	// valid only in 64-bit mode, usually with 64-bit prefix
485	{Zr_m, 1, argList{Yrl, Yml}},      // 0x89
486	{Zm_r, 1, argList{Yml, Yrl}},      // 0x8b
487	{Zilo_m, 2, argList{Ys32, Yrl}},   // 32 bit signed 0xc7,(0)
488	{Ziq_rp, 1, argList{Yi64, Yrl}},   // 0xb8 -- 32/64 bit immediate
489	{Zilo_m, 2, argList{Yi32, Yml}},   // 0xc7,(0)
490	{Zm_r_xm, 1, argList{Ymm, Ymr}},   // 0x6e MMX MOVD
491	{Zr_m_xm, 1, argList{Ymr, Ymm}},   // 0x7e MMX MOVD
492	{Zm_r_xm, 2, argList{Yml, Yxr}},   // Pe, 0x6e MOVD xmm load
493	{Zr_m_xm, 2, argList{Yxr, Yml}},   // Pe, 0x7e MOVD xmm store
494	{Zaut_r, 1, argList{Yiauto, Yrl}}, // 0 built-in LEAQ
495}
496
497var ymovbe = []ytab{
498	{Zlitm_r, 3, argList{Ym, Yrl}},
499	{Zlitr_m, 3, argList{Yrl, Ym}},
500}
501
502var ym_rl = []ytab{
503	{Zm_r, 1, argList{Ym, Yrl}},
504}
505
506var yrl_m = []ytab{
507	{Zr_m, 1, argList{Yrl, Ym}},
508}
509
510var ymb_rl = []ytab{
511	{Zmb_r, 1, argList{Ymb, Yrl}},
512}
513
514var yml_rl = []ytab{
515	{Zm_r, 1, argList{Yml, Yrl}},
516}
517
518var yrl_ml = []ytab{
519	{Zr_m, 1, argList{Yrl, Yml}},
520}
521
522var yml_mb = []ytab{
523	{Zr_m, 1, argList{Yrb, Ymb}},
524	{Zm_r, 1, argList{Ymb, Yrb}},
525}
526
527var yrb_mb = []ytab{
528	{Zr_m, 1, argList{Yrb, Ymb}},
529}
530
531var yxchg = []ytab{
532	{Z_rp, 1, argList{Yax, Yrl}},
533	{Zrp_, 1, argList{Yrl, Yax}},
534	{Zr_m, 1, argList{Yrl, Yml}},
535	{Zm_r, 1, argList{Yml, Yrl}},
536}
537
538var ydivl = []ytab{
539	{Zm_o, 2, argList{Yml}},
540}
541
542var ydivb = []ytab{
543	{Zm_o, 2, argList{Ymb}},
544}
545
546var yimul = []ytab{
547	{Zm_o, 2, argList{Yml}},
548	{Zib_rr, 1, argList{Yi8, Yrl}},
549	{Zil_rr, 1, argList{Yi32, Yrl}},
550	{Zm_r, 2, argList{Yml, Yrl}},
551}
552
553var yimul3 = []ytab{
554	{Zibm_r, 2, argList{Yi8, Yml, Yrl}},
555	{Zibm_r, 2, argList{Yi32, Yml, Yrl}},
556}
557
558var ybyte = []ytab{
559	{Zbyte, 1, argList{Yi64}},
560}
561
562var yin = []ytab{
563	{Zib_, 1, argList{Yi32}},
564	{Zlit, 1, argList{}},
565}
566
567var yint = []ytab{
568	{Zib_, 1, argList{Yi32}},
569}
570
571var ypushl = []ytab{
572	{Zrp_, 1, argList{Yrl}},
573	{Zm_o, 2, argList{Ym}},
574	{Zib_, 1, argList{Yi8}},
575	{Zil_, 1, argList{Yi32}},
576}
577
578var ypopl = []ytab{
579	{Z_rp, 1, argList{Yrl}},
580	{Zo_m, 2, argList{Ym}},
581}
582
583var ywrfsbase = []ytab{
584	{Zm_o, 2, argList{Yrl}},
585}
586
587var yrdrand = []ytab{
588	{Zo_m, 2, argList{Yrl}},
589}
590
591var yclflush = []ytab{
592	{Zo_m, 2, argList{Ym}},
593}
594
595var ybswap = []ytab{
596	{Z_rp, 2, argList{Yrl}},
597}
598
599var yscond = []ytab{
600	{Zo_m, 2, argList{Ymb}},
601}
602
603var yjcond = []ytab{
604	{Zbr, 0, argList{Ybr}},
605	{Zbr, 0, argList{Yi0, Ybr}},
606	{Zbr, 1, argList{Yi1, Ybr}},
607}
608
609var yloop = []ytab{
610	{Zloop, 1, argList{Ybr}},
611}
612
613var ycall = []ytab{
614	{Zcallindreg, 0, argList{Yml}},
615	{Zcallindreg, 2, argList{Yrx, Yrx}},
616	{Zcallind, 2, argList{Yindir}},
617	{Zcall, 0, argList{Ybr}},
618	{Zcallcon, 1, argList{Yi32}},
619}
620
621var yduff = []ytab{
622	{Zcallduff, 1, argList{Yi32}},
623}
624
625var yjmp = []ytab{
626	{Zo_m64, 2, argList{Yml}},
627	{Zjmp, 0, argList{Ybr}},
628	{Zjmpcon, 1, argList{Yi32}},
629}
630
631var yfmvd = []ytab{
632	{Zm_o, 2, argList{Ym, Yf0}},
633	{Zo_m, 2, argList{Yf0, Ym}},
634	{Zm_o, 2, argList{Yrf, Yf0}},
635	{Zo_m, 2, argList{Yf0, Yrf}},
636}
637
638var yfmvdp = []ytab{
639	{Zo_m, 2, argList{Yf0, Ym}},
640	{Zo_m, 2, argList{Yf0, Yrf}},
641}
642
643var yfmvf = []ytab{
644	{Zm_o, 2, argList{Ym, Yf0}},
645	{Zo_m, 2, argList{Yf0, Ym}},
646}
647
648var yfmvx = []ytab{
649	{Zm_o, 2, argList{Ym, Yf0}},
650}
651
652var yfmvp = []ytab{
653	{Zo_m, 2, argList{Yf0, Ym}},
654}
655
656var yfcmv = []ytab{
657	{Zm_o, 2, argList{Yrf, Yf0}},
658}
659
660var yfadd = []ytab{
661	{Zm_o, 2, argList{Ym, Yf0}},
662	{Zm_o, 2, argList{Yrf, Yf0}},
663	{Zo_m, 2, argList{Yf0, Yrf}},
664}
665
666var yfxch = []ytab{
667	{Zo_m, 2, argList{Yf0, Yrf}},
668	{Zm_o, 2, argList{Yrf, Yf0}},
669}
670
671var ycompp = []ytab{
672	{Zo_m, 2, argList{Yf0, Yrf}}, // botch is really f0,f1
673}
674
675var ystsw = []ytab{
676	{Zo_m, 2, argList{Ym}},
677	{Zlit, 1, argList{Yax}},
678}
679
680var ysvrs_mo = []ytab{
681	{Zm_o, 2, argList{Ym}},
682}
683
684// unaryDst version of "ysvrs_mo".
685var ysvrs_om = []ytab{
686	{Zo_m, 2, argList{Ym}},
687}
688
689var ymm = []ytab{
690	{Zm_r_xm, 1, argList{Ymm, Ymr}},
691	{Zm_r_xm, 2, argList{Yxm, Yxr}},
692}
693
694var yxm = []ytab{
695	{Zm_r_xm, 1, argList{Yxm, Yxr}},
696}
697
698var yxm_q4 = []ytab{
699	{Zm_r, 1, argList{Yxm, Yxr}},
700}
701
702var yxcvm1 = []ytab{
703	{Zm_r_xm, 2, argList{Yxm, Yxr}},
704	{Zm_r_xm, 2, argList{Yxm, Ymr}},
705}
706
707var yxcvm2 = []ytab{
708	{Zm_r_xm, 2, argList{Yxm, Yxr}},
709	{Zm_r_xm, 2, argList{Ymm, Yxr}},
710}
711
712var yxr = []ytab{
713	{Zm_r_xm, 1, argList{Yxr, Yxr}},
714}
715
716var yxr_ml = []ytab{
717	{Zr_m_xm, 1, argList{Yxr, Yml}},
718}
719
720var ymr = []ytab{
721	{Zm_r, 1, argList{Ymr, Ymr}},
722}
723
724var ymr_ml = []ytab{
725	{Zr_m_xm, 1, argList{Ymr, Yml}},
726}
727
728var yxcmpi = []ytab{
729	{Zm_r_i_xm, 2, argList{Yxm, Yxr, Yi8}},
730}
731
732var yxmov = []ytab{
733	{Zm_r_xm, 1, argList{Yxm, Yxr}},
734	{Zr_m_xm, 1, argList{Yxr, Yxm}},
735}
736
737var yxcvfl = []ytab{
738	{Zm_r_xm, 1, argList{Yxm, Yrl}},
739}
740
741var yxcvlf = []ytab{
742	{Zm_r_xm, 1, argList{Yml, Yxr}},
743}
744
745var yxcvfq = []ytab{
746	{Zm_r_xm, 2, argList{Yxm, Yrl}},
747}
748
749var yxcvqf = []ytab{
750	{Zm_r_xm, 2, argList{Yml, Yxr}},
751}
752
753var yps = []ytab{
754	{Zm_r_xm, 1, argList{Ymm, Ymr}},
755	{Zibo_m_xm, 2, argList{Yi8, Ymr}},
756	{Zm_r_xm, 2, argList{Yxm, Yxr}},
757	{Zibo_m_xm, 3, argList{Yi8, Yxr}},
758}
759
760var yxrrl = []ytab{
761	{Zm_r, 1, argList{Yxr, Yrl}},
762}
763
764var ymrxr = []ytab{
765	{Zm_r, 1, argList{Ymr, Yxr}},
766	{Zm_r_xm, 1, argList{Yxm, Yxr}},
767}
768
769var ymshuf = []ytab{
770	{Zibm_r, 2, argList{Yi8, Ymm, Ymr}},
771}
772
773var ymshufb = []ytab{
774	{Zm2_r, 2, argList{Yxm, Yxr}},
775}
776
777// It should never have more than 1 entry,
778// because some optab entries have opcode sequences that
779// are longer than 2 bytes (zoffset=2 here),
780// ROUNDPD and ROUNDPS and recently added BLENDPD,
781// to name a few.
782var yxshuf = []ytab{
783	{Zibm_r, 2, argList{Yu8, Yxm, Yxr}},
784}
785
786var yextrw = []ytab{
787	{Zibm_r, 2, argList{Yu8, Yxr, Yrl}},
788	{Zibr_m, 2, argList{Yu8, Yxr, Yml}},
789}
790
791var yextr = []ytab{
792	{Zibr_m, 3, argList{Yu8, Yxr, Ymm}},
793}
794
795var yinsrw = []ytab{
796	{Zibm_r, 2, argList{Yu8, Yml, Yxr}},
797}
798
799var yinsr = []ytab{
800	{Zibm_r, 3, argList{Yu8, Ymm, Yxr}},
801}
802
803var ypsdq = []ytab{
804	{Zibo_m, 2, argList{Yi8, Yxr}},
805}
806
807var ymskb = []ytab{
808	{Zm_r_xm, 2, argList{Yxr, Yrl}},
809	{Zm_r_xm, 1, argList{Ymr, Yrl}},
810}
811
812var ycrc32l = []ytab{
813	{Zlitm_r, 0, argList{Yml, Yrl}},
814}
815
816var ycrc32b = []ytab{
817	{Zlitm_r, 0, argList{Ymb, Yrl}},
818}
819
820var yprefetch = []ytab{
821	{Zm_o, 2, argList{Ym}},
822}
823
824var yaes = []ytab{
825	{Zlitm_r, 2, argList{Yxm, Yxr}},
826}
827
828var yxbegin = []ytab{
829	{Zjmp, 1, argList{Ybr}},
830}
831
832var yxabort = []ytab{
833	{Zib_, 1, argList{Yu8}},
834}
835
836var ylddqu = []ytab{
837	{Zm_r, 1, argList{Ym, Yxr}},
838}
839
840var ypalignr = []ytab{
841	{Zibm_r, 2, argList{Yu8, Yxm, Yxr}},
842}
843
844var ysha256rnds2 = []ytab{
845	{Zlit_m_r, 0, argList{Yxr0, Yxm, Yxr}},
846}
847
848var yblendvpd = []ytab{
849	{Z_m_r, 1, argList{Yxr0, Yxm, Yxr}},
850}
851
852var ymmxmm0f38 = []ytab{
853	{Zlitm_r, 3, argList{Ymm, Ymr}},
854	{Zlitm_r, 5, argList{Yxm, Yxr}},
855}
856
857var yextractps = []ytab{
858	{Zibr_m, 2, argList{Yu2, Yxr, Yml}},
859}
860
861var ysha1rnds4 = []ytab{
862	{Zibm_r, 2, argList{Yu2, Yxm, Yxr}},
863}
864
865// You are doasm, holding in your hand a *obj.Prog with p.As set to, say,
866// ACRC32, and p.From and p.To as operands (obj.Addr).  The linker scans optab
867// to find the entry with the given p.As and then looks through the ytable for
868// that instruction (the second field in the optab struct) for a line whose
869// first two values match the Ytypes of the p.From and p.To operands.  The
870// function oclass computes the specific Ytype of an operand and then the set
871// of more general Ytypes that it satisfies is implied by the ycover table, set
872// up in instinit.  For example, oclass distinguishes the constants 0 and 1
873// from the more general 8-bit constants, but instinit says
874//
875//	ycover[Yi0*Ymax+Ys32] = 1
876//	ycover[Yi1*Ymax+Ys32] = 1
877//	ycover[Yi8*Ymax+Ys32] = 1
878//
879// which means that Yi0, Yi1, and Yi8 all count as Ys32 (signed 32)
880// if that's what an instruction can handle.
881//
882// In parallel with the scan through the ytable for the appropriate line, there
883// is a z pointer that starts out pointing at the strange magic byte list in
884// the Optab struct.  With each step past a non-matching ytable line, z
885// advances by the 4th entry in the line.  When a matching line is found, that
886// z pointer has the extra data to use in laying down the instruction bytes.
887// The actual bytes laid down are a function of the 3rd entry in the line (that
888// is, the Ztype) and the z bytes.
889//
890// For example, let's look at AADDL.  The optab line says:
891//
892//	{AADDL, yaddl, Px, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
893//
894// and yaddl says
895//
896//	var yaddl = []ytab{
897//	        {Yi8, Ynone, Yml, Zibo_m, 2},
898//	        {Yi32, Ynone, Yax, Zil_, 1},
899//	        {Yi32, Ynone, Yml, Zilo_m, 2},
900//	        {Yrl, Ynone, Yml, Zr_m, 1},
901//	        {Yml, Ynone, Yrl, Zm_r, 1},
902//	}
903//
904// so there are 5 possible types of ADDL instruction that can be laid down, and
905// possible states used to lay them down (Ztype and z pointer, assuming z
906// points at opBytes{0x83, 00, 0x05,0x81, 00, 0x01, 0x03}) are:
907//
908//	Yi8, Yml -> Zibo_m, z (0x83, 00)
909//	Yi32, Yax -> Zil_, z+2 (0x05)
910//	Yi32, Yml -> Zilo_m, z+2+1 (0x81, 0x00)
911//	Yrl, Yml -> Zr_m, z+2+1+2 (0x01)
912//	Yml, Yrl -> Zm_r, z+2+1+2+1 (0x03)
913//
914// The Pconstant in the optab line controls the prefix bytes to emit.  That's
915// relatively straightforward as this program goes.
916//
917// The switch on yt.zcase in doasm implements the various Z cases.  Zibo_m, for
918// example, is an opcode byte (z[0]) then an asmando (which is some kind of
919// encoded addressing mode for the Yml arg), and then a single immediate byte.
920// Zilo_m is the same but a long (32-bit) immediate.
921var optab =
922// as, ytab, andproto, opcode
923[...]Optab{
924	{obj.AXXX, nil, 0, opBytes{}},
925	{AAAA, ynone, P32, opBytes{0x37}},
926	{AAAD, ynone, P32, opBytes{0xd5, 0x0a}},
927	{AAAM, ynone, P32, opBytes{0xd4, 0x0a}},
928	{AAAS, ynone, P32, opBytes{0x3f}},
929	{AADCB, yxorb, Pb, opBytes{0x14, 0x80, 02, 0x10, 0x12}},
930	{AADCL, yaddl, Px, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
931	{AADCQ, yaddl, Pw, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
932	{AADCW, yaddl, Pe, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
933	{AADCXL, yml_rl, Pq4, opBytes{0xf6}},
934	{AADCXQ, yml_rl, Pq4w, opBytes{0xf6}},
935	{AADDB, yxorb, Pb, opBytes{0x04, 0x80, 00, 0x00, 0x02}},
936	{AADDL, yaddl, Px, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
937	{AADDPD, yxm, Pq, opBytes{0x58}},
938	{AADDPS, yxm, Pm, opBytes{0x58}},
939	{AADDQ, yaddl, Pw, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
940	{AADDSD, yxm, Pf2, opBytes{0x58}},
941	{AADDSS, yxm, Pf3, opBytes{0x58}},
942	{AADDSUBPD, yxm, Pq, opBytes{0xd0}},
943	{AADDSUBPS, yxm, Pf2, opBytes{0xd0}},
944	{AADDW, yaddl, Pe, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
945	{AADOXL, yml_rl, Pq5, opBytes{0xf6}},
946	{AADOXQ, yml_rl, Pq5w, opBytes{0xf6}},
947	{AADJSP, nil, 0, opBytes{}},
948	{AANDB, yxorb, Pb, opBytes{0x24, 0x80, 04, 0x20, 0x22}},
949	{AANDL, yaddl, Px, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
950	{AANDNPD, yxm, Pq, opBytes{0x55}},
951	{AANDNPS, yxm, Pm, opBytes{0x55}},
952	{AANDPD, yxm, Pq, opBytes{0x54}},
953	{AANDPS, yxm, Pm, opBytes{0x54}},
954	{AANDQ, yaddl, Pw, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
955	{AANDW, yaddl, Pe, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
956	{AARPL, yrl_ml, P32, opBytes{0x63}},
957	{ABOUNDL, yrl_m, P32, opBytes{0x62}},
958	{ABOUNDW, yrl_m, Pe, opBytes{0x62}},
959	{ABSFL, yml_rl, Pm, opBytes{0xbc}},
960	{ABSFQ, yml_rl, Pw, opBytes{0x0f, 0xbc}},
961	{ABSFW, yml_rl, Pq, opBytes{0xbc}},
962	{ABSRL, yml_rl, Pm, opBytes{0xbd}},
963	{ABSRQ, yml_rl, Pw, opBytes{0x0f, 0xbd}},
964	{ABSRW, yml_rl, Pq, opBytes{0xbd}},
965	{ABSWAPL, ybswap, Px, opBytes{0x0f, 0xc8}},
966	{ABSWAPQ, ybswap, Pw, opBytes{0x0f, 0xc8}},
967	{ABTCL, ybtl, Pm, opBytes{0xba, 07, 0xbb}},
968	{ABTCQ, ybtl, Pw, opBytes{0x0f, 0xba, 07, 0x0f, 0xbb}},
969	{ABTCW, ybtl, Pq, opBytes{0xba, 07, 0xbb}},
970	{ABTL, ybtl, Pm, opBytes{0xba, 04, 0xa3}},
971	{ABTQ, ybtl, Pw, opBytes{0x0f, 0xba, 04, 0x0f, 0xa3}},
972	{ABTRL, ybtl, Pm, opBytes{0xba, 06, 0xb3}},
973	{ABTRQ, ybtl, Pw, opBytes{0x0f, 0xba, 06, 0x0f, 0xb3}},
974	{ABTRW, ybtl, Pq, opBytes{0xba, 06, 0xb3}},
975	{ABTSL, ybtl, Pm, opBytes{0xba, 05, 0xab}},
976	{ABTSQ, ybtl, Pw, opBytes{0x0f, 0xba, 05, 0x0f, 0xab}},
977	{ABTSW, ybtl, Pq, opBytes{0xba, 05, 0xab}},
978	{ABTW, ybtl, Pq, opBytes{0xba, 04, 0xa3}},
979	{ABYTE, ybyte, Px, opBytes{1}},
980	{obj.ACALL, ycall, Px, opBytes{0xff, 02, 0xff, 0x15, 0xe8}},
981	{ACBW, ynone, Pe, opBytes{0x98}},
982	{ACDQ, ynone, Px, opBytes{0x99}},
983	{ACDQE, ynone, Pw, opBytes{0x98}},
984	{ACLAC, ynone, Pm, opBytes{01, 0xca}},
985	{ACLC, ynone, Px, opBytes{0xf8}},
986	{ACLD, ynone, Px, opBytes{0xfc}},
987	{ACLDEMOTE, yclflush, Pm, opBytes{0x1c, 00}},
988	{ACLFLUSH, yclflush, Pm, opBytes{0xae, 07}},
989	{ACLFLUSHOPT, yclflush, Pq, opBytes{0xae, 07}},
990	{ACLI, ynone, Px, opBytes{0xfa}},
991	{ACLTS, ynone, Pm, opBytes{0x06}},
992	{ACLWB, yclflush, Pq, opBytes{0xae, 06}},
993	{ACMC, ynone, Px, opBytes{0xf5}},
994	{ACMOVLCC, yml_rl, Pm, opBytes{0x43}},
995	{ACMOVLCS, yml_rl, Pm, opBytes{0x42}},
996	{ACMOVLEQ, yml_rl, Pm, opBytes{0x44}},
997	{ACMOVLGE, yml_rl, Pm, opBytes{0x4d}},
998	{ACMOVLGT, yml_rl, Pm, opBytes{0x4f}},
999	{ACMOVLHI, yml_rl, Pm, opBytes{0x47}},
1000	{ACMOVLLE, yml_rl, Pm, opBytes{0x4e}},
1001	{ACMOVLLS, yml_rl, Pm, opBytes{0x46}},
1002	{ACMOVLLT, yml_rl, Pm, opBytes{0x4c}},
1003	{ACMOVLMI, yml_rl, Pm, opBytes{0x48}},
1004	{ACMOVLNE, yml_rl, Pm, opBytes{0x45}},
1005	{ACMOVLOC, yml_rl, Pm, opBytes{0x41}},
1006	{ACMOVLOS, yml_rl, Pm, opBytes{0x40}},
1007	{ACMOVLPC, yml_rl, Pm, opBytes{0x4b}},
1008	{ACMOVLPL, yml_rl, Pm, opBytes{0x49}},
1009	{ACMOVLPS, yml_rl, Pm, opBytes{0x4a}},
1010	{ACMOVQCC, yml_rl, Pw, opBytes{0x0f, 0x43}},
1011	{ACMOVQCS, yml_rl, Pw, opBytes{0x0f, 0x42}},
1012	{ACMOVQEQ, yml_rl, Pw, opBytes{0x0f, 0x44}},
1013	{ACMOVQGE, yml_rl, Pw, opBytes{0x0f, 0x4d}},
1014	{ACMOVQGT, yml_rl, Pw, opBytes{0x0f, 0x4f}},
1015	{ACMOVQHI, yml_rl, Pw, opBytes{0x0f, 0x47}},
1016	{ACMOVQLE, yml_rl, Pw, opBytes{0x0f, 0x4e}},
1017	{ACMOVQLS, yml_rl, Pw, opBytes{0x0f, 0x46}},
1018	{ACMOVQLT, yml_rl, Pw, opBytes{0x0f, 0x4c}},
1019	{ACMOVQMI, yml_rl, Pw, opBytes{0x0f, 0x48}},
1020	{ACMOVQNE, yml_rl, Pw, opBytes{0x0f, 0x45}},
1021	{ACMOVQOC, yml_rl, Pw, opBytes{0x0f, 0x41}},
1022	{ACMOVQOS, yml_rl, Pw, opBytes{0x0f, 0x40}},
1023	{ACMOVQPC, yml_rl, Pw, opBytes{0x0f, 0x4b}},
1024	{ACMOVQPL, yml_rl, Pw, opBytes{0x0f, 0x49}},
1025	{ACMOVQPS, yml_rl, Pw, opBytes{0x0f, 0x4a}},
1026	{ACMOVWCC, yml_rl, Pq, opBytes{0x43}},
1027	{ACMOVWCS, yml_rl, Pq, opBytes{0x42}},
1028	{ACMOVWEQ, yml_rl, Pq, opBytes{0x44}},
1029	{ACMOVWGE, yml_rl, Pq, opBytes{0x4d}},
1030	{ACMOVWGT, yml_rl, Pq, opBytes{0x4f}},
1031	{ACMOVWHI, yml_rl, Pq, opBytes{0x47}},
1032	{ACMOVWLE, yml_rl, Pq, opBytes{0x4e}},
1033	{ACMOVWLS, yml_rl, Pq, opBytes{0x46}},
1034	{ACMOVWLT, yml_rl, Pq, opBytes{0x4c}},
1035	{ACMOVWMI, yml_rl, Pq, opBytes{0x48}},
1036	{ACMOVWNE, yml_rl, Pq, opBytes{0x45}},
1037	{ACMOVWOC, yml_rl, Pq, opBytes{0x41}},
1038	{ACMOVWOS, yml_rl, Pq, opBytes{0x40}},
1039	{ACMOVWPC, yml_rl, Pq, opBytes{0x4b}},
1040	{ACMOVWPL, yml_rl, Pq, opBytes{0x49}},
1041	{ACMOVWPS, yml_rl, Pq, opBytes{0x4a}},
1042	{ACMPB, ycmpb, Pb, opBytes{0x3c, 0x80, 07, 0x38, 0x3a}},
1043	{ACMPL, ycmpl, Px, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
1044	{ACMPPD, yxcmpi, Px, opBytes{Pe, 0xc2}},
1045	{ACMPPS, yxcmpi, Pm, opBytes{0xc2, 0}},
1046	{ACMPQ, ycmpl, Pw, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
1047	{ACMPSB, ynone, Pb, opBytes{0xa6}},
1048	{ACMPSD, yxcmpi, Px, opBytes{Pf2, 0xc2}},
1049	{ACMPSL, ynone, Px, opBytes{0xa7}},
1050	{ACMPSQ, ynone, Pw, opBytes{0xa7}},
1051	{ACMPSS, yxcmpi, Px, opBytes{Pf3, 0xc2}},
1052	{ACMPSW, ynone, Pe, opBytes{0xa7}},
1053	{ACMPW, ycmpl, Pe, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
1054	{ACOMISD, yxm, Pe, opBytes{0x2f}},
1055	{ACOMISS, yxm, Pm, opBytes{0x2f}},
1056	{ACPUID, ynone, Pm, opBytes{0xa2}},
1057	{ACVTPL2PD, yxcvm2, Px, opBytes{Pf3, 0xe6, Pe, 0x2a}},
1058	{ACVTPL2PS, yxcvm2, Pm, opBytes{0x5b, 0, 0x2a, 0}},
1059	{ACVTPD2PL, yxcvm1, Px, opBytes{Pf2, 0xe6, Pe, 0x2d}},
1060	{ACVTPD2PS, yxm, Pe, opBytes{0x5a}},
1061	{ACVTPS2PL, yxcvm1, Px, opBytes{Pe, 0x5b, Pm, 0x2d}},
1062	{ACVTPS2PD, yxm, Pm, opBytes{0x5a}},
1063	{ACVTSD2SL, yxcvfl, Pf2, opBytes{0x2d}},
1064	{ACVTSD2SQ, yxcvfq, Pw, opBytes{Pf2, 0x2d}},
1065	{ACVTSD2SS, yxm, Pf2, opBytes{0x5a}},
1066	{ACVTSL2SD, yxcvlf, Pf2, opBytes{0x2a}},
1067	{ACVTSQ2SD, yxcvqf, Pw, opBytes{Pf2, 0x2a}},
1068	{ACVTSL2SS, yxcvlf, Pf3, opBytes{0x2a}},
1069	{ACVTSQ2SS, yxcvqf, Pw, opBytes{Pf3, 0x2a}},
1070	{ACVTSS2SD, yxm, Pf3, opBytes{0x5a}},
1071	{ACVTSS2SL, yxcvfl, Pf3, opBytes{0x2d}},
1072	{ACVTSS2SQ, yxcvfq, Pw, opBytes{Pf3, 0x2d}},
1073	{ACVTTPD2PL, yxcvm1, Px, opBytes{Pe, 0xe6, Pe, 0x2c}},
1074	{ACVTTPS2PL, yxcvm1, Px, opBytes{Pf3, 0x5b, Pm, 0x2c}},
1075	{ACVTTSD2SL, yxcvfl, Pf2, opBytes{0x2c}},
1076	{ACVTTSD2SQ, yxcvfq, Pw, opBytes{Pf2, 0x2c}},
1077	{ACVTTSS2SL, yxcvfl, Pf3, opBytes{0x2c}},
1078	{ACVTTSS2SQ, yxcvfq, Pw, opBytes{Pf3, 0x2c}},
1079	{ACWD, ynone, Pe, opBytes{0x99}},
1080	{ACWDE, ynone, Px, opBytes{0x98}},
1081	{ACQO, ynone, Pw, opBytes{0x99}},
1082	{ADAA, ynone, P32, opBytes{0x27}},
1083	{ADAS, ynone, P32, opBytes{0x2f}},
1084	{ADECB, yscond, Pb, opBytes{0xfe, 01}},
1085	{ADECL, yincl, Px1, opBytes{0x48, 0xff, 01}},
1086	{ADECQ, yincq, Pw, opBytes{0xff, 01}},
1087	{ADECW, yincq, Pe, opBytes{0xff, 01}},
1088	{ADIVB, ydivb, Pb, opBytes{0xf6, 06}},
1089	{ADIVL, ydivl, Px, opBytes{0xf7, 06}},
1090	{ADIVPD, yxm, Pe, opBytes{0x5e}},
1091	{ADIVPS, yxm, Pm, opBytes{0x5e}},
1092	{ADIVQ, ydivl, Pw, opBytes{0xf7, 06}},
1093	{ADIVSD, yxm, Pf2, opBytes{0x5e}},
1094	{ADIVSS, yxm, Pf3, opBytes{0x5e}},
1095	{ADIVW, ydivl, Pe, opBytes{0xf7, 06}},
1096	{ADPPD, yxshuf, Pq, opBytes{0x3a, 0x41, 0}},
1097	{ADPPS, yxshuf, Pq, opBytes{0x3a, 0x40, 0}},
1098	{AEMMS, ynone, Pm, opBytes{0x77}},
1099	{AEXTRACTPS, yextractps, Pq, opBytes{0x3a, 0x17, 0}},
1100	{AENTER, nil, 0, opBytes{}}, // botch
1101	{AFXRSTOR, ysvrs_mo, Pm, opBytes{0xae, 01, 0xae, 01}},
1102	{AFXSAVE, ysvrs_om, Pm, opBytes{0xae, 00, 0xae, 00}},
1103	{AFXRSTOR64, ysvrs_mo, Pw, opBytes{0x0f, 0xae, 01, 0x0f, 0xae, 01}},
1104	{AFXSAVE64, ysvrs_om, Pw, opBytes{0x0f, 0xae, 00, 0x0f, 0xae, 00}},
1105	{AHLT, ynone, Px, opBytes{0xf4}},
1106	{AIDIVB, ydivb, Pb, opBytes{0xf6, 07}},
1107	{AIDIVL, ydivl, Px, opBytes{0xf7, 07}},
1108	{AIDIVQ, ydivl, Pw, opBytes{0xf7, 07}},
1109	{AIDIVW, ydivl, Pe, opBytes{0xf7, 07}},
1110	{AIMULB, ydivb, Pb, opBytes{0xf6, 05}},
1111	{AIMULL, yimul, Px, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
1112	{AIMULQ, yimul, Pw, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
1113	{AIMULW, yimul, Pe, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
1114	{AIMUL3W, yimul3, Pe, opBytes{0x6b, 00, 0x69, 00}},
1115	{AIMUL3L, yimul3, Px, opBytes{0x6b, 00, 0x69, 00}},
1116	{AIMUL3Q, yimul3, Pw, opBytes{0x6b, 00, 0x69, 00}},
1117	{AINB, yin, Pb, opBytes{0xe4, 0xec}},
1118	{AINW, yin, Pe, opBytes{0xe5, 0xed}},
1119	{AINL, yin, Px, opBytes{0xe5, 0xed}},
1120	{AINCB, yscond, Pb, opBytes{0xfe, 00}},
1121	{AINCL, yincl, Px1, opBytes{0x40, 0xff, 00}},
1122	{AINCQ, yincq, Pw, opBytes{0xff, 00}},
1123	{AINCW, yincq, Pe, opBytes{0xff, 00}},
1124	{AINSB, ynone, Pb, opBytes{0x6c}},
1125	{AINSL, ynone, Px, opBytes{0x6d}},
1126	{AINSERTPS, yxshuf, Pq, opBytes{0x3a, 0x21, 0}},
1127	{AINSW, ynone, Pe, opBytes{0x6d}},
1128	{AICEBP, ynone, Px, opBytes{0xf1}},
1129	{AINT, yint, Px, opBytes{0xcd}},
1130	{AINTO, ynone, P32, opBytes{0xce}},
1131	{AIRETL, ynone, Px, opBytes{0xcf}},
1132	{AIRETQ, ynone, Pw, opBytes{0xcf}},
1133	{AIRETW, ynone, Pe, opBytes{0xcf}},
1134	{AJCC, yjcond, Px, opBytes{0x73, 0x83, 00}},
1135	{AJCS, yjcond, Px, opBytes{0x72, 0x82}},
1136	{AJCXZL, yloop, Px, opBytes{0xe3}},
1137	{AJCXZW, yloop, Px, opBytes{0xe3}},
1138	{AJCXZQ, yloop, Px, opBytes{0xe3}},
1139	{AJEQ, yjcond, Px, opBytes{0x74, 0x84}},
1140	{AJGE, yjcond, Px, opBytes{0x7d, 0x8d}},
1141	{AJGT, yjcond, Px, opBytes{0x7f, 0x8f}},
1142	{AJHI, yjcond, Px, opBytes{0x77, 0x87}},
1143	{AJLE, yjcond, Px, opBytes{0x7e, 0x8e}},
1144	{AJLS, yjcond, Px, opBytes{0x76, 0x86}},
1145	{AJLT, yjcond, Px, opBytes{0x7c, 0x8c}},
1146	{AJMI, yjcond, Px, opBytes{0x78, 0x88}},
1147	{obj.AJMP, yjmp, Px, opBytes{0xff, 04, 0xeb, 0xe9}},
1148	{AJNE, yjcond, Px, opBytes{0x75, 0x85}},
1149	{AJOC, yjcond, Px, opBytes{0x71, 0x81, 00}},
1150	{AJOS, yjcond, Px, opBytes{0x70, 0x80, 00}},
1151	{AJPC, yjcond, Px, opBytes{0x7b, 0x8b}},
1152	{AJPL, yjcond, Px, opBytes{0x79, 0x89}},
1153	{AJPS, yjcond, Px, opBytes{0x7a, 0x8a}},
1154	{AHADDPD, yxm, Pq, opBytes{0x7c}},
1155	{AHADDPS, yxm, Pf2, opBytes{0x7c}},
1156	{AHSUBPD, yxm, Pq, opBytes{0x7d}},
1157	{AHSUBPS, yxm, Pf2, opBytes{0x7d}},
1158	{ALAHF, ynone, Px, opBytes{0x9f}},
1159	{ALARL, yml_rl, Pm, opBytes{0x02}},
1160	{ALARQ, yml_rl, Pw, opBytes{0x0f, 0x02}},
1161	{ALARW, yml_rl, Pq, opBytes{0x02}},
1162	{ALDDQU, ylddqu, Pf2, opBytes{0xf0}},
1163	{ALDMXCSR, ysvrs_mo, Pm, opBytes{0xae, 02, 0xae, 02}},
1164	{ALEAL, ym_rl, Px, opBytes{0x8d}},
1165	{ALEAQ, ym_rl, Pw, opBytes{0x8d}},
1166	{ALEAVEL, ynone, P32, opBytes{0xc9}},
1167	{ALEAVEQ, ynone, Py, opBytes{0xc9}},
1168	{ALEAVEW, ynone, Pe, opBytes{0xc9}},
1169	{ALEAW, ym_rl, Pe, opBytes{0x8d}},
1170	{ALOCK, ynone, Px, opBytes{0xf0}},
1171	{ALODSB, ynone, Pb, opBytes{0xac}},
1172	{ALODSL, ynone, Px, opBytes{0xad}},
1173	{ALODSQ, ynone, Pw, opBytes{0xad}},
1174	{ALODSW, ynone, Pe, opBytes{0xad}},
1175	{ALONG, ybyte, Px, opBytes{4}},
1176	{ALOOP, yloop, Px, opBytes{0xe2}},
1177	{ALOOPEQ, yloop, Px, opBytes{0xe1}},
1178	{ALOOPNE, yloop, Px, opBytes{0xe0}},
1179	{ALTR, ydivl, Pm, opBytes{0x00, 03}},
1180	{ALZCNTL, yml_rl, Pf3, opBytes{0xbd}},
1181	{ALZCNTQ, yml_rl, Pfw, opBytes{0xbd}},
1182	{ALZCNTW, yml_rl, Pef3, opBytes{0xbd}},
1183	{ALSLL, yml_rl, Pm, opBytes{0x03}},
1184	{ALSLW, yml_rl, Pq, opBytes{0x03}},
1185	{ALSLQ, yml_rl, Pw, opBytes{0x0f, 0x03}},
1186	{AMASKMOVOU, yxr, Pe, opBytes{0xf7}},
1187	{AMASKMOVQ, ymr, Pm, opBytes{0xf7}},
1188	{AMAXPD, yxm, Pe, opBytes{0x5f}},
1189	{AMAXPS, yxm, Pm, opBytes{0x5f}},
1190	{AMAXSD, yxm, Pf2, opBytes{0x5f}},
1191	{AMAXSS, yxm, Pf3, opBytes{0x5f}},
1192	{AMINPD, yxm, Pe, opBytes{0x5d}},
1193	{AMINPS, yxm, Pm, opBytes{0x5d}},
1194	{AMINSD, yxm, Pf2, opBytes{0x5d}},
1195	{AMINSS, yxm, Pf3, opBytes{0x5d}},
1196	{AMONITOR, ynone, Px, opBytes{0x0f, 0x01, 0xc8, 0}},
1197	{AMWAIT, ynone, Px, opBytes{0x0f, 0x01, 0xc9, 0}},
1198	{AMOVAPD, yxmov, Pe, opBytes{0x28, 0x29}},
1199	{AMOVAPS, yxmov, Pm, opBytes{0x28, 0x29}},
1200	{AMOVB, ymovb, Pb, opBytes{0x88, 0x8a, 0xb0, 0xc6, 00}},
1201	{AMOVBLSX, ymb_rl, Pm, opBytes{0xbe}},
1202	{AMOVBLZX, ymb_rl, Pm, opBytes{0xb6}},
1203	{AMOVBQSX, ymb_rl, Pw, opBytes{0x0f, 0xbe}},
1204	{AMOVBQZX, ymb_rl, Pw, opBytes{0x0f, 0xb6}},
1205	{AMOVBWSX, ymb_rl, Pq, opBytes{0xbe}},
1206	{AMOVSWW, ymb_rl, Pe, opBytes{0x0f, 0xbf}},
1207	{AMOVBWZX, ymb_rl, Pq, opBytes{0xb6}},
1208	{AMOVZWW, ymb_rl, Pe, opBytes{0x0f, 0xb7}},
1209	{AMOVO, yxmov, Pe, opBytes{0x6f, 0x7f}},
1210	{AMOVOU, yxmov, Pf3, opBytes{0x6f, 0x7f}},
1211	{AMOVHLPS, yxr, Pm, opBytes{0x12}},
1212	{AMOVHPD, yxmov, Pe, opBytes{0x16, 0x17}},
1213	{AMOVHPS, yxmov, Pm, opBytes{0x16, 0x17}},
1214	{AMOVL, ymovl, Px, opBytes{0x89, 0x8b, 0xb8, 0xc7, 00, 0x6e, 0x7e, Pe, 0x6e, Pe, 0x7e, 0}},
1215	{AMOVLHPS, yxr, Pm, opBytes{0x16}},
1216	{AMOVLPD, yxmov, Pe, opBytes{0x12, 0x13}},
1217	{AMOVLPS, yxmov, Pm, opBytes{0x12, 0x13}},
1218	{AMOVLQSX, yml_rl, Pw, opBytes{0x63}},
1219	{AMOVLQZX, yml_rl, Px, opBytes{0x8b}},
1220	{AMOVMSKPD, yxrrl, Pq, opBytes{0x50}},
1221	{AMOVMSKPS, yxrrl, Pm, opBytes{0x50}},
1222	{AMOVNTO, yxr_ml, Pe, opBytes{0xe7}},
1223	{AMOVNTDQA, ylddqu, Pq4, opBytes{0x2a}},
1224	{AMOVNTPD, yxr_ml, Pe, opBytes{0x2b}},
1225	{AMOVNTPS, yxr_ml, Pm, opBytes{0x2b}},
1226	{AMOVNTQ, ymr_ml, Pm, opBytes{0xe7}},
1227	{AMOVQ, ymovq, Pw8, opBytes{0x6f, 0x7f, Pf2, 0xd6, Pf3, 0x7e, Pe, 0xd6, 0x89, 0x8b, 0xc7, 00, 0xb8, 0xc7, 00, 0x6e, 0x7e, Pe, 0x6e, Pe, 0x7e, 0}},
1228	{AMOVQOZX, ymrxr, Pf3, opBytes{0xd6, 0x7e}},
1229	{AMOVSB, ynone, Pb, opBytes{0xa4}},
1230	{AMOVSD, yxmov, Pf2, opBytes{0x10, 0x11}},
1231	{AMOVSL, ynone, Px, opBytes{0xa5}},
1232	{AMOVSQ, ynone, Pw, opBytes{0xa5}},
1233	{AMOVSS, yxmov, Pf3, opBytes{0x10, 0x11}},
1234	{AMOVSW, ynone, Pe, opBytes{0xa5}},
1235	{AMOVUPD, yxmov, Pe, opBytes{0x10, 0x11}},
1236	{AMOVUPS, yxmov, Pm, opBytes{0x10, 0x11}},
1237	{AMOVW, ymovw, Pe, opBytes{0x89, 0x8b, 0xb8, 0xc7, 00, 0}},
1238	{AMOVWLSX, yml_rl, Pm, opBytes{0xbf}},
1239	{AMOVWLZX, yml_rl, Pm, opBytes{0xb7}},
1240	{AMOVWQSX, yml_rl, Pw, opBytes{0x0f, 0xbf}},
1241	{AMOVWQZX, yml_rl, Pw, opBytes{0x0f, 0xb7}},
1242	{AMPSADBW, yxshuf, Pq, opBytes{0x3a, 0x42, 0}},
1243	{AMULB, ydivb, Pb, opBytes{0xf6, 04}},
1244	{AMULL, ydivl, Px, opBytes{0xf7, 04}},
1245	{AMULPD, yxm, Pe, opBytes{0x59}},
1246	{AMULPS, yxm, Ym, opBytes{0x59}},
1247	{AMULQ, ydivl, Pw, opBytes{0xf7, 04}},
1248	{AMULSD, yxm, Pf2, opBytes{0x59}},
1249	{AMULSS, yxm, Pf3, opBytes{0x59}},
1250	{AMULW, ydivl, Pe, opBytes{0xf7, 04}},
1251	{ANEGB, yscond, Pb, opBytes{0xf6, 03}},
1252	{ANEGL, yscond, Px, opBytes{0xf7, 03}},
1253	{ANEGQ, yscond, Pw, opBytes{0xf7, 03}},
1254	{ANEGW, yscond, Pe, opBytes{0xf7, 03}},
1255	{obj.ANOP, ynop, Px, opBytes{0, 0}},
1256	{ANOTB, yscond, Pb, opBytes{0xf6, 02}},
1257	{ANOTL, yscond, Px, opBytes{0xf7, 02}}, // TODO(rsc): yscond is wrong here.
1258	{ANOTQ, yscond, Pw, opBytes{0xf7, 02}},
1259	{ANOTW, yscond, Pe, opBytes{0xf7, 02}},
1260	{AORB, yxorb, Pb, opBytes{0x0c, 0x80, 01, 0x08, 0x0a}},
1261	{AORL, yaddl, Px, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
1262	{AORPD, yxm, Pq, opBytes{0x56}},
1263	{AORPS, yxm, Pm, opBytes{0x56}},
1264	{AORQ, yaddl, Pw, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
1265	{AORW, yaddl, Pe, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
1266	{AOUTB, yin, Pb, opBytes{0xe6, 0xee}},
1267	{AOUTL, yin, Px, opBytes{0xe7, 0xef}},
1268	{AOUTW, yin, Pe, opBytes{0xe7, 0xef}},
1269	{AOUTSB, ynone, Pb, opBytes{0x6e}},
1270	{AOUTSL, ynone, Px, opBytes{0x6f}},
1271	{AOUTSW, ynone, Pe, opBytes{0x6f}},
1272	{APABSB, yxm_q4, Pq4, opBytes{0x1c}},
1273	{APABSD, yxm_q4, Pq4, opBytes{0x1e}},
1274	{APABSW, yxm_q4, Pq4, opBytes{0x1d}},
1275	{APACKSSLW, ymm, Py1, opBytes{0x6b, Pe, 0x6b}},
1276	{APACKSSWB, ymm, Py1, opBytes{0x63, Pe, 0x63}},
1277	{APACKUSDW, yxm_q4, Pq4, opBytes{0x2b}},
1278	{APACKUSWB, ymm, Py1, opBytes{0x67, Pe, 0x67}},
1279	{APADDB, ymm, Py1, opBytes{0xfc, Pe, 0xfc}},
1280	{APADDL, ymm, Py1, opBytes{0xfe, Pe, 0xfe}},
1281	{APADDQ, yxm, Pe, opBytes{0xd4}},
1282	{APADDSB, ymm, Py1, opBytes{0xec, Pe, 0xec}},
1283	{APADDSW, ymm, Py1, opBytes{0xed, Pe, 0xed}},
1284	{APADDUSB, ymm, Py1, opBytes{0xdc, Pe, 0xdc}},
1285	{APADDUSW, ymm, Py1, opBytes{0xdd, Pe, 0xdd}},
1286	{APADDW, ymm, Py1, opBytes{0xfd, Pe, 0xfd}},
1287	{APALIGNR, ypalignr, Pq, opBytes{0x3a, 0x0f}},
1288	{APAND, ymm, Py1, opBytes{0xdb, Pe, 0xdb}},
1289	{APANDN, ymm, Py1, opBytes{0xdf, Pe, 0xdf}},
1290	{APAUSE, ynone, Px, opBytes{0xf3, 0x90}},
1291	{APAVGB, ymm, Py1, opBytes{0xe0, Pe, 0xe0}},
1292	{APAVGW, ymm, Py1, opBytes{0xe3, Pe, 0xe3}},
1293	{APBLENDW, yxshuf, Pq, opBytes{0x3a, 0x0e, 0}},
1294	{APCMPEQB, ymm, Py1, opBytes{0x74, Pe, 0x74}},
1295	{APCMPEQL, ymm, Py1, opBytes{0x76, Pe, 0x76}},
1296	{APCMPEQQ, yxm_q4, Pq4, opBytes{0x29}},
1297	{APCMPEQW, ymm, Py1, opBytes{0x75, Pe, 0x75}},
1298	{APCMPGTB, ymm, Py1, opBytes{0x64, Pe, 0x64}},
1299	{APCMPGTL, ymm, Py1, opBytes{0x66, Pe, 0x66}},
1300	{APCMPGTQ, yxm_q4, Pq4, opBytes{0x37}},
1301	{APCMPGTW, ymm, Py1, opBytes{0x65, Pe, 0x65}},
1302	{APCMPISTRI, yxshuf, Pq, opBytes{0x3a, 0x63, 0}},
1303	{APCMPISTRM, yxshuf, Pq, opBytes{0x3a, 0x62, 0}},
1304	{APEXTRW, yextrw, Pq, opBytes{0xc5, 0, 0x3a, 0x15, 0}},
1305	{APEXTRB, yextr, Pq, opBytes{0x3a, 0x14, 00}},
1306	{APEXTRD, yextr, Pq, opBytes{0x3a, 0x16, 00}},
1307	{APEXTRQ, yextr, Pq3, opBytes{0x3a, 0x16, 00}},
1308	{APHADDD, ymmxmm0f38, Px, opBytes{0x0F, 0x38, 0x02, 0, 0x66, 0x0F, 0x38, 0x02, 0}},
1309	{APHADDSW, yxm_q4, Pq4, opBytes{0x03}},
1310	{APHADDW, yxm_q4, Pq4, opBytes{0x01}},
1311	{APHMINPOSUW, yxm_q4, Pq4, opBytes{0x41}},
1312	{APHSUBD, yxm_q4, Pq4, opBytes{0x06}},
1313	{APHSUBSW, yxm_q4, Pq4, opBytes{0x07}},
1314	{APHSUBW, yxm_q4, Pq4, opBytes{0x05}},
1315	{APINSRW, yinsrw, Pq, opBytes{0xc4, 00}},
1316	{APINSRB, yinsr, Pq, opBytes{0x3a, 0x20, 00}},
1317	{APINSRD, yinsr, Pq, opBytes{0x3a, 0x22, 00}},
1318	{APINSRQ, yinsr, Pq3, opBytes{0x3a, 0x22, 00}},
1319	{APMADDUBSW, yxm_q4, Pq4, opBytes{0x04}},
1320	{APMADDWL, ymm, Py1, opBytes{0xf5, Pe, 0xf5}},
1321	{APMAXSB, yxm_q4, Pq4, opBytes{0x3c}},
1322	{APMAXSD, yxm_q4, Pq4, opBytes{0x3d}},
1323	{APMAXSW, yxm, Pe, opBytes{0xee}},
1324	{APMAXUB, yxm, Pe, opBytes{0xde}},
1325	{APMAXUD, yxm_q4, Pq4, opBytes{0x3f}},
1326	{APMAXUW, yxm_q4, Pq4, opBytes{0x3e}},
1327	{APMINSB, yxm_q4, Pq4, opBytes{0x38}},
1328	{APMINSD, yxm_q4, Pq4, opBytes{0x39}},
1329	{APMINSW, yxm, Pe, opBytes{0xea}},
1330	{APMINUB, yxm, Pe, opBytes{0xda}},
1331	{APMINUD, yxm_q4, Pq4, opBytes{0x3b}},
1332	{APMINUW, yxm_q4, Pq4, opBytes{0x3a}},
1333	{APMOVMSKB, ymskb, Px, opBytes{Pe, 0xd7, 0xd7}},
1334	{APMOVSXBD, yxm_q4, Pq4, opBytes{0x21}},
1335	{APMOVSXBQ, yxm_q4, Pq4, opBytes{0x22}},
1336	{APMOVSXBW, yxm_q4, Pq4, opBytes{0x20}},
1337	{APMOVSXDQ, yxm_q4, Pq4, opBytes{0x25}},
1338	{APMOVSXWD, yxm_q4, Pq4, opBytes{0x23}},
1339	{APMOVSXWQ, yxm_q4, Pq4, opBytes{0x24}},
1340	{APMOVZXBD, yxm_q4, Pq4, opBytes{0x31}},
1341	{APMOVZXBQ, yxm_q4, Pq4, opBytes{0x32}},
1342	{APMOVZXBW, yxm_q4, Pq4, opBytes{0x30}},
1343	{APMOVZXDQ, yxm_q4, Pq4, opBytes{0x35}},
1344	{APMOVZXWD, yxm_q4, Pq4, opBytes{0x33}},
1345	{APMOVZXWQ, yxm_q4, Pq4, opBytes{0x34}},
1346	{APMULDQ, yxm_q4, Pq4, opBytes{0x28}},
1347	{APMULHRSW, yxm_q4, Pq4, opBytes{0x0b}},
1348	{APMULHUW, ymm, Py1, opBytes{0xe4, Pe, 0xe4}},
1349	{APMULHW, ymm, Py1, opBytes{0xe5, Pe, 0xe5}},
1350	{APMULLD, yxm_q4, Pq4, opBytes{0x40}},
1351	{APMULLW, ymm, Py1, opBytes{0xd5, Pe, 0xd5}},
1352	{APMULULQ, ymm, Py1, opBytes{0xf4, Pe, 0xf4}},
1353	{APOPAL, ynone, P32, opBytes{0x61}},
1354	{APOPAW, ynone, Pe, opBytes{0x61}},
1355	{APOPCNTW, yml_rl, Pef3, opBytes{0xb8}},
1356	{APOPCNTL, yml_rl, Pf3, opBytes{0xb8}},
1357	{APOPCNTQ, yml_rl, Pfw, opBytes{0xb8}},
1358	{APOPFL, ynone, P32, opBytes{0x9d}},
1359	{APOPFQ, ynone, Py, opBytes{0x9d}},
1360	{APOPFW, ynone, Pe, opBytes{0x9d}},
1361	{APOPL, ypopl, P32, opBytes{0x58, 0x8f, 00}},
1362	{APOPQ, ypopl, Py, opBytes{0x58, 0x8f, 00}},
1363	{APOPW, ypopl, Pe, opBytes{0x58, 0x8f, 00}},
1364	{APOR, ymm, Py1, opBytes{0xeb, Pe, 0xeb}},
1365	{APSADBW, yxm, Pq, opBytes{0xf6}},
1366	{APSHUFHW, yxshuf, Pf3, opBytes{0x70, 00}},
1367	{APSHUFL, yxshuf, Pq, opBytes{0x70, 00}},
1368	{APSHUFLW, yxshuf, Pf2, opBytes{0x70, 00}},
1369	{APSHUFW, ymshuf, Pm, opBytes{0x70, 00}},
1370	{APSHUFB, ymshufb, Pq, opBytes{0x38, 0x00}},
1371	{APSIGNB, yxm_q4, Pq4, opBytes{0x08}},
1372	{APSIGND, yxm_q4, Pq4, opBytes{0x0a}},
1373	{APSIGNW, yxm_q4, Pq4, opBytes{0x09}},
1374	{APSLLO, ypsdq, Pq, opBytes{0x73, 07}},
1375	{APSLLL, yps, Py3, opBytes{0xf2, 0x72, 06, Pe, 0xf2, Pe, 0x72, 06}},
1376	{APSLLQ, yps, Py3, opBytes{0xf3, 0x73, 06, Pe, 0xf3, Pe, 0x73, 06}},
1377	{APSLLW, yps, Py3, opBytes{0xf1, 0x71, 06, Pe, 0xf1, Pe, 0x71, 06}},
1378	{APSRAL, yps, Py3, opBytes{0xe2, 0x72, 04, Pe, 0xe2, Pe, 0x72, 04}},
1379	{APSRAW, yps, Py3, opBytes{0xe1, 0x71, 04, Pe, 0xe1, Pe, 0x71, 04}},
1380	{APSRLO, ypsdq, Pq, opBytes{0x73, 03}},
1381	{APSRLL, yps, Py3, opBytes{0xd2, 0x72, 02, Pe, 0xd2, Pe, 0x72, 02}},
1382	{APSRLQ, yps, Py3, opBytes{0xd3, 0x73, 02, Pe, 0xd3, Pe, 0x73, 02}},
1383	{APSRLW, yps, Py3, opBytes{0xd1, 0x71, 02, Pe, 0xd1, Pe, 0x71, 02}},
1384	{APSUBB, yxm, Pe, opBytes{0xf8}},
1385	{APSUBL, yxm, Pe, opBytes{0xfa}},
1386	{APSUBQ, yxm, Pe, opBytes{0xfb}},
1387	{APSUBSB, yxm, Pe, opBytes{0xe8}},
1388	{APSUBSW, yxm, Pe, opBytes{0xe9}},
1389	{APSUBUSB, yxm, Pe, opBytes{0xd8}},
1390	{APSUBUSW, yxm, Pe, opBytes{0xd9}},
1391	{APSUBW, yxm, Pe, opBytes{0xf9}},
1392	{APTEST, yxm_q4, Pq4, opBytes{0x17}},
1393	{APUNPCKHBW, ymm, Py1, opBytes{0x68, Pe, 0x68}},
1394	{APUNPCKHLQ, ymm, Py1, opBytes{0x6a, Pe, 0x6a}},
1395	{APUNPCKHQDQ, yxm, Pe, opBytes{0x6d}},
1396	{APUNPCKHWL, ymm, Py1, opBytes{0x69, Pe, 0x69}},
1397	{APUNPCKLBW, ymm, Py1, opBytes{0x60, Pe, 0x60}},
1398	{APUNPCKLLQ, ymm, Py1, opBytes{0x62, Pe, 0x62}},
1399	{APUNPCKLQDQ, yxm, Pe, opBytes{0x6c}},
1400	{APUNPCKLWL, ymm, Py1, opBytes{0x61, Pe, 0x61}},
1401	{APUSHAL, ynone, P32, opBytes{0x60}},
1402	{APUSHAW, ynone, Pe, opBytes{0x60}},
1403	{APUSHFL, ynone, P32, opBytes{0x9c}},
1404	{APUSHFQ, ynone, Py, opBytes{0x9c}},
1405	{APUSHFW, ynone, Pe, opBytes{0x9c}},
1406	{APUSHL, ypushl, P32, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
1407	{APUSHQ, ypushl, Py, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
1408	{APUSHW, ypushl, Pe, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
1409	{APXOR, ymm, Py1, opBytes{0xef, Pe, 0xef}},
1410	{AQUAD, ybyte, Px, opBytes{8}},
1411	{ARCLB, yshb, Pb, opBytes{0xd0, 02, 0xc0, 02, 0xd2, 02}},
1412	{ARCLL, yshl, Px, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
1413	{ARCLQ, yshl, Pw, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
1414	{ARCLW, yshl, Pe, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
1415	{ARCPPS, yxm, Pm, opBytes{0x53}},
1416	{ARCPSS, yxm, Pf3, opBytes{0x53}},
1417	{ARCRB, yshb, Pb, opBytes{0xd0, 03, 0xc0, 03, 0xd2, 03}},
1418	{ARCRL, yshl, Px, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
1419	{ARCRQ, yshl, Pw, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
1420	{ARCRW, yshl, Pe, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
1421	{AREP, ynone, Px, opBytes{0xf3}},
1422	{AREPN, ynone, Px, opBytes{0xf2}},
1423	{obj.ARET, ynone, Px, opBytes{0xc3}},
1424	{ARETFW, yret, Pe, opBytes{0xcb, 0xca}},
1425	{ARETFL, yret, Px, opBytes{0xcb, 0xca}},
1426	{ARETFQ, yret, Pw, opBytes{0xcb, 0xca}},
1427	{AROLB, yshb, Pb, opBytes{0xd0, 00, 0xc0, 00, 0xd2, 00}},
1428	{AROLL, yshl, Px, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
1429	{AROLQ, yshl, Pw, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
1430	{AROLW, yshl, Pe, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
1431	{ARORB, yshb, Pb, opBytes{0xd0, 01, 0xc0, 01, 0xd2, 01}},
1432	{ARORL, yshl, Px, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
1433	{ARORQ, yshl, Pw, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
1434	{ARORW, yshl, Pe, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
1435	{ARSQRTPS, yxm, Pm, opBytes{0x52}},
1436	{ARSQRTSS, yxm, Pf3, opBytes{0x52}},
1437	{ASAHF, ynone, Px, opBytes{0x9e, 00, 0x86, 0xe0, 0x50, 0x9d}}, // XCHGB AH,AL; PUSH AX; POPFL
1438	{ASALB, yshb, Pb, opBytes{0xd0, 04, 0xc0, 04, 0xd2, 04}},
1439	{ASALL, yshl, Px, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1440	{ASALQ, yshl, Pw, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1441	{ASALW, yshl, Pe, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1442	{ASARB, yshb, Pb, opBytes{0xd0, 07, 0xc0, 07, 0xd2, 07}},
1443	{ASARL, yshl, Px, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
1444	{ASARQ, yshl, Pw, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
1445	{ASARW, yshl, Pe, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
1446	{ASBBB, yxorb, Pb, opBytes{0x1c, 0x80, 03, 0x18, 0x1a}},
1447	{ASBBL, yaddl, Px, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
1448	{ASBBQ, yaddl, Pw, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
1449	{ASBBW, yaddl, Pe, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
1450	{ASCASB, ynone, Pb, opBytes{0xae}},
1451	{ASCASL, ynone, Px, opBytes{0xaf}},
1452	{ASCASQ, ynone, Pw, opBytes{0xaf}},
1453	{ASCASW, ynone, Pe, opBytes{0xaf}},
1454	{ASETCC, yscond, Pb, opBytes{0x0f, 0x93, 00}},
1455	{ASETCS, yscond, Pb, opBytes{0x0f, 0x92, 00}},
1456	{ASETEQ, yscond, Pb, opBytes{0x0f, 0x94, 00}},
1457	{ASETGE, yscond, Pb, opBytes{0x0f, 0x9d, 00}},
1458	{ASETGT, yscond, Pb, opBytes{0x0f, 0x9f, 00}},
1459	{ASETHI, yscond, Pb, opBytes{0x0f, 0x97, 00}},
1460	{ASETLE, yscond, Pb, opBytes{0x0f, 0x9e, 00}},
1461	{ASETLS, yscond, Pb, opBytes{0x0f, 0x96, 00}},
1462	{ASETLT, yscond, Pb, opBytes{0x0f, 0x9c, 00}},
1463	{ASETMI, yscond, Pb, opBytes{0x0f, 0x98, 00}},
1464	{ASETNE, yscond, Pb, opBytes{0x0f, 0x95, 00}},
1465	{ASETOC, yscond, Pb, opBytes{0x0f, 0x91, 00}},
1466	{ASETOS, yscond, Pb, opBytes{0x0f, 0x90, 00}},
1467	{ASETPC, yscond, Pb, opBytes{0x0f, 0x9b, 00}},
1468	{ASETPL, yscond, Pb, opBytes{0x0f, 0x99, 00}},
1469	{ASETPS, yscond, Pb, opBytes{0x0f, 0x9a, 00}},
1470	{ASHLB, yshb, Pb, opBytes{0xd0, 04, 0xc0, 04, 0xd2, 04}},
1471	{ASHLL, yshl, Px, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1472	{ASHLQ, yshl, Pw, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1473	{ASHLW, yshl, Pe, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1474	{ASHRB, yshb, Pb, opBytes{0xd0, 05, 0xc0, 05, 0xd2, 05}},
1475	{ASHRL, yshl, Px, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
1476	{ASHRQ, yshl, Pw, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
1477	{ASHRW, yshl, Pe, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
1478	{ASHUFPD, yxshuf, Pq, opBytes{0xc6, 00}},
1479	{ASHUFPS, yxshuf, Pm, opBytes{0xc6, 00}},
1480	{ASQRTPD, yxm, Pe, opBytes{0x51}},
1481	{ASQRTPS, yxm, Pm, opBytes{0x51}},
1482	{ASQRTSD, yxm, Pf2, opBytes{0x51}},
1483	{ASQRTSS, yxm, Pf3, opBytes{0x51}},
1484	{ASTC, ynone, Px, opBytes{0xf9}},
1485	{ASTD, ynone, Px, opBytes{0xfd}},
1486	{ASTI, ynone, Px, opBytes{0xfb}},
1487	{ASTMXCSR, ysvrs_om, Pm, opBytes{0xae, 03, 0xae, 03}},
1488	{ASTOSB, ynone, Pb, opBytes{0xaa}},
1489	{ASTOSL, ynone, Px, opBytes{0xab}},
1490	{ASTOSQ, ynone, Pw, opBytes{0xab}},
1491	{ASTOSW, ynone, Pe, opBytes{0xab}},
1492	{ASUBB, yxorb, Pb, opBytes{0x2c, 0x80, 05, 0x28, 0x2a}},
1493	{ASUBL, yaddl, Px, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
1494	{ASUBPD, yxm, Pe, opBytes{0x5c}},
1495	{ASUBPS, yxm, Pm, opBytes{0x5c}},
1496	{ASUBQ, yaddl, Pw, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
1497	{ASUBSD, yxm, Pf2, opBytes{0x5c}},
1498	{ASUBSS, yxm, Pf3, opBytes{0x5c}},
1499	{ASUBW, yaddl, Pe, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
1500	{ASWAPGS, ynone, Pm, opBytes{0x01, 0xf8}},
1501	{ASYSCALL, ynone, Px, opBytes{0x0f, 0x05}}, // fast syscall
1502	{ATESTB, yxorb, Pb, opBytes{0xa8, 0xf6, 00, 0x84, 0x84}},
1503	{ATESTL, ytestl, Px, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
1504	{ATESTQ, ytestl, Pw, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
1505	{ATESTW, ytestl, Pe, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
1506	{ATPAUSE, ywrfsbase, Pq, opBytes{0xae, 06}},
1507	{obj.ATEXT, ytext, Px, opBytes{}},
1508	{AUCOMISD, yxm, Pe, opBytes{0x2e}},
1509	{AUCOMISS, yxm, Pm, opBytes{0x2e}},
1510	{AUNPCKHPD, yxm, Pe, opBytes{0x15}},
1511	{AUNPCKHPS, yxm, Pm, opBytes{0x15}},
1512	{AUNPCKLPD, yxm, Pe, opBytes{0x14}},
1513	{AUNPCKLPS, yxm, Pm, opBytes{0x14}},
1514	{AUMONITOR, ywrfsbase, Pf3, opBytes{0xae, 06}},
1515	{AVERR, ydivl, Pm, opBytes{0x00, 04}},
1516	{AVERW, ydivl, Pm, opBytes{0x00, 05}},
1517	{AWAIT, ynone, Px, opBytes{0x9b}},
1518	{AWORD, ybyte, Px, opBytes{2}},
1519	{AXCHGB, yml_mb, Pb, opBytes{0x86, 0x86}},
1520	{AXCHGL, yxchg, Px, opBytes{0x90, 0x90, 0x87, 0x87}},
1521	{AXCHGQ, yxchg, Pw, opBytes{0x90, 0x90, 0x87, 0x87}},
1522	{AXCHGW, yxchg, Pe, opBytes{0x90, 0x90, 0x87, 0x87}},
1523	{AXLAT, ynone, Px, opBytes{0xd7}},
1524	{AXORB, yxorb, Pb, opBytes{0x34, 0x80, 06, 0x30, 0x32}},
1525	{AXORL, yaddl, Px, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
1526	{AXORPD, yxm, Pe, opBytes{0x57}},
1527	{AXORPS, yxm, Pm, opBytes{0x57}},
1528	{AXORQ, yaddl, Pw, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
1529	{AXORW, yaddl, Pe, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
1530	{AFMOVB, yfmvx, Px, opBytes{0xdf, 04}},
1531	{AFMOVBP, yfmvp, Px, opBytes{0xdf, 06}},
1532	{AFMOVD, yfmvd, Px, opBytes{0xdd, 00, 0xdd, 02, 0xd9, 00, 0xdd, 02}},
1533	{AFMOVDP, yfmvdp, Px, opBytes{0xdd, 03, 0xdd, 03}},
1534	{AFMOVF, yfmvf, Px, opBytes{0xd9, 00, 0xd9, 02}},
1535	{AFMOVFP, yfmvp, Px, opBytes{0xd9, 03}},
1536	{AFMOVL, yfmvf, Px, opBytes{0xdb, 00, 0xdb, 02}},
1537	{AFMOVLP, yfmvp, Px, opBytes{0xdb, 03}},
1538	{AFMOVV, yfmvx, Px, opBytes{0xdf, 05}},
1539	{AFMOVVP, yfmvp, Px, opBytes{0xdf, 07}},
1540	{AFMOVW, yfmvf, Px, opBytes{0xdf, 00, 0xdf, 02}},
1541	{AFMOVWP, yfmvp, Px, opBytes{0xdf, 03}},
1542	{AFMOVX, yfmvx, Px, opBytes{0xdb, 05}},
1543	{AFMOVXP, yfmvp, Px, opBytes{0xdb, 07}},
1544	{AFCMOVCC, yfcmv, Px, opBytes{0xdb, 00}},
1545	{AFCMOVCS, yfcmv, Px, opBytes{0xda, 00}},
1546	{AFCMOVEQ, yfcmv, Px, opBytes{0xda, 01}},
1547	{AFCMOVHI, yfcmv, Px, opBytes{0xdb, 02}},
1548	{AFCMOVLS, yfcmv, Px, opBytes{0xda, 02}},
1549	{AFCMOVB, yfcmv, Px, opBytes{0xda, 00}},
1550	{AFCMOVBE, yfcmv, Px, opBytes{0xda, 02}},
1551	{AFCMOVNB, yfcmv, Px, opBytes{0xdb, 00}},
1552	{AFCMOVNBE, yfcmv, Px, opBytes{0xdb, 02}},
1553	{AFCMOVE, yfcmv, Px, opBytes{0xda, 01}},
1554	{AFCMOVNE, yfcmv, Px, opBytes{0xdb, 01}},
1555	{AFCMOVNU, yfcmv, Px, opBytes{0xdb, 03}},
1556	{AFCMOVU, yfcmv, Px, opBytes{0xda, 03}},
1557	{AFCMOVUN, yfcmv, Px, opBytes{0xda, 03}},
1558	{AFCOMD, yfadd, Px, opBytes{0xdc, 02, 0xd8, 02, 0xdc, 02}},  // botch
1559	{AFCOMDP, yfadd, Px, opBytes{0xdc, 03, 0xd8, 03, 0xdc, 03}}, // botch
1560	{AFCOMDPP, ycompp, Px, opBytes{0xde, 03}},
1561	{AFCOMF, yfmvx, Px, opBytes{0xd8, 02}},
1562	{AFCOMFP, yfmvx, Px, opBytes{0xd8, 03}},
1563	{AFCOMI, yfcmv, Px, opBytes{0xdb, 06}},
1564	{AFCOMIP, yfcmv, Px, opBytes{0xdf, 06}},
1565	{AFCOML, yfmvx, Px, opBytes{0xda, 02}},
1566	{AFCOMLP, yfmvx, Px, opBytes{0xda, 03}},
1567	{AFCOMW, yfmvx, Px, opBytes{0xde, 02}},
1568	{AFCOMWP, yfmvx, Px, opBytes{0xde, 03}},
1569	{AFUCOM, ycompp, Px, opBytes{0xdd, 04}},
1570	{AFUCOMI, ycompp, Px, opBytes{0xdb, 05}},
1571	{AFUCOMIP, ycompp, Px, opBytes{0xdf, 05}},
1572	{AFUCOMP, ycompp, Px, opBytes{0xdd, 05}},
1573	{AFUCOMPP, ycompp, Px, opBytes{0xda, 13}},
1574	{AFADDDP, ycompp, Px, opBytes{0xde, 00}},
1575	{AFADDW, yfmvx, Px, opBytes{0xde, 00}},
1576	{AFADDL, yfmvx, Px, opBytes{0xda, 00}},
1577	{AFADDF, yfmvx, Px, opBytes{0xd8, 00}},
1578	{AFADDD, yfadd, Px, opBytes{0xdc, 00, 0xd8, 00, 0xdc, 00}},
1579	{AFMULDP, ycompp, Px, opBytes{0xde, 01}},
1580	{AFMULW, yfmvx, Px, opBytes{0xde, 01}},
1581	{AFMULL, yfmvx, Px, opBytes{0xda, 01}},
1582	{AFMULF, yfmvx, Px, opBytes{0xd8, 01}},
1583	{AFMULD, yfadd, Px, opBytes{0xdc, 01, 0xd8, 01, 0xdc, 01}},
1584	{AFSUBDP, ycompp, Px, opBytes{0xde, 05}},
1585	{AFSUBW, yfmvx, Px, opBytes{0xde, 04}},
1586	{AFSUBL, yfmvx, Px, opBytes{0xda, 04}},
1587	{AFSUBF, yfmvx, Px, opBytes{0xd8, 04}},
1588	{AFSUBD, yfadd, Px, opBytes{0xdc, 04, 0xd8, 04, 0xdc, 05}},
1589	{AFSUBRDP, ycompp, Px, opBytes{0xde, 04}},
1590	{AFSUBRW, yfmvx, Px, opBytes{0xde, 05}},
1591	{AFSUBRL, yfmvx, Px, opBytes{0xda, 05}},
1592	{AFSUBRF, yfmvx, Px, opBytes{0xd8, 05}},
1593	{AFSUBRD, yfadd, Px, opBytes{0xdc, 05, 0xd8, 05, 0xdc, 04}},
1594	{AFDIVDP, ycompp, Px, opBytes{0xde, 07}},
1595	{AFDIVW, yfmvx, Px, opBytes{0xde, 06}},
1596	{AFDIVL, yfmvx, Px, opBytes{0xda, 06}},
1597	{AFDIVF, yfmvx, Px, opBytes{0xd8, 06}},
1598	{AFDIVD, yfadd, Px, opBytes{0xdc, 06, 0xd8, 06, 0xdc, 07}},
1599	{AFDIVRDP, ycompp, Px, opBytes{0xde, 06}},
1600	{AFDIVRW, yfmvx, Px, opBytes{0xde, 07}},
1601	{AFDIVRL, yfmvx, Px, opBytes{0xda, 07}},
1602	{AFDIVRF, yfmvx, Px, opBytes{0xd8, 07}},
1603	{AFDIVRD, yfadd, Px, opBytes{0xdc, 07, 0xd8, 07, 0xdc, 06}},
1604	{AFXCHD, yfxch, Px, opBytes{0xd9, 01, 0xd9, 01}},
1605	{AFFREE, nil, 0, opBytes{}},
1606	{AFLDCW, ysvrs_mo, Px, opBytes{0xd9, 05, 0xd9, 05}},
1607	{AFLDENV, ysvrs_mo, Px, opBytes{0xd9, 04, 0xd9, 04}},
1608	{AFRSTOR, ysvrs_mo, Px, opBytes{0xdd, 04, 0xdd, 04}},
1609	{AFSAVE, ysvrs_om, Px, opBytes{0xdd, 06, 0xdd, 06}},
1610	{AFSTCW, ysvrs_om, Px, opBytes{0xd9, 07, 0xd9, 07}},
1611	{AFSTENV, ysvrs_om, Px, opBytes{0xd9, 06, 0xd9, 06}},
1612	{AFSTSW, ystsw, Px, opBytes{0xdd, 07, 0xdf, 0xe0}},
1613	{AF2XM1, ynone, Px, opBytes{0xd9, 0xf0}},
1614	{AFABS, ynone, Px, opBytes{0xd9, 0xe1}},
1615	{AFBLD, ysvrs_mo, Px, opBytes{0xdf, 04}},
1616	{AFBSTP, yclflush, Px, opBytes{0xdf, 06}},
1617	{AFCHS, ynone, Px, opBytes{0xd9, 0xe0}},
1618	{AFCLEX, ynone, Px, opBytes{0xdb, 0xe2}},
1619	{AFCOS, ynone, Px, opBytes{0xd9, 0xff}},
1620	{AFDECSTP, ynone, Px, opBytes{0xd9, 0xf6}},
1621	{AFINCSTP, ynone, Px, opBytes{0xd9, 0xf7}},
1622	{AFINIT, ynone, Px, opBytes{0xdb, 0xe3}},
1623	{AFLD1, ynone, Px, opBytes{0xd9, 0xe8}},
1624	{AFLDL2E, ynone, Px, opBytes{0xd9, 0xea}},
1625	{AFLDL2T, ynone, Px, opBytes{0xd9, 0xe9}},
1626	{AFLDLG2, ynone, Px, opBytes{0xd9, 0xec}},
1627	{AFLDLN2, ynone, Px, opBytes{0xd9, 0xed}},
1628	{AFLDPI, ynone, Px, opBytes{0xd9, 0xeb}},
1629	{AFLDZ, ynone, Px, opBytes{0xd9, 0xee}},
1630	{AFNOP, ynone, Px, opBytes{0xd9, 0xd0}},
1631	{AFPATAN, ynone, Px, opBytes{0xd9, 0xf3}},
1632	{AFPREM, ynone, Px, opBytes{0xd9, 0xf8}},
1633	{AFPREM1, ynone, Px, opBytes{0xd9, 0xf5}},
1634	{AFPTAN, ynone, Px, opBytes{0xd9, 0xf2}},
1635	{AFRNDINT, ynone, Px, opBytes{0xd9, 0xfc}},
1636	{AFSCALE, ynone, Px, opBytes{0xd9, 0xfd}},
1637	{AFSIN, ynone, Px, opBytes{0xd9, 0xfe}},
1638	{AFSINCOS, ynone, Px, opBytes{0xd9, 0xfb}},
1639	{AFSQRT, ynone, Px, opBytes{0xd9, 0xfa}},
1640	{AFTST, ynone, Px, opBytes{0xd9, 0xe4}},
1641	{AFXAM, ynone, Px, opBytes{0xd9, 0xe5}},
1642	{AFXTRACT, ynone, Px, opBytes{0xd9, 0xf4}},
1643	{AFYL2X, ynone, Px, opBytes{0xd9, 0xf1}},
1644	{AFYL2XP1, ynone, Px, opBytes{0xd9, 0xf9}},
1645	{ACMPXCHGB, yrb_mb, Pb, opBytes{0x0f, 0xb0}},
1646	{ACMPXCHGL, yrl_ml, Px, opBytes{0x0f, 0xb1}},
1647	{ACMPXCHGW, yrl_ml, Pe, opBytes{0x0f, 0xb1}},
1648	{ACMPXCHGQ, yrl_ml, Pw, opBytes{0x0f, 0xb1}},
1649	{ACMPXCHG8B, yscond, Pm, opBytes{0xc7, 01}},
1650	{ACMPXCHG16B, yscond, Pw, opBytes{0x0f, 0xc7, 01}},
1651	{AINVD, ynone, Pm, opBytes{0x08}},
1652	{AINVLPG, ydivb, Pm, opBytes{0x01, 07}},
1653	{AINVPCID, ycrc32l, Pe, opBytes{0x0f, 0x38, 0x82, 0}},
1654	{ALFENCE, ynone, Pm, opBytes{0xae, 0xe8}},
1655	{AMFENCE, ynone, Pm, opBytes{0xae, 0xf0}},
1656	{AMOVNTIL, yrl_ml, Pm, opBytes{0xc3}},
1657	{AMOVNTIQ, yrl_ml, Pw, opBytes{0x0f, 0xc3}},
1658	{ARDPKRU, ynone, Pm, opBytes{0x01, 0xee, 0}},
1659	{ARDMSR, ynone, Pm, opBytes{0x32}},
1660	{ARDPMC, ynone, Pm, opBytes{0x33}},
1661	{ARDTSC, ynone, Pm, opBytes{0x31}},
1662	{ARSM, ynone, Pm, opBytes{0xaa}},
1663	{ASFENCE, ynone, Pm, opBytes{0xae, 0xf8}},
1664	{ASYSRET, ynone, Pm, opBytes{0x07}},
1665	{AWBINVD, ynone, Pm, opBytes{0x09}},
1666	{AWRMSR, ynone, Pm, opBytes{0x30}},
1667	{AWRPKRU, ynone, Pm, opBytes{0x01, 0xef, 0}},
1668	{AXADDB, yrb_mb, Pb, opBytes{0x0f, 0xc0}},
1669	{AXADDL, yrl_ml, Px, opBytes{0x0f, 0xc1}},
1670	{AXADDQ, yrl_ml, Pw, opBytes{0x0f, 0xc1}},
1671	{AXADDW, yrl_ml, Pe, opBytes{0x0f, 0xc1}},
1672	{ACRC32B, ycrc32b, Px, opBytes{0xf2, 0x0f, 0x38, 0xf0, 0}},
1673	{ACRC32L, ycrc32l, Px, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
1674	{ACRC32Q, ycrc32l, Pw, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
1675	{ACRC32W, ycrc32l, Pe, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
1676	{APREFETCHT0, yprefetch, Pm, opBytes{0x18, 01}},
1677	{APREFETCHT1, yprefetch, Pm, opBytes{0x18, 02}},
1678	{APREFETCHT2, yprefetch, Pm, opBytes{0x18, 03}},
1679	{APREFETCHNTA, yprefetch, Pm, opBytes{0x18, 00}},
1680	{AMOVQL, yrl_ml, Px, opBytes{0x89}},
1681	{obj.AUNDEF, ynone, Px, opBytes{0x0f, 0x0b}},
1682	{AAESENC, yaes, Pq, opBytes{0x38, 0xdc, 0}},
1683	{AAESENCLAST, yaes, Pq, opBytes{0x38, 0xdd, 0}},
1684	{AAESDEC, yaes, Pq, opBytes{0x38, 0xde, 0}},
1685	{AAESDECLAST, yaes, Pq, opBytes{0x38, 0xdf, 0}},
1686	{AAESIMC, yaes, Pq, opBytes{0x38, 0xdb, 0}},
1687	{AAESKEYGENASSIST, yxshuf, Pq, opBytes{0x3a, 0xdf, 0}},
1688	{AROUNDPD, yxshuf, Pq, opBytes{0x3a, 0x09, 0}},
1689	{AROUNDPS, yxshuf, Pq, opBytes{0x3a, 0x08, 0}},
1690	{AROUNDSD, yxshuf, Pq, opBytes{0x3a, 0x0b, 0}},
1691	{AROUNDSS, yxshuf, Pq, opBytes{0x3a, 0x0a, 0}},
1692	{APSHUFD, yxshuf, Pq, opBytes{0x70, 0}},
1693	{APCLMULQDQ, yxshuf, Pq, opBytes{0x3a, 0x44, 0}},
1694	{APCMPESTRI, yxshuf, Pq, opBytes{0x3a, 0x61, 0}},
1695	{APCMPESTRM, yxshuf, Pq, opBytes{0x3a, 0x60, 0}},
1696	{AMOVDDUP, yxm, Pf2, opBytes{0x12}},
1697	{AMOVSHDUP, yxm, Pf3, opBytes{0x16}},
1698	{AMOVSLDUP, yxm, Pf3, opBytes{0x12}},
1699	{ARDTSCP, ynone, Pm, opBytes{0x01, 0xf9, 0}},
1700	{ASTAC, ynone, Pm, opBytes{0x01, 0xcb, 0}},
1701	{AUD1, ynone, Pm, opBytes{0xb9, 0}},
1702	{AUD2, ynone, Pm, opBytes{0x0b, 0}},
1703	{AUMWAIT, ywrfsbase, Pf2, opBytes{0xae, 06}},
1704	{ASYSENTER, ynone, Px, opBytes{0x0f, 0x34, 0}},
1705	{ASYSENTER64, ynone, Pw, opBytes{0x0f, 0x34, 0}},
1706	{ASYSEXIT, ynone, Px, opBytes{0x0f, 0x35, 0}},
1707	{ASYSEXIT64, ynone, Pw, opBytes{0x0f, 0x35, 0}},
1708	{ALMSW, ydivl, Pm, opBytes{0x01, 06}},
1709	{ALLDT, ydivl, Pm, opBytes{0x00, 02}},
1710	{ALIDT, ysvrs_mo, Pm, opBytes{0x01, 03}},
1711	{ALGDT, ysvrs_mo, Pm, opBytes{0x01, 02}},
1712	{ATZCNTW, ycrc32l, Pe, opBytes{0xf3, 0x0f, 0xbc, 0}},
1713	{ATZCNTL, ycrc32l, Px, opBytes{0xf3, 0x0f, 0xbc, 0}},
1714	{ATZCNTQ, ycrc32l, Pw, opBytes{0xf3, 0x0f, 0xbc, 0}},
1715	{AXRSTOR, ydivl, Px, opBytes{0x0f, 0xae, 05}},
1716	{AXRSTOR64, ydivl, Pw, opBytes{0x0f, 0xae, 05}},
1717	{AXRSTORS, ydivl, Px, opBytes{0x0f, 0xc7, 03}},
1718	{AXRSTORS64, ydivl, Pw, opBytes{0x0f, 0xc7, 03}},
1719	{AXSAVE, yclflush, Px, opBytes{0x0f, 0xae, 04}},
1720	{AXSAVE64, yclflush, Pw, opBytes{0x0f, 0xae, 04}},
1721	{AXSAVEOPT, yclflush, Px, opBytes{0x0f, 0xae, 06}},
1722	{AXSAVEOPT64, yclflush, Pw, opBytes{0x0f, 0xae, 06}},
1723	{AXSAVEC, yclflush, Px, opBytes{0x0f, 0xc7, 04}},
1724	{AXSAVEC64, yclflush, Pw, opBytes{0x0f, 0xc7, 04}},
1725	{AXSAVES, yclflush, Px, opBytes{0x0f, 0xc7, 05}},
1726	{AXSAVES64, yclflush, Pw, opBytes{0x0f, 0xc7, 05}},
1727	{ASGDT, yclflush, Pm, opBytes{0x01, 00}},
1728	{ASIDT, yclflush, Pm, opBytes{0x01, 01}},
1729	{ARDRANDW, yrdrand, Pe, opBytes{0x0f, 0xc7, 06}},
1730	{ARDRANDL, yrdrand, Px, opBytes{0x0f, 0xc7, 06}},
1731	{ARDRANDQ, yrdrand, Pw, opBytes{0x0f, 0xc7, 06}},
1732	{ARDSEEDW, yrdrand, Pe, opBytes{0x0f, 0xc7, 07}},
1733	{ARDSEEDL, yrdrand, Px, opBytes{0x0f, 0xc7, 07}},
1734	{ARDSEEDQ, yrdrand, Pw, opBytes{0x0f, 0xc7, 07}},
1735	{ASTRW, yincq, Pe, opBytes{0x0f, 0x00, 01}},
1736	{ASTRL, yincq, Px, opBytes{0x0f, 0x00, 01}},
1737	{ASTRQ, yincq, Pw, opBytes{0x0f, 0x00, 01}},
1738	{AXSETBV, ynone, Pm, opBytes{0x01, 0xd1, 0}},
1739	{AMOVBEW, ymovbe, Pq, opBytes{0x38, 0xf0, 0, 0x38, 0xf1, 0}},
1740	{AMOVBEL, ymovbe, Pm, opBytes{0x38, 0xf0, 0, 0x38, 0xf1, 0}},
1741	{AMOVBEQ, ymovbe, Pw, opBytes{0x0f, 0x38, 0xf0, 0, 0x0f, 0x38, 0xf1, 0}},
1742	{ANOPW, ydivl, Pe, opBytes{0x0f, 0x1f, 00}},
1743	{ANOPL, ydivl, Px, opBytes{0x0f, 0x1f, 00}},
1744	{ASLDTW, yincq, Pe, opBytes{0x0f, 0x00, 00}},
1745	{ASLDTL, yincq, Px, opBytes{0x0f, 0x00, 00}},
1746	{ASLDTQ, yincq, Pw, opBytes{0x0f, 0x00, 00}},
1747	{ASMSWW, yincq, Pe, opBytes{0x0f, 0x01, 04}},
1748	{ASMSWL, yincq, Px, opBytes{0x0f, 0x01, 04}},
1749	{ASMSWQ, yincq, Pw, opBytes{0x0f, 0x01, 04}},
1750	{ABLENDVPS, yblendvpd, Pq4, opBytes{0x14}},
1751	{ABLENDVPD, yblendvpd, Pq4, opBytes{0x15}},
1752	{APBLENDVB, yblendvpd, Pq4, opBytes{0x10}},
1753	{ASHA1MSG1, yaes, Px, opBytes{0x0f, 0x38, 0xc9, 0}},
1754	{ASHA1MSG2, yaes, Px, opBytes{0x0f, 0x38, 0xca, 0}},
1755	{ASHA1NEXTE, yaes, Px, opBytes{0x0f, 0x38, 0xc8, 0}},
1756	{ASHA256MSG1, yaes, Px, opBytes{0x0f, 0x38, 0xcc, 0}},
1757	{ASHA256MSG2, yaes, Px, opBytes{0x0f, 0x38, 0xcd, 0}},
1758	{ASHA1RNDS4, ysha1rnds4, Pm, opBytes{0x3a, 0xcc, 0}},
1759	{ASHA256RNDS2, ysha256rnds2, Px, opBytes{0x0f, 0x38, 0xcb, 0}},
1760	{ARDFSBASEL, yrdrand, Pf3, opBytes{0xae, 00}},
1761	{ARDFSBASEQ, yrdrand, Pfw, opBytes{0xae, 00}},
1762	{ARDGSBASEL, yrdrand, Pf3, opBytes{0xae, 01}},
1763	{ARDGSBASEQ, yrdrand, Pfw, opBytes{0xae, 01}},
1764	{AWRFSBASEL, ywrfsbase, Pf3, opBytes{0xae, 02}},
1765	{AWRFSBASEQ, ywrfsbase, Pfw, opBytes{0xae, 02}},
1766	{AWRGSBASEL, ywrfsbase, Pf3, opBytes{0xae, 03}},
1767	{AWRGSBASEQ, ywrfsbase, Pfw, opBytes{0xae, 03}},
1768	{ALFSW, ym_rl, Pe, opBytes{0x0f, 0xb4}},
1769	{ALFSL, ym_rl, Px, opBytes{0x0f, 0xb4}},
1770	{ALFSQ, ym_rl, Pw, opBytes{0x0f, 0xb4}},
1771	{ALGSW, ym_rl, Pe, opBytes{0x0f, 0xb5}},
1772	{ALGSL, ym_rl, Px, opBytes{0x0f, 0xb5}},
1773	{ALGSQ, ym_rl, Pw, opBytes{0x0f, 0xb5}},
1774	{ALSSW, ym_rl, Pe, opBytes{0x0f, 0xb2}},
1775	{ALSSL, ym_rl, Px, opBytes{0x0f, 0xb2}},
1776	{ALSSQ, ym_rl, Pw, opBytes{0x0f, 0xb2}},
1777	{ARDPID, yrdrand, Pf3, opBytes{0xc7, 07}},
1778
1779	{ABLENDPD, yxshuf, Pq, opBytes{0x3a, 0x0d, 0}},
1780	{ABLENDPS, yxshuf, Pq, opBytes{0x3a, 0x0c, 0}},
1781	{AXACQUIRE, ynone, Px, opBytes{0xf2}},
1782	{AXRELEASE, ynone, Px, opBytes{0xf3}},
1783	{AXBEGIN, yxbegin, Px, opBytes{0xc7, 0xf8}},
1784	{AXABORT, yxabort, Px, opBytes{0xc6, 0xf8}},
1785	{AXEND, ynone, Px, opBytes{0x0f, 01, 0xd5}},
1786	{AXTEST, ynone, Px, opBytes{0x0f, 01, 0xd6}},
1787	{AXGETBV, ynone, Pm, opBytes{01, 0xd0}},
1788	{obj.AFUNCDATA, yfuncdata, Px, opBytes{0, 0}},
1789	{obj.APCDATA, ypcdata, Px, opBytes{0, 0}},
1790	{obj.ADUFFCOPY, yduff, Px, opBytes{0xe8}},
1791	{obj.ADUFFZERO, yduff, Px, opBytes{0xe8}},
1792
1793	{obj.AEND, nil, 0, opBytes{}},
1794	{0, nil, 0, opBytes{}},
1795}
1796
1797var opindex [(ALAST + 1) & obj.AMask]*Optab
1798
1799// useAbs reports whether s describes a symbol that must avoid pc-relative addressing.
1800// This happens on systems like Solaris that call .so functions instead of system calls.
1801// It does not seem to be necessary for any other systems. This is probably working
1802// around a Solaris-specific bug that should be fixed differently, but we don't know
1803// what that bug is. And this does fix it.
1804func useAbs(ctxt *obj.Link, s *obj.LSym) bool {
1805	if ctxt.Headtype == objabi.Hsolaris {
1806		// All the Solaris dynamic imports from libc.so begin with "libc_".
1807		return strings.HasPrefix(s.Name, "libc_")
1808	}
1809	return ctxt.Arch.Family == sys.I386 && !ctxt.Flag_shared
1810}
1811
1812// single-instruction no-ops of various lengths.
1813// constructed by hand and disassembled with gdb to verify.
1814// see http://www.agner.org/optimize/optimizing_assembly.pdf for discussion.
1815var nop = [][16]uint8{
1816	{0x90},
1817	{0x66, 0x90},
1818	{0x0F, 0x1F, 0x00},
1819	{0x0F, 0x1F, 0x40, 0x00},
1820	{0x0F, 0x1F, 0x44, 0x00, 0x00},
1821	{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
1822	{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
1823	{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
1824	{0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
1825}
1826
1827// Native Client rejects the repeated 0x66 prefix.
1828// {0x66, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
1829func fillnop(p []byte, n int) {
1830	var m int
1831
1832	for n > 0 {
1833		m = n
1834		if m > len(nop) {
1835			m = len(nop)
1836		}
1837		copy(p[:m], nop[m-1][:m])
1838		p = p[m:]
1839		n -= m
1840	}
1841}
1842
1843func noppad(ctxt *obj.Link, s *obj.LSym, c int32, pad int32) int32 {
1844	s.Grow(int64(c) + int64(pad))
1845	fillnop(s.P[c:], int(pad))
1846	return c + pad
1847}
1848
1849func spadjop(ctxt *obj.Link, l, q obj.As) obj.As {
1850	if ctxt.Arch.Family != sys.AMD64 || ctxt.Arch.PtrSize == 4 {
1851		return l
1852	}
1853	return q
1854}
1855
1856// isJump returns whether p is a jump instruction.
1857// It is used to ensure that no standalone or macro-fused jump will straddle
1858// or end on a 32 byte boundary by inserting NOPs before the jumps.
1859func isJump(p *obj.Prog) bool {
1860	return p.To.Target() != nil || p.As == obj.AJMP || p.As == obj.ACALL ||
1861		p.As == obj.ARET || p.As == obj.ADUFFCOPY || p.As == obj.ADUFFZERO
1862}
1863
1864// lookForJCC returns the first real instruction starting from p, if that instruction is a conditional
1865// jump. Otherwise, nil is returned.
1866func lookForJCC(p *obj.Prog) *obj.Prog {
1867	// Skip any PCDATA, FUNCDATA or NOP instructions
1868	var q *obj.Prog
1869	for q = p.Link; q != nil && (q.As == obj.APCDATA || q.As == obj.AFUNCDATA || q.As == obj.ANOP); q = q.Link {
1870	}
1871
1872	if q == nil || q.To.Target() == nil || p.As == obj.AJMP || p.As == obj.ACALL {
1873		return nil
1874	}
1875
1876	switch q.As {
1877	case AJOS, AJOC, AJCS, AJCC, AJEQ, AJNE, AJLS, AJHI,
1878		AJMI, AJPL, AJPS, AJPC, AJLT, AJGE, AJLE, AJGT:
1879	default:
1880		return nil
1881	}
1882
1883	return q
1884}
1885
1886// fusedJump determines whether p can be fused with a subsequent conditional jump instruction.
1887// If it can, we return true followed by the total size of the fused jump. If it can't, we return false.
1888// Macro fusion rules are derived from the Intel Optimization Manual (April 2019) section 3.4.2.2.
1889func fusedJump(p *obj.Prog) (bool, uint8) {
1890	var fusedSize uint8
1891
1892	// The first instruction in a macro fused pair may be preceded by the LOCK prefix,
1893	// or possibly an XACQUIRE/XRELEASE prefix followed by a LOCK prefix. If it is, we
1894	// need to be careful to insert any padding before the locks rather than directly after them.
1895
1896	if p.As == AXRELEASE || p.As == AXACQUIRE {
1897		fusedSize += p.Isize
1898		for p = p.Link; p != nil && (p.As == obj.APCDATA || p.As == obj.AFUNCDATA); p = p.Link {
1899		}
1900		if p == nil {
1901			return false, 0
1902		}
1903	}
1904	if p.As == ALOCK {
1905		fusedSize += p.Isize
1906		for p = p.Link; p != nil && (p.As == obj.APCDATA || p.As == obj.AFUNCDATA); p = p.Link {
1907		}
1908		if p == nil {
1909			return false, 0
1910		}
1911	}
1912	cmp := p.As == ACMPB || p.As == ACMPL || p.As == ACMPQ || p.As == ACMPW
1913
1914	cmpAddSub := p.As == AADDB || p.As == AADDL || p.As == AADDW || p.As == AADDQ ||
1915		p.As == ASUBB || p.As == ASUBL || p.As == ASUBW || p.As == ASUBQ || cmp
1916
1917	testAnd := p.As == ATESTB || p.As == ATESTL || p.As == ATESTQ || p.As == ATESTW ||
1918		p.As == AANDB || p.As == AANDL || p.As == AANDQ || p.As == AANDW
1919
1920	incDec := p.As == AINCB || p.As == AINCL || p.As == AINCQ || p.As == AINCW ||
1921		p.As == ADECB || p.As == ADECL || p.As == ADECQ || p.As == ADECW
1922
1923	if !cmpAddSub && !testAnd && !incDec {
1924		return false, 0
1925	}
1926
1927	if !incDec {
1928		var argOne obj.AddrType
1929		var argTwo obj.AddrType
1930		if cmp {
1931			argOne = p.From.Type
1932			argTwo = p.To.Type
1933		} else {
1934			argOne = p.To.Type
1935			argTwo = p.From.Type
1936		}
1937		if argOne == obj.TYPE_REG {
1938			if argTwo != obj.TYPE_REG && argTwo != obj.TYPE_CONST && argTwo != obj.TYPE_MEM {
1939				return false, 0
1940			}
1941		} else if argOne == obj.TYPE_MEM {
1942			if argTwo != obj.TYPE_REG {
1943				return false, 0
1944			}
1945		} else {
1946			return false, 0
1947		}
1948	}
1949
1950	fusedSize += p.Isize
1951	jmp := lookForJCC(p)
1952	if jmp == nil {
1953		return false, 0
1954	}
1955
1956	fusedSize += jmp.Isize
1957
1958	if testAnd {
1959		return true, fusedSize
1960	}
1961
1962	if jmp.As == AJOC || jmp.As == AJOS || jmp.As == AJMI ||
1963		jmp.As == AJPL || jmp.As == AJPS || jmp.As == AJPC {
1964		return false, 0
1965	}
1966
1967	if cmpAddSub {
1968		return true, fusedSize
1969	}
1970
1971	if jmp.As == AJCS || jmp.As == AJCC || jmp.As == AJHI || jmp.As == AJLS {
1972		return false, 0
1973	}
1974
1975	return true, fusedSize
1976}
1977
1978type padJumpsCtx int32
1979
1980func makePjcCtx(ctxt *obj.Link) padJumpsCtx {
1981	// Disable jump padding on 32 bit builds by setting
1982	// padJumps to 0.
1983	if ctxt.Arch.Family == sys.I386 {
1984		return padJumpsCtx(0)
1985	}
1986
1987	// Disable jump padding for hand written assembly code.
1988	if ctxt.IsAsm {
1989		return padJumpsCtx(0)
1990	}
1991
1992	return padJumpsCtx(32)
1993}
1994
1995// padJump detects whether the instruction being assembled is a standalone or a macro-fused
1996// jump that needs to be padded. If it is, NOPs are inserted to ensure that the jump does
1997// not cross or end on a 32 byte boundary.
1998func (pjc padJumpsCtx) padJump(ctxt *obj.Link, s *obj.LSym, p *obj.Prog, c int32) int32 {
1999	if pjc == 0 {
2000		return c
2001	}
2002
2003	var toPad int32
2004	fj, fjSize := fusedJump(p)
2005	mask := int32(pjc - 1)
2006	if fj {
2007		if (c&mask)+int32(fjSize) >= int32(pjc) {
2008			toPad = int32(pjc) - (c & mask)
2009		}
2010	} else if isJump(p) {
2011		if (c&mask)+int32(p.Isize) >= int32(pjc) {
2012			toPad = int32(pjc) - (c & mask)
2013		}
2014	}
2015	if toPad <= 0 {
2016		return c
2017	}
2018
2019	return noppad(ctxt, s, c, toPad)
2020}
2021
2022// reAssemble is called if an instruction's size changes during assembly. If
2023// it does and the instruction is a standalone or a macro-fused jump we need to
2024// reassemble.
2025func (pjc padJumpsCtx) reAssemble(p *obj.Prog) bool {
2026	if pjc == 0 {
2027		return false
2028	}
2029
2030	fj, _ := fusedJump(p)
2031	return fj || isJump(p)
2032}
2033
2034type nopPad struct {
2035	p *obj.Prog // Instruction before the pad
2036	n int32     // Size of the pad
2037}
2038
2039// requireAlignment ensures that the function alignment is at
2040// least as high as a, which should be a power of two
2041// and between 8 and 2048, inclusive.
2042//
2043// the boolean result indicates whether the alignment meets those constraints
2044func requireAlignment(a int64, ctxt *obj.Link, cursym *obj.LSym) bool {
2045	if !((a&(a-1) == 0) && 8 <= a && a <= 2048) {
2046		ctxt.Diag("alignment value of an instruction must be a power of two and in the range [8, 2048], got %d\n", a)
2047		return false
2048	}
2049	// By default function alignment is 32 bytes for amd64
2050	if cursym.Func().Align < int32(a) {
2051		cursym.Func().Align = int32(a)
2052	}
2053	return true
2054}
2055
2056func span6(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
2057	if ctxt.Retpoline && ctxt.Arch.Family == sys.I386 {
2058		ctxt.Diag("-spectre=ret not supported on 386")
2059		ctxt.Retpoline = false // don't keep printing
2060	}
2061
2062	pjc := makePjcCtx(ctxt)
2063
2064	if s.P != nil {
2065		return
2066	}
2067
2068	if ycover[0] == 0 {
2069		ctxt.Diag("x86 tables not initialized, call x86.instinit first")
2070	}
2071
2072	for p := s.Func().Text; p != nil; p = p.Link {
2073		if p.To.Type == obj.TYPE_BRANCH && p.To.Target() == nil {
2074			p.To.SetTarget(p)
2075		}
2076		if p.As == AADJSP {
2077			p.To.Type = obj.TYPE_REG
2078			p.To.Reg = REG_SP
2079			// Generate 'ADDQ $x, SP' or 'SUBQ $x, SP', with x positive.
2080			// One exception: It is smaller to encode $-0x80 than $0x80.
2081			// For that case, flip the sign and the op:
2082			// Instead of 'ADDQ $0x80, SP', generate 'SUBQ $-0x80, SP'.
2083			switch v := p.From.Offset; {
2084			case v == 0:
2085				p.As = obj.ANOP
2086			case v == 0x80 || (v < 0 && v != -0x80):
2087				p.As = spadjop(ctxt, AADDL, AADDQ)
2088				p.From.Offset *= -1
2089			default:
2090				p.As = spadjop(ctxt, ASUBL, ASUBQ)
2091			}
2092		}
2093		if ctxt.Retpoline && (p.As == obj.ACALL || p.As == obj.AJMP) && (p.To.Type == obj.TYPE_REG || p.To.Type == obj.TYPE_MEM) {
2094			if p.To.Type != obj.TYPE_REG {
2095				ctxt.Diag("non-retpoline-compatible: %v", p)
2096				continue
2097			}
2098			p.To.Type = obj.TYPE_BRANCH
2099			p.To.Name = obj.NAME_EXTERN
2100			p.To.Sym = ctxt.Lookup("runtime.retpoline" + obj.Rconv(int(p.To.Reg)))
2101			p.To.Reg = 0
2102			p.To.Offset = 0
2103		}
2104	}
2105
2106	var count int64 // rough count of number of instructions
2107	for p := s.Func().Text; p != nil; p = p.Link {
2108		count++
2109		p.Back = branchShort // use short branches first time through
2110		if q := p.To.Target(); q != nil && (q.Back&branchShort != 0) {
2111			p.Back |= branchBackwards
2112			q.Back |= branchLoopHead
2113		}
2114	}
2115	s.GrowCap(count * 5) // preallocate roughly 5 bytes per instruction
2116
2117	var ab AsmBuf
2118	var n int
2119	var c int32
2120	errors := ctxt.Errors
2121	var nops []nopPad // Padding for a particular assembly (reuse slice storage if multiple assemblies)
2122	nrelocs0 := len(s.R)
2123	for {
2124		// This loop continues while there are reasons to re-assemble
2125		// whole block, like the presence of long forward jumps.
2126		reAssemble := false
2127		for i := range s.R[nrelocs0:] {
2128			s.R[nrelocs0+i] = obj.Reloc{}
2129		}
2130		s.R = s.R[:nrelocs0] // preserve marker relocations generated by the compiler
2131		s.P = s.P[:0]
2132		c = 0
2133		var pPrev *obj.Prog
2134		nops = nops[:0]
2135		for p := s.Func().Text; p != nil; p = p.Link {
2136			c0 := c
2137			c = pjc.padJump(ctxt, s, p, c)
2138
2139			if p.As == obj.APCALIGN || p.As == obj.APCALIGNMAX {
2140				v := obj.AlignmentPadding(c, p, ctxt, s)
2141				if v > 0 {
2142					s.Grow(int64(c) + int64(v))
2143					fillnop(s.P[c:], int(v))
2144				}
2145				p.Pc = int64(c)
2146				c += int32(v)
2147				pPrev = p
2148				continue
2149
2150			}
2151
2152			if maxLoopPad > 0 && p.Back&branchLoopHead != 0 && c&(loopAlign-1) != 0 {
2153				// pad with NOPs
2154				v := -c & (loopAlign - 1)
2155
2156				if v <= maxLoopPad {
2157					s.Grow(int64(c) + int64(v))
2158					fillnop(s.P[c:], int(v))
2159					c += v
2160				}
2161			}
2162
2163			p.Pc = int64(c)
2164
2165			// process forward jumps to p
2166			for q := p.Rel; q != nil; q = q.Forwd {
2167				v := int32(p.Pc - (q.Pc + int64(q.Isize)))
2168				if q.Back&branchShort != 0 {
2169					if v > 127 {
2170						reAssemble = true
2171						q.Back ^= branchShort
2172					}
2173
2174					if q.As == AJCXZL || q.As == AXBEGIN {
2175						s.P[q.Pc+2] = byte(v)
2176					} else {
2177						s.P[q.Pc+1] = byte(v)
2178					}
2179				} else {
2180					binary.LittleEndian.PutUint32(s.P[q.Pc+int64(q.Isize)-4:], uint32(v))
2181				}
2182			}
2183
2184			p.Rel = nil
2185
2186			p.Pc = int64(c)
2187			ab.asmins(ctxt, s, p)
2188			m := ab.Len()
2189			if int(p.Isize) != m {
2190				p.Isize = uint8(m)
2191				if pjc.reAssemble(p) {
2192					// We need to re-assemble here to check for jumps and fused jumps
2193					// that span or end on 32 byte boundaries.
2194					reAssemble = true
2195				}
2196			}
2197
2198			s.Grow(p.Pc + int64(m))
2199			copy(s.P[p.Pc:], ab.Bytes())
2200			// If there was padding, remember it.
2201			if pPrev != nil && !ctxt.IsAsm && c > c0 {
2202				nops = append(nops, nopPad{p: pPrev, n: c - c0})
2203			}
2204			c += int32(m)
2205			pPrev = p
2206		}
2207
2208		n++
2209		if n > 1000 {
2210			ctxt.Diag("span must be looping")
2211			log.Fatalf("loop")
2212		}
2213		if !reAssemble {
2214			break
2215		}
2216		if ctxt.Errors > errors {
2217			return
2218		}
2219	}
2220	// splice padding nops into Progs
2221	for _, n := range nops {
2222		pp := n.p
2223		np := &obj.Prog{Link: pp.Link, Ctxt: pp.Ctxt, As: obj.ANOP, Pos: pp.Pos.WithNotStmt(), Pc: pp.Pc + int64(pp.Isize), Isize: uint8(n.n)}
2224		pp.Link = np
2225	}
2226
2227	s.Size = int64(c)
2228
2229	if false { /* debug['a'] > 1 */
2230		fmt.Printf("span1 %s %d (%d tries)\n %.6x", s.Name, s.Size, n, 0)
2231		var i int
2232		for i = 0; i < len(s.P); i++ {
2233			fmt.Printf(" %.2x", s.P[i])
2234			if i%16 == 15 {
2235				fmt.Printf("\n  %.6x", uint(i+1))
2236			}
2237		}
2238
2239		if i%16 != 0 {
2240			fmt.Printf("\n")
2241		}
2242
2243		for i := 0; i < len(s.R); i++ {
2244			r := &s.R[i]
2245			fmt.Printf(" rel %#.4x/%d %s%+d\n", uint32(r.Off), r.Siz, r.Sym.Name, r.Add)
2246		}
2247	}
2248
2249	// Mark nonpreemptible instruction sequences.
2250	// The 2-instruction TLS access sequence
2251	//	MOVQ TLS, BX
2252	//	MOVQ 0(BX)(TLS*1), BX
2253	// is not async preemptible, as if it is preempted and resumed on
2254	// a different thread, the TLS address may become invalid.
2255	if !CanUse1InsnTLS(ctxt) {
2256		useTLS := func(p *obj.Prog) bool {
2257			// Only need to mark the second instruction, which has
2258			// REG_TLS as Index. (It is okay to interrupt and restart
2259			// the first instruction.)
2260			return p.From.Index == REG_TLS
2261		}
2262		obj.MarkUnsafePoints(ctxt, s.Func().Text, newprog, useTLS, nil)
2263	}
2264
2265	// Now that we know byte offsets, we can generate jump table entries.
2266	// TODO: could this live in obj instead of obj/$ARCH?
2267	for _, jt := range s.Func().JumpTables {
2268		for i, p := range jt.Targets {
2269			// The ith jumptable entry points to the p.Pc'th
2270			// byte in the function symbol s.
2271			jt.Sym.WriteAddr(ctxt, int64(i)*8, 8, s, p.Pc)
2272		}
2273	}
2274}
2275
2276func instinit(ctxt *obj.Link) {
2277	if ycover[0] != 0 {
2278		// Already initialized; stop now.
2279		// This happens in the cmd/asm tests,
2280		// each of which re-initializes the arch.
2281		return
2282	}
2283
2284	switch ctxt.Headtype {
2285	case objabi.Hplan9:
2286		plan9privates = ctxt.Lookup("_privates")
2287	}
2288
2289	for i := range avxOptab {
2290		c := avxOptab[i].as
2291		if opindex[c&obj.AMask] != nil {
2292			ctxt.Diag("phase error in avxOptab: %d (%v)", i, c)
2293		}
2294		opindex[c&obj.AMask] = &avxOptab[i]
2295	}
2296	for i := 1; optab[i].as != 0; i++ {
2297		c := optab[i].as
2298		if opindex[c&obj.AMask] != nil {
2299			ctxt.Diag("phase error in optab: %d (%v)", i, c)
2300		}
2301		opindex[c&obj.AMask] = &optab[i]
2302	}
2303
2304	for i := 0; i < Ymax; i++ {
2305		ycover[i*Ymax+i] = 1
2306	}
2307
2308	ycover[Yi0*Ymax+Yu2] = 1
2309	ycover[Yi1*Ymax+Yu2] = 1
2310
2311	ycover[Yi0*Ymax+Yi8] = 1
2312	ycover[Yi1*Ymax+Yi8] = 1
2313	ycover[Yu2*Ymax+Yi8] = 1
2314	ycover[Yu7*Ymax+Yi8] = 1
2315
2316	ycover[Yi0*Ymax+Yu7] = 1
2317	ycover[Yi1*Ymax+Yu7] = 1
2318	ycover[Yu2*Ymax+Yu7] = 1
2319
2320	ycover[Yi0*Ymax+Yu8] = 1
2321	ycover[Yi1*Ymax+Yu8] = 1
2322	ycover[Yu2*Ymax+Yu8] = 1
2323	ycover[Yu7*Ymax+Yu8] = 1
2324
2325	ycover[Yi0*Ymax+Ys32] = 1
2326	ycover[Yi1*Ymax+Ys32] = 1
2327	ycover[Yu2*Ymax+Ys32] = 1
2328	ycover[Yu7*Ymax+Ys32] = 1
2329	ycover[Yu8*Ymax+Ys32] = 1
2330	ycover[Yi8*Ymax+Ys32] = 1
2331
2332	ycover[Yi0*Ymax+Yi32] = 1
2333	ycover[Yi1*Ymax+Yi32] = 1
2334	ycover[Yu2*Ymax+Yi32] = 1
2335	ycover[Yu7*Ymax+Yi32] = 1
2336	ycover[Yu8*Ymax+Yi32] = 1
2337	ycover[Yi8*Ymax+Yi32] = 1
2338	ycover[Ys32*Ymax+Yi32] = 1
2339
2340	ycover[Yi0*Ymax+Yi64] = 1
2341	ycover[Yi1*Ymax+Yi64] = 1
2342	ycover[Yu7*Ymax+Yi64] = 1
2343	ycover[Yu2*Ymax+Yi64] = 1
2344	ycover[Yu8*Ymax+Yi64] = 1
2345	ycover[Yi8*Ymax+Yi64] = 1
2346	ycover[Ys32*Ymax+Yi64] = 1
2347	ycover[Yi32*Ymax+Yi64] = 1
2348
2349	ycover[Yal*Ymax+Yrb] = 1
2350	ycover[Ycl*Ymax+Yrb] = 1
2351	ycover[Yax*Ymax+Yrb] = 1
2352	ycover[Ycx*Ymax+Yrb] = 1
2353	ycover[Yrx*Ymax+Yrb] = 1
2354	ycover[Yrl*Ymax+Yrb] = 1 // but not Yrl32
2355
2356	ycover[Ycl*Ymax+Ycx] = 1
2357
2358	ycover[Yax*Ymax+Yrx] = 1
2359	ycover[Ycx*Ymax+Yrx] = 1
2360
2361	ycover[Yax*Ymax+Yrl] = 1
2362	ycover[Ycx*Ymax+Yrl] = 1
2363	ycover[Yrx*Ymax+Yrl] = 1
2364	ycover[Yrl32*Ymax+Yrl] = 1
2365
2366	ycover[Yf0*Ymax+Yrf] = 1
2367
2368	ycover[Yal*Ymax+Ymb] = 1
2369	ycover[Ycl*Ymax+Ymb] = 1
2370	ycover[Yax*Ymax+Ymb] = 1
2371	ycover[Ycx*Ymax+Ymb] = 1
2372	ycover[Yrx*Ymax+Ymb] = 1
2373	ycover[Yrb*Ymax+Ymb] = 1
2374	ycover[Yrl*Ymax+Ymb] = 1 // but not Yrl32
2375	ycover[Ym*Ymax+Ymb] = 1
2376
2377	ycover[Yax*Ymax+Yml] = 1
2378	ycover[Ycx*Ymax+Yml] = 1
2379	ycover[Yrx*Ymax+Yml] = 1
2380	ycover[Yrl*Ymax+Yml] = 1
2381	ycover[Yrl32*Ymax+Yml] = 1
2382	ycover[Ym*Ymax+Yml] = 1
2383
2384	ycover[Yax*Ymax+Ymm] = 1
2385	ycover[Ycx*Ymax+Ymm] = 1
2386	ycover[Yrx*Ymax+Ymm] = 1
2387	ycover[Yrl*Ymax+Ymm] = 1
2388	ycover[Yrl32*Ymax+Ymm] = 1
2389	ycover[Ym*Ymax+Ymm] = 1
2390	ycover[Ymr*Ymax+Ymm] = 1
2391
2392	ycover[Yxr0*Ymax+Yxr] = 1
2393
2394	ycover[Ym*Ymax+Yxm] = 1
2395	ycover[Yxr0*Ymax+Yxm] = 1
2396	ycover[Yxr*Ymax+Yxm] = 1
2397
2398	ycover[Ym*Ymax+Yym] = 1
2399	ycover[Yyr*Ymax+Yym] = 1
2400
2401	ycover[Yxr0*Ymax+YxrEvex] = 1
2402	ycover[Yxr*Ymax+YxrEvex] = 1
2403
2404	ycover[Ym*Ymax+YxmEvex] = 1
2405	ycover[Yxr0*Ymax+YxmEvex] = 1
2406	ycover[Yxr*Ymax+YxmEvex] = 1
2407	ycover[YxrEvex*Ymax+YxmEvex] = 1
2408
2409	ycover[Yyr*Ymax+YyrEvex] = 1
2410
2411	ycover[Ym*Ymax+YymEvex] = 1
2412	ycover[Yyr*Ymax+YymEvex] = 1
2413	ycover[YyrEvex*Ymax+YymEvex] = 1
2414
2415	ycover[Ym*Ymax+Yzm] = 1
2416	ycover[Yzr*Ymax+Yzm] = 1
2417
2418	ycover[Yk0*Ymax+Yk] = 1
2419	ycover[Yknot0*Ymax+Yk] = 1
2420
2421	ycover[Yk0*Ymax+Ykm] = 1
2422	ycover[Yknot0*Ymax+Ykm] = 1
2423	ycover[Yk*Ymax+Ykm] = 1
2424	ycover[Ym*Ymax+Ykm] = 1
2425
2426	ycover[Yxvm*Ymax+YxvmEvex] = 1
2427
2428	ycover[Yyvm*Ymax+YyvmEvex] = 1
2429
2430	for i := 0; i < MAXREG; i++ {
2431		reg[i] = -1
2432		if i >= REG_AL && i <= REG_R15B {
2433			reg[i] = (i - REG_AL) & 7
2434			if i >= REG_SPB && i <= REG_DIB {
2435				regrex[i] = 0x40
2436			}
2437			if i >= REG_R8B && i <= REG_R15B {
2438				regrex[i] = Rxr | Rxx | Rxb
2439			}
2440		}
2441
2442		if i >= REG_AH && i <= REG_BH {
2443			reg[i] = 4 + ((i - REG_AH) & 7)
2444		}
2445		if i >= REG_AX && i <= REG_R15 {
2446			reg[i] = (i - REG_AX) & 7
2447			if i >= REG_R8 {
2448				regrex[i] = Rxr | Rxx | Rxb
2449			}
2450		}
2451
2452		if i >= REG_F0 && i <= REG_F0+7 {
2453			reg[i] = (i - REG_F0) & 7
2454		}
2455		if i >= REG_M0 && i <= REG_M0+7 {
2456			reg[i] = (i - REG_M0) & 7
2457		}
2458		if i >= REG_K0 && i <= REG_K0+7 {
2459			reg[i] = (i - REG_K0) & 7
2460		}
2461		if i >= REG_X0 && i <= REG_X0+15 {
2462			reg[i] = (i - REG_X0) & 7
2463			if i >= REG_X0+8 {
2464				regrex[i] = Rxr | Rxx | Rxb
2465			}
2466		}
2467		if i >= REG_X16 && i <= REG_X16+15 {
2468			reg[i] = (i - REG_X16) & 7
2469			if i >= REG_X16+8 {
2470				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
2471			} else {
2472				regrex[i] = RxrEvex
2473			}
2474		}
2475		if i >= REG_Y0 && i <= REG_Y0+15 {
2476			reg[i] = (i - REG_Y0) & 7
2477			if i >= REG_Y0+8 {
2478				regrex[i] = Rxr | Rxx | Rxb
2479			}
2480		}
2481		if i >= REG_Y16 && i <= REG_Y16+15 {
2482			reg[i] = (i - REG_Y16) & 7
2483			if i >= REG_Y16+8 {
2484				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
2485			} else {
2486				regrex[i] = RxrEvex
2487			}
2488		}
2489		if i >= REG_Z0 && i <= REG_Z0+15 {
2490			reg[i] = (i - REG_Z0) & 7
2491			if i > REG_Z0+7 {
2492				regrex[i] = Rxr | Rxx | Rxb
2493			}
2494		}
2495		if i >= REG_Z16 && i <= REG_Z16+15 {
2496			reg[i] = (i - REG_Z16) & 7
2497			if i >= REG_Z16+8 {
2498				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
2499			} else {
2500				regrex[i] = RxrEvex
2501			}
2502		}
2503
2504		if i >= REG_CR+8 && i <= REG_CR+15 {
2505			regrex[i] = Rxr
2506		}
2507	}
2508}
2509
2510var isAndroid = buildcfg.GOOS == "android"
2511
2512func prefixof(ctxt *obj.Link, a *obj.Addr) int {
2513	if a.Reg < REG_CS && a.Index < REG_CS { // fast path
2514		return 0
2515	}
2516	if a.Type == obj.TYPE_MEM && a.Name == obj.NAME_NONE {
2517		switch a.Reg {
2518		case REG_CS:
2519			return 0x2e
2520
2521		case REG_DS:
2522			return 0x3e
2523
2524		case REG_ES:
2525			return 0x26
2526
2527		case REG_FS:
2528			return 0x64
2529
2530		case REG_GS:
2531			return 0x65
2532
2533		case REG_TLS:
2534			// NOTE: Systems listed here should be only systems that
2535			// support direct TLS references like 8(TLS) implemented as
2536			// direct references from FS or GS. Systems that require
2537			// the initial-exec model, where you load the TLS base into
2538			// a register and then index from that register, do not reach
2539			// this code and should not be listed.
2540			if ctxt.Arch.Family == sys.I386 {
2541				switch ctxt.Headtype {
2542				default:
2543					if isAndroid {
2544						return 0x65 // GS
2545					}
2546					log.Fatalf("unknown TLS base register for %v", ctxt.Headtype)
2547
2548				case objabi.Hdarwin,
2549					objabi.Hdragonfly,
2550					objabi.Hfreebsd,
2551					objabi.Hnetbsd,
2552					objabi.Hopenbsd:
2553					return 0x65 // GS
2554				}
2555			}
2556
2557			switch ctxt.Headtype {
2558			default:
2559				log.Fatalf("unknown TLS base register for %v", ctxt.Headtype)
2560
2561			case objabi.Hlinux:
2562				if isAndroid {
2563					return 0x64 // FS
2564				}
2565
2566				if ctxt.Flag_shared {
2567					log.Fatalf("unknown TLS base register for linux with -shared")
2568				} else {
2569					return 0x64 // FS
2570				}
2571
2572			case objabi.Hdragonfly,
2573				objabi.Hfreebsd,
2574				objabi.Hnetbsd,
2575				objabi.Hopenbsd,
2576				objabi.Hsolaris:
2577				return 0x64 // FS
2578
2579			case objabi.Hdarwin:
2580				return 0x65 // GS
2581			}
2582		}
2583	}
2584
2585	switch a.Index {
2586	case REG_CS:
2587		return 0x2e
2588
2589	case REG_DS:
2590		return 0x3e
2591
2592	case REG_ES:
2593		return 0x26
2594
2595	case REG_TLS:
2596		if ctxt.Flag_shared && ctxt.Headtype != objabi.Hwindows {
2597			// When building for inclusion into a shared library, an instruction of the form
2598			//     MOV off(CX)(TLS*1), AX
2599			// becomes
2600			//     mov %gs:off(%ecx), %eax // on i386
2601			//     mov %fs:off(%rcx), %rax // on amd64
2602			// which assumes that the correct TLS offset has been loaded into CX (today
2603			// there is only one TLS variable -- g -- so this is OK). When not building for
2604			// a shared library the instruction it becomes
2605			//     mov 0x0(%ecx), %eax // on i386
2606			//     mov 0x0(%rcx), %rax // on amd64
2607			// and a R_TLS_LE relocation, and so does not require a prefix.
2608			if ctxt.Arch.Family == sys.I386 {
2609				return 0x65 // GS
2610			}
2611			return 0x64 // FS
2612		}
2613
2614	case REG_FS:
2615		return 0x64
2616
2617	case REG_GS:
2618		return 0x65
2619	}
2620
2621	return 0
2622}
2623
2624// oclassRegList returns multisource operand class for addr.
2625func oclassRegList(ctxt *obj.Link, addr *obj.Addr) int {
2626	// TODO(quasilyte): when oclass register case is refactored into
2627	// lookup table, use it here to get register kind more easily.
2628	// Helper functions like regIsXmm should go away too (they will become redundant).
2629
2630	regIsXmm := func(r int) bool { return r >= REG_X0 && r <= REG_X31 }
2631	regIsYmm := func(r int) bool { return r >= REG_Y0 && r <= REG_Y31 }
2632	regIsZmm := func(r int) bool { return r >= REG_Z0 && r <= REG_Z31 }
2633
2634	reg0, reg1 := decodeRegisterRange(addr.Offset)
2635	low := regIndex(int16(reg0))
2636	high := regIndex(int16(reg1))
2637
2638	if ctxt.Arch.Family == sys.I386 {
2639		if low >= 8 || high >= 8 {
2640			return Yxxx
2641		}
2642	}
2643
2644	switch high - low {
2645	case 3:
2646		switch {
2647		case regIsXmm(reg0) && regIsXmm(reg1):
2648			return YxrEvexMulti4
2649		case regIsYmm(reg0) && regIsYmm(reg1):
2650			return YyrEvexMulti4
2651		case regIsZmm(reg0) && regIsZmm(reg1):
2652			return YzrMulti4
2653		default:
2654			return Yxxx
2655		}
2656	default:
2657		return Yxxx
2658	}
2659}
2660
2661// oclassVMem returns V-mem (vector memory with VSIB) operand class.
2662// For addr that is not V-mem returns (Yxxx, false).
2663func oclassVMem(ctxt *obj.Link, addr *obj.Addr) (int, bool) {
2664	switch addr.Index {
2665	case REG_X0 + 0,
2666		REG_X0 + 1,
2667		REG_X0 + 2,
2668		REG_X0 + 3,
2669		REG_X0 + 4,
2670		REG_X0 + 5,
2671		REG_X0 + 6,
2672		REG_X0 + 7:
2673		return Yxvm, true
2674	case REG_X8 + 0,
2675		REG_X8 + 1,
2676		REG_X8 + 2,
2677		REG_X8 + 3,
2678		REG_X8 + 4,
2679		REG_X8 + 5,
2680		REG_X8 + 6,
2681		REG_X8 + 7:
2682		if ctxt.Arch.Family == sys.I386 {
2683			return Yxxx, true
2684		}
2685		return Yxvm, true
2686	case REG_X16 + 0,
2687		REG_X16 + 1,
2688		REG_X16 + 2,
2689		REG_X16 + 3,
2690		REG_X16 + 4,
2691		REG_X16 + 5,
2692		REG_X16 + 6,
2693		REG_X16 + 7,
2694		REG_X16 + 8,
2695		REG_X16 + 9,
2696		REG_X16 + 10,
2697		REG_X16 + 11,
2698		REG_X16 + 12,
2699		REG_X16 + 13,
2700		REG_X16 + 14,
2701		REG_X16 + 15:
2702		if ctxt.Arch.Family == sys.I386 {
2703			return Yxxx, true
2704		}
2705		return YxvmEvex, true
2706
2707	case REG_Y0 + 0,
2708		REG_Y0 + 1,
2709		REG_Y0 + 2,
2710		REG_Y0 + 3,
2711		REG_Y0 + 4,
2712		REG_Y0 + 5,
2713		REG_Y0 + 6,
2714		REG_Y0 + 7:
2715		return Yyvm, true
2716	case REG_Y8 + 0,
2717		REG_Y8 + 1,
2718		REG_Y8 + 2,
2719		REG_Y8 + 3,
2720		REG_Y8 + 4,
2721		REG_Y8 + 5,
2722		REG_Y8 + 6,
2723		REG_Y8 + 7:
2724		if ctxt.Arch.Family == sys.I386 {
2725			return Yxxx, true
2726		}
2727		return Yyvm, true
2728	case REG_Y16 + 0,
2729		REG_Y16 + 1,
2730		REG_Y16 + 2,
2731		REG_Y16 + 3,
2732		REG_Y16 + 4,
2733		REG_Y16 + 5,
2734		REG_Y16 + 6,
2735		REG_Y16 + 7,
2736		REG_Y16 + 8,
2737		REG_Y16 + 9,
2738		REG_Y16 + 10,
2739		REG_Y16 + 11,
2740		REG_Y16 + 12,
2741		REG_Y16 + 13,
2742		REG_Y16 + 14,
2743		REG_Y16 + 15:
2744		if ctxt.Arch.Family == sys.I386 {
2745			return Yxxx, true
2746		}
2747		return YyvmEvex, true
2748
2749	case REG_Z0 + 0,
2750		REG_Z0 + 1,
2751		REG_Z0 + 2,
2752		REG_Z0 + 3,
2753		REG_Z0 + 4,
2754		REG_Z0 + 5,
2755		REG_Z0 + 6,
2756		REG_Z0 + 7:
2757		return Yzvm, true
2758	case REG_Z8 + 0,
2759		REG_Z8 + 1,
2760		REG_Z8 + 2,
2761		REG_Z8 + 3,
2762		REG_Z8 + 4,
2763		REG_Z8 + 5,
2764		REG_Z8 + 6,
2765		REG_Z8 + 7,
2766		REG_Z8 + 8,
2767		REG_Z8 + 9,
2768		REG_Z8 + 10,
2769		REG_Z8 + 11,
2770		REG_Z8 + 12,
2771		REG_Z8 + 13,
2772		REG_Z8 + 14,
2773		REG_Z8 + 15,
2774		REG_Z8 + 16,
2775		REG_Z8 + 17,
2776		REG_Z8 + 18,
2777		REG_Z8 + 19,
2778		REG_Z8 + 20,
2779		REG_Z8 + 21,
2780		REG_Z8 + 22,
2781		REG_Z8 + 23:
2782		if ctxt.Arch.Family == sys.I386 {
2783			return Yxxx, true
2784		}
2785		return Yzvm, true
2786	}
2787
2788	return Yxxx, false
2789}
2790
2791func oclass(ctxt *obj.Link, p *obj.Prog, a *obj.Addr) int {
2792	switch a.Type {
2793	case obj.TYPE_REGLIST:
2794		return oclassRegList(ctxt, a)
2795
2796	case obj.TYPE_NONE:
2797		return Ynone
2798
2799	case obj.TYPE_BRANCH:
2800		return Ybr
2801
2802	case obj.TYPE_INDIR:
2803		if a.Name != obj.NAME_NONE && a.Reg == REG_NONE && a.Index == REG_NONE && a.Scale == 0 {
2804			return Yindir
2805		}
2806		return Yxxx
2807
2808	case obj.TYPE_MEM:
2809		// Pseudo registers have negative index, but SP is
2810		// not pseudo on x86, hence REG_SP check is not redundant.
2811		if a.Index == REG_SP || a.Index < 0 {
2812			// Can't use FP/SB/PC/SP as the index register.
2813			return Yxxx
2814		}
2815
2816		if vmem, ok := oclassVMem(ctxt, a); ok {
2817			return vmem
2818		}
2819
2820		if ctxt.Arch.Family == sys.AMD64 {
2821			switch a.Name {
2822			case obj.NAME_EXTERN, obj.NAME_STATIC, obj.NAME_GOTREF:
2823				// Global variables can't use index registers and their
2824				// base register is %rip (%rip is encoded as REG_NONE).
2825				if a.Reg != REG_NONE || a.Index != REG_NONE || a.Scale != 0 {
2826					return Yxxx
2827				}
2828			case obj.NAME_AUTO, obj.NAME_PARAM:
2829				// These names must have a base of SP.  The old compiler
2830				// uses 0 for the base register. SSA uses REG_SP.
2831				if a.Reg != REG_SP && a.Reg != 0 {
2832					return Yxxx
2833				}
2834			case obj.NAME_NONE:
2835				// everything is ok
2836			default:
2837				// unknown name
2838				return Yxxx
2839			}
2840		}
2841		return Ym
2842
2843	case obj.TYPE_ADDR:
2844		switch a.Name {
2845		case obj.NAME_GOTREF:
2846			ctxt.Diag("unexpected TYPE_ADDR with NAME_GOTREF")
2847			return Yxxx
2848
2849		case obj.NAME_EXTERN,
2850			obj.NAME_STATIC:
2851			if a.Sym != nil && useAbs(ctxt, a.Sym) {
2852				return Yi32
2853			}
2854			return Yiauto // use pc-relative addressing
2855
2856		case obj.NAME_AUTO,
2857			obj.NAME_PARAM:
2858			return Yiauto
2859		}
2860
2861		// TODO(rsc): DUFFZERO/DUFFCOPY encoding forgot to set a->index
2862		// and got Yi32 in an earlier version of this code.
2863		// Keep doing that until we fix yduff etc.
2864		if a.Sym != nil && strings.HasPrefix(a.Sym.Name, "runtime.duff") {
2865			return Yi32
2866		}
2867
2868		if a.Sym != nil || a.Name != obj.NAME_NONE {
2869			ctxt.Diag("unexpected addr: %v", obj.Dconv(p, a))
2870		}
2871		fallthrough
2872
2873	case obj.TYPE_CONST:
2874		if a.Sym != nil {
2875			ctxt.Diag("TYPE_CONST with symbol: %v", obj.Dconv(p, a))
2876		}
2877
2878		v := a.Offset
2879		if ctxt.Arch.Family == sys.I386 {
2880			v = int64(int32(v))
2881		}
2882		switch {
2883		case v == 0:
2884			return Yi0
2885		case v == 1:
2886			return Yi1
2887		case v >= 0 && v <= 3:
2888			return Yu2
2889		case v >= 0 && v <= 127:
2890			return Yu7
2891		case v >= 0 && v <= 255:
2892			return Yu8
2893		case v >= -128 && v <= 127:
2894			return Yi8
2895		}
2896		if ctxt.Arch.Family == sys.I386 {
2897			return Yi32
2898		}
2899		l := int32(v)
2900		if int64(l) == v {
2901			return Ys32 // can sign extend
2902		}
2903		if v>>32 == 0 {
2904			return Yi32 // unsigned
2905		}
2906		return Yi64
2907
2908	case obj.TYPE_TEXTSIZE:
2909		return Ytextsize
2910	}
2911
2912	if a.Type != obj.TYPE_REG {
2913		ctxt.Diag("unexpected addr1: type=%d %v", a.Type, obj.Dconv(p, a))
2914		return Yxxx
2915	}
2916
2917	switch a.Reg {
2918	case REG_AL:
2919		return Yal
2920
2921	case REG_AX:
2922		return Yax
2923
2924		/*
2925			case REG_SPB:
2926		*/
2927	case REG_BPB,
2928		REG_SIB,
2929		REG_DIB,
2930		REG_R8B,
2931		REG_R9B,
2932		REG_R10B,
2933		REG_R11B,
2934		REG_R12B,
2935		REG_R13B,
2936		REG_R14B,
2937		REG_R15B:
2938		if ctxt.Arch.Family == sys.I386 {
2939			return Yxxx
2940		}
2941		fallthrough
2942
2943	case REG_DL,
2944		REG_BL,
2945		REG_AH,
2946		REG_CH,
2947		REG_DH,
2948		REG_BH:
2949		return Yrb
2950
2951	case REG_CL:
2952		return Ycl
2953
2954	case REG_CX:
2955		return Ycx
2956
2957	case REG_DX, REG_BX:
2958		return Yrx
2959
2960	case REG_R8, // not really Yrl
2961		REG_R9,
2962		REG_R10,
2963		REG_R11,
2964		REG_R12,
2965		REG_R13,
2966		REG_R14,
2967		REG_R15:
2968		if ctxt.Arch.Family == sys.I386 {
2969			return Yxxx
2970		}
2971		fallthrough
2972
2973	case REG_SP, REG_BP, REG_SI, REG_DI:
2974		if ctxt.Arch.Family == sys.I386 {
2975			return Yrl32
2976		}
2977		return Yrl
2978
2979	case REG_F0 + 0:
2980		return Yf0
2981
2982	case REG_F0 + 1,
2983		REG_F0 + 2,
2984		REG_F0 + 3,
2985		REG_F0 + 4,
2986		REG_F0 + 5,
2987		REG_F0 + 6,
2988		REG_F0 + 7:
2989		return Yrf
2990
2991	case REG_M0 + 0,
2992		REG_M0 + 1,
2993		REG_M0 + 2,
2994		REG_M0 + 3,
2995		REG_M0 + 4,
2996		REG_M0 + 5,
2997		REG_M0 + 6,
2998		REG_M0 + 7:
2999		return Ymr
3000
3001	case REG_X0:
3002		return Yxr0
3003
3004	case REG_X0 + 1,
3005		REG_X0 + 2,
3006		REG_X0 + 3,
3007		REG_X0 + 4,
3008		REG_X0 + 5,
3009		REG_X0 + 6,
3010		REG_X0 + 7,
3011		REG_X0 + 8,
3012		REG_X0 + 9,
3013		REG_X0 + 10,
3014		REG_X0 + 11,
3015		REG_X0 + 12,
3016		REG_X0 + 13,
3017		REG_X0 + 14,
3018		REG_X0 + 15:
3019		return Yxr
3020
3021	case REG_X0 + 16,
3022		REG_X0 + 17,
3023		REG_X0 + 18,
3024		REG_X0 + 19,
3025		REG_X0 + 20,
3026		REG_X0 + 21,
3027		REG_X0 + 22,
3028		REG_X0 + 23,
3029		REG_X0 + 24,
3030		REG_X0 + 25,
3031		REG_X0 + 26,
3032		REG_X0 + 27,
3033		REG_X0 + 28,
3034		REG_X0 + 29,
3035		REG_X0 + 30,
3036		REG_X0 + 31:
3037		return YxrEvex
3038
3039	case REG_Y0 + 0,
3040		REG_Y0 + 1,
3041		REG_Y0 + 2,
3042		REG_Y0 + 3,
3043		REG_Y0 + 4,
3044		REG_Y0 + 5,
3045		REG_Y0 + 6,
3046		REG_Y0 + 7,
3047		REG_Y0 + 8,
3048		REG_Y0 + 9,
3049		REG_Y0 + 10,
3050		REG_Y0 + 11,
3051		REG_Y0 + 12,
3052		REG_Y0 + 13,
3053		REG_Y0 + 14,
3054		REG_Y0 + 15:
3055		return Yyr
3056
3057	case REG_Y0 + 16,
3058		REG_Y0 + 17,
3059		REG_Y0 + 18,
3060		REG_Y0 + 19,
3061		REG_Y0 + 20,
3062		REG_Y0 + 21,
3063		REG_Y0 + 22,
3064		REG_Y0 + 23,
3065		REG_Y0 + 24,
3066		REG_Y0 + 25,
3067		REG_Y0 + 26,
3068		REG_Y0 + 27,
3069		REG_Y0 + 28,
3070		REG_Y0 + 29,
3071		REG_Y0 + 30,
3072		REG_Y0 + 31:
3073		return YyrEvex
3074
3075	case REG_Z0 + 0,
3076		REG_Z0 + 1,
3077		REG_Z0 + 2,
3078		REG_Z0 + 3,
3079		REG_Z0 + 4,
3080		REG_Z0 + 5,
3081		REG_Z0 + 6,
3082		REG_Z0 + 7:
3083		return Yzr
3084
3085	case REG_Z0 + 8,
3086		REG_Z0 + 9,
3087		REG_Z0 + 10,
3088		REG_Z0 + 11,
3089		REG_Z0 + 12,
3090		REG_Z0 + 13,
3091		REG_Z0 + 14,
3092		REG_Z0 + 15,
3093		REG_Z0 + 16,
3094		REG_Z0 + 17,
3095		REG_Z0 + 18,
3096		REG_Z0 + 19,
3097		REG_Z0 + 20,
3098		REG_Z0 + 21,
3099		REG_Z0 + 22,
3100		REG_Z0 + 23,
3101		REG_Z0 + 24,
3102		REG_Z0 + 25,
3103		REG_Z0 + 26,
3104		REG_Z0 + 27,
3105		REG_Z0 + 28,
3106		REG_Z0 + 29,
3107		REG_Z0 + 30,
3108		REG_Z0 + 31:
3109		if ctxt.Arch.Family == sys.I386 {
3110			return Yxxx
3111		}
3112		return Yzr
3113
3114	case REG_K0:
3115		return Yk0
3116
3117	case REG_K0 + 1,
3118		REG_K0 + 2,
3119		REG_K0 + 3,
3120		REG_K0 + 4,
3121		REG_K0 + 5,
3122		REG_K0 + 6,
3123		REG_K0 + 7:
3124		return Yknot0
3125
3126	case REG_CS:
3127		return Ycs
3128	case REG_SS:
3129		return Yss
3130	case REG_DS:
3131		return Yds
3132	case REG_ES:
3133		return Yes
3134	case REG_FS:
3135		return Yfs
3136	case REG_GS:
3137		return Ygs
3138	case REG_TLS:
3139		return Ytls
3140
3141	case REG_GDTR:
3142		return Ygdtr
3143	case REG_IDTR:
3144		return Yidtr
3145	case REG_LDTR:
3146		return Yldtr
3147	case REG_MSW:
3148		return Ymsw
3149	case REG_TASK:
3150		return Ytask
3151
3152	case REG_CR + 0:
3153		return Ycr0
3154	case REG_CR + 1:
3155		return Ycr1
3156	case REG_CR + 2:
3157		return Ycr2
3158	case REG_CR + 3:
3159		return Ycr3
3160	case REG_CR + 4:
3161		return Ycr4
3162	case REG_CR + 5:
3163		return Ycr5
3164	case REG_CR + 6:
3165		return Ycr6
3166	case REG_CR + 7:
3167		return Ycr7
3168	case REG_CR + 8:
3169		return Ycr8
3170
3171	case REG_DR + 0:
3172		return Ydr0
3173	case REG_DR + 1:
3174		return Ydr1
3175	case REG_DR + 2:
3176		return Ydr2
3177	case REG_DR + 3:
3178		return Ydr3
3179	case REG_DR + 4:
3180		return Ydr4
3181	case REG_DR + 5:
3182		return Ydr5
3183	case REG_DR + 6:
3184		return Ydr6
3185	case REG_DR + 7:
3186		return Ydr7
3187
3188	case REG_TR + 0:
3189		return Ytr0
3190	case REG_TR + 1:
3191		return Ytr1
3192	case REG_TR + 2:
3193		return Ytr2
3194	case REG_TR + 3:
3195		return Ytr3
3196	case REG_TR + 4:
3197		return Ytr4
3198	case REG_TR + 5:
3199		return Ytr5
3200	case REG_TR + 6:
3201		return Ytr6
3202	case REG_TR + 7:
3203		return Ytr7
3204	}
3205
3206	return Yxxx
3207}
3208
3209// AsmBuf is a simple buffer to assemble variable-length x86 instructions into
3210// and hold assembly state.
3211type AsmBuf struct {
3212	buf      [100]byte
3213	off      int
3214	rexflag  int
3215	vexflag  bool // Per inst: true for VEX-encoded
3216	evexflag bool // Per inst: true for EVEX-encoded
3217	rep      bool
3218	repn     bool
3219	lock     bool
3220
3221	evex evexBits // Initialized when evexflag is true
3222}
3223
3224// Put1 appends one byte to the end of the buffer.
3225func (ab *AsmBuf) Put1(x byte) {
3226	ab.buf[ab.off] = x
3227	ab.off++
3228}
3229
3230// Put2 appends two bytes to the end of the buffer.
3231func (ab *AsmBuf) Put2(x, y byte) {
3232	ab.buf[ab.off+0] = x
3233	ab.buf[ab.off+1] = y
3234	ab.off += 2
3235}
3236
3237// Put3 appends three bytes to the end of the buffer.
3238func (ab *AsmBuf) Put3(x, y, z byte) {
3239	ab.buf[ab.off+0] = x
3240	ab.buf[ab.off+1] = y
3241	ab.buf[ab.off+2] = z
3242	ab.off += 3
3243}
3244
3245// Put4 appends four bytes to the end of the buffer.
3246func (ab *AsmBuf) Put4(x, y, z, w byte) {
3247	ab.buf[ab.off+0] = x
3248	ab.buf[ab.off+1] = y
3249	ab.buf[ab.off+2] = z
3250	ab.buf[ab.off+3] = w
3251	ab.off += 4
3252}
3253
3254// PutInt16 writes v into the buffer using little-endian encoding.
3255func (ab *AsmBuf) PutInt16(v int16) {
3256	ab.buf[ab.off+0] = byte(v)
3257	ab.buf[ab.off+1] = byte(v >> 8)
3258	ab.off += 2
3259}
3260
3261// PutInt32 writes v into the buffer using little-endian encoding.
3262func (ab *AsmBuf) PutInt32(v int32) {
3263	ab.buf[ab.off+0] = byte(v)
3264	ab.buf[ab.off+1] = byte(v >> 8)
3265	ab.buf[ab.off+2] = byte(v >> 16)
3266	ab.buf[ab.off+3] = byte(v >> 24)
3267	ab.off += 4
3268}
3269
3270// PutInt64 writes v into the buffer using little-endian encoding.
3271func (ab *AsmBuf) PutInt64(v int64) {
3272	ab.buf[ab.off+0] = byte(v)
3273	ab.buf[ab.off+1] = byte(v >> 8)
3274	ab.buf[ab.off+2] = byte(v >> 16)
3275	ab.buf[ab.off+3] = byte(v >> 24)
3276	ab.buf[ab.off+4] = byte(v >> 32)
3277	ab.buf[ab.off+5] = byte(v >> 40)
3278	ab.buf[ab.off+6] = byte(v >> 48)
3279	ab.buf[ab.off+7] = byte(v >> 56)
3280	ab.off += 8
3281}
3282
3283// Put copies b into the buffer.
3284func (ab *AsmBuf) Put(b []byte) {
3285	copy(ab.buf[ab.off:], b)
3286	ab.off += len(b)
3287}
3288
3289// PutOpBytesLit writes zero terminated sequence of bytes from op,
3290// starting at specified offset (e.g. z counter value).
3291// Trailing 0 is not written.
3292//
3293// Intended to be used for literal Z cases.
3294// Literal Z cases usually have "Zlit" in their name (Zlit, Zlitr_m, Zlitm_r).
3295func (ab *AsmBuf) PutOpBytesLit(offset int, op *opBytes) {
3296	for int(op[offset]) != 0 {
3297		ab.Put1(byte(op[offset]))
3298		offset++
3299	}
3300}
3301
3302// Insert inserts b at offset i.
3303func (ab *AsmBuf) Insert(i int, b byte) {
3304	ab.off++
3305	copy(ab.buf[i+1:ab.off], ab.buf[i:ab.off-1])
3306	ab.buf[i] = b
3307}
3308
3309// Last returns the byte at the end of the buffer.
3310func (ab *AsmBuf) Last() byte { return ab.buf[ab.off-1] }
3311
3312// Len returns the length of the buffer.
3313func (ab *AsmBuf) Len() int { return ab.off }
3314
3315// Bytes returns the contents of the buffer.
3316func (ab *AsmBuf) Bytes() []byte { return ab.buf[:ab.off] }
3317
3318// Reset empties the buffer.
3319func (ab *AsmBuf) Reset() { ab.off = 0 }
3320
3321// At returns the byte at offset i.
3322func (ab *AsmBuf) At(i int) byte { return ab.buf[i] }
3323
3324// asmidx emits SIB byte.
3325func (ab *AsmBuf) asmidx(ctxt *obj.Link, scale int, index int, base int) {
3326	var i int
3327
3328	// X/Y index register is used in VSIB.
3329	switch index {
3330	default:
3331		goto bad
3332
3333	case REG_NONE:
3334		i = 4 << 3
3335		goto bas
3336
3337	case REG_R8,
3338		REG_R9,
3339		REG_R10,
3340		REG_R11,
3341		REG_R12,
3342		REG_R13,
3343		REG_R14,
3344		REG_R15,
3345		REG_X8,
3346		REG_X9,
3347		REG_X10,
3348		REG_X11,
3349		REG_X12,
3350		REG_X13,
3351		REG_X14,
3352		REG_X15,
3353		REG_X16,
3354		REG_X17,
3355		REG_X18,
3356		REG_X19,
3357		REG_X20,
3358		REG_X21,
3359		REG_X22,
3360		REG_X23,
3361		REG_X24,
3362		REG_X25,
3363		REG_X26,
3364		REG_X27,
3365		REG_X28,
3366		REG_X29,
3367		REG_X30,
3368		REG_X31,
3369		REG_Y8,
3370		REG_Y9,
3371		REG_Y10,
3372		REG_Y11,
3373		REG_Y12,
3374		REG_Y13,
3375		REG_Y14,
3376		REG_Y15,
3377		REG_Y16,
3378		REG_Y17,
3379		REG_Y18,
3380		REG_Y19,
3381		REG_Y20,
3382		REG_Y21,
3383		REG_Y22,
3384		REG_Y23,
3385		REG_Y24,
3386		REG_Y25,
3387		REG_Y26,
3388		REG_Y27,
3389		REG_Y28,
3390		REG_Y29,
3391		REG_Y30,
3392		REG_Y31,
3393		REG_Z8,
3394		REG_Z9,
3395		REG_Z10,
3396		REG_Z11,
3397		REG_Z12,
3398		REG_Z13,
3399		REG_Z14,
3400		REG_Z15,
3401		REG_Z16,
3402		REG_Z17,
3403		REG_Z18,
3404		REG_Z19,
3405		REG_Z20,
3406		REG_Z21,
3407		REG_Z22,
3408		REG_Z23,
3409		REG_Z24,
3410		REG_Z25,
3411		REG_Z26,
3412		REG_Z27,
3413		REG_Z28,
3414		REG_Z29,
3415		REG_Z30,
3416		REG_Z31:
3417		if ctxt.Arch.Family == sys.I386 {
3418			goto bad
3419		}
3420		fallthrough
3421
3422	case REG_AX,
3423		REG_CX,
3424		REG_DX,
3425		REG_BX,
3426		REG_BP,
3427		REG_SI,
3428		REG_DI,
3429		REG_X0,
3430		REG_X1,
3431		REG_X2,
3432		REG_X3,
3433		REG_X4,
3434		REG_X5,
3435		REG_X6,
3436		REG_X7,
3437		REG_Y0,
3438		REG_Y1,
3439		REG_Y2,
3440		REG_Y3,
3441		REG_Y4,
3442		REG_Y5,
3443		REG_Y6,
3444		REG_Y7,
3445		REG_Z0,
3446		REG_Z1,
3447		REG_Z2,
3448		REG_Z3,
3449		REG_Z4,
3450		REG_Z5,
3451		REG_Z6,
3452		REG_Z7:
3453		i = reg[index] << 3
3454	}
3455
3456	switch scale {
3457	default:
3458		goto bad
3459
3460	case 1:
3461		break
3462
3463	case 2:
3464		i |= 1 << 6
3465
3466	case 4:
3467		i |= 2 << 6
3468
3469	case 8:
3470		i |= 3 << 6
3471	}
3472
3473bas:
3474	switch base {
3475	default:
3476		goto bad
3477
3478	case REG_NONE: // must be mod=00
3479		i |= 5
3480
3481	case REG_R8,
3482		REG_R9,
3483		REG_R10,
3484		REG_R11,
3485		REG_R12,
3486		REG_R13,
3487		REG_R14,
3488		REG_R15:
3489		if ctxt.Arch.Family == sys.I386 {
3490			goto bad
3491		}
3492		fallthrough
3493
3494	case REG_AX,
3495		REG_CX,
3496		REG_DX,
3497		REG_BX,
3498		REG_SP,
3499		REG_BP,
3500		REG_SI,
3501		REG_DI:
3502		i |= reg[base]
3503	}
3504
3505	ab.Put1(byte(i))
3506	return
3507
3508bad:
3509	ctxt.Diag("asmidx: bad address %d/%d/%d", scale, index, base)
3510	ab.Put1(0)
3511}
3512
3513func (ab *AsmBuf) relput4(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr) {
3514	var rel obj.Reloc
3515
3516	v := vaddr(ctxt, p, a, &rel)
3517	if rel.Siz != 0 {
3518		if rel.Siz != 4 {
3519			ctxt.Diag("bad reloc")
3520		}
3521		r := obj.Addrel(cursym)
3522		*r = rel
3523		r.Off = int32(p.Pc + int64(ab.Len()))
3524	}
3525
3526	ab.PutInt32(int32(v))
3527}
3528
3529func vaddr(ctxt *obj.Link, p *obj.Prog, a *obj.Addr, r *obj.Reloc) int64 {
3530	if r != nil {
3531		*r = obj.Reloc{}
3532	}
3533
3534	switch a.Name {
3535	case obj.NAME_STATIC,
3536		obj.NAME_GOTREF,
3537		obj.NAME_EXTERN:
3538		s := a.Sym
3539		if r == nil {
3540			ctxt.Diag("need reloc for %v", obj.Dconv(p, a))
3541			log.Fatalf("reloc")
3542		}
3543
3544		if a.Name == obj.NAME_GOTREF {
3545			r.Siz = 4
3546			r.Type = objabi.R_GOTPCREL
3547		} else if useAbs(ctxt, s) {
3548			r.Siz = 4
3549			r.Type = objabi.R_ADDR
3550		} else {
3551			r.Siz = 4
3552			r.Type = objabi.R_PCREL
3553		}
3554
3555		r.Off = -1 // caller must fill in
3556		r.Sym = s
3557		r.Add = a.Offset
3558
3559		return 0
3560	}
3561
3562	if (a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR) && a.Reg == REG_TLS {
3563		if r == nil {
3564			ctxt.Diag("need reloc for %v", obj.Dconv(p, a))
3565			log.Fatalf("reloc")
3566		}
3567
3568		if !ctxt.Flag_shared || isAndroid || ctxt.Headtype == objabi.Hdarwin {
3569			r.Type = objabi.R_TLS_LE
3570			r.Siz = 4
3571			r.Off = -1 // caller must fill in
3572			r.Add = a.Offset
3573		}
3574		return 0
3575	}
3576
3577	return a.Offset
3578}
3579
3580func (ab *AsmBuf) asmandsz(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, r int, rex int, m64 int) {
3581	var base int
3582	var rel obj.Reloc
3583
3584	rex &= 0x40 | Rxr
3585	if a.Offset != int64(int32(a.Offset)) {
3586		// The rules are slightly different for 386 and AMD64,
3587		// mostly for historical reasons. We may unify them later,
3588		// but it must be discussed beforehand.
3589		//
3590		// For 64bit mode only LEAL is allowed to overflow.
3591		// It's how https://golang.org/cl/59630 made it.
3592		// crypto/sha1/sha1block_amd64.s depends on this feature.
3593		//
3594		// For 32bit mode rules are more permissive.
3595		// If offset fits uint32, it's permitted.
3596		// This is allowed for assembly that wants to use 32-bit hex
3597		// constants, e.g. LEAL 0x99999999(AX), AX.
3598		overflowOK := (ctxt.Arch.Family == sys.AMD64 && p.As == ALEAL) ||
3599			(ctxt.Arch.Family != sys.AMD64 &&
3600				int64(uint32(a.Offset)) == a.Offset &&
3601				ab.rexflag&Rxw == 0)
3602		if !overflowOK {
3603			ctxt.Diag("offset too large in %s", p)
3604		}
3605	}
3606	v := int32(a.Offset)
3607	rel.Siz = 0
3608
3609	switch a.Type {
3610	case obj.TYPE_ADDR:
3611		if a.Name == obj.NAME_NONE {
3612			ctxt.Diag("unexpected TYPE_ADDR with NAME_NONE")
3613		}
3614		if a.Index == REG_TLS {
3615			ctxt.Diag("unexpected TYPE_ADDR with index==REG_TLS")
3616		}
3617		goto bad
3618
3619	case obj.TYPE_REG:
3620		const regFirst = REG_AL
3621		const regLast = REG_Z31
3622		if a.Reg < regFirst || regLast < a.Reg {
3623			goto bad
3624		}
3625		if v != 0 {
3626			goto bad
3627		}
3628		ab.Put1(byte(3<<6 | reg[a.Reg]<<0 | r<<3))
3629		ab.rexflag |= regrex[a.Reg]&(0x40|Rxb) | rex
3630		return
3631	}
3632
3633	if a.Type != obj.TYPE_MEM {
3634		goto bad
3635	}
3636
3637	if a.Index != REG_NONE && a.Index != REG_TLS && !(REG_CS <= a.Index && a.Index <= REG_GS) {
3638		base := int(a.Reg)
3639		switch a.Name {
3640		case obj.NAME_EXTERN,
3641			obj.NAME_GOTREF,
3642			obj.NAME_STATIC:
3643			if !useAbs(ctxt, a.Sym) && ctxt.Arch.Family == sys.AMD64 {
3644				goto bad
3645			}
3646			if ctxt.Arch.Family == sys.I386 && ctxt.Flag_shared {
3647				// The base register has already been set. It holds the PC
3648				// of this instruction returned by a PC-reading thunk.
3649				// See obj6.go:rewriteToPcrel.
3650			} else {
3651				base = REG_NONE
3652			}
3653			v = int32(vaddr(ctxt, p, a, &rel))
3654
3655		case obj.NAME_AUTO,
3656			obj.NAME_PARAM:
3657			base = REG_SP
3658		}
3659
3660		ab.rexflag |= regrex[int(a.Index)]&Rxx | regrex[base]&Rxb | rex
3661		if base == REG_NONE {
3662			ab.Put1(byte(0<<6 | 4<<0 | r<<3))
3663			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3664			goto putrelv
3665		}
3666
3667		if v == 0 && rel.Siz == 0 && base != REG_BP && base != REG_R13 {
3668			ab.Put1(byte(0<<6 | 4<<0 | r<<3))
3669			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3670			return
3671		}
3672
3673		if disp8, ok := toDisp8(v, p, ab); ok && rel.Siz == 0 {
3674			ab.Put1(byte(1<<6 | 4<<0 | r<<3))
3675			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3676			ab.Put1(disp8)
3677			return
3678		}
3679
3680		ab.Put1(byte(2<<6 | 4<<0 | r<<3))
3681		ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3682		goto putrelv
3683	}
3684
3685	base = int(a.Reg)
3686	switch a.Name {
3687	case obj.NAME_STATIC,
3688		obj.NAME_GOTREF,
3689		obj.NAME_EXTERN:
3690		if a.Sym == nil {
3691			ctxt.Diag("bad addr: %v", p)
3692		}
3693		if ctxt.Arch.Family == sys.I386 && ctxt.Flag_shared {
3694			// The base register has already been set. It holds the PC
3695			// of this instruction returned by a PC-reading thunk.
3696			// See obj6.go:rewriteToPcrel.
3697		} else {
3698			base = REG_NONE
3699		}
3700		v = int32(vaddr(ctxt, p, a, &rel))
3701
3702	case obj.NAME_AUTO,
3703		obj.NAME_PARAM:
3704		base = REG_SP
3705	}
3706
3707	if base == REG_TLS {
3708		v = int32(vaddr(ctxt, p, a, &rel))
3709	}
3710
3711	ab.rexflag |= regrex[base]&Rxb | rex
3712	if base == REG_NONE || (REG_CS <= base && base <= REG_GS) || base == REG_TLS {
3713		if (a.Sym == nil || !useAbs(ctxt, a.Sym)) && base == REG_NONE && (a.Name == obj.NAME_STATIC || a.Name == obj.NAME_EXTERN || a.Name == obj.NAME_GOTREF) || ctxt.Arch.Family != sys.AMD64 {
3714			if a.Name == obj.NAME_GOTREF && (a.Offset != 0 || a.Index != 0 || a.Scale != 0) {
3715				ctxt.Diag("%v has offset against gotref", p)
3716			}
3717			ab.Put1(byte(0<<6 | 5<<0 | r<<3))
3718			goto putrelv
3719		}
3720
3721		// temporary
3722		ab.Put2(
3723			byte(0<<6|4<<0|r<<3), // sib present
3724			0<<6|4<<3|5<<0,       // DS:d32
3725		)
3726		goto putrelv
3727	}
3728
3729	if base == REG_SP || base == REG_R12 {
3730		if v == 0 {
3731			ab.Put1(byte(0<<6 | reg[base]<<0 | r<<3))
3732			ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
3733			return
3734		}
3735
3736		if disp8, ok := toDisp8(v, p, ab); ok {
3737			ab.Put1(byte(1<<6 | reg[base]<<0 | r<<3))
3738			ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
3739			ab.Put1(disp8)
3740			return
3741		}
3742
3743		ab.Put1(byte(2<<6 | reg[base]<<0 | r<<3))
3744		ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
3745		goto putrelv
3746	}
3747
3748	if REG_AX <= base && base <= REG_R15 {
3749		if a.Index == REG_TLS && !ctxt.Flag_shared && !isAndroid &&
3750			ctxt.Headtype != objabi.Hwindows {
3751			rel = obj.Reloc{}
3752			rel.Type = objabi.R_TLS_LE
3753			rel.Siz = 4
3754			rel.Sym = nil
3755			rel.Add = int64(v)
3756			v = 0
3757		}
3758
3759		if v == 0 && rel.Siz == 0 && base != REG_BP && base != REG_R13 {
3760			ab.Put1(byte(0<<6 | reg[base]<<0 | r<<3))
3761			return
3762		}
3763
3764		if disp8, ok := toDisp8(v, p, ab); ok && rel.Siz == 0 {
3765			ab.Put2(byte(1<<6|reg[base]<<0|r<<3), disp8)
3766			return
3767		}
3768
3769		ab.Put1(byte(2<<6 | reg[base]<<0 | r<<3))
3770		goto putrelv
3771	}
3772
3773	goto bad
3774
3775putrelv:
3776	if rel.Siz != 0 {
3777		if rel.Siz != 4 {
3778			ctxt.Diag("bad rel")
3779			goto bad
3780		}
3781
3782		r := obj.Addrel(cursym)
3783		*r = rel
3784		r.Off = int32(p.Pc + int64(ab.Len()))
3785	}
3786
3787	ab.PutInt32(v)
3788	return
3789
3790bad:
3791	ctxt.Diag("asmand: bad address %v", obj.Dconv(p, a))
3792}
3793
3794func (ab *AsmBuf) asmand(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, ra *obj.Addr) {
3795	ab.asmandsz(ctxt, cursym, p, a, reg[ra.Reg], regrex[ra.Reg], 0)
3796}
3797
3798func (ab *AsmBuf) asmando(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, o int) {
3799	ab.asmandsz(ctxt, cursym, p, a, o, 0, 0)
3800}
3801
3802func bytereg(a *obj.Addr, t *uint8) {
3803	if a.Type == obj.TYPE_REG && a.Index == REG_NONE && (REG_AX <= a.Reg && a.Reg <= REG_R15) {
3804		a.Reg += REG_AL - REG_AX
3805		*t = 0
3806	}
3807}
3808
3809func unbytereg(a *obj.Addr, t *uint8) {
3810	if a.Type == obj.TYPE_REG && a.Index == REG_NONE && (REG_AL <= a.Reg && a.Reg <= REG_R15B) {
3811		a.Reg += REG_AX - REG_AL
3812		*t = 0
3813	}
3814}
3815
3816const (
3817	movLit uint8 = iota // Like Zlit
3818	movRegMem
3819	movMemReg
3820	movRegMem2op
3821	movMemReg2op
3822	movFullPtr // Load full pointer, trash heap (unsupported)
3823	movDoubleShift
3824	movTLSReg
3825)
3826
3827var ymovtab = []movtab{
3828	// push
3829	{APUSHL, Ycs, Ynone, Ynone, movLit, [4]uint8{0x0e, 0}},
3830	{APUSHL, Yss, Ynone, Ynone, movLit, [4]uint8{0x16, 0}},
3831	{APUSHL, Yds, Ynone, Ynone, movLit, [4]uint8{0x1e, 0}},
3832	{APUSHL, Yes, Ynone, Ynone, movLit, [4]uint8{0x06, 0}},
3833	{APUSHL, Yfs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa0, 0}},
3834	{APUSHL, Ygs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa8, 0}},
3835	{APUSHQ, Yfs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa0, 0}},
3836	{APUSHQ, Ygs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa8, 0}},
3837	{APUSHW, Ycs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0e, 0}},
3838	{APUSHW, Yss, Ynone, Ynone, movLit, [4]uint8{Pe, 0x16, 0}},
3839	{APUSHW, Yds, Ynone, Ynone, movLit, [4]uint8{Pe, 0x1e, 0}},
3840	{APUSHW, Yes, Ynone, Ynone, movLit, [4]uint8{Pe, 0x06, 0}},
3841	{APUSHW, Yfs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0f, 0xa0, 0}},
3842	{APUSHW, Ygs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0f, 0xa8, 0}},
3843
3844	// pop
3845	{APOPL, Ynone, Ynone, Yds, movLit, [4]uint8{0x1f, 0}},
3846	{APOPL, Ynone, Ynone, Yes, movLit, [4]uint8{0x07, 0}},
3847	{APOPL, Ynone, Ynone, Yss, movLit, [4]uint8{0x17, 0}},
3848	{APOPL, Ynone, Ynone, Yfs, movLit, [4]uint8{0x0f, 0xa1, 0}},
3849	{APOPL, Ynone, Ynone, Ygs, movLit, [4]uint8{0x0f, 0xa9, 0}},
3850	{APOPQ, Ynone, Ynone, Yfs, movLit, [4]uint8{0x0f, 0xa1, 0}},
3851	{APOPQ, Ynone, Ynone, Ygs, movLit, [4]uint8{0x0f, 0xa9, 0}},
3852	{APOPW, Ynone, Ynone, Yds, movLit, [4]uint8{Pe, 0x1f, 0}},
3853	{APOPW, Ynone, Ynone, Yes, movLit, [4]uint8{Pe, 0x07, 0}},
3854	{APOPW, Ynone, Ynone, Yss, movLit, [4]uint8{Pe, 0x17, 0}},
3855	{APOPW, Ynone, Ynone, Yfs, movLit, [4]uint8{Pe, 0x0f, 0xa1, 0}},
3856	{APOPW, Ynone, Ynone, Ygs, movLit, [4]uint8{Pe, 0x0f, 0xa9, 0}},
3857
3858	// mov seg
3859	{AMOVW, Yes, Ynone, Yml, movRegMem, [4]uint8{0x8c, 0, 0, 0}},
3860	{AMOVW, Ycs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 1, 0, 0}},
3861	{AMOVW, Yss, Ynone, Yml, movRegMem, [4]uint8{0x8c, 2, 0, 0}},
3862	{AMOVW, Yds, Ynone, Yml, movRegMem, [4]uint8{0x8c, 3, 0, 0}},
3863	{AMOVW, Yfs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 4, 0, 0}},
3864	{AMOVW, Ygs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 5, 0, 0}},
3865	{AMOVW, Yml, Ynone, Yes, movMemReg, [4]uint8{0x8e, 0, 0, 0}},
3866	{AMOVW, Yml, Ynone, Ycs, movMemReg, [4]uint8{0x8e, 1, 0, 0}},
3867	{AMOVW, Yml, Ynone, Yss, movMemReg, [4]uint8{0x8e, 2, 0, 0}},
3868	{AMOVW, Yml, Ynone, Yds, movMemReg, [4]uint8{0x8e, 3, 0, 0}},
3869	{AMOVW, Yml, Ynone, Yfs, movMemReg, [4]uint8{0x8e, 4, 0, 0}},
3870	{AMOVW, Yml, Ynone, Ygs, movMemReg, [4]uint8{0x8e, 5, 0, 0}},
3871
3872	// mov cr
3873	{AMOVL, Ycr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 0, 0}},
3874	{AMOVL, Ycr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 2, 0}},
3875	{AMOVL, Ycr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 3, 0}},
3876	{AMOVL, Ycr4, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 4, 0}},
3877	{AMOVL, Ycr8, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 8, 0}},
3878	{AMOVQ, Ycr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 0, 0}},
3879	{AMOVQ, Ycr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 2, 0}},
3880	{AMOVQ, Ycr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 3, 0}},
3881	{AMOVQ, Ycr4, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 4, 0}},
3882	{AMOVQ, Ycr8, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 8, 0}},
3883	{AMOVL, Yrl, Ynone, Ycr0, movMemReg2op, [4]uint8{0x0f, 0x22, 0, 0}},
3884	{AMOVL, Yrl, Ynone, Ycr2, movMemReg2op, [4]uint8{0x0f, 0x22, 2, 0}},
3885	{AMOVL, Yrl, Ynone, Ycr3, movMemReg2op, [4]uint8{0x0f, 0x22, 3, 0}},
3886	{AMOVL, Yrl, Ynone, Ycr4, movMemReg2op, [4]uint8{0x0f, 0x22, 4, 0}},
3887	{AMOVL, Yrl, Ynone, Ycr8, movMemReg2op, [4]uint8{0x0f, 0x22, 8, 0}},
3888	{AMOVQ, Yrl, Ynone, Ycr0, movMemReg2op, [4]uint8{0x0f, 0x22, 0, 0}},
3889	{AMOVQ, Yrl, Ynone, Ycr2, movMemReg2op, [4]uint8{0x0f, 0x22, 2, 0}},
3890	{AMOVQ, Yrl, Ynone, Ycr3, movMemReg2op, [4]uint8{0x0f, 0x22, 3, 0}},
3891	{AMOVQ, Yrl, Ynone, Ycr4, movMemReg2op, [4]uint8{0x0f, 0x22, 4, 0}},
3892	{AMOVQ, Yrl, Ynone, Ycr8, movMemReg2op, [4]uint8{0x0f, 0x22, 8, 0}},
3893
3894	// mov dr
3895	{AMOVL, Ydr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 0, 0}},
3896	{AMOVL, Ydr6, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 6, 0}},
3897	{AMOVL, Ydr7, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 7, 0}},
3898	{AMOVQ, Ydr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 0, 0}},
3899	{AMOVQ, Ydr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 2, 0}},
3900	{AMOVQ, Ydr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 3, 0}},
3901	{AMOVQ, Ydr6, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 6, 0}},
3902	{AMOVQ, Ydr7, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 7, 0}},
3903	{AMOVL, Yrl, Ynone, Ydr0, movMemReg2op, [4]uint8{0x0f, 0x23, 0, 0}},
3904	{AMOVL, Yrl, Ynone, Ydr6, movMemReg2op, [4]uint8{0x0f, 0x23, 6, 0}},
3905	{AMOVL, Yrl, Ynone, Ydr7, movMemReg2op, [4]uint8{0x0f, 0x23, 7, 0}},
3906	{AMOVQ, Yrl, Ynone, Ydr0, movMemReg2op, [4]uint8{0x0f, 0x23, 0, 0}},
3907	{AMOVQ, Yrl, Ynone, Ydr2, movMemReg2op, [4]uint8{0x0f, 0x23, 2, 0}},
3908	{AMOVQ, Yrl, Ynone, Ydr3, movMemReg2op, [4]uint8{0x0f, 0x23, 3, 0}},
3909	{AMOVQ, Yrl, Ynone, Ydr6, movMemReg2op, [4]uint8{0x0f, 0x23, 6, 0}},
3910	{AMOVQ, Yrl, Ynone, Ydr7, movMemReg2op, [4]uint8{0x0f, 0x23, 7, 0}},
3911
3912	// mov tr
3913	{AMOVL, Ytr6, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x24, 6, 0}},
3914	{AMOVL, Ytr7, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x24, 7, 0}},
3915	{AMOVL, Yml, Ynone, Ytr6, movMemReg2op, [4]uint8{0x0f, 0x26, 6, 0xff}},
3916	{AMOVL, Yml, Ynone, Ytr7, movMemReg2op, [4]uint8{0x0f, 0x26, 7, 0xff}},
3917
3918	// lgdt, sgdt, lidt, sidt
3919	{AMOVL, Ym, Ynone, Ygdtr, movMemReg2op, [4]uint8{0x0f, 0x01, 2, 0}},
3920	{AMOVL, Ygdtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 0, 0}},
3921	{AMOVL, Ym, Ynone, Yidtr, movMemReg2op, [4]uint8{0x0f, 0x01, 3, 0}},
3922	{AMOVL, Yidtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 1, 0}},
3923	{AMOVQ, Ym, Ynone, Ygdtr, movMemReg2op, [4]uint8{0x0f, 0x01, 2, 0}},
3924	{AMOVQ, Ygdtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 0, 0}},
3925	{AMOVQ, Ym, Ynone, Yidtr, movMemReg2op, [4]uint8{0x0f, 0x01, 3, 0}},
3926	{AMOVQ, Yidtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 1, 0}},
3927
3928	// lldt, sldt
3929	{AMOVW, Yml, Ynone, Yldtr, movMemReg2op, [4]uint8{0x0f, 0x00, 2, 0}},
3930	{AMOVW, Yldtr, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x00, 0, 0}},
3931
3932	// lmsw, smsw
3933	{AMOVW, Yml, Ynone, Ymsw, movMemReg2op, [4]uint8{0x0f, 0x01, 6, 0}},
3934	{AMOVW, Ymsw, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x01, 4, 0}},
3935
3936	// ltr, str
3937	{AMOVW, Yml, Ynone, Ytask, movMemReg2op, [4]uint8{0x0f, 0x00, 3, 0}},
3938	{AMOVW, Ytask, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x00, 1, 0}},
3939
3940	/* load full pointer - unsupported
3941	{AMOVL, Yml, Ycol, movFullPtr, [4]uint8{0, 0, 0, 0}},
3942	{AMOVW, Yml, Ycol, movFullPtr, [4]uint8{Pe, 0, 0, 0}},
3943	*/
3944
3945	// double shift
3946	{ASHLL, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
3947	{ASHLL, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
3948	{ASHLL, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
3949	{ASHRL, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
3950	{ASHRL, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
3951	{ASHRL, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
3952	{ASHLQ, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
3953	{ASHLQ, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
3954	{ASHLQ, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
3955	{ASHRQ, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
3956	{ASHRQ, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
3957	{ASHRQ, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
3958	{ASHLW, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
3959	{ASHLW, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
3960	{ASHLW, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
3961	{ASHRW, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},
3962	{ASHRW, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},
3963	{ASHRW, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},
3964
3965	// load TLS base
3966	{AMOVL, Ytls, Ynone, Yrl, movTLSReg, [4]uint8{0, 0, 0, 0}},
3967	{AMOVQ, Ytls, Ynone, Yrl, movTLSReg, [4]uint8{0, 0, 0, 0}},
3968	{0, 0, 0, 0, 0, [4]uint8{}},
3969}
3970
3971func isax(a *obj.Addr) bool {
3972	switch a.Reg {
3973	case REG_AX, REG_AL, REG_AH:
3974		return true
3975	}
3976
3977	return a.Index == REG_AX
3978}
3979
3980func subreg(p *obj.Prog, from int, to int) {
3981	if false { /* debug['Q'] */
3982		fmt.Printf("\n%v\ts/%v/%v/\n", p, rconv(from), rconv(to))
3983	}
3984
3985	if int(p.From.Reg) == from {
3986		p.From.Reg = int16(to)
3987		p.Ft = 0
3988	}
3989
3990	if int(p.To.Reg) == from {
3991		p.To.Reg = int16(to)
3992		p.Tt = 0
3993	}
3994
3995	if int(p.From.Index) == from {
3996		p.From.Index = int16(to)
3997		p.Ft = 0
3998	}
3999
4000	if int(p.To.Index) == from {
4001		p.To.Index = int16(to)
4002		p.Tt = 0
4003	}
4004
4005	if false { /* debug['Q'] */
4006		fmt.Printf("%v\n", p)
4007	}
4008}
4009
4010func (ab *AsmBuf) mediaop(ctxt *obj.Link, o *Optab, op int, osize int, z int) int {
4011	switch op {
4012	case Pm, Pe, Pf2, Pf3:
4013		if osize != 1 {
4014			if op != Pm {
4015				ab.Put1(byte(op))
4016			}
4017			ab.Put1(Pm)
4018			z++
4019			op = int(o.op[z])
4020			break
4021		}
4022		fallthrough
4023
4024	default:
4025		if ab.Len() == 0 || ab.Last() != Pm {
4026			ab.Put1(Pm)
4027		}
4028	}
4029
4030	ab.Put1(byte(op))
4031	return z
4032}
4033
4034var bpduff1 = []byte{
4035	0x48, 0x89, 0x6c, 0x24, 0xf0, // MOVQ BP, -16(SP)
4036	0x48, 0x8d, 0x6c, 0x24, 0xf0, // LEAQ -16(SP), BP
4037}
4038
4039var bpduff2 = []byte{
4040	0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP
4041}
4042
4043// asmevex emits EVEX pregis and opcode byte.
4044// In addition to asmvex r/m, vvvv and reg fields also requires optional
4045// K-masking register.
4046//
4047// Expects asmbuf.evex to be properly initialized.
4048func (ab *AsmBuf) asmevex(ctxt *obj.Link, p *obj.Prog, rm, v, r, k *obj.Addr) {
4049	ab.evexflag = true
4050	evex := ab.evex
4051
4052	rexR := byte(1)
4053	evexR := byte(1)
4054	rexX := byte(1)
4055	rexB := byte(1)
4056	if r != nil {
4057		if regrex[r.Reg]&Rxr != 0 {
4058			rexR = 0 // "ModR/M.reg" selector 4th bit.
4059		}
4060		if regrex[r.Reg]&RxrEvex != 0 {
4061			evexR = 0 // "ModR/M.reg" selector 5th bit.
4062		}
4063	}
4064	if rm != nil {
4065		if rm.Index == REG_NONE && regrex[rm.Reg]&RxrEvex != 0 {
4066			rexX = 0
4067		} else if regrex[rm.Index]&Rxx != 0 {
4068			rexX = 0
4069		}
4070		if regrex[rm.Reg]&Rxb != 0 {
4071			rexB = 0
4072		}
4073	}
4074	// P0 = [R][X][B][R'][00][mm]
4075	p0 := (rexR << 7) |
4076		(rexX << 6) |
4077		(rexB << 5) |
4078		(evexR << 4) |
4079		(0 << 2) |
4080		(evex.M() << 0)
4081
4082	vexV := byte(0)
4083	if v != nil {
4084		// 4bit-wide reg index.
4085		vexV = byte(reg[v.Reg]|(regrex[v.Reg]&Rxr)<<1) & 0xF
4086	}
4087	vexV ^= 0x0F
4088	// P1 = [W][vvvv][1][pp]
4089	p1 := (evex.W() << 7) |
4090		(vexV << 3) |
4091		(1 << 2) |
4092		(evex.P() << 0)
4093
4094	suffix := evexSuffixMap[p.Scond]
4095	evexZ := byte(0)
4096	evexLL := evex.L()
4097	evexB := byte(0)
4098	evexV := byte(1)
4099	evexA := byte(0)
4100	if suffix.zeroing {
4101		if !evex.ZeroingEnabled() {
4102			ctxt.Diag("unsupported zeroing: %v", p)
4103		}
4104		if k == nil {
4105			// When you request zeroing you must specify a mask register.
4106			// See issue 57952.
4107			ctxt.Diag("mask register must be specified for .Z instructions: %v", p)
4108		} else if k.Reg == REG_K0 {
4109			// The mask register must not be K0. That restriction is already
4110			// handled by the Yknot0 restriction in the opcode tables, so we
4111			// won't ever reach here. But put something sensible here just in case.
4112			ctxt.Diag("mask register must not be K0 for .Z instructions: %v", p)
4113		}
4114		evexZ = 1
4115	}
4116	switch {
4117	case suffix.rounding != rcUnset:
4118		if rm != nil && rm.Type == obj.TYPE_MEM {
4119			ctxt.Diag("illegal rounding with memory argument: %v", p)
4120		} else if !evex.RoundingEnabled() {
4121			ctxt.Diag("unsupported rounding: %v", p)
4122		}
4123		evexB = 1
4124		evexLL = suffix.rounding
4125	case suffix.broadcast:
4126		if rm == nil || rm.Type != obj.TYPE_MEM {
4127			ctxt.Diag("illegal broadcast without memory argument: %v", p)
4128		} else if !evex.BroadcastEnabled() {
4129			ctxt.Diag("unsupported broadcast: %v", p)
4130		}
4131		evexB = 1
4132	case suffix.sae:
4133		if rm != nil && rm.Type == obj.TYPE_MEM {
4134			ctxt.Diag("illegal SAE with memory argument: %v", p)
4135		} else if !evex.SaeEnabled() {
4136			ctxt.Diag("unsupported SAE: %v", p)
4137		}
4138		evexB = 1
4139	}
4140	if rm != nil && regrex[rm.Index]&RxrEvex != 0 {
4141		evexV = 0
4142	} else if v != nil && regrex[v.Reg]&RxrEvex != 0 {
4143		evexV = 0 // VSR selector 5th bit.
4144	}
4145	if k != nil {
4146		evexA = byte(reg[k.Reg])
4147	}
4148	// P2 = [z][L'L][b][V'][aaa]
4149	p2 := (evexZ << 7) |
4150		(evexLL << 5) |
4151		(evexB << 4) |
4152		(evexV << 3) |
4153		(evexA << 0)
4154
4155	const evexEscapeByte = 0x62
4156	ab.Put4(evexEscapeByte, p0, p1, p2)
4157	ab.Put1(evex.opcode)
4158}
4159
4160// Emit VEX prefix and opcode byte.
4161// The three addresses are the r/m, vvvv, and reg fields.
4162// The reg and rm arguments appear in the same order as the
4163// arguments to asmand, which typically follows the call to asmvex.
4164// The final two arguments are the VEX prefix (see encoding above)
4165// and the opcode byte.
4166// For details about vex prefix see:
4167// https://en.wikipedia.org/wiki/VEX_prefix#Technical_description
4168func (ab *AsmBuf) asmvex(ctxt *obj.Link, rm, v, r *obj.Addr, vex, opcode uint8) {
4169	ab.vexflag = true
4170	rexR := 0
4171	if r != nil {
4172		rexR = regrex[r.Reg] & Rxr
4173	}
4174	rexB := 0
4175	rexX := 0
4176	if rm != nil {
4177		rexB = regrex[rm.Reg] & Rxb
4178		rexX = regrex[rm.Index] & Rxx
4179	}
4180	vexM := (vex >> 3) & 0x7
4181	vexWLP := vex & 0x87
4182	vexV := byte(0)
4183	if v != nil {
4184		vexV = byte(reg[v.Reg]|(regrex[v.Reg]&Rxr)<<1) & 0xF
4185	}
4186	vexV ^= 0xF
4187	if vexM == 1 && (rexX|rexB) == 0 && vex&vexW1 == 0 {
4188		// Can use 2-byte encoding.
4189		ab.Put2(0xc5, byte(rexR<<5)^0x80|vexV<<3|vexWLP)
4190	} else {
4191		// Must use 3-byte encoding.
4192		ab.Put3(0xc4,
4193			(byte(rexR|rexX|rexB)<<5)^0xE0|vexM,
4194			vexV<<3|vexWLP,
4195		)
4196	}
4197	ab.Put1(opcode)
4198}
4199
4200// regIndex returns register index that fits in 5 bits.
4201//
4202//	R         : 3 bit | legacy instructions     | N/A
4203//	[R/V]EX.R : 1 bit | REX / VEX extension bit | Rxr
4204//	EVEX.R    : 1 bit | EVEX extension bit      | RxrEvex
4205//
4206// Examples:
4207//
4208//	REG_Z30 => 30
4209//	REG_X15 => 15
4210//	REG_R9  => 9
4211//	REG_AX  => 0
4212func regIndex(r int16) int {
4213	lower3bits := reg[r]
4214	high4bit := regrex[r] & Rxr << 1
4215	high5bit := regrex[r] & RxrEvex << 0
4216	return lower3bits | high4bit | high5bit
4217}
4218
4219// avx2gatherValid reports whether p satisfies AVX2 gather constraints.
4220// Reports errors via ctxt.
4221func avx2gatherValid(ctxt *obj.Link, p *obj.Prog) bool {
4222	// If any pair of the index, mask, or destination registers
4223	// are the same, illegal instruction trap (#UD) is triggered.
4224	index := regIndex(p.GetFrom3().Index)
4225	mask := regIndex(p.From.Reg)
4226	dest := regIndex(p.To.Reg)
4227	if dest == mask || dest == index || mask == index {
4228		ctxt.Diag("mask, index, and destination registers should be distinct: %v", p)
4229		return false
4230	}
4231
4232	return true
4233}
4234
4235// avx512gatherValid reports whether p satisfies AVX512 gather constraints.
4236// Reports errors via ctxt.
4237func avx512gatherValid(ctxt *obj.Link, p *obj.Prog) bool {
4238	// Illegal instruction trap (#UD) is triggered if the destination vector
4239	// register is the same as index vector in VSIB.
4240	index := regIndex(p.From.Index)
4241	dest := regIndex(p.To.Reg)
4242	if dest == index {
4243		ctxt.Diag("index and destination registers should be distinct: %v", p)
4244		return false
4245	}
4246
4247	return true
4248}
4249
4250func (ab *AsmBuf) doasm(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
4251	o := opindex[p.As&obj.AMask]
4252
4253	if o == nil {
4254		ctxt.Diag("asmins: missing op %v", p)
4255		return
4256	}
4257
4258	if pre := prefixof(ctxt, &p.From); pre != 0 {
4259		ab.Put1(byte(pre))
4260	}
4261	if pre := prefixof(ctxt, &p.To); pre != 0 {
4262		ab.Put1(byte(pre))
4263	}
4264
4265	// Checks to warn about instruction/arguments combinations that
4266	// will unconditionally trigger illegal instruction trap (#UD).
4267	switch p.As {
4268	case AVGATHERDPD,
4269		AVGATHERQPD,
4270		AVGATHERDPS,
4271		AVGATHERQPS,
4272		AVPGATHERDD,
4273		AVPGATHERQD,
4274		AVPGATHERDQ,
4275		AVPGATHERQQ:
4276		if p.GetFrom3() == nil {
4277			// gathers need a 3rd arg. See issue 58822.
4278			ctxt.Diag("need a third arg for gather instruction: %v", p)
4279			return
4280		}
4281		// AVX512 gather requires explicit K mask.
4282		if p.GetFrom3().Reg >= REG_K0 && p.GetFrom3().Reg <= REG_K7 {
4283			if !avx512gatherValid(ctxt, p) {
4284				return
4285			}
4286		} else {
4287			if !avx2gatherValid(ctxt, p) {
4288				return
4289			}
4290		}
4291	}
4292
4293	if p.Ft == 0 {
4294		p.Ft = uint8(oclass(ctxt, p, &p.From))
4295	}
4296	if p.Tt == 0 {
4297		p.Tt = uint8(oclass(ctxt, p, &p.To))
4298	}
4299
4300	ft := int(p.Ft) * Ymax
4301	var f3t int
4302	tt := int(p.Tt) * Ymax
4303
4304	xo := obj.Bool2int(o.op[0] == 0x0f)
4305	z := 0
4306	var a *obj.Addr
4307	var l int
4308	var op int
4309	var q *obj.Prog
4310	var r *obj.Reloc
4311	var rel obj.Reloc
4312	var v int64
4313
4314	args := make([]int, 0, argListMax)
4315	if ft != Ynone*Ymax {
4316		args = append(args, ft)
4317	}
4318	for i := range p.RestArgs {
4319		args = append(args, oclass(ctxt, p, &p.RestArgs[i].Addr)*Ymax)
4320	}
4321	if tt != Ynone*Ymax {
4322		args = append(args, tt)
4323	}
4324
4325	for _, yt := range o.ytab {
4326		// ytab matching is purely args-based,
4327		// but AVX512 suffixes like "Z" or "RU_SAE" will
4328		// add EVEX-only filter that will reject non-EVEX matches.
4329		//
4330		// Consider "VADDPD.BCST 2032(DX), X0, X0".
4331		// Without this rule, operands will lead to VEX-encoded form
4332		// and produce "c5b15813" encoding.
4333		if !yt.match(args) {
4334			// "xo" is always zero for VEX/EVEX encoded insts.
4335			z += int(yt.zoffset) + xo
4336		} else {
4337			if p.Scond != 0 && !evexZcase(yt.zcase) {
4338				// Do not signal error and continue to search
4339				// for matching EVEX-encoded form.
4340				z += int(yt.zoffset)
4341				continue
4342			}
4343
4344			switch o.prefix {
4345			case Px1: // first option valid only in 32-bit mode
4346				if ctxt.Arch.Family == sys.AMD64 && z == 0 {
4347					z += int(yt.zoffset) + xo
4348					continue
4349				}
4350			case Pq: // 16 bit escape and opcode escape
4351				ab.Put2(Pe, Pm)
4352
4353			case Pq3: // 16 bit escape and opcode escape + REX.W
4354				ab.rexflag |= Pw
4355				ab.Put2(Pe, Pm)
4356
4357			case Pq4: // 66 0F 38
4358				ab.Put3(0x66, 0x0F, 0x38)
4359
4360			case Pq4w: // 66 0F 38 + REX.W
4361				ab.rexflag |= Pw
4362				ab.Put3(0x66, 0x0F, 0x38)
4363
4364			case Pq5: // F3 0F 38
4365				ab.Put3(0xF3, 0x0F, 0x38)
4366
4367			case Pq5w: //  F3 0F 38 + REX.W
4368				ab.rexflag |= Pw
4369				ab.Put3(0xF3, 0x0F, 0x38)
4370
4371			case Pf2, // xmm opcode escape
4372				Pf3:
4373				ab.Put2(o.prefix, Pm)
4374
4375			case Pef3:
4376				ab.Put3(Pe, Pf3, Pm)
4377
4378			case Pfw: // xmm opcode escape + REX.W
4379				ab.rexflag |= Pw
4380				ab.Put2(Pf3, Pm)
4381
4382			case Pm: // opcode escape
4383				ab.Put1(Pm)
4384
4385			case Pe: // 16 bit escape
4386				ab.Put1(Pe)
4387
4388			case Pw: // 64-bit escape
4389				if ctxt.Arch.Family != sys.AMD64 {
4390					ctxt.Diag("asmins: illegal 64: %v", p)
4391				}
4392				ab.rexflag |= Pw
4393
4394			case Pw8: // 64-bit escape if z >= 8
4395				if z >= 8 {
4396					if ctxt.Arch.Family != sys.AMD64 {
4397						ctxt.Diag("asmins: illegal 64: %v", p)
4398					}
4399					ab.rexflag |= Pw
4400				}
4401
4402			case Pb: // botch
4403				if ctxt.Arch.Family != sys.AMD64 && (isbadbyte(&p.From) || isbadbyte(&p.To)) {
4404					goto bad
4405				}
4406				// NOTE(rsc): This is probably safe to do always,
4407				// but when enabled it chooses different encodings
4408				// than the old cmd/internal/obj/i386 code did,
4409				// which breaks our "same bits out" checks.
4410				// In particular, CMPB AX, $0 encodes as 80 f8 00
4411				// in the original obj/i386, and it would encode
4412				// (using a valid, shorter form) as 3c 00 if we enabled
4413				// the call to bytereg here.
4414				if ctxt.Arch.Family == sys.AMD64 {
4415					bytereg(&p.From, &p.Ft)
4416					bytereg(&p.To, &p.Tt)
4417				}
4418
4419			case P32: // 32 bit but illegal if 64-bit mode
4420				if ctxt.Arch.Family == sys.AMD64 {
4421					ctxt.Diag("asmins: illegal in 64-bit mode: %v", p)
4422				}
4423
4424			case Py: // 64-bit only, no prefix
4425				if ctxt.Arch.Family != sys.AMD64 {
4426					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
4427				}
4428
4429			case Py1: // 64-bit only if z < 1, no prefix
4430				if z < 1 && ctxt.Arch.Family != sys.AMD64 {
4431					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
4432				}
4433
4434			case Py3: // 64-bit only if z < 3, no prefix
4435				if z < 3 && ctxt.Arch.Family != sys.AMD64 {
4436					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
4437				}
4438			}
4439
4440			if z >= len(o.op) {
4441				log.Fatalf("asmins bad table %v", p)
4442			}
4443			op = int(o.op[z])
4444			if op == 0x0f {
4445				ab.Put1(byte(op))
4446				z++
4447				op = int(o.op[z])
4448			}
4449
4450			switch yt.zcase {
4451			default:
4452				ctxt.Diag("asmins: unknown z %d %v", yt.zcase, p)
4453				return
4454
4455			case Zpseudo:
4456				break
4457
4458			case Zlit:
4459				ab.PutOpBytesLit(z, &o.op)
4460
4461			case Zlitr_m:
4462				ab.PutOpBytesLit(z, &o.op)
4463				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4464
4465			case Zlitm_r:
4466				ab.PutOpBytesLit(z, &o.op)
4467				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4468
4469			case Zlit_m_r:
4470				ab.PutOpBytesLit(z, &o.op)
4471				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4472
4473			case Zmb_r:
4474				bytereg(&p.From, &p.Ft)
4475				fallthrough
4476
4477			case Zm_r:
4478				ab.Put1(byte(op))
4479				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4480
4481			case Z_m_r:
4482				ab.Put1(byte(op))
4483				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4484
4485			case Zm2_r:
4486				ab.Put2(byte(op), o.op[z+1])
4487				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4488
4489			case Zm_r_xm:
4490				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4491				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4492
4493			case Zm_r_xm_nr:
4494				ab.rexflag = 0
4495				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4496				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4497
4498			case Zm_r_i_xm:
4499				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4500				ab.asmand(ctxt, cursym, p, &p.From, p.GetFrom3())
4501				ab.Put1(byte(p.To.Offset))
4502
4503			case Zibm_r, Zibr_m:
4504				ab.PutOpBytesLit(z, &o.op)
4505				if yt.zcase == Zibr_m {
4506					ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
4507				} else {
4508					ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4509				}
4510				switch {
4511				default:
4512					ab.Put1(byte(p.From.Offset))
4513				case yt.args[0] == Yi32 && o.prefix == Pe:
4514					ab.PutInt16(int16(p.From.Offset))
4515				case yt.args[0] == Yi32:
4516					ab.PutInt32(int32(p.From.Offset))
4517				}
4518
4519			case Zaut_r:
4520				ab.Put1(0x8d) // leal
4521				if p.From.Type != obj.TYPE_ADDR {
4522					ctxt.Diag("asmins: Zaut sb type ADDR")
4523				}
4524				p.From.Type = obj.TYPE_MEM
4525				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4526				p.From.Type = obj.TYPE_ADDR
4527
4528			case Zm_o:
4529				ab.Put1(byte(op))
4530				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))
4531
4532			case Zr_m:
4533				ab.Put1(byte(op))
4534				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4535
4536			case Zvex:
4537				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])
4538
4539			case Zvex_rm_v_r:
4540				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])
4541				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4542
4543			case Zvex_rm_v_ro:
4544				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])
4545				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+2]))
4546
4547			case Zvex_i_rm_vo:
4548				ab.asmvex(ctxt, p.GetFrom3(), &p.To, nil, o.op[z], o.op[z+1])
4549				ab.asmando(ctxt, cursym, p, p.GetFrom3(), int(o.op[z+2]))
4550				ab.Put1(byte(p.From.Offset))
4551
4552			case Zvex_i_r_v:
4553				ab.asmvex(ctxt, p.GetFrom3(), &p.To, nil, o.op[z], o.op[z+1])
4554				regnum := byte(0x7)
4555				if p.GetFrom3().Reg >= REG_X0 && p.GetFrom3().Reg <= REG_X15 {
4556					regnum &= byte(p.GetFrom3().Reg - REG_X0)
4557				} else {
4558					regnum &= byte(p.GetFrom3().Reg - REG_Y0)
4559				}
4560				ab.Put1(o.op[z+2] | regnum)
4561				ab.Put1(byte(p.From.Offset))
4562
4563			case Zvex_i_rm_v_r:
4564				imm, from, from3, to := unpackOps4(p)
4565				ab.asmvex(ctxt, from, from3, to, o.op[z], o.op[z+1])
4566				ab.asmand(ctxt, cursym, p, from, to)
4567				ab.Put1(byte(imm.Offset))
4568
4569			case Zvex_i_rm_r:
4570				ab.asmvex(ctxt, p.GetFrom3(), nil, &p.To, o.op[z], o.op[z+1])
4571				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4572				ab.Put1(byte(p.From.Offset))
4573
4574			case Zvex_v_rm_r:
4575				ab.asmvex(ctxt, p.GetFrom3(), &p.From, &p.To, o.op[z], o.op[z+1])
4576				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4577
4578			case Zvex_r_v_rm:
4579				ab.asmvex(ctxt, &p.To, p.GetFrom3(), &p.From, o.op[z], o.op[z+1])
4580				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4581
4582			case Zvex_rm_r_vo:
4583				ab.asmvex(ctxt, &p.From, &p.To, p.GetFrom3(), o.op[z], o.op[z+1])
4584				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+2]))
4585
4586			case Zvex_i_r_rm:
4587				ab.asmvex(ctxt, &p.To, nil, p.GetFrom3(), o.op[z], o.op[z+1])
4588				ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
4589				ab.Put1(byte(p.From.Offset))
4590
4591			case Zvex_hr_rm_v_r:
4592				hr, from, from3, to := unpackOps4(p)
4593				ab.asmvex(ctxt, from, from3, to, o.op[z], o.op[z+1])
4594				ab.asmand(ctxt, cursym, p, from, to)
4595				ab.Put1(byte(regIndex(hr.Reg) << 4))
4596
4597			case Zevex_k_rmo:
4598				ab.evex = newEVEXBits(z, &o.op)
4599				ab.asmevex(ctxt, p, &p.To, nil, nil, &p.From)
4600				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+3]))
4601
4602			case Zevex_i_rm_vo:
4603				ab.evex = newEVEXBits(z, &o.op)
4604				ab.asmevex(ctxt, p, p.GetFrom3(), &p.To, nil, nil)
4605				ab.asmando(ctxt, cursym, p, p.GetFrom3(), int(o.op[z+3]))
4606				ab.Put1(byte(p.From.Offset))
4607
4608			case Zevex_i_rm_k_vo:
4609				imm, from, kmask, to := unpackOps4(p)
4610				ab.evex = newEVEXBits(z, &o.op)
4611				ab.asmevex(ctxt, p, from, to, nil, kmask)
4612				ab.asmando(ctxt, cursym, p, from, int(o.op[z+3]))
4613				ab.Put1(byte(imm.Offset))
4614
4615			case Zevex_i_r_rm:
4616				ab.evex = newEVEXBits(z, &o.op)
4617				ab.asmevex(ctxt, p, &p.To, nil, p.GetFrom3(), nil)
4618				ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
4619				ab.Put1(byte(p.From.Offset))
4620
4621			case Zevex_i_r_k_rm:
4622				imm, from, kmask, to := unpackOps4(p)
4623				ab.evex = newEVEXBits(z, &o.op)
4624				ab.asmevex(ctxt, p, to, nil, from, kmask)
4625				ab.asmand(ctxt, cursym, p, to, from)
4626				ab.Put1(byte(imm.Offset))
4627
4628			case Zevex_i_rm_r:
4629				ab.evex = newEVEXBits(z, &o.op)
4630				ab.asmevex(ctxt, p, p.GetFrom3(), nil, &p.To, nil)
4631				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4632				ab.Put1(byte(p.From.Offset))
4633
4634			case Zevex_i_rm_k_r:
4635				imm, from, kmask, to := unpackOps4(p)
4636				ab.evex = newEVEXBits(z, &o.op)
4637				ab.asmevex(ctxt, p, from, nil, to, kmask)
4638				ab.asmand(ctxt, cursym, p, from, to)
4639				ab.Put1(byte(imm.Offset))
4640
4641			case Zevex_i_rm_v_r:
4642				imm, from, from3, to := unpackOps4(p)
4643				ab.evex = newEVEXBits(z, &o.op)
4644				ab.asmevex(ctxt, p, from, from3, to, nil)
4645				ab.asmand(ctxt, cursym, p, from, to)
4646				ab.Put1(byte(imm.Offset))
4647
4648			case Zevex_i_rm_v_k_r:
4649				imm, from, from3, kmask, to := unpackOps5(p)
4650				ab.evex = newEVEXBits(z, &o.op)
4651				ab.asmevex(ctxt, p, from, from3, to, kmask)
4652				ab.asmand(ctxt, cursym, p, from, to)
4653				ab.Put1(byte(imm.Offset))
4654
4655			case Zevex_r_v_rm:
4656				ab.evex = newEVEXBits(z, &o.op)
4657				ab.asmevex(ctxt, p, &p.To, p.GetFrom3(), &p.From, nil)
4658				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4659
4660			case Zevex_rm_v_r:
4661				ab.evex = newEVEXBits(z, &o.op)
4662				ab.asmevex(ctxt, p, &p.From, p.GetFrom3(), &p.To, nil)
4663				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4664
4665			case Zevex_rm_k_r:
4666				ab.evex = newEVEXBits(z, &o.op)
4667				ab.asmevex(ctxt, p, &p.From, nil, &p.To, p.GetFrom3())
4668				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4669
4670			case Zevex_r_k_rm:
4671				ab.evex = newEVEXBits(z, &o.op)
4672				ab.asmevex(ctxt, p, &p.To, nil, &p.From, p.GetFrom3())
4673				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4674
4675			case Zevex_rm_v_k_r:
4676				from, from3, kmask, to := unpackOps4(p)
4677				ab.evex = newEVEXBits(z, &o.op)
4678				ab.asmevex(ctxt, p, from, from3, to, kmask)
4679				ab.asmand(ctxt, cursym, p, from, to)
4680
4681			case Zevex_r_v_k_rm:
4682				from, from3, kmask, to := unpackOps4(p)
4683				ab.evex = newEVEXBits(z, &o.op)
4684				ab.asmevex(ctxt, p, to, from3, from, kmask)
4685				ab.asmand(ctxt, cursym, p, to, from)
4686
4687			case Zr_m_xm:
4688				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4689				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4690
4691			case Zr_m_xm_nr:
4692				ab.rexflag = 0
4693				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4694				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4695
4696			case Zo_m:
4697				ab.Put1(byte(op))
4698				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4699
4700			case Zcallindreg:
4701				r = obj.Addrel(cursym)
4702				r.Off = int32(p.Pc)
4703				r.Type = objabi.R_CALLIND
4704				r.Siz = 0
4705				fallthrough
4706
4707			case Zo_m64:
4708				ab.Put1(byte(op))
4709				ab.asmandsz(ctxt, cursym, p, &p.To, int(o.op[z+1]), 0, 1)
4710
4711			case Zm_ibo:
4712				ab.Put1(byte(op))
4713				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))
4714				ab.Put1(byte(vaddr(ctxt, p, &p.To, nil)))
4715
4716			case Zibo_m:
4717				ab.Put1(byte(op))
4718				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4719				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))
4720
4721			case Zibo_m_xm:
4722				z = ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4723				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4724				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))
4725
4726			case Z_ib, Zib_:
4727				if yt.zcase == Zib_ {
4728					a = &p.From
4729				} else {
4730					a = &p.To
4731				}
4732				ab.Put1(byte(op))
4733				if p.As == AXABORT {
4734					ab.Put1(o.op[z+1])
4735				}
4736				ab.Put1(byte(vaddr(ctxt, p, a, nil)))
4737
4738			case Zib_rp:
4739				ab.rexflag |= regrex[p.To.Reg] & (Rxb | 0x40)
4740				ab.Put2(byte(op+reg[p.To.Reg]), byte(vaddr(ctxt, p, &p.From, nil)))
4741
4742			case Zil_rp:
4743				ab.rexflag |= regrex[p.To.Reg] & Rxb
4744				ab.Put1(byte(op + reg[p.To.Reg]))
4745				if o.prefix == Pe {
4746					v = vaddr(ctxt, p, &p.From, nil)
4747					ab.PutInt16(int16(v))
4748				} else {
4749					ab.relput4(ctxt, cursym, p, &p.From)
4750				}
4751
4752			case Zo_iw:
4753				ab.Put1(byte(op))
4754				if p.From.Type != obj.TYPE_NONE {
4755					v = vaddr(ctxt, p, &p.From, nil)
4756					ab.PutInt16(int16(v))
4757				}
4758
4759			case Ziq_rp:
4760				v = vaddr(ctxt, p, &p.From, &rel)
4761				l = int(v >> 32)
4762				if l == 0 && rel.Siz != 8 {
4763					ab.rexflag &^= (0x40 | Rxw)
4764
4765					ab.rexflag |= regrex[p.To.Reg] & Rxb
4766					ab.Put1(byte(0xb8 + reg[p.To.Reg]))
4767					if rel.Type != 0 {
4768						r = obj.Addrel(cursym)
4769						*r = rel
4770						r.Off = int32(p.Pc + int64(ab.Len()))
4771					}
4772
4773					ab.PutInt32(int32(v))
4774				} else if l == -1 && uint64(v)&(uint64(1)<<31) != 0 { // sign extend
4775					ab.Put1(0xc7)
4776					ab.asmando(ctxt, cursym, p, &p.To, 0)
4777
4778					ab.PutInt32(int32(v)) // need all 8
4779				} else {
4780					ab.rexflag |= regrex[p.To.Reg] & Rxb
4781					ab.Put1(byte(op + reg[p.To.Reg]))
4782					if rel.Type != 0 {
4783						r = obj.Addrel(cursym)
4784						*r = rel
4785						r.Off = int32(p.Pc + int64(ab.Len()))
4786					}
4787
4788					ab.PutInt64(v)
4789				}
4790
4791			case Zib_rr:
4792				ab.Put1(byte(op))
4793				ab.asmand(ctxt, cursym, p, &p.To, &p.To)
4794				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))
4795
4796			case Z_il, Zil_:
4797				if yt.zcase == Zil_ {
4798					a = &p.From
4799				} else {
4800					a = &p.To
4801				}
4802				ab.Put1(byte(op))
4803				if o.prefix == Pe {
4804					v = vaddr(ctxt, p, a, nil)
4805					ab.PutInt16(int16(v))
4806				} else {
4807					ab.relput4(ctxt, cursym, p, a)
4808				}
4809
4810			case Zm_ilo, Zilo_m:
4811				ab.Put1(byte(op))
4812				if yt.zcase == Zilo_m {
4813					a = &p.From
4814					ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4815				} else {
4816					a = &p.To
4817					ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))
4818				}
4819
4820				if o.prefix == Pe {
4821					v = vaddr(ctxt, p, a, nil)
4822					ab.PutInt16(int16(v))
4823				} else {
4824					ab.relput4(ctxt, cursym, p, a)
4825				}
4826
4827			case Zil_rr:
4828				ab.Put1(byte(op))
4829				ab.asmand(ctxt, cursym, p, &p.To, &p.To)
4830				if o.prefix == Pe {
4831					v = vaddr(ctxt, p, &p.From, nil)
4832					ab.PutInt16(int16(v))
4833				} else {
4834					ab.relput4(ctxt, cursym, p, &p.From)
4835				}
4836
4837			case Z_rp:
4838				ab.rexflag |= regrex[p.To.Reg] & (Rxb | 0x40)
4839				ab.Put1(byte(op + reg[p.To.Reg]))
4840
4841			case Zrp_:
4842				ab.rexflag |= regrex[p.From.Reg] & (Rxb | 0x40)
4843				ab.Put1(byte(op + reg[p.From.Reg]))
4844
4845			case Zcallcon, Zjmpcon:
4846				if yt.zcase == Zcallcon {
4847					ab.Put1(byte(op))
4848				} else {
4849					ab.Put1(o.op[z+1])
4850				}
4851				r = obj.Addrel(cursym)
4852				r.Off = int32(p.Pc + int64(ab.Len()))
4853				r.Type = objabi.R_PCREL
4854				r.Siz = 4
4855				r.Add = p.To.Offset
4856				ab.PutInt32(0)
4857
4858			case Zcallind:
4859				ab.Put2(byte(op), o.op[z+1])
4860				r = obj.Addrel(cursym)
4861				r.Off = int32(p.Pc + int64(ab.Len()))
4862				if ctxt.Arch.Family == sys.AMD64 {
4863					r.Type = objabi.R_PCREL
4864				} else {
4865					r.Type = objabi.R_ADDR
4866				}
4867				r.Siz = 4
4868				r.Add = p.To.Offset
4869				r.Sym = p.To.Sym
4870				ab.PutInt32(0)
4871
4872			case Zcall, Zcallduff:
4873				if p.To.Sym == nil {
4874					ctxt.Diag("call without target")
4875					ctxt.DiagFlush()
4876					log.Fatalf("bad code")
4877				}
4878
4879				if yt.zcase == Zcallduff && ctxt.Flag_dynlink {
4880					ctxt.Diag("directly calling duff when dynamically linking Go")
4881				}
4882
4883				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
4884					// Maintain BP around call, since duffcopy/duffzero can't do it
4885					// (the call jumps into the middle of the function).
4886					// This makes it possible to see call sites for duffcopy/duffzero in
4887					// BP-based profiling tools like Linux perf (which is the
4888					// whole point of maintaining frame pointers in Go).
4889					// MOVQ BP, -16(SP)
4890					// LEAQ -16(SP), BP
4891					ab.Put(bpduff1)
4892				}
4893				ab.Put1(byte(op))
4894				r = obj.Addrel(cursym)
4895				r.Off = int32(p.Pc + int64(ab.Len()))
4896				r.Sym = p.To.Sym
4897				r.Add = p.To.Offset
4898				r.Type = objabi.R_CALL
4899				r.Siz = 4
4900				ab.PutInt32(0)
4901
4902				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
4903					// Pop BP pushed above.
4904					// MOVQ 0(BP), BP
4905					ab.Put(bpduff2)
4906				}
4907
4908			// TODO: jump across functions needs reloc
4909			case Zbr, Zjmp, Zloop:
4910				if p.As == AXBEGIN {
4911					ab.Put1(byte(op))
4912				}
4913				if p.To.Sym != nil {
4914					if yt.zcase != Zjmp {
4915						ctxt.Diag("branch to ATEXT")
4916						ctxt.DiagFlush()
4917						log.Fatalf("bad code")
4918					}
4919
4920					ab.Put1(o.op[z+1])
4921					r = obj.Addrel(cursym)
4922					r.Off = int32(p.Pc + int64(ab.Len()))
4923					r.Sym = p.To.Sym
4924					// Note: R_CALL instead of R_PCREL. R_CALL is more permissive in that
4925					// it can point to a trampoline instead of the destination itself.
4926					r.Type = objabi.R_CALL
4927					r.Siz = 4
4928					ab.PutInt32(0)
4929					break
4930				}
4931
4932				// Assumes q is in this function.
4933				// TODO: Check in input, preserve in brchain.
4934
4935				// Fill in backward jump now.
4936				q = p.To.Target()
4937
4938				if q == nil {
4939					ctxt.Diag("jmp/branch/loop without target")
4940					ctxt.DiagFlush()
4941					log.Fatalf("bad code")
4942				}
4943
4944				if p.Back&branchBackwards != 0 {
4945					v = q.Pc - (p.Pc + 2)
4946					if v >= -128 && p.As != AXBEGIN {
4947						if p.As == AJCXZL {
4948							ab.Put1(0x67)
4949						}
4950						ab.Put2(byte(op), byte(v))
4951					} else if yt.zcase == Zloop {
4952						ctxt.Diag("loop too far: %v", p)
4953					} else {
4954						v -= 5 - 2
4955						if p.As == AXBEGIN {
4956							v--
4957						}
4958						if yt.zcase == Zbr {
4959							ab.Put1(0x0f)
4960							v--
4961						}
4962
4963						ab.Put1(o.op[z+1])
4964						ab.PutInt32(int32(v))
4965					}
4966
4967					break
4968				}
4969
4970				// Annotate target; will fill in later.
4971				p.Forwd = q.Rel
4972
4973				q.Rel = p
4974				if p.Back&branchShort != 0 && p.As != AXBEGIN {
4975					if p.As == AJCXZL {
4976						ab.Put1(0x67)
4977					}
4978					ab.Put2(byte(op), 0)
4979				} else if yt.zcase == Zloop {
4980					ctxt.Diag("loop too far: %v", p)
4981				} else {
4982					if yt.zcase == Zbr {
4983						ab.Put1(0x0f)
4984					}
4985					ab.Put1(o.op[z+1])
4986					ab.PutInt32(0)
4987				}
4988
4989			case Zbyte:
4990				v = vaddr(ctxt, p, &p.From, &rel)
4991				if rel.Siz != 0 {
4992					rel.Siz = uint8(op)
4993					r = obj.Addrel(cursym)
4994					*r = rel
4995					r.Off = int32(p.Pc + int64(ab.Len()))
4996				}
4997
4998				ab.Put1(byte(v))
4999				if op > 1 {
5000					ab.Put1(byte(v >> 8))
5001					if op > 2 {
5002						ab.PutInt16(int16(v >> 16))
5003						if op > 4 {
5004							ab.PutInt32(int32(v >> 32))
5005						}
5006					}
5007				}
5008			}
5009
5010			return
5011		}
5012	}
5013	f3t = Ynone * Ymax
5014	if p.GetFrom3() != nil {
5015		f3t = oclass(ctxt, p, p.GetFrom3()) * Ymax
5016	}
5017	for mo := ymovtab; mo[0].as != 0; mo = mo[1:] {
5018		var pp obj.Prog
5019		var t []byte
5020		if p.As == mo[0].as {
5021			if ycover[ft+int(mo[0].ft)] != 0 && ycover[f3t+int(mo[0].f3t)] != 0 && ycover[tt+int(mo[0].tt)] != 0 {
5022				t = mo[0].op[:]
5023				switch mo[0].code {
5024				default:
5025					ctxt.Diag("asmins: unknown mov %d %v", mo[0].code, p)
5026
5027				case movLit:
5028					for z = 0; t[z] != 0; z++ {
5029						ab.Put1(t[z])
5030					}
5031
5032				case movRegMem:
5033					ab.Put1(t[0])
5034					ab.asmando(ctxt, cursym, p, &p.To, int(t[1]))
5035
5036				case movMemReg:
5037					ab.Put1(t[0])
5038					ab.asmando(ctxt, cursym, p, &p.From, int(t[1]))
5039
5040				case movRegMem2op: // r,m - 2op
5041					ab.Put2(t[0], t[1])
5042					ab.asmando(ctxt, cursym, p, &p.To, int(t[2]))
5043					ab.rexflag |= regrex[p.From.Reg] & (Rxr | 0x40)
5044
5045				case movMemReg2op:
5046					ab.Put2(t[0], t[1])
5047					ab.asmando(ctxt, cursym, p, &p.From, int(t[2]))
5048					ab.rexflag |= regrex[p.To.Reg] & (Rxr | 0x40)
5049
5050				case movFullPtr:
5051					if t[0] != 0 {
5052						ab.Put1(t[0])
5053					}
5054					switch p.To.Index {
5055					default:
5056						goto bad
5057
5058					case REG_DS:
5059						ab.Put1(0xc5)
5060
5061					case REG_SS:
5062						ab.Put2(0x0f, 0xb2)
5063
5064					case REG_ES:
5065						ab.Put1(0xc4)
5066
5067					case REG_FS:
5068						ab.Put2(0x0f, 0xb4)
5069
5070					case REG_GS:
5071						ab.Put2(0x0f, 0xb5)
5072					}
5073
5074					ab.asmand(ctxt, cursym, p, &p.From, &p.To)
5075
5076				case movDoubleShift:
5077					if t[0] == Pw {
5078						if ctxt.Arch.Family != sys.AMD64 {
5079							ctxt.Diag("asmins: illegal 64: %v", p)
5080						}
5081						ab.rexflag |= Pw
5082						t = t[1:]
5083					} else if t[0] == Pe {
5084						ab.Put1(Pe)
5085						t = t[1:]
5086					}
5087
5088					switch p.From.Type {
5089					default:
5090						goto bad
5091
5092					case obj.TYPE_CONST:
5093						ab.Put2(0x0f, t[0])
5094						ab.asmandsz(ctxt, cursym, p, &p.To, reg[p.GetFrom3().Reg], regrex[p.GetFrom3().Reg], 0)
5095						ab.Put1(byte(p.From.Offset))
5096
5097					case obj.TYPE_REG:
5098						switch p.From.Reg {
5099						default:
5100							goto bad
5101
5102						case REG_CL, REG_CX:
5103							ab.Put2(0x0f, t[1])
5104							ab.asmandsz(ctxt, cursym, p, &p.To, reg[p.GetFrom3().Reg], regrex[p.GetFrom3().Reg], 0)
5105						}
5106					}
5107
5108				// NOTE: The systems listed here are the ones that use the "TLS initial exec" model,
5109				// where you load the TLS base register into a register and then index off that
5110				// register to access the actual TLS variables. Systems that allow direct TLS access
5111				// are handled in prefixof above and should not be listed here.
5112				case movTLSReg:
5113					if ctxt.Arch.Family == sys.AMD64 && p.As != AMOVQ || ctxt.Arch.Family == sys.I386 && p.As != AMOVL {
5114						ctxt.Diag("invalid load of TLS: %v", p)
5115					}
5116
5117					if ctxt.Arch.Family == sys.I386 {
5118						// NOTE: The systems listed here are the ones that use the "TLS initial exec" model,
5119						// where you load the TLS base register into a register and then index off that
5120						// register to access the actual TLS variables. Systems that allow direct TLS access
5121						// are handled in prefixof above and should not be listed here.
5122						switch ctxt.Headtype {
5123						default:
5124							log.Fatalf("unknown TLS base location for %v", ctxt.Headtype)
5125
5126						case objabi.Hlinux, objabi.Hfreebsd:
5127							if ctxt.Flag_shared {
5128								// Note that this is not generating the same insns as the other cases.
5129								//     MOV TLS, dst
5130								// becomes
5131								//     call __x86.get_pc_thunk.dst
5132								//     movl (gotpc + g@gotntpoff)(dst), dst
5133								// which is encoded as
5134								//     call __x86.get_pc_thunk.dst
5135								//     movq 0(dst), dst
5136								// and R_CALL & R_TLS_IE relocs. This all assumes the only tls variable we access
5137								// is g, which we can't check here, but will when we assemble the second
5138								// instruction.
5139								dst := p.To.Reg
5140								ab.Put1(0xe8)
5141								r = obj.Addrel(cursym)
5142								r.Off = int32(p.Pc + int64(ab.Len()))
5143								r.Type = objabi.R_CALL
5144								r.Siz = 4
5145								r.Sym = ctxt.Lookup("__x86.get_pc_thunk." + strings.ToLower(rconv(int(dst))))
5146								ab.PutInt32(0)
5147
5148								ab.Put2(0x8B, byte(2<<6|reg[dst]|(reg[dst]<<3)))
5149								r = obj.Addrel(cursym)
5150								r.Off = int32(p.Pc + int64(ab.Len()))
5151								r.Type = objabi.R_TLS_IE
5152								r.Siz = 4
5153								r.Add = 2
5154								ab.PutInt32(0)
5155							} else {
5156								// ELF TLS base is 0(GS).
5157								pp.From = p.From
5158
5159								pp.From.Type = obj.TYPE_MEM
5160								pp.From.Reg = REG_GS
5161								pp.From.Offset = 0
5162								pp.From.Index = REG_NONE
5163								pp.From.Scale = 0
5164								ab.Put2(0x65, // GS
5165									0x8B)
5166								ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5167							}
5168						case objabi.Hplan9:
5169							pp.From = obj.Addr{}
5170							pp.From.Type = obj.TYPE_MEM
5171							pp.From.Name = obj.NAME_EXTERN
5172							pp.From.Sym = plan9privates
5173							pp.From.Offset = 0
5174							pp.From.Index = REG_NONE
5175							ab.Put1(0x8B)
5176							ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5177						}
5178						break
5179					}
5180
5181					switch ctxt.Headtype {
5182					default:
5183						log.Fatalf("unknown TLS base location for %v", ctxt.Headtype)
5184
5185					case objabi.Hlinux, objabi.Hfreebsd:
5186						if !ctxt.Flag_shared {
5187							log.Fatalf("unknown TLS base location for linux/freebsd without -shared")
5188						}
5189						// Note that this is not generating the same insn as the other cases.
5190						//     MOV TLS, R_to
5191						// becomes
5192						//     movq g@gottpoff(%rip), R_to
5193						// which is encoded as
5194						//     movq 0(%rip), R_to
5195						// and a R_TLS_IE reloc. This all assumes the only tls variable we access
5196						// is g, which we can't check here, but will when we assemble the second
5197						// instruction.
5198						ab.rexflag = Pw | (regrex[p.To.Reg] & Rxr)
5199
5200						ab.Put2(0x8B, byte(0x05|(reg[p.To.Reg]<<3)))
5201						r = obj.Addrel(cursym)
5202						r.Off = int32(p.Pc + int64(ab.Len()))
5203						r.Type = objabi.R_TLS_IE
5204						r.Siz = 4
5205						r.Add = -4
5206						ab.PutInt32(0)
5207
5208					case objabi.Hplan9:
5209						pp.From = obj.Addr{}
5210						pp.From.Type = obj.TYPE_MEM
5211						pp.From.Name = obj.NAME_EXTERN
5212						pp.From.Sym = plan9privates
5213						pp.From.Offset = 0
5214						pp.From.Index = REG_NONE
5215						ab.rexflag |= Pw
5216						ab.Put1(0x8B)
5217						ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5218
5219					case objabi.Hsolaris: // TODO(rsc): Delete Hsolaris from list. Should not use this code. See progedit in obj6.c.
5220						// TLS base is 0(FS).
5221						pp.From = p.From
5222
5223						pp.From.Type = obj.TYPE_MEM
5224						pp.From.Name = obj.NAME_NONE
5225						pp.From.Reg = REG_NONE
5226						pp.From.Offset = 0
5227						pp.From.Index = REG_NONE
5228						pp.From.Scale = 0
5229						ab.rexflag |= Pw
5230						ab.Put2(0x64, // FS
5231							0x8B)
5232						ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5233					}
5234				}
5235				return
5236			}
5237		}
5238	}
5239	goto bad
5240
5241bad:
5242	if ctxt.Arch.Family != sys.AMD64 {
5243		// here, the assembly has failed.
5244		// if it's a byte instruction that has
5245		// unaddressable registers, try to
5246		// exchange registers and reissue the
5247		// instruction with the operands renamed.
5248		pp := *p
5249
5250		unbytereg(&pp.From, &pp.Ft)
5251		unbytereg(&pp.To, &pp.Tt)
5252
5253		z := int(p.From.Reg)
5254		if p.From.Type == obj.TYPE_REG && z >= REG_BP && z <= REG_DI {
5255			// TODO(rsc): Use this code for x86-64 too. It has bug fixes not present in the amd64 code base.
5256			// For now, different to keep bit-for-bit compatibility.
5257			if ctxt.Arch.Family == sys.I386 {
5258				breg := byteswapreg(ctxt, &p.To)
5259				if breg != REG_AX {
5260					ab.Put1(0x87) // xchg lhs,bx
5261					ab.asmando(ctxt, cursym, p, &p.From, reg[breg])
5262					subreg(&pp, z, breg)
5263					ab.doasm(ctxt, cursym, &pp)
5264					ab.Put1(0x87) // xchg lhs,bx
5265					ab.asmando(ctxt, cursym, p, &p.From, reg[breg])
5266				} else {
5267					ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5268					subreg(&pp, z, REG_AX)
5269					ab.doasm(ctxt, cursym, &pp)
5270					ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5271				}
5272				return
5273			}
5274
5275			if isax(&p.To) || p.To.Type == obj.TYPE_NONE {
5276				// We certainly don't want to exchange
5277				// with AX if the op is MUL or DIV.
5278				ab.Put1(0x87) // xchg lhs,bx
5279				ab.asmando(ctxt, cursym, p, &p.From, reg[REG_BX])
5280				subreg(&pp, z, REG_BX)
5281				ab.doasm(ctxt, cursym, &pp)
5282				ab.Put1(0x87) // xchg lhs,bx
5283				ab.asmando(ctxt, cursym, p, &p.From, reg[REG_BX])
5284			} else {
5285				ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5286				subreg(&pp, z, REG_AX)
5287				ab.doasm(ctxt, cursym, &pp)
5288				ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5289			}
5290			return
5291		}
5292
5293		z = int(p.To.Reg)
5294		if p.To.Type == obj.TYPE_REG && z >= REG_BP && z <= REG_DI {
5295			// TODO(rsc): Use this code for x86-64 too. It has bug fixes not present in the amd64 code base.
5296			// For now, different to keep bit-for-bit compatibility.
5297			if ctxt.Arch.Family == sys.I386 {
5298				breg := byteswapreg(ctxt, &p.From)
5299				if breg != REG_AX {
5300					ab.Put1(0x87) //xchg rhs,bx
5301					ab.asmando(ctxt, cursym, p, &p.To, reg[breg])
5302					subreg(&pp, z, breg)
5303					ab.doasm(ctxt, cursym, &pp)
5304					ab.Put1(0x87) // xchg rhs,bx
5305					ab.asmando(ctxt, cursym, p, &p.To, reg[breg])
5306				} else {
5307					ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5308					subreg(&pp, z, REG_AX)
5309					ab.doasm(ctxt, cursym, &pp)
5310					ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5311				}
5312				return
5313			}
5314
5315			if isax(&p.From) {
5316				ab.Put1(0x87) // xchg rhs,bx
5317				ab.asmando(ctxt, cursym, p, &p.To, reg[REG_BX])
5318				subreg(&pp, z, REG_BX)
5319				ab.doasm(ctxt, cursym, &pp)
5320				ab.Put1(0x87) // xchg rhs,bx
5321				ab.asmando(ctxt, cursym, p, &p.To, reg[REG_BX])
5322			} else {
5323				ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5324				subreg(&pp, z, REG_AX)
5325				ab.doasm(ctxt, cursym, &pp)
5326				ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5327			}
5328			return
5329		}
5330	}
5331
5332	ctxt.Diag("%s: invalid instruction: %v", cursym.Name, p)
5333}
5334
5335// byteswapreg returns a byte-addressable register (AX, BX, CX, DX)
5336// which is not referenced in a.
5337// If a is empty, it returns BX to account for MULB-like instructions
5338// that might use DX and AX.
5339func byteswapreg(ctxt *obj.Link, a *obj.Addr) int {
5340	cana, canb, canc, cand := true, true, true, true
5341	if a.Type == obj.TYPE_NONE {
5342		cana, cand = false, false
5343	}
5344
5345	if a.Type == obj.TYPE_REG || ((a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR) && a.Name == obj.NAME_NONE) {
5346		switch a.Reg {
5347		case REG_NONE:
5348			cana, cand = false, false
5349		case REG_AX, REG_AL, REG_AH:
5350			cana = false
5351		case REG_BX, REG_BL, REG_BH:
5352			canb = false
5353		case REG_CX, REG_CL, REG_CH:
5354			canc = false
5355		case REG_DX, REG_DL, REG_DH:
5356			cand = false
5357		}
5358	}
5359
5360	if a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR {
5361		switch a.Index {
5362		case REG_AX:
5363			cana = false
5364		case REG_BX:
5365			canb = false
5366		case REG_CX:
5367			canc = false
5368		case REG_DX:
5369			cand = false
5370		}
5371	}
5372
5373	switch {
5374	case cana:
5375		return REG_AX
5376	case canb:
5377		return REG_BX
5378	case canc:
5379		return REG_CX
5380	case cand:
5381		return REG_DX
5382	default:
5383		ctxt.Diag("impossible byte register")
5384		ctxt.DiagFlush()
5385		log.Fatalf("bad code")
5386		return 0
5387	}
5388}
5389
5390func isbadbyte(a *obj.Addr) bool {
5391	return a.Type == obj.TYPE_REG && (REG_BP <= a.Reg && a.Reg <= REG_DI || REG_BPB <= a.Reg && a.Reg <= REG_DIB)
5392}
5393
5394func (ab *AsmBuf) asmins(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
5395	ab.Reset()
5396
5397	ab.rexflag = 0
5398	ab.vexflag = false
5399	ab.evexflag = false
5400	mark := ab.Len()
5401	ab.doasm(ctxt, cursym, p)
5402	if ab.rexflag != 0 && !ab.vexflag && !ab.evexflag {
5403		// as befits the whole approach of the architecture,
5404		// the rex prefix must appear before the first opcode byte
5405		// (and thus after any 66/67/f2/f3/26/2e/3e prefix bytes, but
5406		// before the 0f opcode escape!), or it might be ignored.
5407		// note that the handbook often misleadingly shows 66/f2/f3 in `opcode'.
5408		if ctxt.Arch.Family != sys.AMD64 {
5409			ctxt.Diag("asmins: illegal in mode %d: %v (%d %d)", ctxt.Arch.RegSize*8, p, p.Ft, p.Tt)
5410		}
5411		n := ab.Len()
5412		var np int
5413		for np = mark; np < n; np++ {
5414			c := ab.At(np)
5415			if c != 0xf2 && c != 0xf3 && (c < 0x64 || c > 0x67) && c != 0x2e && c != 0x3e && c != 0x26 {
5416				break
5417			}
5418		}
5419		ab.Insert(np, byte(0x40|ab.rexflag))
5420	}
5421
5422	n := ab.Len()
5423	for i := len(cursym.R) - 1; i >= 0; i-- {
5424		r := &cursym.R[i]
5425		if int64(r.Off) < p.Pc {
5426			break
5427		}
5428		if ab.rexflag != 0 && !ab.vexflag && !ab.evexflag {
5429			r.Off++
5430		}
5431		if r.Type == objabi.R_PCREL {
5432			if ctxt.Arch.Family == sys.AMD64 || p.As == obj.AJMP || p.As == obj.ACALL {
5433				// PC-relative addressing is relative to the end of the instruction,
5434				// but the relocations applied by the linker are relative to the end
5435				// of the relocation. Because immediate instruction
5436				// arguments can follow the PC-relative memory reference in the
5437				// instruction encoding, the two may not coincide. In this case,
5438				// adjust addend so that linker can keep relocating relative to the
5439				// end of the relocation.
5440				r.Add -= p.Pc + int64(n) - (int64(r.Off) + int64(r.Siz))
5441			} else if ctxt.Arch.Family == sys.I386 {
5442				// On 386 PC-relative addressing (for non-call/jmp instructions)
5443				// assumes that the previous instruction loaded the PC of the end
5444				// of that instruction into CX, so the adjustment is relative to
5445				// that.
5446				r.Add += int64(r.Off) - p.Pc + int64(r.Siz)
5447			}
5448		}
5449		if r.Type == objabi.R_GOTPCREL && ctxt.Arch.Family == sys.I386 {
5450			// On 386, R_GOTPCREL makes the same assumptions as R_PCREL.
5451			r.Add += int64(r.Off) - p.Pc + int64(r.Siz)
5452		}
5453
5454	}
5455}
5456
5457// unpackOps4 extracts 4 operands from p.
5458func unpackOps4(p *obj.Prog) (arg0, arg1, arg2, dst *obj.Addr) {
5459	return &p.From, &p.RestArgs[0].Addr, &p.RestArgs[1].Addr, &p.To
5460}
5461
5462// unpackOps5 extracts 5 operands from p.
5463func unpackOps5(p *obj.Prog) (arg0, arg1, arg2, arg3, dst *obj.Addr) {
5464	return &p.From, &p.RestArgs[0].Addr, &p.RestArgs[1].Addr, &p.RestArgs[2].Addr, &p.To
5465}
5466