1// Copyright 2014 The Go Authors.  All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Table-driven decoding of x86 instructions.
6
7package x86asm
8
9import (
10	"encoding/binary"
11	"errors"
12	"fmt"
13	"runtime"
14)
15
16// Set trace to true to cause the decoder to print the PC sequence
17// of the executed instruction codes. This is typically only useful
18// when you are running a test of a single input case.
19const trace = false
20
21// A decodeOp is a single instruction in the decoder bytecode program.
22//
23// The decodeOps correspond to consuming and conditionally branching
24// on input bytes, consuming additional fields, and then interpreting
25// consumed data as instruction arguments. The names of the xRead and xArg
26// operations are taken from the Intel manual conventions, for example
27// Volume 2, Section 3.1.1, page 487 of
28// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
29//
30// The actual decoding program is generated by ../x86map.
31//
32// TODO(rsc): We may be able to merge various of the memory operands
33// since we don't care about, say, the distinction between m80dec and m80bcd.
34// Similarly, mm and mm1 have identical meaning, as do xmm and xmm1.
35
36type decodeOp uint16
37
38const (
39	xFail  decodeOp = iota // invalid instruction (return)
40	xMatch                 // completed match
41	xJump                  // jump to pc
42
43	xCondByte     // switch on instruction byte value
44	xCondSlashR   // read and switch on instruction /r value
45	xCondPrefix   // switch on presence of instruction prefix
46	xCondIs64     // switch on 64-bit processor mode
47	xCondDataSize // switch on operand size
48	xCondAddrSize // switch on address size
49	xCondIsMem    // switch on memory vs register argument
50
51	xSetOp // set instruction opcode
52
53	xReadSlashR // read /r
54	xReadIb     // read ib
55	xReadIw     // read iw
56	xReadId     // read id
57	xReadIo     // read io
58	xReadCb     // read cb
59	xReadCw     // read cw
60	xReadCd     // read cd
61	xReadCp     // read cp
62	xReadCm     // read cm
63
64	xArg1            // arg 1
65	xArg3            // arg 3
66	xArgAL           // arg AL
67	xArgAX           // arg AX
68	xArgCL           // arg CL
69	xArgCR0dashCR7   // arg CR0-CR7
70	xArgCS           // arg CS
71	xArgDR0dashDR7   // arg DR0-DR7
72	xArgDS           // arg DS
73	xArgDX           // arg DX
74	xArgEAX          // arg EAX
75	xArgEDX          // arg EDX
76	xArgES           // arg ES
77	xArgFS           // arg FS
78	xArgGS           // arg GS
79	xArgImm16        // arg imm16
80	xArgImm32        // arg imm32
81	xArgImm64        // arg imm64
82	xArgImm8         // arg imm8
83	xArgImm8u        // arg imm8 but record as unsigned
84	xArgImm16u       // arg imm8 but record as unsigned
85	xArgM            // arg m
86	xArgM128         // arg m128
87	xArgM256         // arg m256
88	xArgM1428byte    // arg m14/28byte
89	xArgM16          // arg m16
90	xArgM16and16     // arg m16&16
91	xArgM16and32     // arg m16&32
92	xArgM16and64     // arg m16&64
93	xArgM16colon16   // arg m16:16
94	xArgM16colon32   // arg m16:32
95	xArgM16colon64   // arg m16:64
96	xArgM16int       // arg m16int
97	xArgM2byte       // arg m2byte
98	xArgM32          // arg m32
99	xArgM32and32     // arg m32&32
100	xArgM32fp        // arg m32fp
101	xArgM32int       // arg m32int
102	xArgM512byte     // arg m512byte
103	xArgM64          // arg m64
104	xArgM64fp        // arg m64fp
105	xArgM64int       // arg m64int
106	xArgM8           // arg m8
107	xArgM80bcd       // arg m80bcd
108	xArgM80dec       // arg m80dec
109	xArgM80fp        // arg m80fp
110	xArgM94108byte   // arg m94/108byte
111	xArgMm           // arg mm
112	xArgMm1          // arg mm1
113	xArgMm2          // arg mm2
114	xArgMm2M64       // arg mm2/m64
115	xArgMmM32        // arg mm/m32
116	xArgMmM64        // arg mm/m64
117	xArgMem          // arg mem
118	xArgMoffs16      // arg moffs16
119	xArgMoffs32      // arg moffs32
120	xArgMoffs64      // arg moffs64
121	xArgMoffs8       // arg moffs8
122	xArgPtr16colon16 // arg ptr16:16
123	xArgPtr16colon32 // arg ptr16:32
124	xArgR16          // arg r16
125	xArgR16op        // arg r16 with +rw in opcode
126	xArgR32          // arg r32
127	xArgR32M16       // arg r32/m16
128	xArgR32M8        // arg r32/m8
129	xArgR32op        // arg r32 with +rd in opcode
130	xArgR64          // arg r64
131	xArgR64M16       // arg r64/m16
132	xArgR64op        // arg r64 with +rd in opcode
133	xArgR8           // arg r8
134	xArgR8op         // arg r8 with +rb in opcode
135	xArgRAX          // arg RAX
136	xArgRDX          // arg RDX
137	xArgRM           // arg r/m
138	xArgRM16         // arg r/m16
139	xArgRM32         // arg r/m32
140	xArgRM64         // arg r/m64
141	xArgRM8          // arg r/m8
142	xArgReg          // arg reg
143	xArgRegM16       // arg reg/m16
144	xArgRegM32       // arg reg/m32
145	xArgRegM8        // arg reg/m8
146	xArgRel16        // arg rel16
147	xArgRel32        // arg rel32
148	xArgRel8         // arg rel8
149	xArgSS           // arg SS
150	xArgST           // arg ST, aka ST(0)
151	xArgSTi          // arg ST(i) with +i in opcode
152	xArgSreg         // arg Sreg
153	xArgTR0dashTR7   // arg TR0-TR7
154	xArgXmm          // arg xmm
155	xArgXMM0         // arg <XMM0>
156	xArgXmm1         // arg xmm1
157	xArgXmm2         // arg xmm2
158	xArgXmm2M128     // arg xmm2/m128
159	xArgYmm2M256     // arg ymm2/m256
160	xArgXmm2M16      // arg xmm2/m16
161	xArgXmm2M32      // arg xmm2/m32
162	xArgXmm2M64      // arg xmm2/m64
163	xArgXmmM128      // arg xmm/m128
164	xArgXmmM32       // arg xmm/m32
165	xArgXmmM64       // arg xmm/m64
166	xArgYmm1         // arg ymm1
167	xArgRmf16        // arg r/m16 but force mod=3
168	xArgRmf32        // arg r/m32 but force mod=3
169	xArgRmf64        // arg r/m64 but force mod=3
170)
171
172// instPrefix returns an Inst describing just one prefix byte.
173// It is only used if there is a prefix followed by an unintelligible
174// or invalid instruction byte sequence.
175func instPrefix(b byte, mode int) (Inst, error) {
176	// When tracing it is useful to see what called instPrefix to report an error.
177	if trace {
178		_, file, line, _ := runtime.Caller(1)
179		fmt.Printf("%s:%d\n", file, line)
180	}
181	p := Prefix(b)
182	switch p {
183	case PrefixDataSize:
184		if mode == 16 {
185			p = PrefixData32
186		} else {
187			p = PrefixData16
188		}
189	case PrefixAddrSize:
190		if mode == 32 {
191			p = PrefixAddr16
192		} else {
193			p = PrefixAddr32
194		}
195	}
196	// Note: using composite literal with Prefix key confuses 'bundle' tool.
197	inst := Inst{Len: 1}
198	inst.Prefix = Prefixes{p}
199	return inst, nil
200}
201
202// truncated reports a truncated instruction.
203// For now we use instPrefix but perhaps later we will return
204// a specific error here.
205func truncated(src []byte, mode int) (Inst, error) {
206	if len(src) == 0 {
207		return Inst{}, ErrTruncated
208	}
209	return instPrefix(src[0], mode) // too long
210}
211
212// These are the errors returned by Decode.
213var (
214	ErrInvalidMode  = errors.New("invalid x86 mode in Decode")
215	ErrTruncated    = errors.New("truncated instruction")
216	ErrUnrecognized = errors.New("unrecognized instruction")
217)
218
219// decoderCover records coverage information for which parts
220// of the byte code have been executed.
221var decoderCover []bool
222
223// Decode decodes the leading bytes in src as a single instruction.
224// The mode arguments specifies the assumed processor mode:
225// 16, 32, or 64 for 16-, 32-, and 64-bit execution modes.
226func Decode(src []byte, mode int) (inst Inst, err error) {
227	return decode1(src, mode, false)
228}
229
230// decode1 is the implementation of Decode but takes an extra
231// gnuCompat flag to cause it to change its behavior to mimic
232// bugs (or at least unique features) of GNU libopcodes as used
233// by objdump. We don't believe that logic is the right thing to do
234// in general, but when testing against libopcodes it simplifies the
235// comparison if we adjust a few small pieces of logic.
236// The affected logic is in the conditional branch for "mandatory" prefixes,
237// case xCondPrefix.
238func decode1(src []byte, mode int, gnuCompat bool) (Inst, error) {
239	switch mode {
240	case 16, 32, 64:
241		// ok
242		// TODO(rsc): 64-bit mode not tested, probably not working.
243	default:
244		return Inst{}, ErrInvalidMode
245	}
246
247	// Maximum instruction size is 15 bytes.
248	// If we need to read more, return 'truncated instruction.
249	if len(src) > 15 {
250		src = src[:15]
251	}
252
253	var (
254		// prefix decoding information
255		pos           = 0    // position reading src
256		nprefix       = 0    // number of prefixes
257		lockIndex     = -1   // index of LOCK prefix in src and inst.Prefix
258		repIndex      = -1   // index of REP/REPN prefix in src and inst.Prefix
259		segIndex      = -1   // index of Group 2 prefix in src and inst.Prefix
260		dataSizeIndex = -1   // index of Group 3 prefix in src and inst.Prefix
261		addrSizeIndex = -1   // index of Group 4 prefix in src and inst.Prefix
262		rex           Prefix // rex byte if present (or 0)
263		rexUsed       Prefix // bits used in rex byte
264		rexIndex      = -1   // index of rex byte
265		vex           Prefix // use vex encoding
266		vexIndex      = -1   // index of vex prefix
267
268		addrMode = mode // address mode (width in bits)
269		dataMode = mode // operand mode (width in bits)
270
271		// decoded ModR/M fields
272		haveModrm bool
273		modrm     int
274		mod       int
275		regop     int
276		rm        int
277
278		// if ModR/M is memory reference, Mem form
279		mem     Mem
280		haveMem bool
281
282		// decoded SIB fields
283		haveSIB bool
284		sib     int
285		scale   int
286		index   int
287		base    int
288		displen int
289		dispoff int
290
291		// decoded immediate values
292		imm     int64
293		imm8    int8
294		immc    int64
295		immcpos int
296
297		// output
298		opshift int
299		inst    Inst
300		narg    int // number of arguments written to inst
301	)
302
303	if mode == 64 {
304		dataMode = 32
305	}
306
307	// Prefixes are certainly the most complex and underspecified part of
308	// decoding x86 instructions. Although the manuals say things like
309	// up to four prefixes, one from each group, nearly everyone seems to
310	// agree that in practice as many prefixes as possible, including multiple
311	// from a particular group or repetitions of a given prefix, can be used on
312	// an instruction, provided the total instruction length including prefixes
313	// does not exceed the agreed-upon maximum of 15 bytes.
314	// Everyone also agrees that if one of these prefixes is the LOCK prefix
315	// and the instruction is not one of the instructions that can be used with
316	// the LOCK prefix or if the destination is not a memory operand,
317	// then the instruction is invalid and produces the #UD exception.
318	// However, that is the end of any semblance of agreement.
319	//
320	// What happens if prefixes are given that conflict with other prefixes?
321	// For example, the memory segment overrides CS, DS, ES, FS, GS, SS
322	// conflict with each other: only one segment can be in effect.
323	// Disassemblers seem to agree that later prefixes take priority over
324	// earlier ones. I have not taken the time to write assembly programs
325	// to check to see if the hardware agrees.
326	//
327	// What happens if prefixes are given that have no meaning for the
328	// specific instruction to which they are attached? It depends.
329	// If they really have no meaning, they are ignored. However, a future
330	// processor may assign a different meaning. As a disassembler, we
331	// don't really know whether we're seeing a meaningless prefix or one
332	// whose meaning we simply haven't been told yet.
333	//
334	// Combining the two questions, what happens when conflicting
335	// extension prefixes are given? No one seems to know for sure.
336	// For example, MOVQ is 66 0F D6 /r, MOVDQ2Q is F2 0F D6 /r,
337	// and MOVQ2DQ is F3 0F D6 /r. What is '66 F2 F3 0F D6 /r'?
338	// Which prefix wins? See the xCondPrefix prefix for more.
339	//
340	// Writing assembly test cases to divine which interpretation the
341	// CPU uses might clarify the situation, but more likely it would
342	// make the situation even less clear.
343
344	// Read non-REX prefixes.
345ReadPrefixes:
346	for ; pos < len(src); pos++ {
347		p := Prefix(src[pos])
348		switch p {
349		default:
350			nprefix = pos
351			break ReadPrefixes
352
353		// Group 1 - lock and repeat prefixes
354		// According to Intel, there should only be one from this set,
355		// but according to AMD both can be present.
356		case 0xF0:
357			if lockIndex >= 0 {
358				inst.Prefix[lockIndex] |= PrefixIgnored
359			}
360			lockIndex = pos
361		case 0xF2, 0xF3:
362			if repIndex >= 0 {
363				inst.Prefix[repIndex] |= PrefixIgnored
364			}
365			repIndex = pos
366
367		// Group 2 - segment override / branch hints
368		case 0x26, 0x2E, 0x36, 0x3E:
369			if mode == 64 {
370				p |= PrefixIgnored
371				break
372			}
373			fallthrough
374		case 0x64, 0x65:
375			if segIndex >= 0 {
376				inst.Prefix[segIndex] |= PrefixIgnored
377			}
378			segIndex = pos
379
380		// Group 3 - operand size override
381		case 0x66:
382			if mode == 16 {
383				dataMode = 32
384				p = PrefixData32
385			} else {
386				dataMode = 16
387				p = PrefixData16
388			}
389			if dataSizeIndex >= 0 {
390				inst.Prefix[dataSizeIndex] |= PrefixIgnored
391			}
392			dataSizeIndex = pos
393
394		// Group 4 - address size override
395		case 0x67:
396			if mode == 32 {
397				addrMode = 16
398				p = PrefixAddr16
399			} else {
400				addrMode = 32
401				p = PrefixAddr32
402			}
403			if addrSizeIndex >= 0 {
404				inst.Prefix[addrSizeIndex] |= PrefixIgnored
405			}
406			addrSizeIndex = pos
407
408		//Group 5 - Vex encoding
409		case 0xC5:
410			if pos == 0 && pos+1 < len(src) && (mode == 64 || (mode == 32 && src[pos+1]&0xc0 == 0xc0)) {
411				vex = p
412				vexIndex = pos
413				inst.Prefix[pos] = p
414				inst.Prefix[pos+1] = Prefix(src[pos+1])
415				pos += 1
416				continue
417			} else {
418				nprefix = pos
419				break ReadPrefixes
420			}
421		case 0xC4:
422			if pos == 0 && pos+2 < len(src) && (mode == 64 || (mode == 32 && src[pos+1]&0xc0 == 0xc0)) {
423				vex = p
424				vexIndex = pos
425				inst.Prefix[pos] = p
426				inst.Prefix[pos+1] = Prefix(src[pos+1])
427				inst.Prefix[pos+2] = Prefix(src[pos+2])
428				pos += 2
429				continue
430			} else {
431				nprefix = pos
432				break ReadPrefixes
433			}
434		}
435
436		if pos >= len(inst.Prefix) {
437			return instPrefix(src[0], mode) // too long
438		}
439
440		inst.Prefix[pos] = p
441	}
442
443	// Read REX prefix.
444	if pos < len(src) && mode == 64 && Prefix(src[pos]).IsREX() && vex == 0 {
445		rex = Prefix(src[pos])
446		rexIndex = pos
447		if pos >= len(inst.Prefix) {
448			return instPrefix(src[0], mode) // too long
449		}
450		inst.Prefix[pos] = rex
451		pos++
452		if rex&PrefixREXW != 0 {
453			dataMode = 64
454			if dataSizeIndex >= 0 {
455				inst.Prefix[dataSizeIndex] |= PrefixIgnored
456			}
457		}
458	}
459
460	// Decode instruction stream, interpreting decoding instructions.
461	// opshift gives the shift to use when saving the next
462	// opcode byte into inst.Opcode.
463	opshift = 24
464
465	// Decode loop, executing decoder program.
466	var oldPC, prevPC int
467Decode:
468	for pc := 1; ; { // TODO uint
469		oldPC = prevPC
470		prevPC = pc
471		if trace {
472			println("run", pc)
473		}
474		x := decoder[pc]
475		if decoderCover != nil {
476			decoderCover[pc] = true
477		}
478		pc++
479
480		// Read and decode ModR/M if needed by opcode.
481		switch decodeOp(x) {
482		case xCondSlashR, xReadSlashR:
483			if haveModrm {
484				return Inst{Len: pos}, errInternal
485			}
486			haveModrm = true
487			if pos >= len(src) {
488				return truncated(src, mode)
489			}
490			modrm = int(src[pos])
491			pos++
492			if opshift >= 0 {
493				inst.Opcode |= uint32(modrm) << uint(opshift)
494				opshift -= 8
495			}
496			mod = modrm >> 6
497			regop = (modrm >> 3) & 07
498			rm = modrm & 07
499			if rex&PrefixREXR != 0 {
500				rexUsed |= PrefixREXR
501				regop |= 8
502			}
503			if addrMode == 16 {
504				// 16-bit modrm form
505				if mod != 3 {
506					haveMem = true
507					mem = addr16[rm]
508					if rm == 6 && mod == 0 {
509						mem.Base = 0
510					}
511
512					// Consume disp16 if present.
513					if mod == 0 && rm == 6 || mod == 2 {
514						if pos+2 > len(src) {
515							return truncated(src, mode)
516						}
517						mem.Disp = int64(binary.LittleEndian.Uint16(src[pos:]))
518						pos += 2
519					}
520
521					// Consume disp8 if present.
522					if mod == 1 {
523						if pos >= len(src) {
524							return truncated(src, mode)
525						}
526						mem.Disp = int64(int8(src[pos]))
527						pos++
528					}
529				}
530			} else {
531				haveMem = mod != 3
532
533				// 32-bit or 64-bit form
534				// Consume SIB encoding if present.
535				if rm == 4 && mod != 3 {
536					haveSIB = true
537					if pos >= len(src) {
538						return truncated(src, mode)
539					}
540					sib = int(src[pos])
541					pos++
542					if opshift >= 0 {
543						inst.Opcode |= uint32(sib) << uint(opshift)
544						opshift -= 8
545					}
546					scale = sib >> 6
547					index = (sib >> 3) & 07
548					base = sib & 07
549					if rex&PrefixREXB != 0 || vex == 0xC4 && inst.Prefix[vexIndex+1]&0x20 == 0 {
550						rexUsed |= PrefixREXB
551						base |= 8
552					}
553					if rex&PrefixREXX != 0 || vex == 0xC4 && inst.Prefix[vexIndex+1]&0x40 == 0 {
554						rexUsed |= PrefixREXX
555						index |= 8
556					}
557
558					mem.Scale = 1 << uint(scale)
559					if index == 4 {
560						// no mem.Index
561					} else {
562						mem.Index = baseRegForBits(addrMode) + Reg(index)
563					}
564					if base&7 == 5 && mod == 0 {
565						// no mem.Base
566					} else {
567						mem.Base = baseRegForBits(addrMode) + Reg(base)
568					}
569				} else {
570					if rex&PrefixREXB != 0 {
571						rexUsed |= PrefixREXB
572						rm |= 8
573					}
574					if mod == 0 && rm&7 == 5 || rm&7 == 4 {
575						// base omitted
576					} else if mod != 3 {
577						mem.Base = baseRegForBits(addrMode) + Reg(rm)
578					}
579				}
580
581				// Consume disp32 if present.
582				if mod == 0 && (rm&7 == 5 || haveSIB && base&7 == 5) || mod == 2 {
583					if pos+4 > len(src) {
584						return truncated(src, mode)
585					}
586					dispoff = pos
587					displen = 4
588					mem.Disp = int64(binary.LittleEndian.Uint32(src[pos:]))
589					pos += 4
590				}
591
592				// Consume disp8 if present.
593				if mod == 1 {
594					if pos >= len(src) {
595						return truncated(src, mode)
596					}
597					dispoff = pos
598					displen = 1
599					mem.Disp = int64(int8(src[pos]))
600					pos++
601				}
602
603				// In 64-bit, mod=0 rm=5 is PC-relative instead of just disp.
604				// See Vol 2A. Table 2-7.
605				if mode == 64 && mod == 0 && rm&7 == 5 {
606					if addrMode == 32 {
607						mem.Base = EIP
608					} else {
609						mem.Base = RIP
610					}
611				}
612			}
613
614			if segIndex >= 0 {
615				mem.Segment = prefixToSegment(inst.Prefix[segIndex])
616			}
617		}
618
619		// Execute single opcode.
620		switch decodeOp(x) {
621		default:
622			println("bad op", x, "at", pc-1, "from", oldPC)
623			return Inst{Len: pos}, errInternal
624
625		case xFail:
626			inst.Op = 0
627			break Decode
628
629		case xMatch:
630			break Decode
631
632		case xJump:
633			pc = int(decoder[pc])
634
635		// Conditional branches.
636
637		case xCondByte:
638			if pos >= len(src) {
639				return truncated(src, mode)
640			}
641			b := src[pos]
642			n := int(decoder[pc])
643			pc++
644			for i := 0; i < n; i++ {
645				xb, xpc := decoder[pc], int(decoder[pc+1])
646				pc += 2
647				if b == byte(xb) {
648					pc = xpc
649					pos++
650					if opshift >= 0 {
651						inst.Opcode |= uint32(b) << uint(opshift)
652						opshift -= 8
653					}
654					continue Decode
655				}
656			}
657			// xCondByte is the only conditional with a fall through,
658			// so that it can be used to pick off special cases before
659			// an xCondSlash. If the fallthrough instruction is xFail,
660			// advance the position so that the decoded instruction
661			// size includes the byte we just compared against.
662			if decodeOp(decoder[pc]) == xJump {
663				pc = int(decoder[pc+1])
664			}
665			if decodeOp(decoder[pc]) == xFail {
666				pos++
667			}
668
669		case xCondIs64:
670			if mode == 64 {
671				pc = int(decoder[pc+1])
672			} else {
673				pc = int(decoder[pc])
674			}
675
676		case xCondIsMem:
677			mem := haveMem
678			if !haveModrm {
679				if pos >= len(src) {
680					return instPrefix(src[0], mode) // too long
681				}
682				mem = src[pos]>>6 != 3
683			}
684			if mem {
685				pc = int(decoder[pc+1])
686			} else {
687				pc = int(decoder[pc])
688			}
689
690		case xCondDataSize:
691			switch dataMode {
692			case 16:
693				if dataSizeIndex >= 0 {
694					inst.Prefix[dataSizeIndex] |= PrefixImplicit
695				}
696				pc = int(decoder[pc])
697			case 32:
698				if dataSizeIndex >= 0 {
699					inst.Prefix[dataSizeIndex] |= PrefixImplicit
700				}
701				pc = int(decoder[pc+1])
702			case 64:
703				rexUsed |= PrefixREXW
704				pc = int(decoder[pc+2])
705			}
706
707		case xCondAddrSize:
708			switch addrMode {
709			case 16:
710				if addrSizeIndex >= 0 {
711					inst.Prefix[addrSizeIndex] |= PrefixImplicit
712				}
713				pc = int(decoder[pc])
714			case 32:
715				if addrSizeIndex >= 0 {
716					inst.Prefix[addrSizeIndex] |= PrefixImplicit
717				}
718				pc = int(decoder[pc+1])
719			case 64:
720				pc = int(decoder[pc+2])
721			}
722
723		case xCondPrefix:
724			// Conditional branch based on presence or absence of prefixes.
725			// The conflict cases here are completely undocumented and
726			// differ significantly between GNU libopcodes and Intel xed.
727			// I have not written assembly code to divine what various CPUs
728			// do, but it wouldn't surprise me if they are not consistent either.
729			//
730			// The basic idea is to switch on the presence of a prefix, so that
731			// for example:
732			//
733			//	xCondPrefix, 4
734			//	0xF3, 123,
735			//	0xF2, 234,
736			//	0x66, 345,
737			//	0, 456
738			//
739			// branch to 123 if the F3 prefix is present, 234 if the F2 prefix
740			// is present, 66 if the 345 prefix is present, and 456 otherwise.
741			// The prefixes are given in descending order so that the 0 will be last.
742			//
743			// It is unclear what should happen if multiple conditions are
744			// satisfied: what if F2 and F3 are both present, or if 66 and F2
745			// are present, or if all three are present? The one chosen becomes
746			// part of the opcode and the others do not. Perhaps the answer
747			// depends on the specific opcodes in question.
748			//
749			// The only clear example is that CRC32 is F2 0F 38 F1 /r, and
750			// it comes in 16-bit and 32-bit forms based on the 66 prefix,
751			// so 66 F2 0F 38 F1 /r should be treated as F2 taking priority,
752			// with the 66 being only an operand size override, and probably
753			// F2 66 0F 38 F1 /r should be treated the same.
754			// Perhaps that rule is specific to the case of CRC32, since no
755			// 66 0F 38 F1 instruction is defined (today) (that we know of).
756			// However, both libopcodes and xed seem to generalize this
757			// example and choose F2/F3 in preference to 66, and we
758			// do the same.
759			//
760			// Next, what if both F2 and F3 are present? Which wins?
761			// The Intel xed rule, and ours, is that the one that occurs last wins.
762			// The GNU libopcodes rule, which we implement only in gnuCompat mode,
763			// is that F3 beats F2 unless F3 has no special meaning, in which
764			// case F3 can be a modified on an F2 special meaning.
765			//
766			// Concretely,
767			//	66 0F D6 /r is MOVQ
768			//	F2 0F D6 /r is MOVDQ2Q
769			//	F3 0F D6 /r is MOVQ2DQ.
770			//
771			//	F2 66 0F D6 /r is 66 + MOVDQ2Q always.
772			//	66 F2 0F D6 /r is 66 + MOVDQ2Q always.
773			//	F3 66 0F D6 /r is 66 + MOVQ2DQ always.
774			//	66 F3 0F D6 /r is 66 + MOVQ2DQ always.
775			//	F2 F3 0F D6 /r is F2 + MOVQ2DQ always.
776			//	F3 F2 0F D6 /r is F3 + MOVQ2DQ in Intel xed, but F2 + MOVQ2DQ in GNU libopcodes.
777			//	Adding 66 anywhere in the prefix section of the
778			//	last two cases does not change the outcome.
779			//
780			// Finally, what if there is a variant in which 66 is a mandatory
781			// prefix rather than an operand size override, but we know of
782			// no corresponding F2/F3 form, and we see both F2/F3 and 66.
783			// Does F2/F3 still take priority, so that the result is an unknown
784			// instruction, or does the 66 take priority, so that the extended
785			// 66 instruction should be interpreted as having a REP/REPN prefix?
786			// Intel xed does the former and GNU libopcodes does the latter.
787			// We side with Intel xed, unless we are trying to match libopcodes
788			// more closely during the comparison-based test suite.
789			//
790			// In 64-bit mode REX.W is another valid prefix to test for, but
791			// there is less ambiguity about that. When present, REX.W is
792			// always the first entry in the table.
793			n := int(decoder[pc])
794			pc++
795			sawF3 := false
796			for j := 0; j < n; j++ {
797				prefix := Prefix(decoder[pc+2*j])
798				if prefix.IsREX() {
799					rexUsed |= prefix
800					if rex&prefix == prefix {
801						pc = int(decoder[pc+2*j+1])
802						continue Decode
803					}
804					continue
805				}
806				ok := false
807				if prefix == 0 {
808					ok = true
809				} else if prefix.IsREX() {
810					rexUsed |= prefix
811					if rex&prefix == prefix {
812						ok = true
813					}
814				} else if prefix == 0xC5 || prefix == 0xC4 {
815					if vex == prefix {
816						ok = true
817					}
818				} else if vex != 0 && (prefix == 0x0F || prefix == 0x0F38 || prefix == 0x0F3A ||
819					prefix == 0x66 || prefix == 0xF2 || prefix == 0xF3) {
820					var vexM, vexP Prefix
821					if vex == 0xC5 {
822						vexM = 1 // 2 byte vex always implies 0F
823						vexP = inst.Prefix[vexIndex+1]
824					} else {
825						vexM = inst.Prefix[vexIndex+1]
826						vexP = inst.Prefix[vexIndex+2]
827					}
828					switch prefix {
829					case 0x66:
830						ok = vexP&3 == 1
831					case 0xF3:
832						ok = vexP&3 == 2
833					case 0xF2:
834						ok = vexP&3 == 3
835					case 0x0F:
836						ok = vexM&3 == 1
837					case 0x0F38:
838						ok = vexM&3 == 2
839					case 0x0F3A:
840						ok = vexM&3 == 3
841					}
842				} else {
843					if prefix == 0xF3 {
844						sawF3 = true
845					}
846					switch prefix {
847					case PrefixLOCK:
848						if lockIndex >= 0 {
849							inst.Prefix[lockIndex] |= PrefixImplicit
850							ok = true
851						}
852					case PrefixREP, PrefixREPN:
853						if repIndex >= 0 && inst.Prefix[repIndex]&0xFF == prefix {
854							inst.Prefix[repIndex] |= PrefixImplicit
855							ok = true
856						}
857						if gnuCompat && !ok && prefix == 0xF3 && repIndex >= 0 && (j+1 >= n || decoder[pc+2*(j+1)] != 0xF2) {
858							// Check to see if earlier prefix F3 is present.
859							for i := repIndex - 1; i >= 0; i-- {
860								if inst.Prefix[i]&0xFF == prefix {
861									inst.Prefix[i] |= PrefixImplicit
862									ok = true
863								}
864							}
865						}
866						if gnuCompat && !ok && prefix == 0xF2 && repIndex >= 0 && !sawF3 && inst.Prefix[repIndex]&0xFF == 0xF3 {
867							// Check to see if earlier prefix F2 is present.
868							for i := repIndex - 1; i >= 0; i-- {
869								if inst.Prefix[i]&0xFF == prefix {
870									inst.Prefix[i] |= PrefixImplicit
871									ok = true
872								}
873							}
874						}
875					case PrefixCS, PrefixDS, PrefixES, PrefixFS, PrefixGS, PrefixSS:
876						if segIndex >= 0 && inst.Prefix[segIndex]&0xFF == prefix {
877							inst.Prefix[segIndex] |= PrefixImplicit
878							ok = true
879						}
880					case PrefixDataSize:
881						// Looking for 66 mandatory prefix.
882						// The F2/F3 mandatory prefixes take priority when both are present.
883						// If we got this far in the xCondPrefix table and an F2/F3 is present,
884						// it means the table didn't have any entry for that prefix. But if 66 has
885						// special meaning, perhaps F2/F3 have special meaning that we don't know.
886						// Intel xed works this way, treating the F2/F3 as inhibiting the 66.
887						// GNU libopcodes allows the 66 to match. We do what Intel xed does
888						// except in gnuCompat mode.
889						if repIndex >= 0 && !gnuCompat {
890							inst.Op = 0
891							break Decode
892						}
893						if dataSizeIndex >= 0 {
894							inst.Prefix[dataSizeIndex] |= PrefixImplicit
895							ok = true
896						}
897					case PrefixAddrSize:
898						if addrSizeIndex >= 0 {
899							inst.Prefix[addrSizeIndex] |= PrefixImplicit
900							ok = true
901						}
902					}
903				}
904				if ok {
905					pc = int(decoder[pc+2*j+1])
906					continue Decode
907				}
908			}
909			inst.Op = 0
910			break Decode
911
912		case xCondSlashR:
913			pc = int(decoder[pc+regop&7])
914
915		// Input.
916
917		case xReadSlashR:
918			// done above
919
920		case xReadIb:
921			if pos >= len(src) {
922				return truncated(src, mode)
923			}
924			imm8 = int8(src[pos])
925			pos++
926
927		case xReadIw:
928			if pos+2 > len(src) {
929				return truncated(src, mode)
930			}
931			imm = int64(binary.LittleEndian.Uint16(src[pos:]))
932			pos += 2
933
934		case xReadId:
935			if pos+4 > len(src) {
936				return truncated(src, mode)
937			}
938			imm = int64(binary.LittleEndian.Uint32(src[pos:]))
939			pos += 4
940
941		case xReadIo:
942			if pos+8 > len(src) {
943				return truncated(src, mode)
944			}
945			imm = int64(binary.LittleEndian.Uint64(src[pos:]))
946			pos += 8
947
948		case xReadCb:
949			if pos >= len(src) {
950				return truncated(src, mode)
951			}
952			immcpos = pos
953			immc = int64(src[pos])
954			pos++
955
956		case xReadCw:
957			if pos+2 > len(src) {
958				return truncated(src, mode)
959			}
960			immcpos = pos
961			immc = int64(binary.LittleEndian.Uint16(src[pos:]))
962			pos += 2
963
964		case xReadCm:
965			immcpos = pos
966			if addrMode == 16 {
967				if pos+2 > len(src) {
968					return truncated(src, mode)
969				}
970				immc = int64(binary.LittleEndian.Uint16(src[pos:]))
971				pos += 2
972			} else if addrMode == 32 {
973				if pos+4 > len(src) {
974					return truncated(src, mode)
975				}
976				immc = int64(binary.LittleEndian.Uint32(src[pos:]))
977				pos += 4
978			} else {
979				if pos+8 > len(src) {
980					return truncated(src, mode)
981				}
982				immc = int64(binary.LittleEndian.Uint64(src[pos:]))
983				pos += 8
984			}
985		case xReadCd:
986			immcpos = pos
987			if pos+4 > len(src) {
988				return truncated(src, mode)
989			}
990			immc = int64(binary.LittleEndian.Uint32(src[pos:]))
991			pos += 4
992
993		case xReadCp:
994			immcpos = pos
995			if pos+6 > len(src) {
996				return truncated(src, mode)
997			}
998			w := binary.LittleEndian.Uint32(src[pos:])
999			w2 := binary.LittleEndian.Uint16(src[pos+4:])
1000			immc = int64(w2)<<32 | int64(w)
1001			pos += 6
1002
1003		// Output.
1004
1005		case xSetOp:
1006			inst.Op = Op(decoder[pc])
1007			pc++
1008
1009		case xArg1,
1010			xArg3,
1011			xArgAL,
1012			xArgAX,
1013			xArgCL,
1014			xArgCS,
1015			xArgDS,
1016			xArgDX,
1017			xArgEAX,
1018			xArgEDX,
1019			xArgES,
1020			xArgFS,
1021			xArgGS,
1022			xArgRAX,
1023			xArgRDX,
1024			xArgSS,
1025			xArgST,
1026			xArgXMM0:
1027			inst.Args[narg] = fixedArg[x]
1028			narg++
1029
1030		case xArgImm8:
1031			inst.Args[narg] = Imm(imm8)
1032			narg++
1033
1034		case xArgImm8u:
1035			inst.Args[narg] = Imm(uint8(imm8))
1036			narg++
1037
1038		case xArgImm16:
1039			inst.Args[narg] = Imm(int16(imm))
1040			narg++
1041
1042		case xArgImm16u:
1043			inst.Args[narg] = Imm(uint16(imm))
1044			narg++
1045
1046		case xArgImm32:
1047			inst.Args[narg] = Imm(int32(imm))
1048			narg++
1049
1050		case xArgImm64:
1051			inst.Args[narg] = Imm(imm)
1052			narg++
1053
1054		case xArgM,
1055			xArgM128,
1056			xArgM256,
1057			xArgM1428byte,
1058			xArgM16,
1059			xArgM16and16,
1060			xArgM16and32,
1061			xArgM16and64,
1062			xArgM16colon16,
1063			xArgM16colon32,
1064			xArgM16colon64,
1065			xArgM16int,
1066			xArgM2byte,
1067			xArgM32,
1068			xArgM32and32,
1069			xArgM32fp,
1070			xArgM32int,
1071			xArgM512byte,
1072			xArgM64,
1073			xArgM64fp,
1074			xArgM64int,
1075			xArgM8,
1076			xArgM80bcd,
1077			xArgM80dec,
1078			xArgM80fp,
1079			xArgM94108byte,
1080			xArgMem:
1081			if !haveMem {
1082				inst.Op = 0
1083				break Decode
1084			}
1085			inst.Args[narg] = mem
1086			inst.MemBytes = int(memBytes[decodeOp(x)])
1087			if mem.Base == RIP {
1088				inst.PCRel = displen
1089				inst.PCRelOff = dispoff
1090			}
1091			narg++
1092
1093		case xArgPtr16colon16:
1094			inst.Args[narg] = Imm(immc >> 16)
1095			inst.Args[narg+1] = Imm(immc & (1<<16 - 1))
1096			narg += 2
1097
1098		case xArgPtr16colon32:
1099			inst.Args[narg] = Imm(immc >> 32)
1100			inst.Args[narg+1] = Imm(immc & (1<<32 - 1))
1101			narg += 2
1102
1103		case xArgMoffs8, xArgMoffs16, xArgMoffs32, xArgMoffs64:
1104			// TODO(rsc): Can address be 64 bits?
1105			mem = Mem{Disp: int64(immc)}
1106			if segIndex >= 0 {
1107				mem.Segment = prefixToSegment(inst.Prefix[segIndex])
1108				inst.Prefix[segIndex] |= PrefixImplicit
1109			}
1110			inst.Args[narg] = mem
1111			inst.MemBytes = int(memBytes[decodeOp(x)])
1112			if mem.Base == RIP {
1113				inst.PCRel = displen
1114				inst.PCRelOff = dispoff
1115			}
1116			narg++
1117
1118		case xArgYmm1:
1119			base := baseReg[x]
1120			index := Reg(regop)
1121			if inst.Prefix[vexIndex+1]&0x80 == 0 {
1122				index += 8
1123			}
1124			inst.Args[narg] = base + index
1125			narg++
1126
1127		case xArgR8, xArgR16, xArgR32, xArgR64, xArgXmm, xArgXmm1, xArgDR0dashDR7:
1128			base := baseReg[x]
1129			index := Reg(regop)
1130			if rex != 0 && base == AL && index >= 4 {
1131				rexUsed |= PrefixREX
1132				index -= 4
1133				base = SPB
1134			}
1135			inst.Args[narg] = base + index
1136			narg++
1137
1138		case xArgMm, xArgMm1, xArgTR0dashTR7:
1139			inst.Args[narg] = baseReg[x] + Reg(regop&7)
1140			narg++
1141
1142		case xArgCR0dashCR7:
1143			// AMD documents an extension that the LOCK prefix
1144			// can be used in place of a REX prefix in order to access
1145			// CR8 from 32-bit mode. The LOCK prefix is allowed in
1146			// all modes, provided the corresponding CPUID bit is set.
1147			if lockIndex >= 0 {
1148				inst.Prefix[lockIndex] |= PrefixImplicit
1149				regop += 8
1150			}
1151			inst.Args[narg] = CR0 + Reg(regop)
1152			narg++
1153
1154		case xArgSreg:
1155			regop &= 7
1156			if regop >= 6 {
1157				inst.Op = 0
1158				break Decode
1159			}
1160			inst.Args[narg] = ES + Reg(regop)
1161			narg++
1162
1163		case xArgRmf16, xArgRmf32, xArgRmf64:
1164			base := baseReg[x]
1165			index := Reg(modrm & 07)
1166			if rex&PrefixREXB != 0 {
1167				rexUsed |= PrefixREXB
1168				index += 8
1169			}
1170			inst.Args[narg] = base + index
1171			narg++
1172
1173		case xArgR8op, xArgR16op, xArgR32op, xArgR64op, xArgSTi:
1174			n := inst.Opcode >> uint(opshift+8) & 07
1175			base := baseReg[x]
1176			index := Reg(n)
1177			if rex&PrefixREXB != 0 && decodeOp(x) != xArgSTi {
1178				rexUsed |= PrefixREXB
1179				index += 8
1180			}
1181			if rex != 0 && base == AL && index >= 4 {
1182				rexUsed |= PrefixREX
1183				index -= 4
1184				base = SPB
1185			}
1186			inst.Args[narg] = base + index
1187			narg++
1188		case xArgRM8, xArgRM16, xArgRM32, xArgRM64, xArgR32M16, xArgR32M8, xArgR64M16,
1189			xArgMmM32, xArgMmM64, xArgMm2M64,
1190			xArgXmm2M16, xArgXmm2M32, xArgXmm2M64, xArgXmmM64, xArgXmmM128, xArgXmmM32, xArgXmm2M128,
1191			xArgYmm2M256:
1192			if haveMem {
1193				inst.Args[narg] = mem
1194				inst.MemBytes = int(memBytes[decodeOp(x)])
1195				if mem.Base == RIP {
1196					inst.PCRel = displen
1197					inst.PCRelOff = dispoff
1198				}
1199			} else {
1200				base := baseReg[x]
1201				index := Reg(rm)
1202				switch decodeOp(x) {
1203				case xArgMmM32, xArgMmM64, xArgMm2M64:
1204					// There are only 8 MMX registers, so these ignore the REX.X bit.
1205					index &= 7
1206				case xArgRM8:
1207					if rex != 0 && index >= 4 {
1208						rexUsed |= PrefixREX
1209						index -= 4
1210						base = SPB
1211					}
1212				case xArgYmm2M256:
1213					if vex == 0xC4 && inst.Prefix[vexIndex+1]&0x40 == 0x40 {
1214						index += 8
1215					}
1216				}
1217				inst.Args[narg] = base + index
1218			}
1219			narg++
1220
1221		case xArgMm2: // register only; TODO(rsc): Handle with tag modrm_regonly tag
1222			if haveMem {
1223				inst.Op = 0
1224				break Decode
1225			}
1226			inst.Args[narg] = baseReg[x] + Reg(rm&7)
1227			narg++
1228
1229		case xArgXmm2: // register only; TODO(rsc): Handle with tag modrm_regonly tag
1230			if haveMem {
1231				inst.Op = 0
1232				break Decode
1233			}
1234			inst.Args[narg] = baseReg[x] + Reg(rm)
1235			narg++
1236
1237		case xArgRel8:
1238			inst.PCRelOff = immcpos
1239			inst.PCRel = 1
1240			inst.Args[narg] = Rel(int8(immc))
1241			narg++
1242
1243		case xArgRel16:
1244			inst.PCRelOff = immcpos
1245			inst.PCRel = 2
1246			inst.Args[narg] = Rel(int16(immc))
1247			narg++
1248
1249		case xArgRel32:
1250			inst.PCRelOff = immcpos
1251			inst.PCRel = 4
1252			inst.Args[narg] = Rel(int32(immc))
1253			narg++
1254		}
1255	}
1256
1257	if inst.Op == 0 {
1258		// Invalid instruction.
1259		if nprefix > 0 {
1260			return instPrefix(src[0], mode) // invalid instruction
1261		}
1262		return Inst{Len: pos}, ErrUnrecognized
1263	}
1264
1265	// Matched! Hooray!
1266
1267	// 90 decodes as XCHG EAX, EAX but is NOP.
1268	// 66 90 decodes as XCHG AX, AX and is NOP too.
1269	// 48 90 decodes as XCHG RAX, RAX and is NOP too.
1270	// 43 90 decodes as XCHG R8D, EAX and is *not* NOP.
1271	// F3 90 decodes as REP XCHG EAX, EAX but is PAUSE.
1272	// It's all too special to handle in the decoding tables, at least for now.
1273	if inst.Op == XCHG && inst.Opcode>>24 == 0x90 {
1274		if inst.Args[0] == RAX || inst.Args[0] == EAX || inst.Args[0] == AX {
1275			inst.Op = NOP
1276			if dataSizeIndex >= 0 {
1277				inst.Prefix[dataSizeIndex] &^= PrefixImplicit
1278			}
1279			inst.Args[0] = nil
1280			inst.Args[1] = nil
1281		}
1282		if repIndex >= 0 && inst.Prefix[repIndex] == 0xF3 {
1283			inst.Prefix[repIndex] |= PrefixImplicit
1284			inst.Op = PAUSE
1285			inst.Args[0] = nil
1286			inst.Args[1] = nil
1287		} else if gnuCompat {
1288			for i := nprefix - 1; i >= 0; i-- {
1289				if inst.Prefix[i]&0xFF == 0xF3 {
1290					inst.Prefix[i] |= PrefixImplicit
1291					inst.Op = PAUSE
1292					inst.Args[0] = nil
1293					inst.Args[1] = nil
1294					break
1295				}
1296			}
1297		}
1298	}
1299
1300	// defaultSeg returns the default segment for an implicit
1301	// memory reference: the final override if present, or else DS.
1302	defaultSeg := func() Reg {
1303		if segIndex >= 0 {
1304			inst.Prefix[segIndex] |= PrefixImplicit
1305			return prefixToSegment(inst.Prefix[segIndex])
1306		}
1307		return DS
1308	}
1309
1310	// Add implicit arguments not present in the tables.
1311	// Normally we shy away from making implicit arguments explicit,
1312	// following the Intel manuals, but adding the arguments seems
1313	// the best way to express the effect of the segment override prefixes.
1314	// TODO(rsc): Perhaps add these to the tables and
1315	// create bytecode instructions for them.
1316	usedAddrSize := false
1317	switch inst.Op {
1318	case INSB, INSW, INSD:
1319		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
1320		inst.Args[1] = DX
1321		usedAddrSize = true
1322
1323	case OUTSB, OUTSW, OUTSD:
1324		inst.Args[0] = DX
1325		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
1326		usedAddrSize = true
1327
1328	case MOVSB, MOVSW, MOVSD, MOVSQ:
1329		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
1330		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
1331		usedAddrSize = true
1332
1333	case CMPSB, CMPSW, CMPSD, CMPSQ:
1334		inst.Args[0] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
1335		inst.Args[1] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
1336		usedAddrSize = true
1337
1338	case LODSB, LODSW, LODSD, LODSQ:
1339		switch inst.Op {
1340		case LODSB:
1341			inst.Args[0] = AL
1342		case LODSW:
1343			inst.Args[0] = AX
1344		case LODSD:
1345			inst.Args[0] = EAX
1346		case LODSQ:
1347			inst.Args[0] = RAX
1348		}
1349		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
1350		usedAddrSize = true
1351
1352	case STOSB, STOSW, STOSD, STOSQ:
1353		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
1354		switch inst.Op {
1355		case STOSB:
1356			inst.Args[1] = AL
1357		case STOSW:
1358			inst.Args[1] = AX
1359		case STOSD:
1360			inst.Args[1] = EAX
1361		case STOSQ:
1362			inst.Args[1] = RAX
1363		}
1364		usedAddrSize = true
1365
1366	case SCASB, SCASW, SCASD, SCASQ:
1367		inst.Args[1] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
1368		switch inst.Op {
1369		case SCASB:
1370			inst.Args[0] = AL
1371		case SCASW:
1372			inst.Args[0] = AX
1373		case SCASD:
1374			inst.Args[0] = EAX
1375		case SCASQ:
1376			inst.Args[0] = RAX
1377		}
1378		usedAddrSize = true
1379
1380	case XLATB:
1381		inst.Args[0] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + BX - AX}
1382		usedAddrSize = true
1383	}
1384
1385	// If we used the address size annotation to construct the
1386	// argument list, mark that prefix as implicit: it doesn't need
1387	// to be shown when printing the instruction.
1388	if haveMem || usedAddrSize {
1389		if addrSizeIndex >= 0 {
1390			inst.Prefix[addrSizeIndex] |= PrefixImplicit
1391		}
1392	}
1393
1394	// Similarly, if there's some memory operand, the segment
1395	// will be shown there and doesn't need to be shown as an
1396	// explicit prefix.
1397	if haveMem {
1398		if segIndex >= 0 {
1399			inst.Prefix[segIndex] |= PrefixImplicit
1400		}
1401	}
1402
1403	// Branch predict prefixes are overloaded segment prefixes,
1404	// since segment prefixes don't make sense on conditional jumps.
1405	// Rewrite final instance to prediction prefix.
1406	// The set of instructions to which the prefixes apply (other then the
1407	// Jcc conditional jumps) is not 100% clear from the manuals, but
1408	// the disassemblers seem to agree about the LOOP and JCXZ instructions,
1409	// so we'll follow along.
1410	// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
1411	if isCondJmp[inst.Op] || isLoop[inst.Op] || inst.Op == JCXZ || inst.Op == JECXZ || inst.Op == JRCXZ {
1412	PredictLoop:
1413		for i := nprefix - 1; i >= 0; i-- {
1414			p := inst.Prefix[i]
1415			switch p & 0xFF {
1416			case PrefixCS:
1417				inst.Prefix[i] = PrefixPN
1418				break PredictLoop
1419			case PrefixDS:
1420				inst.Prefix[i] = PrefixPT
1421				break PredictLoop
1422			}
1423		}
1424	}
1425
1426	// The BND prefix is part of the Intel Memory Protection Extensions (MPX).
1427	// A REPN applied to certain control transfers is a BND prefix to bound
1428	// the range of possible destinations. There's surprisingly little documentation
1429	// about this, so we just do what libopcodes and xed agree on.
1430	// In particular, it's unclear why a REPN applied to LOOP or JCXZ instructions
1431	// does not turn into a BND.
1432	// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
1433	if isCondJmp[inst.Op] || inst.Op == JMP || inst.Op == CALL || inst.Op == RET {
1434		for i := nprefix - 1; i >= 0; i-- {
1435			p := inst.Prefix[i]
1436			if p&^PrefixIgnored == PrefixREPN {
1437				inst.Prefix[i] = PrefixBND
1438				break
1439			}
1440		}
1441	}
1442
1443	// The LOCK prefix only applies to certain instructions, and then only
1444	// to instances of the instruction with a memory destination.
1445	// Other uses of LOCK are invalid and cause a processor exception,
1446	// in contrast to the "just ignore it" spirit applied to all other prefixes.
1447	// Mark invalid lock prefixes.
1448	hasLock := false
1449	if lockIndex >= 0 && inst.Prefix[lockIndex]&PrefixImplicit == 0 {
1450		switch inst.Op {
1451		// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
1452		case ADD, ADC, AND, BTC, BTR, BTS, CMPXCHG, CMPXCHG8B, CMPXCHG16B, DEC, INC, NEG, NOT, OR, SBB, SUB, XOR, XADD, XCHG:
1453			if isMem(inst.Args[0]) {
1454				hasLock = true
1455				break
1456			}
1457			fallthrough
1458		default:
1459			inst.Prefix[lockIndex] |= PrefixInvalid
1460		}
1461	}
1462
1463	// In certain cases, all of which require a memory destination,
1464	// the REPN and REP prefixes are interpreted as XACQUIRE and XRELEASE
1465	// from the Intel Transactional Synchroniation Extensions (TSX).
1466	//
1467	// The specific rules are:
1468	// (1) Any instruction with a valid LOCK prefix can have XACQUIRE or XRELEASE.
1469	// (2) Any XCHG, which always has an implicit LOCK, can have XACQUIRE or XRELEASE.
1470	// (3) Any 0x88-, 0x89-, 0xC6-, or 0xC7-opcode MOV can have XRELEASE.
1471	if isMem(inst.Args[0]) {
1472		if inst.Op == XCHG {
1473			hasLock = true
1474		}
1475
1476		for i := len(inst.Prefix) - 1; i >= 0; i-- {
1477			p := inst.Prefix[i] &^ PrefixIgnored
1478			switch p {
1479			case PrefixREPN:
1480				if hasLock {
1481					inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXACQUIRE
1482				}
1483
1484			case PrefixREP:
1485				if hasLock {
1486					inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXRELEASE
1487				}
1488
1489				if inst.Op == MOV {
1490					op := (inst.Opcode >> 24) &^ 1
1491					if op == 0x88 || op == 0xC6 {
1492						inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXRELEASE
1493					}
1494				}
1495			}
1496		}
1497	}
1498
1499	// If REP is used on a non-REP-able instruction, mark the prefix as ignored.
1500	if repIndex >= 0 {
1501		switch inst.Prefix[repIndex] {
1502		case PrefixREP, PrefixREPN:
1503			switch inst.Op {
1504			// According to the manuals, the REP/REPE prefix applies to all of these,
1505			// while the REPN applies only to some of them. However, both libopcodes
1506			// and xed show both prefixes explicitly for all instructions, so we do the same.
1507			// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
1508			case INSB, INSW, INSD,
1509				MOVSB, MOVSW, MOVSD, MOVSQ,
1510				OUTSB, OUTSW, OUTSD,
1511				LODSB, LODSW, LODSD, LODSQ,
1512				CMPSB, CMPSW, CMPSD, CMPSQ,
1513				SCASB, SCASW, SCASD, SCASQ,
1514				STOSB, STOSW, STOSD, STOSQ:
1515				// ok
1516			default:
1517				inst.Prefix[repIndex] |= PrefixIgnored
1518			}
1519		}
1520	}
1521
1522	// If REX was present, mark implicit if all the 1 bits were consumed.
1523	if rexIndex >= 0 {
1524		if rexUsed != 0 {
1525			rexUsed |= PrefixREX
1526		}
1527		if rex&^rexUsed == 0 {
1528			inst.Prefix[rexIndex] |= PrefixImplicit
1529		}
1530	}
1531
1532	inst.DataSize = dataMode
1533	inst.AddrSize = addrMode
1534	inst.Mode = mode
1535	inst.Len = pos
1536	return inst, nil
1537}
1538
1539var errInternal = errors.New("internal error")
1540
1541// addr16 records the eight 16-bit addressing modes.
1542var addr16 = [8]Mem{
1543	{Base: BX, Scale: 1, Index: SI},
1544	{Base: BX, Scale: 1, Index: DI},
1545	{Base: BP, Scale: 1, Index: SI},
1546	{Base: BP, Scale: 1, Index: DI},
1547	{Base: SI},
1548	{Base: DI},
1549	{Base: BP},
1550	{Base: BX},
1551}
1552
1553// baseRegForBits returns the base register for a given register size in bits.
1554func baseRegForBits(bits int) Reg {
1555	switch bits {
1556	case 8:
1557		return AL
1558	case 16:
1559		return AX
1560	case 32:
1561		return EAX
1562	case 64:
1563		return RAX
1564	}
1565	return 0
1566}
1567
1568// baseReg records the base register for argument types that specify
1569// a range of registers indexed by op, regop, or rm.
1570var baseReg = [...]Reg{
1571	xArgDR0dashDR7: DR0,
1572	xArgMm1:        M0,
1573	xArgMm2:        M0,
1574	xArgMm2M64:     M0,
1575	xArgMm:         M0,
1576	xArgMmM32:      M0,
1577	xArgMmM64:      M0,
1578	xArgR16:        AX,
1579	xArgR16op:      AX,
1580	xArgR32:        EAX,
1581	xArgR32M16:     EAX,
1582	xArgR32M8:      EAX,
1583	xArgR32op:      EAX,
1584	xArgR64:        RAX,
1585	xArgR64M16:     RAX,
1586	xArgR64op:      RAX,
1587	xArgR8:         AL,
1588	xArgR8op:       AL,
1589	xArgRM16:       AX,
1590	xArgRM32:       EAX,
1591	xArgRM64:       RAX,
1592	xArgRM8:        AL,
1593	xArgRmf16:      AX,
1594	xArgRmf32:      EAX,
1595	xArgRmf64:      RAX,
1596	xArgSTi:        F0,
1597	xArgTR0dashTR7: TR0,
1598	xArgXmm1:       X0,
1599	xArgYmm1:       X0,
1600	xArgXmm2:       X0,
1601	xArgXmm2M128:   X0,
1602	xArgYmm2M256:   X0,
1603	xArgXmm2M16:    X0,
1604	xArgXmm2M32:    X0,
1605	xArgXmm2M64:    X0,
1606	xArgXmm:        X0,
1607	xArgXmmM128:    X0,
1608	xArgXmmM32:     X0,
1609	xArgXmmM64:     X0,
1610}
1611
1612// prefixToSegment returns the segment register
1613// corresponding to a particular segment prefix.
1614func prefixToSegment(p Prefix) Reg {
1615	switch p &^ PrefixImplicit {
1616	case PrefixCS:
1617		return CS
1618	case PrefixDS:
1619		return DS
1620	case PrefixES:
1621		return ES
1622	case PrefixFS:
1623		return FS
1624	case PrefixGS:
1625		return GS
1626	case PrefixSS:
1627		return SS
1628	}
1629	return 0
1630}
1631
1632// fixedArg records the fixed arguments corresponding to the given bytecodes.
1633var fixedArg = [...]Arg{
1634	xArg1:    Imm(1),
1635	xArg3:    Imm(3),
1636	xArgAL:   AL,
1637	xArgAX:   AX,
1638	xArgDX:   DX,
1639	xArgEAX:  EAX,
1640	xArgEDX:  EDX,
1641	xArgRAX:  RAX,
1642	xArgRDX:  RDX,
1643	xArgCL:   CL,
1644	xArgCS:   CS,
1645	xArgDS:   DS,
1646	xArgES:   ES,
1647	xArgFS:   FS,
1648	xArgGS:   GS,
1649	xArgSS:   SS,
1650	xArgST:   F0,
1651	xArgXMM0: X0,
1652}
1653
1654// memBytes records the size of the memory pointed at
1655// by a memory argument of the given form.
1656var memBytes = [...]int8{
1657	xArgM128:       128 / 8,
1658	xArgM256:       256 / 8,
1659	xArgM16:        16 / 8,
1660	xArgM16and16:   (16 + 16) / 8,
1661	xArgM16colon16: (16 + 16) / 8,
1662	xArgM16colon32: (16 + 32) / 8,
1663	xArgM16int:     16 / 8,
1664	xArgM2byte:     2,
1665	xArgM32:        32 / 8,
1666	xArgM32and32:   (32 + 32) / 8,
1667	xArgM32fp:      32 / 8,
1668	xArgM32int:     32 / 8,
1669	xArgM64:        64 / 8,
1670	xArgM64fp:      64 / 8,
1671	xArgM64int:     64 / 8,
1672	xArgMm2M64:     64 / 8,
1673	xArgMmM32:      32 / 8,
1674	xArgMmM64:      64 / 8,
1675	xArgMoffs16:    16 / 8,
1676	xArgMoffs32:    32 / 8,
1677	xArgMoffs64:    64 / 8,
1678	xArgMoffs8:     8 / 8,
1679	xArgR32M16:     16 / 8,
1680	xArgR32M8:      8 / 8,
1681	xArgR64M16:     16 / 8,
1682	xArgRM16:       16 / 8,
1683	xArgRM32:       32 / 8,
1684	xArgRM64:       64 / 8,
1685	xArgRM8:        8 / 8,
1686	xArgXmm2M128:   128 / 8,
1687	xArgYmm2M256:   256 / 8,
1688	xArgXmm2M16:    16 / 8,
1689	xArgXmm2M32:    32 / 8,
1690	xArgXmm2M64:    64 / 8,
1691	xArgXmm:        128 / 8,
1692	xArgXmmM128:    128 / 8,
1693	xArgXmmM32:     32 / 8,
1694	xArgXmmM64:     64 / 8,
1695}
1696
1697// isCondJmp records the conditional jumps.
1698var isCondJmp = [maxOp + 1]bool{
1699	JA:  true,
1700	JAE: true,
1701	JB:  true,
1702	JBE: true,
1703	JE:  true,
1704	JG:  true,
1705	JGE: true,
1706	JL:  true,
1707	JLE: true,
1708	JNE: true,
1709	JNO: true,
1710	JNP: true,
1711	JNS: true,
1712	JO:  true,
1713	JP:  true,
1714	JS:  true,
1715}
1716
1717// isLoop records the loop operators.
1718var isLoop = [maxOp + 1]bool{
1719	LOOP:   true,
1720	LOOPE:  true,
1721	LOOPNE: true,
1722	JECXZ:  true,
1723	JRCXZ:  true,
1724}
1725