xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/aco_opcodes.py (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1#
2# Copyright (c) 2018 Valve Corporation
3#
4# SPDX-License-Identifier: MIT
5
6# Class that represents all the information we have about the opcode
7# NOTE: this must be kept in sync with aco_op_info
8
9import sys
10import itertools
11import collections
12from enum import Enum, IntEnum, auto
13from collections import namedtuple
14
15class InstrClass(Enum):
16   Valu32 = "valu32"
17   ValuConvert32 = "valu_convert32"
18   Valu64 = "valu64"
19   ValuQuarterRate32 = "valu_quarter_rate32"
20   ValuFma = "valu_fma"
21   ValuTranscendental32 = "valu_transcendental32"
22   ValuDouble = "valu_double"
23   ValuDoubleAdd = "valu_double_add"
24   ValuDoubleConvert = "valu_double_convert"
25   ValuDoubleTranscendental = "valu_double_transcendental"
26   ValuPseudoScalarTrans = "valu_pseudo_scalar_trans"
27   WMMA = "wmma"
28   Salu = "salu"
29   SFPU = "sfpu"
30   SMem = "smem"
31   Barrier = "barrier"
32   Branch = "branch"
33   Sendmsg = "sendmsg"
34   DS = "ds"
35   Export = "exp"
36   VMem = "vmem"
37   Waitcnt = "waitcnt"
38   Other = "other"
39
40# Representation of the instruction's microcode encoding format
41# Note: Some Vector ALU Formats can be combined, such that:
42# - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
43# - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
44# - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
45#
46# (*) The same is applicable for VOP1 and VOPC instructions.
47class Format(IntEnum):
48   # Pseudo Instruction Formats
49   PSEUDO = 0
50   PSEUDO_BRANCH = auto()
51   PSEUDO_BARRIER = auto()
52   PSEUDO_REDUCTION = auto()
53   # Scalar ALU & Control Formats
54   SOP1 = auto()
55   SOP2 = auto()
56   SOPK = auto()
57   SOPP = auto()
58   SOPC = auto()
59   # Scalar Memory Format
60   SMEM = auto()
61   # LDS/GDS Format
62   DS = auto()
63   LDSDIR = auto()
64   # Vector Memory Buffer Formats
65   MTBUF = auto()
66   MUBUF = auto()
67   # Vector Memory Image Format
68   MIMG = auto()
69   # Export Format
70   EXP = auto()
71   # Flat Formats
72   FLAT = auto()
73   GLOBAL = auto()
74   SCRATCH = auto()
75   # Vector Parameter Interpolation Formats
76   VINTRP = auto()
77   # Vector ALU Formats
78   VINTERP_INREG = auto()
79   VOPD = auto()
80   VOP1 = 1 << 7
81   VOP2 = 1 << 8
82   VOPC = 1 << 9
83   VOP3 = 1 << 10
84   VOP3P = 1 << 11
85   SDWA = 1 << 12
86   DPP16 = 1 << 13
87   DPP8 = 1 << 14
88
89   def get_accessor(self):
90      if self in [Format.VOP3, Format.VOP3P]:
91         return "valu"
92      elif self in [Format.SOPP, Format.SOPK]:
93         return "salu"
94      elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
95         return "flatlike"
96      elif self in [Format.PSEUDO_BRANCH, Format.PSEUDO_REDUCTION, Format.PSEUDO_BARRIER]:
97         return self.name.split("_")[-1].lower()
98      else:
99         return self.name.lower()
100
101   def get_builder_fields(self):
102      if self == Format.SOPK:
103         return [('uint32_t', 'imm', '0')]
104      elif self == Format.SOPP:
105         return [('uint32_t', 'imm', '0')]
106      elif self == Format.SMEM:
107         return [('memory_sync_info', 'sync', 'memory_sync_info()'),
108                 ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}')]
109      elif self == Format.DS:
110         return [('uint16_t', 'offset0', '0'),
111                 ('uint8_t', 'offset1', '0'),
112                 ('bool', 'gds', 'false')]
113      elif self == Format.LDSDIR:
114         return [('uint8_t', 'attr', 0),
115                 ('uint8_t', 'attr_chan', 0),
116                 ('memory_sync_info', 'sync', 'memory_sync_info()'),
117                 ('uint8_t', 'wait_vdst', 15),
118                 ('uint8_t', 'wait_vsrc', 1)]
119      elif self == Format.MTBUF:
120         return [('unsigned', 'dfmt', None),
121                 ('unsigned', 'nfmt', None),
122                 ('unsigned', 'offset', None),
123                 ('bool', 'offen', None),
124                 ('bool', 'idxen', 'false'),
125                 ('bool', 'disable_wqm', 'false'),
126                 ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
127                 ('bool', 'tfe', 'false')]
128      elif self == Format.MUBUF:
129         return [('unsigned', 'offset', None),
130                 ('bool', 'offen', None),
131                 ('bool', 'idxen', 'false'),
132                 ('bool', 'addr64', 'false'),
133                 ('bool', 'disable_wqm', 'false'),
134                 ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
135                 ('bool', 'tfe', 'false'),
136                 ('bool', 'lds', 'false')]
137      elif self == Format.MIMG:
138         return [('unsigned', 'dmask', '0xF'),
139                 ('bool', 'da', 'false'),
140                 ('bool', 'unrm', 'false'),
141                 ('bool', 'disable_wqm', 'false'),
142                 ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
143                 ('bool', 'tfe', 'false'),
144                 ('bool', 'lwe', 'false'),
145                 ('bool', 'r128', 'false'),
146                 ('bool', 'a16', 'false'),
147                 ('bool', 'd16', 'false')]
148         return [('unsigned', 'attribute', None),
149                 ('unsigned', 'component', None)]
150      elif self == Format.EXP:
151         return [('unsigned', 'enabled_mask', None),
152                 ('unsigned', 'dest', None),
153                 ('bool', 'compr', 'false', 'compressed'),
154                 ('bool', 'done', 'false'),
155                 ('bool', 'vm', 'false', 'valid_mask')]
156      elif self == Format.PSEUDO_BRANCH:
157         return [('uint32_t', 'target0', '0', 'target[0]'),
158                 ('uint32_t', 'target1', '0', 'target[1]')]
159      elif self == Format.PSEUDO_REDUCTION:
160         return [('ReduceOp', 'op', None, 'reduce_op'),
161                 ('unsigned', 'cluster_size', '0')]
162      elif self == Format.PSEUDO_BARRIER:
163         return [('memory_sync_info', 'sync', None),
164                 ('sync_scope', 'exec_scope', 'scope_invocation')]
165      elif self == Format.VINTRP:
166         return [('unsigned', 'attribute', None),
167                 ('unsigned', 'component', None),
168                 ('bool', 'high_16bits', 'false')]
169      elif self == Format.DPP16:
170         return [('uint16_t', 'dpp_ctrl', None),
171                 ('uint8_t', 'row_mask', '0xF'),
172                 ('uint8_t', 'bank_mask', '0xF'),
173                 ('bool', 'bound_ctrl', 'true'),
174                 ('bool', 'fetch_inactive', 'true')]
175      elif self == Format.DPP8:
176         return [('uint32_t', 'lane_sel', 0),
177                 ('bool', 'fetch_inactive', 'true')]
178      elif self == Format.VOP3P:
179         return [('uint8_t', 'opsel_lo', None),
180                 ('uint8_t', 'opsel_hi', None)]
181      elif self == Format.VOPD:
182         return [('aco_opcode', 'opy', None)]
183      elif self == Format.VINTERP_INREG:
184         return [('uint8_t', 'opsel', 0),
185                 ('unsigned', 'wait_exp', 7)]
186      elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
187         return [('int16_t', 'offset', 0),
188                 ('memory_sync_info', 'sync', 'memory_sync_info()'),
189                 ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
190                 ('bool', 'lds', 'false'),
191                 ('bool', 'nv', 'false')]
192      else:
193         return []
194
195   def get_builder_field_names(self):
196      return [f[1] for f in self.get_builder_fields()]
197
198   def get_builder_field_dests(self):
199      return [(f[3] if len(f) >= 4 else f[1]) for f in self.get_builder_fields()]
200
201   def get_builder_field_decls(self):
202      return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()]
203
204   def get_builder_initialization(self, num_operands):
205      res = ''
206      if self == Format.SDWA:
207         for i in range(min(num_operands, 2)):
208            res += 'instr->sdwa().sel[{0}] = SubdwordSel(op{0}.op.bytes(), 0, false);'.format(i)
209         res += 'instr->sdwa().dst_sel = SubdwordSel(def0.bytes(), 0, false);\n'
210      elif self == Format.DPP16:
211         res += 'instr->dpp16().fetch_inactive &= program->gfx_level >= GFX10;\n'
212      elif self == Format.DPP8:
213         res += 'instr->dpp8().fetch_inactive &= program->gfx_level >= GFX10;\n'
214      return res
215
216
217Opcode = namedtuple('Opcode', ['gfx6', 'gfx7', 'gfx8', 'gfx9', 'gfx10', 'gfx11', 'gfx12'])
218# namedtuple 'defaults' keyword requires python 3.7+. Use an equivalent construct
219# to support older versions.
220Opcode.__new__.__defaults__=(-1, -1, -1, -1, -1, -1, -1)
221
222class Instruction(object):
223   """Class that represents all the information we have about the opcode
224   NOTE: this must be kept in sync with aco_op_info
225   """
226   def __init__(self, name, opcode, format, input_mod, output_mod, is_atomic, cls, definitions, operands):
227      assert isinstance(name, str)
228      assert isinstance(opcode, Opcode)
229      assert isinstance(format, Format)
230      assert isinstance(input_mod, bool)
231      assert isinstance(output_mod, bool)
232      assert isinstance(definitions, int)
233      assert isinstance(operands, int)
234      assert opcode.gfx6 == -1 or opcode.gfx7 == -1 or opcode.gfx6 == opcode.gfx7
235      assert opcode.gfx8 == -1 or opcode.gfx9 == -1 or opcode.gfx8 == opcode.gfx9
236
237      self.name = name
238      self.op = opcode
239      self.input_mod = "1" if input_mod else "0"
240      self.output_mod = "1" if output_mod else "0"
241      self.is_atomic = "1" if is_atomic else "0"
242      self.format = format
243      self.cls = cls
244      self.definitions = definitions
245      self.operands = operands
246
247      parts = name.replace('_e64', '').rsplit('_', 2)
248      op_dtype = parts[-1]
249
250      op_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]}
251      # inline constants are 32-bit for 16-bit integer/typeless instructions: https://reviews.llvm.org/D81841
252      op_dtype_sizes['b16'] = 32
253      op_dtype_sizes['i16'] = 32
254      op_dtype_sizes['u16'] = 32
255
256      # If we can't tell the operand size, default to 32.
257      self.operand_size = op_dtype_sizes.get(op_dtype, 32)
258
259      # exceptions for operands:
260      if 'qsad_' in name:
261        self.operand_size = 0
262      elif 'sad_' in name:
263        self.operand_size = 32
264      elif name in ['v_mad_u64_u32', 'v_mad_i64_i32',
265                    'v_interp_p10_f16_f32_inreg', 'v_interp_p10_rtz_f16_f32_inreg',
266                    'v_interp_p2_f16_f32_inreg', 'v_interp_p2_rtz_f16_f32_inreg']:
267        self.operand_size = 0
268      elif self.operand_size == 24:
269        self.operand_size = 32
270      elif op_dtype == 'u8' or op_dtype == 'i8':
271        self.operand_size = 32
272      elif name in ['v_cvt_f32_ubyte0', 'v_cvt_f32_ubyte1',
273                    'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']:
274        self.operand_size = 32
275
276
277# Matches PhysReg
278VCC = 106
279M0 = 124
280EXEC_LO = 126
281EXEC = 127 # Some instructins only write lo, so use exec_hi encoding here
282SCC = 253
283
284def src(op1 = 0, op2 = 0, op3 = 0, op4 = 0):
285   return op1 | (op2 << 8) | (op3 << 16) | (op4 << 24)
286
287def dst(def1 = 0, def2 = 0, def3 = 0, def4 = 0):
288   return def1 | (def2 << 8) | (def3 << 16) | (def4 << 24)
289
290def op(*args, **kwargs):
291   enc = [None] * len(Opcode._fields)
292
293   if len(args) > 0:
294      assert(len(args) == 1)
295      enc[0] = args[0]
296
297   for gen, val in kwargs.items():
298      idx = Opcode._fields.index(gen)
299      enc[idx] = val
300
301   for i in range(len(enc)):
302      if enc[i] == None:
303         enc[i] = enc[i - 1] if i > 0 else -1
304
305   return Opcode(*enc)
306
307# global dictionary of instructions
308instructions = {}
309
310def insn(name, opcode = Opcode(), format = Format.PSEUDO, cls = InstrClass.Other, input_mod = False, output_mod = False, is_atomic = False, definitions = 0, operands = 0):
311   assert name not in instructions
312   instructions[name] = Instruction(name, opcode, format, input_mod, output_mod, is_atomic, cls, definitions, operands)
313
314def default_class(instructions, cls):
315   for i in instructions:
316      if isinstance(i[-1], InstrClass):
317         yield i
318      else:
319         yield i + (cls,)
320
321insn("exp", op(0), format = Format.EXP, cls = InstrClass.Export)
322insn("p_parallelcopy")
323insn("p_startpgm")
324insn("p_return")
325insn("p_phi")
326insn("p_linear_phi")
327insn("p_boolean_phi")
328insn("p_as_uniform")
329insn("p_unit_test")
330
331insn("p_create_vector")
332insn("p_extract_vector")
333insn("p_split_vector")
334
335# start/end the parts where we can use exec based instructions
336# implicitly
337insn("p_logical_start")
338insn("p_logical_end")
339
340# e.g. subgroupMin() in SPIR-V
341insn("p_reduce", format=Format.PSEUDO_REDUCTION)
342# e.g. subgroupInclusiveMin()
343insn("p_inclusive_scan", format=Format.PSEUDO_REDUCTION)
344# e.g. subgroupExclusiveMin()
345insn("p_exclusive_scan", format=Format.PSEUDO_REDUCTION)
346
347insn("p_branch", format=Format.PSEUDO_BRANCH)
348insn("p_cbranch", format=Format.PSEUDO_BRANCH)
349insn("p_cbranch_z", format=Format.PSEUDO_BRANCH)
350insn("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
351
352insn("p_barrier", format=Format.PSEUDO_BARRIER)
353
354# Primitive Ordered Pixel Shading pseudo-instructions.
355
356# For querying whether the current wave can enter the ordered section on GFX9-10.3, doing
357# s_add_i32(pops_exiting_wave_id, op0), but in a way that it's different from a usual SALU
358# instruction so that it's easier to maintain the volatility of pops_exiting_wave_id and to handle
359# the polling specially in scheduling.
360# Definitions:
361# - Result SGPR;
362# - Clobbered SCC.
363# Operands:
364# - s1 value to add, usually -(current_wave_ID + 1) (or ~current_wave_ID) to remap the exiting wave
365#   ID from wrapping [0, 0x3FF] to monotonic [0, 0xFFFFFFFF].
366insn("p_pops_gfx9_add_exiting_wave_id")
367
368# Indicates that the wait for the completion of the ordered section in overlapped waves has been
369# finished on GFX9-10.3. Not lowered to any hardware instructions.
370insn("p_pops_gfx9_overlapped_wave_wait_done")
371
372# Indicates that a POPS ordered section has ended, hints that overlapping waves can possibly
373# continue execution. The overlapping waves may actually be resumed by this instruction or anywhere
374# later, however, especially taking into account the fact that there can be multiple ordered
375# sections in a wave (for instance, if one is chosen in divergent control flow in the source
376# shader), thus multiple p_pops_gfx9_ordered_section_done instructions. At least one must be present
377# in the program if POPS is used, however, otherwise the location of the end of the ordered section
378# will be undefined. Only needed on GFX9-10.3 (GFX11+ ordered section is until the last export,
379# can't be exited early). Not lowered to any hardware instructions.
380insn("p_pops_gfx9_ordered_section_done")
381
382insn("p_spill")
383insn("p_reload")
384
385# Start/end linear vgprs. p_start_linear_vgpr can take an operand to copy from, into the linear vgpr
386insn("p_start_linear_vgpr")
387insn("p_end_linear_vgpr")
388
389insn("p_end_wqm")
390insn("p_discard_if")
391insn("p_demote_to_helper")
392insn("p_is_helper")
393insn("p_exit_early_if")
394
395# simulates proper bpermute behavior using v_readlane_b32
396# definitions: result VGPR, temp EXEC, clobbered VCC
397# operands: index, input data
398insn("p_bpermute_readlane")
399
400# simulates proper wave64 bpermute behavior using shared vgprs (for GFX10/10.3)
401# definitions: result VGPR, temp EXEC, clobbered SCC
402# operands: index * 4, input data, same half (bool)
403insn("p_bpermute_shared_vgpr")
404
405# simulates proper wave64 bpermute behavior using v_permlane64_b32 (for GFX11+)
406# definitions: result VGPR, temp EXEC, clobbered SCC
407# operands: linear VGPR, index * 4, input data, same half (bool)
408insn("p_bpermute_permlane")
409
410# creates a lane mask where only the first active lane is selected
411insn("p_elect")
412
413insn("p_constaddr")
414insn("p_resume_shader_address")
415
416# These don't have to be pseudo-ops, but it makes optimization easier to only
417# have to consider two instructions.
418# (src0 >> (index * bits)) & ((1 << bits) - 1) with optional sign extension
419insn("p_extract") # src1=index, src2=bits, src3=signext
420# (src0 & ((1 << bits) - 1)) << (index * bits)
421insn("p_insert") # src1=index, src2=bits
422
423insn("p_init_scratch")
424
425# jumps to a shader epilog
426insn("p_jump_to_epilog")
427
428# loads and interpolates a fragment shader input with a correct exec mask
429#dst0=result, src0=linear_vgpr, src1=attribute, src2=component, src3=high_16bits, src4=coord1, src5=coord2, src6=m0
430#dst0=result, src0=linear_vgpr, src1=attribute, src2=component, src3=dpp_ctrl, src4=m0
431insn("p_interp_gfx11")
432
433# performs dual source MRTs swizzling and emits exports on GFX11
434insn("p_dual_src_export_gfx11")
435
436# Let shader end with specific registers set to wanted value, used by multi part
437# shader to pass arguments to next part.
438insn("p_end_with_regs")
439
440insn("p_shader_cycles_hi_lo_hi")
441
442# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
443SOP2 = {
444   ("s_add_u32",            dst(1, SCC), src(1, 1), op(0x00)),
445   ("s_sub_u32",            dst(1, SCC), src(1, 1), op(0x01)),
446   ("s_add_i32",            dst(1, SCC), src(1, 1), op(0x02)),
447   ("s_sub_i32",            dst(1, SCC), src(1, 1), op(0x03)),
448   ("s_addc_u32",           dst(1, SCC), src(1, 1, SCC), op(0x04)),
449   ("s_subb_u32",           dst(1, SCC), src(1, 1, SCC), op(0x05)),
450   ("s_min_i32",            dst(1, SCC), src(1, 1), op(0x06, gfx11=0x12)),
451   ("s_min_u32",            dst(1, SCC), src(1, 1), op(0x07, gfx11=0x13)),
452   ("s_max_i32",            dst(1, SCC), src(1, 1), op(0x08, gfx11=0x14)),
453   ("s_max_u32",            dst(1, SCC), src(1, 1), op(0x09, gfx11=0x15)),
454   ("s_cselect_b32",        dst(1), src(1, 1, SCC), op(0x0a, gfx11=0x30)),
455   ("s_cselect_b64",        dst(2), src(2, 2, SCC), op(0x0b, gfx11=0x31)),
456   ("s_and_b32",            dst(1, SCC), src(1, 1), op(0x0e, gfx8=0x0c, gfx10=0x0e, gfx11=0x16)),
457   ("s_and_b64",            dst(2, SCC), src(2, 2), op(0x0f, gfx8=0x0d, gfx10=0x0f, gfx11=0x17)),
458   ("s_or_b32",             dst(1, SCC), src(1, 1), op(0x10, gfx8=0x0e, gfx10=0x10, gfx11=0x18)),
459   ("s_or_b64",             dst(2, SCC), src(2, 2), op(0x11, gfx8=0x0f, gfx10=0x11, gfx11=0x19)),
460   ("s_xor_b32",            dst(1, SCC), src(1, 1), op(0x12, gfx8=0x10, gfx10=0x12, gfx11=0x1a)),
461   ("s_xor_b64",            dst(2, SCC), src(2, 2), op(0x13, gfx8=0x11, gfx10=0x13, gfx11=0x1b)),
462   ("s_andn2_b32",          dst(1, SCC), src(1, 1), op(0x14, gfx8=0x12, gfx10=0x14, gfx11=0x22)), #s_and_not1_b32 in GFX11
463   ("s_andn2_b64",          dst(2, SCC), src(2, 2), op(0x15, gfx8=0x13, gfx10=0x15, gfx11=0x23)), #s_and_not1_b64 in GFX11
464   ("s_orn2_b32",           dst(1, SCC), src(1, 1), op(0x16, gfx8=0x14, gfx10=0x16, gfx11=0x24)), #s_or_not1_b32 in GFX11
465   ("s_orn2_b64",           dst(2, SCC), src(2, 2), op(0x17, gfx8=0x15, gfx10=0x17, gfx11=0x25)), #s_or_not1_b64 in GFX11
466   ("s_nand_b32",           dst(1, SCC), src(1, 1), op(0x18, gfx8=0x16, gfx10=0x18, gfx11=0x1c)),
467   ("s_nand_b64",           dst(2, SCC), src(2, 2), op(0x19, gfx8=0x17, gfx10=0x19, gfx11=0x1d)),
468   ("s_nor_b32",            dst(1, SCC), src(1, 1), op(0x1a, gfx8=0x18, gfx10=0x1a, gfx11=0x1e)),
469   ("s_nor_b64",            dst(2, SCC), src(2, 2), op(0x1b, gfx8=0x19, gfx10=0x1b, gfx11=0x1f)),
470   ("s_xnor_b32",           dst(1, SCC), src(1, 1), op(0x1c, gfx8=0x1a, gfx10=0x1c, gfx11=0x20)),
471   ("s_xnor_b64",           dst(2, SCC), src(2, 2), op(0x1d, gfx8=0x1b, gfx10=0x1d, gfx11=0x21)),
472   ("s_lshl_b32",           dst(1, SCC), src(1, 1), op(0x1e, gfx8=0x1c, gfx10=0x1e, gfx11=0x08)),
473   ("s_lshl_b64",           dst(2, SCC), src(2, 1), op(0x1f, gfx8=0x1d, gfx10=0x1f, gfx11=0x09)),
474   ("s_lshr_b32",           dst(1, SCC), src(1, 1), op(0x20, gfx8=0x1e, gfx10=0x20, gfx11=0x0a)),
475   ("s_lshr_b64",           dst(2, SCC), src(2, 1), op(0x21, gfx8=0x1f, gfx10=0x21, gfx11=0x0b)),
476   ("s_ashr_i32",           dst(1, SCC), src(1, 1), op(0x22, gfx8=0x20, gfx10=0x22, gfx11=0x0c)),
477   ("s_ashr_i64",           dst(2, SCC), src(2, 1), op(0x23, gfx8=0x21, gfx10=0x23, gfx11=0x0d)),
478   ("s_bfm_b32",            dst(1), src(1, 1), op(0x24, gfx8=0x22, gfx10=0x24, gfx11=0x2a)),
479   ("s_bfm_b64",            dst(2), src(1, 1), op(0x25, gfx8=0x23, gfx10=0x25, gfx11=0x2b)),
480   ("s_mul_i32",            dst(1), src(1, 1), op(0x26, gfx8=0x24, gfx10=0x26, gfx11=0x2c)),
481   ("s_bfe_u32",            dst(1, SCC), src(1, 1), op(0x27, gfx8=0x25, gfx10=0x27, gfx11=0x26)),
482   ("s_bfe_i32",            dst(1, SCC), src(1, 1), op(0x28, gfx8=0x26, gfx10=0x28, gfx11=0x27)),
483   ("s_bfe_u64",            dst(2, SCC), src(2, 1), op(0x29, gfx8=0x27, gfx10=0x29, gfx11=0x28)),
484   ("s_bfe_i64",            dst(2, SCC), src(2, 1), op(0x2a, gfx8=0x28, gfx10=0x2a, gfx11=0x29)),
485   ("s_cbranch_g_fork",     dst(), src(), op(0x2b, gfx8=0x29, gfx10=-1), InstrClass.Branch),
486   ("s_absdiff_i32",        dst(1, SCC), src(1, 1), op(0x2c, gfx8=0x2a, gfx10=0x2c, gfx11=0x06)),
487   ("s_rfe_restore_b64",    dst(), src(), op(gfx8=0x2b, gfx10=-1), InstrClass.Branch),
488   ("s_lshl1_add_u32",      dst(1, SCC), src(1, 1), op(gfx9=0x2e, gfx11=0x0e)),
489   ("s_lshl2_add_u32",      dst(1, SCC), src(1, 1), op(gfx9=0x2f, gfx11=0x0f)),
490   ("s_lshl3_add_u32",      dst(1, SCC), src(1, 1), op(gfx9=0x30, gfx11=0x10)),
491   ("s_lshl4_add_u32",      dst(1, SCC), src(1, 1), op(gfx9=0x31, gfx11=0x11)),
492   ("s_pack_ll_b32_b16",    dst(1), src(1, 1), op(gfx9=0x32)),
493   ("s_pack_lh_b32_b16",    dst(1), src(1, 1), op(gfx9=0x33)),
494   ("s_pack_hh_b32_b16",    dst(1), src(1, 1), op(gfx9=0x34)),
495   ("s_pack_hl_b32_b16",    dst(1), src(1, 1), op(gfx11=0x35)),
496   ("s_mul_hi_u32",         dst(1), src(1, 1), op(gfx9=0x2c, gfx10=0x35, gfx11=0x2d)),
497   ("s_mul_hi_i32",         dst(1), src(1, 1), op(gfx9=0x2d, gfx10=0x36, gfx11=0x2e)),
498   ("s_add_f32",            dst(1), src(1, 1), op(gfx11=0x40), InstrClass.SFPU),
499   ("s_sub_f32",            dst(1), src(1, 1), op(gfx11=0x41), InstrClass.SFPU),
500   ("s_min_f32",            dst(1), src(1, 1), op(gfx11=0x42), InstrClass.SFPU),
501   ("s_max_f32",            dst(1), src(1, 1), op(gfx11=0x43), InstrClass.SFPU),
502   ("s_mul_f32",            dst(1), src(1, 1), op(gfx11=0x44), InstrClass.SFPU),
503   ("s_fmaak_f32",          dst(1), src(1, 1, 1), op(gfx11=0x45), InstrClass.SFPU),
504   ("s_fmamk_f32",          dst(1), src(1, 1, 1), op(gfx11=0x46), InstrClass.SFPU),
505   ("s_fmac_f32",           dst(1), src(1, 1, 1), op(gfx11=0x47), InstrClass.SFPU),
506   ("s_cvt_pk_rtz_f16_f32", dst(1), src(1, 1), op(gfx11=0x48), InstrClass.SFPU),
507   ("s_add_f16",            dst(1), src(1, 1), op(gfx11=0x49), InstrClass.SFPU),
508   ("s_sub_f16",            dst(1), src(1, 1), op(gfx11=0x4a), InstrClass.SFPU),
509   ("s_min_f16",            dst(1), src(1, 1), op(gfx11=0x4b), InstrClass.SFPU),
510   ("s_max_f16",            dst(1), src(1, 1), op(gfx11=0x4c), InstrClass.SFPU),
511   ("s_mul_f16",            dst(1), src(1, 1), op(gfx11=0x4d), InstrClass.SFPU),
512   ("s_fmac_f16",           dst(1), src(1, 1, 1), op(gfx11=0x4e), InstrClass.SFPU),
513   ("s_minimum_f32",        dst(1), src(1, 1), op(gfx12=0x4f), InstrClass.SFPU),
514   ("s_maximum_f32",        dst(1), src(1, 1), op(gfx12=0x50), InstrClass.SFPU),
515   ("s_minimum_f16",        dst(1), src(1, 1), op(gfx12=0x51), InstrClass.SFPU),
516   ("s_maximum_f16",        dst(1), src(1, 1), op(gfx12=0x52), InstrClass.SFPU),
517   ("s_add_u64",            dst(2), src(2, 2), op(gfx12=0x53)),
518   ("s_sub_u64",            dst(2), src(2, 2), op(gfx12=0x54)),
519   ("s_mul_u64",            dst(2), src(2, 2), op(gfx12=0x55)),
520   # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP2.
521   ("p_constaddr_addlo",    dst(1, SCC), src(1, 1, 1), op(-1)),
522   ("p_resumeaddr_addlo",   dst(1, SCC), src(1, 1, 1), op(-1)),
523}
524for (name, defs, ops, num, cls) in default_class(SOP2, InstrClass.Salu):
525    insn(name, num, Format.SOP2, cls, definitions = defs, operands = ops)
526
527
528# SOPK instructions: 0 input (+ imm), 1 output + optional scc
529SOPK = {
530   ("s_movk_i32",             dst(1), src(), op(0x00)),
531   ("s_version",              dst(), src(), op(gfx10=0x01)),
532   ("s_cmovk_i32",            dst(1), src(1, SCC), op(0x02, gfx8=0x01, gfx10=0x02)),
533   ("s_cmpk_eq_i32",          dst(SCC), src(1), op(0x03, gfx8=0x02, gfx10=0x03, gfx12=-1)),
534   ("s_cmpk_lg_i32",          dst(SCC), src(1), op(0x04, gfx8=0x03, gfx10=0x04, gfx12=-1)),
535   ("s_cmpk_gt_i32",          dst(SCC), src(1), op(0x05, gfx8=0x04, gfx10=0x05, gfx12=-1)),
536   ("s_cmpk_ge_i32",          dst(SCC), src(1), op(0x06, gfx8=0x05, gfx10=0x06, gfx12=-1)),
537   ("s_cmpk_lt_i32",          dst(SCC), src(1), op(0x07, gfx8=0x06, gfx10=0x07, gfx12=-1)),
538   ("s_cmpk_le_i32",          dst(SCC), src(1), op(0x08, gfx8=0x07, gfx10=0x08, gfx12=-1)),
539   ("s_cmpk_eq_u32",          dst(SCC), src(1), op(0x09, gfx8=0x08, gfx10=0x09, gfx12=-1)),
540   ("s_cmpk_lg_u32",          dst(SCC), src(1), op(0x0a, gfx8=0x09, gfx10=0x0a, gfx12=-1)),
541   ("s_cmpk_gt_u32",          dst(SCC), src(1), op(0x0b, gfx8=0x0a, gfx10=0x0b, gfx12=-1)),
542   ("s_cmpk_ge_u32",          dst(SCC), src(1), op(0x0c, gfx8=0x0b, gfx10=0x0c, gfx12=-1)),
543   ("s_cmpk_lt_u32",          dst(SCC), src(1), op(0x0d, gfx8=0x0c, gfx10=0x0d, gfx12=-1)),
544   ("s_cmpk_le_u32",          dst(SCC), src(1), op(0x0e, gfx8=0x0d, gfx10=0x0e, gfx12=-1)),
545   ("s_addk_i32",             dst(1, SCC), src(1), op(0x0f, gfx8=0x0e, gfx10=0x0f)),
546   ("s_mulk_i32",             dst(1), src(1), op(0x10, gfx8=0x0f, gfx10=0x10)),
547   ("s_cbranch_i_fork",       dst(), src(), op(0x11, gfx8=0x10, gfx10=-1), InstrClass.Branch),
548   ("s_getreg_b32",           dst(1), src(), op(0x12, gfx8=0x11, gfx10=0x12, gfx11=0x11)),
549   ("s_setreg_b32",           dst(), src(1), op(0x13, gfx8=0x12, gfx10=0x13, gfx11=0x12)),
550   ("s_setreg_imm32_b32",     dst(), src(1), op(0x15, gfx8=0x14, gfx10=0x15, gfx11=0x13)), # requires 32bit literal
551   ("s_call_b64",             dst(2), src(), op(gfx8=0x15, gfx10=0x16, gfx11=0x14), InstrClass.Branch),
552   ("s_waitcnt_vscnt",        dst(), src(1), op(gfx10=0x17, gfx11=0x18, gfx12=-1), InstrClass.Waitcnt),
553   ("s_waitcnt_vmcnt",        dst(), src(1), op(gfx10=0x18, gfx11=0x19, gfx12=-1), InstrClass.Waitcnt),
554   ("s_waitcnt_expcnt",       dst(), src(1), op(gfx10=0x19, gfx11=0x1a, gfx12=-1), InstrClass.Waitcnt),
555   ("s_waitcnt_lgkmcnt",      dst(), src(1), op(gfx10=0x1a, gfx11=0x1b, gfx12=-1), InstrClass.Waitcnt),
556   ("s_subvector_loop_begin", dst(), src(), op(gfx10=0x1b, gfx11=0x16, gfx12=-1), InstrClass.Branch),
557   ("s_subvector_loop_end",   dst(), src(), op(gfx10=0x1c, gfx11=0x17, gfx12=-1), InstrClass.Branch),
558}
559for (name, defs, ops, num, cls) in default_class(SOPK, InstrClass.Salu):
560   insn(name, num, Format.SOPK, cls, definitions = defs, operands = ops)
561
562
563# SOP1 instructions: 1 input, 1 output (+optional SCC)
564SOP1 = {
565   ("s_mov_b32",                dst(1), src(1), op(0x03, gfx8=0x00, gfx10=0x03, gfx11=0x00)),
566   ("s_mov_b64",                dst(2), src(2), op(0x04, gfx8=0x01, gfx10=0x04, gfx11=0x01)),
567   ("s_cmov_b32",               dst(1), src(1, 1, SCC), op(0x05, gfx8=0x02, gfx10=0x05, gfx11=0x02)),
568   ("s_cmov_b64",               dst(2), src(2, 2, SCC), op(0x06, gfx8=0x03, gfx10=0x06, gfx11=0x03)),
569   ("s_not_b32",                dst(1, SCC), src(1), op(0x07, gfx8=0x04, gfx10=0x07, gfx11=0x1e)),
570   ("s_not_b64",                dst(2, SCC), src(2), op(0x08, gfx8=0x05, gfx10=0x08, gfx11=0x1f)),
571   ("s_wqm_b32",                dst(1, SCC), src(1), op(0x09, gfx8=0x06, gfx10=0x09, gfx11=0x1c)),
572   ("s_wqm_b64",                dst(2, SCC), src(2), op(0x0a, gfx8=0x07, gfx10=0x0a, gfx11=0x1d)),
573   ("s_brev_b32",               dst(1), src(1), op(0x0b, gfx8=0x08, gfx10=0x0b, gfx11=0x04)),
574   ("s_brev_b64",               dst(2), src(2), op(0x0c, gfx8=0x09, gfx10=0x0c, gfx11=0x05)),
575   ("s_bcnt0_i32_b32",          dst(1, SCC), src(1), op(0x0d, gfx8=0x0a, gfx10=0x0d, gfx11=0x16)),
576   ("s_bcnt0_i32_b64",          dst(1, SCC), src(2), op(0x0e, gfx8=0x0b, gfx10=0x0e, gfx11=0x17)),
577   ("s_bcnt1_i32_b32",          dst(1, SCC), src(1), op(0x0f, gfx8=0x0c, gfx10=0x0f, gfx11=0x18)),
578   ("s_bcnt1_i32_b64",          dst(1, SCC), src(2), op(0x10, gfx8=0x0d, gfx10=0x10, gfx11=0x19)),
579   ("s_ff0_i32_b32",            dst(1), src(1), op(0x11, gfx8=0x0e, gfx10=0x11, gfx11=-1)),
580   ("s_ff0_i32_b64",            dst(1), src(2), op(0x12, gfx8=0x0f, gfx10=0x12, gfx11=-1)),
581   ("s_ff1_i32_b32",            dst(1), src(1), op(0x13, gfx8=0x10, gfx10=0x13, gfx11=0x08)), #s_ctz_i32_b32 in GFX11
582   ("s_ff1_i32_b64",            dst(1), src(2), op(0x14, gfx8=0x11, gfx10=0x14, gfx11=0x09)), #s_ctz_i32_b64 in GFX11
583   ("s_flbit_i32_b32",          dst(1), src(1), op(0x15, gfx8=0x12, gfx10=0x15, gfx11=0x0a)), #s_clz_i32_u32 in GFX11
584   ("s_flbit_i32_b64",          dst(1), src(2), op(0x16, gfx8=0x13, gfx10=0x16, gfx11=0x0b)), #s_clz_i32_u64 in GFX11
585   ("s_flbit_i32",              dst(1), src(1), op(0x17, gfx8=0x14, gfx10=0x17, gfx11=0x0c)), #s_cls_i32 in GFX11
586   ("s_flbit_i32_i64",          dst(1), src(2), op(0x18, gfx8=0x15, gfx10=0x18, gfx11=0x0d)), #s_cls_i32_i64 in GFX11
587   ("s_sext_i32_i8",            dst(1), src(1), op(0x19, gfx8=0x16, gfx10=0x19, gfx11=0x0e)),
588   ("s_sext_i32_i16",           dst(1), src(1), op(0x1a, gfx8=0x17, gfx10=0x1a, gfx11=0x0f)),
589   ("s_bitset0_b32",            dst(1), src(1, 1), op(0x1b, gfx8=0x18, gfx10=0x1b, gfx11=0x10)),
590   ("s_bitset0_b64",            dst(2), src(1, 2), op(0x1c, gfx8=0x19, gfx10=0x1c, gfx11=0x11)),
591   ("s_bitset1_b32",            dst(1), src(1, 1), op(0x1d, gfx8=0x1a, gfx10=0x1d, gfx11=0x12)),
592   ("s_bitset1_b64",            dst(2), src(1, 2), op(0x1e, gfx8=0x1b, gfx10=0x1e, gfx11=0x13)),
593   ("s_getpc_b64",              dst(2), src(), op(0x1f, gfx8=0x1c, gfx10=0x1f, gfx11=0x47)),
594   ("s_setpc_b64",              dst(), src(2), op(0x20, gfx8=0x1d, gfx10=0x20, gfx11=0x48), InstrClass.Branch),
595   ("s_swappc_b64",             dst(2), src(2), op(0x21, gfx8=0x1e, gfx10=0x21, gfx11=0x49), InstrClass.Branch),
596   ("s_rfe_b64",                dst(), src(2), op(0x22, gfx8=0x1f, gfx10=0x22, gfx11=0x4a), InstrClass.Branch),
597   ("s_and_saveexec_b64",       dst(2, SCC, EXEC), src(2, EXEC), op(0x24, gfx8=0x20, gfx10=0x24, gfx11=0x21)),
598   ("s_or_saveexec_b64",        dst(2, SCC, EXEC), src(2, EXEC), op(0x25, gfx8=0x21, gfx10=0x25, gfx11=0x23)),
599   ("s_xor_saveexec_b64",       dst(2, SCC, EXEC), src(2, EXEC), op(0x26, gfx8=0x22, gfx10=0x26, gfx11=0x25)),
600   ("s_andn2_saveexec_b64",     dst(2, SCC, EXEC), src(2, EXEC), op(0x27, gfx8=0x23, gfx10=0x27, gfx11=0x31)), #s_and_not1_saveexec_b64 in GFX11
601   ("s_orn2_saveexec_b64",      dst(2, SCC, EXEC), src(2, EXEC), op(0x28, gfx8=0x24, gfx10=0x28, gfx11=0x33)), #s_or_not1_saveexec_b64 in GFX11
602   ("s_nand_saveexec_b64",      dst(2, SCC, EXEC), src(2, EXEC), op(0x29, gfx8=0x25, gfx10=0x29, gfx11=0x27)),
603   ("s_nor_saveexec_b64",       dst(2, SCC, EXEC), src(2, EXEC), op(0x2a, gfx8=0x26, gfx10=0x2a, gfx11=0x29)),
604   ("s_xnor_saveexec_b64",      dst(2, SCC, EXEC), src(2, EXEC), op(0x2b, gfx8=0x27, gfx10=0x2b)),
605   ("s_quadmask_b32",           dst(1, SCC), src(1), op(0x2c, gfx8=0x28, gfx10=0x2c, gfx11=0x1a)),
606   ("s_quadmask_b64",           dst(2, SCC), src(2), op(0x2d, gfx8=0x29, gfx10=0x2d, gfx11=0x1b)), # Always writes 0 to the second SGPR
607   ("s_movrels_b32",            dst(1), src(1, M0), op(0x2e, gfx8=0x2a, gfx10=0x2e, gfx11=0x40)),
608   ("s_movrels_b64",            dst(2), src(2, M0), op(0x2f, gfx8=0x2b, gfx10=0x2f, gfx11=0x41)),
609   ("s_movreld_b32",            dst(1), src(1, M0), op(0x30, gfx8=0x2c, gfx10=0x30, gfx11=0x42)),
610   ("s_movreld_b64",            dst(2), src(2, M0), op(0x31, gfx8=0x2d, gfx10=0x31, gfx11=0x43)),
611   ("s_cbranch_join",           dst(), src(), op(0x32, gfx8=0x2e, gfx10=-1), InstrClass.Branch),
612   ("s_abs_i32",                dst(1, SCC), src(1), op(0x34, gfx8=0x30, gfx10=0x34, gfx11=0x15)),
613   ("s_mov_fed_b32",            dst(), src(), op(0x35, gfx8=-1, gfx10=0x35, gfx11=-1)),
614   ("s_set_gpr_idx_idx",        dst(M0), src(1, M0), op(gfx8=0x32, gfx10=-1)),
615   ("s_andn1_saveexec_b64",     dst(2, SCC, EXEC), src(2, EXEC), op(gfx9=0x33, gfx10=0x37, gfx11=0x2d)), #s_and_not0_savexec_b64 in GFX11
616   ("s_orn1_saveexec_b64",      dst(2, SCC, EXEC), src(2, EXEC), op(gfx9=0x34, gfx10=0x38, gfx11=0x2f)), #s_or_not0_savexec_b64 in GFX11
617   ("s_andn1_wrexec_b64",       dst(2, SCC, EXEC), src(2, EXEC), op(gfx9=0x35, gfx10=0x39, gfx11=0x35)), #s_and_not0_wrexec_b64 in GFX11
618   ("s_andn2_wrexec_b64",       dst(2, SCC, EXEC), src(2, EXEC), op(gfx9=0x36, gfx10=0x3a, gfx11=0x37)), #s_and_not1_wrexec_b64 in GFX11
619   ("s_bitreplicate_b64_b32",   dst(2), src(1), op(gfx9=0x37, gfx10=0x3b, gfx11=0x14)),
620   ("s_and_saveexec_b32",       dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x3c, gfx11=0x20)),
621   ("s_or_saveexec_b32",        dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x3d, gfx11=0x22)),
622   ("s_xor_saveexec_b32",       dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x3e, gfx11=0x24)),
623   ("s_andn2_saveexec_b32",     dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x3f, gfx11=0x30)), #s_and_not1_saveexec_b32 in GFX11
624   ("s_orn2_saveexec_b32",      dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x40, gfx11=0x32)), #s_or_not1_saveexec_b32 in GFX11
625   ("s_nand_saveexec_b32",      dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x41, gfx11=0x26)),
626   ("s_nor_saveexec_b32",       dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x42, gfx11=0x28)),
627   ("s_xnor_saveexec_b32",      dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x43, gfx11=0x2a)),
628   ("s_andn1_saveexec_b32",     dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x44, gfx11=0x2c)), #s_and_not0_savexec_b32 in GFX11
629   ("s_orn1_saveexec_b32",      dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x45, gfx11=0x2e)), #s_or_not0_savexec_b32 in GFX11
630   ("s_andn1_wrexec_b32",       dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x46, gfx11=0x34)), #s_and_not0_wrexec_b32 in GFX11
631   ("s_andn2_wrexec_b32",       dst(1, SCC, EXEC_LO), src(1, EXEC_LO), op(gfx10=0x47, gfx11=0x36)), #s_and_not1_wrexec_b32 in GFX11
632   ("s_movrelsd_2_b32",         dst(1), src(1, M0), op(gfx10=0x49, gfx11=0x44)),
633   ("s_sendmsg_rtn_b32",        dst(1), src(1), op(gfx11=0x4c)),
634   ("s_sendmsg_rtn_b64",        dst(2), src(1), op(gfx11=0x4d)),
635   ("s_ceil_f32",               dst(1), src(1), op(gfx11=0x60), InstrClass.SFPU),
636   ("s_floor_f32",              dst(1), src(1), op(gfx11=0x61), InstrClass.SFPU),
637   ("s_trunc_f32",              dst(1), src(1), op(gfx11=0x62), InstrClass.SFPU),
638   ("s_rndne_f32",              dst(1), src(1), op(gfx11=0x63), InstrClass.SFPU),
639   ("s_cvt_f32_i32",            dst(1), src(1), op(gfx11=0x64), InstrClass.SFPU),
640   ("s_cvt_f32_u32",            dst(1), src(1), op(gfx11=0x65), InstrClass.SFPU),
641   ("s_cvt_i32_f32",            dst(1), src(1), op(gfx11=0x66), InstrClass.SFPU),
642   ("s_cvt_u32_f32",            dst(1), src(1), op(gfx11=0x67), InstrClass.SFPU),
643   ("s_cvt_f16_f32",            dst(1), src(1), op(gfx11=0x68), InstrClass.SFPU),
644   ("p_s_cvt_f16_f32_rtne",     dst(1), src(1), op(-1), InstrClass.SFPU),
645   ("s_cvt_f32_f16",            dst(1), src(1), op(gfx11=0x69), InstrClass.SFPU),
646   ("s_cvt_hi_f32_f16",         dst(1), src(1), op(gfx11=0x6a), InstrClass.SFPU),
647   ("s_ceil_f16",               dst(1), src(1), op(gfx11=0x6b), InstrClass.SFPU),
648   ("s_floor_f16",              dst(1), src(1), op(gfx11=0x6c), InstrClass.SFPU),
649   ("s_trunc_f16",              dst(1), src(1), op(gfx11=0x6d), InstrClass.SFPU),
650   ("s_rndne_f16",              dst(1), src(1), op(gfx11=0x6e), InstrClass.SFPU),
651   ("s_barrier_signal",         dst(), src(1), op(gfx12=0x4e), InstrClass.Barrier),
652   ("s_barrier_signal_isfirst", dst(SCC), src(1), op(gfx12=0x4f), InstrClass.Barrier),
653   ("s_get_barrier_state",      dst(1), src(1), op(gfx12=0x50), InstrClass.Barrier),
654   ("s_barrier_init",           dst(), src(1), op(gfx12=0x51), InstrClass.Barrier),
655   ("s_barrier_join",           dst(), src(1), op(gfx12=0x52), InstrClass.Barrier),
656   ("s_wakeup_barrier",         dst(), src(1), op(gfx12=0x57), InstrClass.Barrier),
657   ("s_sleep_var",              dst(), src(1), op(gfx12=0x58)),
658   # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP1.
659   ("p_constaddr_getpc",        dst(2), src(1), op(-1)),
660   ("p_resumeaddr_getpc",       dst(2), src(1), op(-1)),
661   ("p_load_symbol",            dst(1), src(1), op(-1)),
662}
663for (name, defs, ops, num, cls) in default_class(SOP1, InstrClass.Salu):
664   insn(name, num, Format.SOP1, cls, definitions = defs, operands = ops)
665
666
667# SOPC instructions: 2 inputs and 0 outputs (+SCC)
668SOPC = {
669   ("s_cmp_eq_i32",     dst(SCC), src(1, 1), op(0x00)),
670   ("s_cmp_lg_i32",     dst(SCC), src(1, 1), op(0x01)),
671   ("s_cmp_gt_i32",     dst(SCC), src(1, 1), op(0x02)),
672   ("s_cmp_ge_i32",     dst(SCC), src(1, 1), op(0x03)),
673   ("s_cmp_lt_i32",     dst(SCC), src(1, 1), op(0x04)),
674   ("s_cmp_le_i32",     dst(SCC), src(1, 1), op(0x05)),
675   ("s_cmp_eq_u32",     dst(SCC), src(1, 1), op(0x06)),
676   ("s_cmp_lg_u32",     dst(SCC), src(1, 1), op(0x07)),
677   ("s_cmp_gt_u32",     dst(SCC), src(1, 1), op(0x08)),
678   ("s_cmp_ge_u32",     dst(SCC), src(1, 1), op(0x09)),
679   ("s_cmp_lt_u32",     dst(SCC), src(1, 1), op(0x0a)),
680   ("s_cmp_le_u32",     dst(SCC), src(1, 1), op(0x0b)),
681   ("s_bitcmp0_b32",    dst(SCC), src(1, 1), op(0x0c)),
682   ("s_bitcmp1_b32",    dst(SCC), src(1, 1), op(0x0d)),
683   ("s_bitcmp0_b64",    dst(SCC), src(2, 1), op(0x0e)),
684   ("s_bitcmp1_b64",    dst(SCC), src(2, 1), op(0x0f)),
685   ("s_setvskip",       dst(), src(1, 1), op(0x10, gfx10=-1)),
686   ("s_set_gpr_idx_on", dst(M0), src(1, 1, M0), op(gfx8=0x11, gfx10=-1)),
687   ("s_cmp_eq_u64",     dst(SCC), src(2, 2), op(gfx8=0x12, gfx11=0x10)),
688   ("s_cmp_lg_u64",     dst(SCC), src(2, 2), op(gfx8=0x13, gfx11=0x11)),
689   ("s_cmp_lt_f32",     dst(SCC), src(1, 1), op(gfx11=0x41), InstrClass.SFPU),
690   ("s_cmp_eq_f32",     dst(SCC), src(1, 1), op(gfx11=0x42), InstrClass.SFPU),
691   ("s_cmp_le_f32",     dst(SCC), src(1, 1), op(gfx11=0x43), InstrClass.SFPU),
692   ("s_cmp_gt_f32",     dst(SCC), src(1, 1), op(gfx11=0x44), InstrClass.SFPU),
693   ("s_cmp_lg_f32",     dst(SCC), src(1, 1), op(gfx11=0x45), InstrClass.SFPU),
694   ("s_cmp_ge_f32",     dst(SCC), src(1, 1), op(gfx11=0x46), InstrClass.SFPU),
695   ("s_cmp_o_f32",      dst(SCC), src(1, 1), op(gfx11=0x47), InstrClass.SFPU),
696   ("s_cmp_u_f32",      dst(SCC), src(1, 1), op(gfx11=0x48), InstrClass.SFPU),
697   ("s_cmp_nge_f32",    dst(SCC), src(1, 1), op(gfx11=0x49), InstrClass.SFPU),
698   ("s_cmp_nlg_f32",    dst(SCC), src(1, 1), op(gfx11=0x4a), InstrClass.SFPU),
699   ("s_cmp_ngt_f32",    dst(SCC), src(1, 1), op(gfx11=0x4b), InstrClass.SFPU),
700   ("s_cmp_nle_f32",    dst(SCC), src(1, 1), op(gfx11=0x4c), InstrClass.SFPU),
701   ("s_cmp_neq_f32",    dst(SCC), src(1, 1), op(gfx11=0x4d), InstrClass.SFPU),
702   ("s_cmp_nlt_f32",    dst(SCC), src(1, 1), op(gfx11=0x4e), InstrClass.SFPU),
703   ("s_cmp_lt_f16",     dst(SCC), src(1, 1), op(gfx11=0x51), InstrClass.SFPU),
704   ("s_cmp_eq_f16",     dst(SCC), src(1, 1), op(gfx11=0x52), InstrClass.SFPU),
705   ("s_cmp_le_f16",     dst(SCC), src(1, 1), op(gfx11=0x53), InstrClass.SFPU),
706   ("s_cmp_gt_f16",     dst(SCC), src(1, 1), op(gfx11=0x54), InstrClass.SFPU),
707   ("s_cmp_lg_f16",     dst(SCC), src(1, 1), op(gfx11=0x55), InstrClass.SFPU),
708   ("s_cmp_ge_f16",     dst(SCC), src(1, 1), op(gfx11=0x56), InstrClass.SFPU),
709   ("s_cmp_o_f16",      dst(SCC), src(1, 1), op(gfx11=0x57), InstrClass.SFPU),
710   ("s_cmp_u_f16",      dst(SCC), src(1, 1), op(gfx11=0x58), InstrClass.SFPU),
711   ("s_cmp_nge_f16",    dst(SCC), src(1, 1), op(gfx11=0x59), InstrClass.SFPU),
712   ("s_cmp_nlg_f16",    dst(SCC), src(1, 1), op(gfx11=0x5a), InstrClass.SFPU),
713   ("s_cmp_ngt_f16",    dst(SCC), src(1, 1), op(gfx11=0x5b), InstrClass.SFPU),
714   ("s_cmp_nle_f16",    dst(SCC), src(1, 1), op(gfx11=0x5c), InstrClass.SFPU),
715   ("s_cmp_neq_f16",    dst(SCC), src(1, 1), op(gfx11=0x5d), InstrClass.SFPU),
716   ("s_cmp_nlt_f16",    dst(SCC), src(1, 1), op(gfx11=0x5e), InstrClass.SFPU),
717}
718for (name, defs, ops, num, cls) in default_class(SOPC, InstrClass.Salu):
719   insn(name, num, Format.SOPC, cls, definitions = defs, operands = ops)
720
721
722# SOPP instructions: 0 inputs (+optional scc/vcc), 0 outputs
723SOPP = {
724   ("s_nop",                      dst(), src(), op(0x00)),
725   ("s_endpgm",                   dst(), src(), op(0x01, gfx11=0x30)),
726   ("s_branch",                   dst(), src(), op(0x02, gfx11=0x20), InstrClass.Branch),
727   ("s_wakeup",                   dst(), src(), op(gfx8=0x03, gfx11=0x34)),
728   ("s_cbranch_scc0",             dst(), src(), op(0x04, gfx11=0x21), InstrClass.Branch),
729   ("s_cbranch_scc1",             dst(), src(), op(0x05, gfx11=0x22), InstrClass.Branch),
730   ("s_cbranch_vccz",             dst(), src(), op(0x06, gfx11=0x23), InstrClass.Branch),
731   ("s_cbranch_vccnz",            dst(), src(), op(0x07, gfx11=0x24), InstrClass.Branch),
732   ("s_cbranch_execz",            dst(), src(), op(0x08, gfx11=0x25), InstrClass.Branch),
733   ("s_cbranch_execnz",           dst(), src(), op(0x09, gfx11=0x26), InstrClass.Branch),
734   ("s_barrier",                  dst(), src(), op(0x0a, gfx11=0x3d), InstrClass.Barrier),
735   ("s_setkill",                  dst(), src(), op(gfx7=0x0b, gfx11=0x01)),
736   ("s_waitcnt",                  dst(), src(), op(0x0c, gfx11=0x09), InstrClass.Waitcnt),
737   ("s_sethalt",                  dst(), src(), op(0x0d, gfx11=0x02)),
738   ("s_sleep",                    dst(), src(), op(0x0e, gfx11=0x03)),
739   ("s_setprio",                  dst(), src(), op(0x0f, gfx11=0x35)),
740   ("s_sendmsg",                  dst(), src(), op(0x10, gfx11=0x36), InstrClass.Sendmsg),
741   ("s_sendmsghalt",              dst(), src(), op(0x11, gfx11=0x37), InstrClass.Sendmsg),
742   ("s_trap",                     dst(), src(), op(0x12, gfx11=0x10), InstrClass.Other),
743   ("s_icache_inv",               dst(), src(), op(0x13, gfx11=0x3c)),
744   ("s_incperflevel",             dst(), src(), op(0x14, gfx11=0x38)),
745   ("s_decperflevel",             dst(), src(), op(0x15, gfx11=0x39)),
746   ("s_ttracedata",               dst(), src(M0), op(0x16, gfx11=0x3a)),
747   ("s_cbranch_cdbgsys",          dst(), src(), op(gfx7=0x17, gfx11=0x27), InstrClass.Branch),
748   ("s_cbranch_cdbguser",         dst(), src(), op(gfx7=0x18, gfx11=0x28), InstrClass.Branch),
749   ("s_cbranch_cdbgsys_or_user",  dst(), src(), op(gfx7=0x19, gfx11=0x29), InstrClass.Branch),
750   ("s_cbranch_cdbgsys_and_user", dst(), src(), op(gfx7=0x1a, gfx11=0x2a), InstrClass.Branch),
751   ("s_endpgm_saved",             dst(), src(), op(gfx8=0x1b, gfx11=0x31)),
752   ("s_set_gpr_idx_off",          dst(), src(), op(gfx8=0x1c, gfx10=-1)),
753   ("s_set_gpr_idx_mode",         dst(M0), src(M0), op(gfx8=0x1d, gfx10=-1)),
754   ("s_endpgm_ordered_ps_done",   dst(), src(), op(gfx9=0x1e, gfx11=-1)),
755   ("s_code_end",                 dst(), src(), op(gfx10=0x1f)),
756   ("s_inst_prefetch",            dst(), src(), op(gfx10=0x20, gfx11=0x04)), #s_set_inst_prefetch_distance in GFX11
757   ("s_clause",                   dst(), src(), op(gfx10=0x21, gfx11=0x05)),
758   ("s_wait_idle",                dst(), src(), op(gfx10=0x22, gfx11=0x0a), InstrClass.Waitcnt),
759   ("s_waitcnt_depctr",           dst(), src(), op(gfx10=0x23, gfx11=0x08), InstrClass.Waitcnt),
760   ("s_round_mode",               dst(), src(), op(gfx10=0x24, gfx11=0x11)),
761   ("s_denorm_mode",              dst(), src(), op(gfx10=0x25, gfx11=0x12)),
762   ("s_ttracedata_imm",           dst(), src(), op(gfx10=0x26, gfx11=0x3b)),
763   ("s_delay_alu",                dst(), src(), op(gfx11=0x07), InstrClass.Waitcnt),
764   ("s_wait_event",               dst(), src(), op(gfx11=0x0b)),
765   ("s_singleuse_vdst",           dst(), src(), op(gfx11=0x13), InstrClass.Waitcnt),
766   ("s_barrier_wait",             dst(), src(), op(gfx12=0x14), InstrClass.Barrier),
767   ("s_barrier_leave",            dst(), src(), op(gfx12=0x15), InstrClass.Barrier),
768   ("s_wait_loadcnt",             dst(), src(), op(gfx12=0x40), InstrClass.Waitcnt),
769   ("s_wait_storecnt",            dst(), src(), op(gfx12=0x41), InstrClass.Waitcnt),
770   ("s_wait_samplecnt",           dst(), src(), op(gfx12=0x42), InstrClass.Waitcnt),
771   ("s_wait_bvhcnt",              dst(), src(), op(gfx12=0x43), InstrClass.Waitcnt),
772   ("s_wait_expcnt",              dst(), src(), op(gfx12=0x44), InstrClass.Waitcnt),
773   ("s_wait_dscnt",               dst(), src(), op(gfx12=0x46), InstrClass.Waitcnt),
774   ("s_wait_kmcnt",               dst(), src(), op(gfx12=0x47), InstrClass.Waitcnt),
775   ("s_wait_loadcnt_dscnt",       dst(), src(), op(gfx12=0x48), InstrClass.Waitcnt),
776   ("s_wait_storecnt_dscnt",      dst(), src(), op(gfx12=0x49), InstrClass.Waitcnt),
777}
778for (name, defs, ops, num, cls) in default_class(SOPP, InstrClass.Salu):
779   insn(name, num, Format.SOPP, cls, definitions = defs, operands = ops)
780
781
782# SMEM instructions: sbase input (2 sgpr), potentially 2 offset inputs, 1 sdata input/output
783# Unlike GFX10, GFX10.3 does not have SMEM store, atomic or scratch instructions
784SMEM = {
785   ("s_load_dword",               op(0x00)), #s_load_b32 in GFX11
786   ("s_load_dwordx2",             op(0x01)), #s_load_b64 in GFX11
787   ("s_load_dwordx3",             op(gfx12=0x05)), #s_load_b96 in GFX12
788   ("s_load_dwordx4",             op(0x02)), #s_load_b128 in GFX11
789   ("s_load_dwordx8",             op(0x03)), #s_load_b256 in GFX11
790   ("s_load_dwordx16",            op(0x04)), #s_load_b512 in GFX11
791   ("s_load_sbyte",               op(gfx12=0x08)), #s_load_i8 in GFX12
792   ("s_load_ubyte",               op(gfx12=0x09)), #s_load_u8 in GFX12
793   ("s_load_sshort",              op(gfx12=0x0a)), #s_load_i16 in GFX12
794   ("s_load_ushort",              op(gfx12=0x0b)), #s_load_u16 in GFX12
795   ("s_scratch_load_dword",       op(gfx9=0x05, gfx11=-1)),
796   ("s_scratch_load_dwordx2",     op(gfx9=0x06, gfx11=-1)),
797   ("s_scratch_load_dwordx4",     op(gfx9=0x07, gfx11=-1)),
798   ("s_buffer_load_dword",        op(0x08, gfx12=0x10)), #s_buffer_load_b32 in GFX11
799   ("s_buffer_load_dwordx2",      op(0x09, gfx12=0x11)), #s_buffer_load_b64 in GFX11
800   ("s_buffer_load_dwordx3",      op(gfx12=0x15)), #s_buffer_load_b96 in GFX12
801   ("s_buffer_load_dwordx4",      op(0x0a, gfx12=0x12)), #s_buffer_load_b128 in GFX11
802   ("s_buffer_load_dwordx8",      op(0x0b, gfx12=0x13)), #s_buffer_load_b256 in GFX11
803   ("s_buffer_load_dwordx16",     op(0x0c, gfx12=0x14)), #s_buffer_load_b512 in GFX11
804   ("s_buffer_load_sbyte",        op(gfx12=0x18)), #s_buffer_load_i8 in GFX12
805   ("s_buffer_load_ubyte",        op(gfx12=0x19)), #s_buffer_load_u8 in GFX12
806   ("s_buffer_load_sshort",       op(gfx12=0x1a)), #s_buffer_load_i16 in GFX12
807   ("s_buffer_load_ushort",       op(gfx12=0x1b)), #s_buffer_load_u16 in GFX12
808   ("s_store_dword",              op(gfx8=0x10, gfx11=-1)),
809   ("s_store_dwordx2",            op(gfx8=0x11, gfx11=-1)),
810   ("s_store_dwordx4",            op(gfx8=0x12, gfx11=-1)),
811   ("s_scratch_store_dword",      op(gfx9=0x15, gfx11=-1)),
812   ("s_scratch_store_dwordx2",    op(gfx9=0x16, gfx11=-1)),
813   ("s_scratch_store_dwordx4",    op(gfx9=0x17, gfx11=-1)),
814   ("s_buffer_store_dword",       op(gfx8=0x18, gfx11=-1)),
815   ("s_buffer_store_dwordx2",     op(gfx8=0x19, gfx11=-1)),
816   ("s_buffer_store_dwordx4",     op(gfx8=0x1a, gfx11=-1)),
817   ("s_gl1_inv",                  op(gfx8=0x1f, gfx11=0x20, gfx12=-1)),
818   ("s_dcache_inv",               op(0x1f, gfx8=0x20, gfx11=0x21)),
819   ("s_dcache_wb",                op(gfx8=0x21, gfx11=-1)),
820   ("s_dcache_inv_vol",           op(gfx7=0x1d, gfx8=0x22, gfx10=-1)),
821   ("s_dcache_wb_vol",            op(gfx8=0x23, gfx10=-1)),
822   ("s_memtime",                  op(0x1e, gfx8=0x24, gfx11=-1)), #GFX6-GFX10
823   ("s_memrealtime",              op(gfx8=0x25, gfx11=-1)),
824   ("s_atc_probe",                op(gfx8=0x26, gfx11=0x22)),
825   ("s_atc_probe_buffer",         op(gfx8=0x27, gfx11=0x23)),
826   ("s_dcache_discard",           op(gfx9=0x28, gfx11=-1)),
827   ("s_dcache_discard_x2",        op(gfx9=0x29, gfx11=-1)),
828   ("s_get_waveid_in_workgroup",  op(gfx10=0x2a, gfx11=-1)),
829   ("s_buffer_atomic_swap",       op(gfx9=0x40, gfx11=-1)),
830   ("s_buffer_atomic_cmpswap",    op(gfx9=0x41, gfx11=-1)),
831   ("s_buffer_atomic_add",        op(gfx9=0x42, gfx11=-1)),
832   ("s_buffer_atomic_sub",        op(gfx9=0x43, gfx11=-1)),
833   ("s_buffer_atomic_smin",       op(gfx9=0x44, gfx11=-1)),
834   ("s_buffer_atomic_umin",       op(gfx9=0x45, gfx11=-1)),
835   ("s_buffer_atomic_smax",       op(gfx9=0x46, gfx11=-1)),
836   ("s_buffer_atomic_umax",       op(gfx9=0x47, gfx11=-1)),
837   ("s_buffer_atomic_and",        op(gfx9=0x48, gfx11=-1)),
838   ("s_buffer_atomic_or",         op(gfx9=0x49, gfx11=-1)),
839   ("s_buffer_atomic_xor",        op(gfx9=0x4a, gfx11=-1)),
840   ("s_buffer_atomic_inc",        op(gfx9=0x4b, gfx11=-1)),
841   ("s_buffer_atomic_dec",        op(gfx9=0x4c, gfx11=-1)),
842   ("s_buffer_atomic_swap_x2",    op(gfx9=0x60, gfx11=-1)),
843   ("s_buffer_atomic_cmpswap_x2", op(gfx9=0x61, gfx11=-1)),
844   ("s_buffer_atomic_add_x2",     op(gfx9=0x62, gfx11=-1)),
845   ("s_buffer_atomic_sub_x2",     op(gfx9=0x63, gfx11=-1)),
846   ("s_buffer_atomic_smin_x2",    op(gfx9=0x64, gfx11=-1)),
847   ("s_buffer_atomic_umin_x2",    op(gfx9=0x65, gfx11=-1)),
848   ("s_buffer_atomic_smax_x2",    op(gfx9=0x66, gfx11=-1)),
849   ("s_buffer_atomic_umax_x2",    op(gfx9=0x67, gfx11=-1)),
850   ("s_buffer_atomic_and_x2",     op(gfx9=0x68, gfx11=-1)),
851   ("s_buffer_atomic_or_x2",      op(gfx9=0x69, gfx11=-1)),
852   ("s_buffer_atomic_xor_x2",     op(gfx9=0x6a, gfx11=-1)),
853   ("s_buffer_atomic_inc_x2",     op(gfx9=0x6b, gfx11=-1)),
854   ("s_buffer_atomic_dec_x2",     op(gfx9=0x6c, gfx11=-1)),
855   ("s_atomic_swap",              op(gfx9=0x80, gfx11=-1)),
856   ("s_atomic_cmpswap",           op(gfx9=0x81, gfx11=-1)),
857   ("s_atomic_add",               op(gfx9=0x82, gfx11=-1)),
858   ("s_atomic_sub",               op(gfx9=0x83, gfx11=-1)),
859   ("s_atomic_smin",              op(gfx9=0x84, gfx11=-1)),
860   ("s_atomic_umin",              op(gfx9=0x85, gfx11=-1)),
861   ("s_atomic_smax",              op(gfx9=0x86, gfx11=-1)),
862   ("s_atomic_umax",              op(gfx9=0x87, gfx11=-1)),
863   ("s_atomic_and",               op(gfx9=0x88, gfx11=-1)),
864   ("s_atomic_or",                op(gfx9=0x89, gfx11=-1)),
865   ("s_atomic_xor",               op(gfx9=0x8a, gfx11=-1)),
866   ("s_atomic_inc",               op(gfx9=0x8b, gfx11=-1)),
867   ("s_atomic_dec",               op(gfx9=0x8c, gfx11=-1)),
868   ("s_atomic_swap_x2",           op(gfx9=0xa0, gfx11=-1)),
869   ("s_atomic_cmpswap_x2",        op(gfx9=0xa1, gfx11=-1)),
870   ("s_atomic_add_x2",            op(gfx9=0xa2, gfx11=-1)),
871   ("s_atomic_sub_x2",            op(gfx9=0xa3, gfx11=-1)),
872   ("s_atomic_smin_x2",           op(gfx9=0xa4, gfx11=-1)),
873   ("s_atomic_umin_x2",           op(gfx9=0xa5, gfx11=-1)),
874   ("s_atomic_smax_x2",           op(gfx9=0xa6, gfx11=-1)),
875   ("s_atomic_umax_x2",           op(gfx9=0xa7, gfx11=-1)),
876   ("s_atomic_and_x2",            op(gfx9=0xa8, gfx11=-1)),
877   ("s_atomic_or_x2",             op(gfx9=0xa9, gfx11=-1)),
878   ("s_atomic_xor_x2",            op(gfx9=0xaa, gfx11=-1)),
879   ("s_atomic_inc_x2",            op(gfx9=0xab, gfx11=-1)),
880   ("s_atomic_dec_x2",            op(gfx9=0xac, gfx11=-1)),
881   ("s_prefetch_inst",            op(gfx12=0x24)),
882   ("s_prefetch_inst_pc_rel",     op(gfx12=0x25)),
883   ("s_prefetch_data",            op(gfx12=0x26)),
884   ("s_buffer_prefetch_data",     op(gfx12=0x27)),
885   ("s_prefetch_data_pc_rel",     op(gfx12=0x28)),
886}
887for (name, num) in SMEM:
888   insn(name, num, Format.SMEM, InstrClass.SMem, is_atomic = "atomic" in name)
889
890
891# VOP2 instructions: 2 inputs, 1 output (+ optional vcc)
892# TODO: misses some GFX6_7 opcodes which were shifted to VOP3 in GFX8
893VOP2 = {
894   ("v_cndmask_b32",       True, False, dst(1), src(1, 1, VCC), op(0x00, gfx10=0x01)),
895   ("v_readlane_b32",      False, False, dst(1), src(1, 1), op(0x01, gfx8=-1)),
896   ("v_writelane_b32",     False, False, dst(1), src(1, 1, 1), op(0x02, gfx8=-1)),
897   ("v_add_f32",           True, True, dst(1), src(1, 1), op(0x03, gfx8=0x01, gfx10=0x03)),
898   ("v_sub_f32",           True, True, dst(1), src(1, 1), op(0x04, gfx8=0x02, gfx10=0x04)),
899   ("v_subrev_f32",        True, True, dst(1), src(1, 1), op(0x05, gfx8=0x03, gfx10=0x05)),
900   ("v_mac_legacy_f32",    True, True, dst(1), src(1, 1, 1), op(0x06, gfx8=-1, gfx10=0x06, gfx11=-1)), #GFX6,7,10
901   ("v_fmac_legacy_f32",   True, True, dst(1), src(1, 1, 1), op(gfx10=0x06, gfx12=-1)), #GFX10.3+, v_fmac_dx9_zero_f32 in GFX11
902   ("v_mul_legacy_f32",    True, True, dst(1), src(1, 1), op(0x07, gfx8=0x04, gfx10=0x07)), #v_mul_dx9_zero_f32 in GFX11
903   ("v_mul_f32",           True, True, dst(1), src(1, 1), op(0x08, gfx8=0x05, gfx10=0x08)),
904   ("v_mul_i32_i24",       False, False, dst(1), src(1, 1), op(0x09, gfx8=0x06, gfx10=0x09)),
905   ("v_mul_hi_i32_i24",    False, False, dst(1), src(1, 1), op(0x0a, gfx8=0x07, gfx10=0x0a)),
906   ("v_mul_u32_u24",       False, False, dst(1), src(1, 1), op(0x0b, gfx8=0x08, gfx10=0x0b)),
907   ("v_mul_hi_u32_u24",    False, False, dst(1), src(1, 1), op(0x0c, gfx8=0x09, gfx10=0x0c)),
908   ("v_dot4c_i32_i8",      False, False, dst(1), src(1, 1, 1), op(gfx9=0x39, gfx10=0x0d, gfx11=-1)),
909   ("v_min_legacy_f32",    True, True, dst(1), src(1, 1), op(0x0d, gfx8=-1)),
910   ("v_max_legacy_f32",    True, True, dst(1), src(1, 1), op(0x0e, gfx8=-1)),
911   ("v_min_f32",           True, True, dst(1), src(1, 1), op(0x0f, gfx8=0x0a, gfx10=0x0f, gfx12=0x15)), #called v_min_num_f32 in GFX12
912   ("v_max_f32",           True, True, dst(1), src(1, 1), op(0x10, gfx8=0x0b, gfx10=0x10, gfx12=0x16)), #called v_max_num_f32 in GFX12
913   ("v_min_i32",           False, False, dst(1), src(1, 1), op(0x11, gfx8=0x0c, gfx10=0x11)),
914   ("v_max_i32",           False, False, dst(1), src(1, 1), op(0x12, gfx8=0x0d, gfx10=0x12)),
915   ("v_min_u32",           False, False, dst(1), src(1, 1), op(0x13, gfx8=0x0e, gfx10=0x13)),
916   ("v_max_u32",           False, False, dst(1), src(1, 1), op(0x14, gfx8=0x0f, gfx10=0x14)),
917   ("v_lshr_b32",          False, False, dst(1), src(1, 1), op(0x15, gfx8=-1)),
918   ("v_lshrrev_b32",       False, False, dst(1), src(1, 1), op(0x16, gfx8=0x10, gfx10=0x16, gfx11=0x19)),
919   ("v_ashr_i32",          False, False, dst(1), src(1, 1), op(0x17, gfx8=-1)),
920   ("v_ashrrev_i32",       False, False, dst(1), src(1, 1), op(0x18, gfx8=0x11, gfx10=0x18, gfx11=0x1a)),
921   ("v_lshl_b32",          False, False, dst(1), src(1, 1), op(0x19, gfx8=-1)),
922   ("v_lshlrev_b32",       False, False, dst(1), src(1, 1), op(0x1a, gfx8=0x12, gfx10=0x1a, gfx11=0x18)),
923   ("v_and_b32",           False, False, dst(1), src(1, 1), op(0x1b, gfx8=0x13, gfx10=0x1b)),
924   ("v_or_b32",            False, False, dst(1), src(1, 1), op(0x1c, gfx8=0x14, gfx10=0x1c)),
925   ("v_xor_b32",           False, False, dst(1), src(1, 1), op(0x1d, gfx8=0x15, gfx10=0x1d)),
926   ("v_xnor_b32",          False, False, dst(1), src(1, 1), op(gfx10=0x1e)),
927   ("v_mac_f32",           True, True, dst(1), src(1, 1, 1), op(0x1f, gfx8=0x16, gfx10=0x1f, gfx11=-1)),
928   ("v_madmk_f32",         False, False, dst(1), src(1, 1, 1), op(0x20, gfx8=0x17, gfx10=0x20, gfx11=-1)),
929   ("v_madak_f32",         False, False, dst(1), src(1, 1, 1), op(0x21, gfx8=0x18, gfx10=0x21, gfx11=-1)),
930   ("v_mbcnt_hi_u32_b32",  False, False, dst(1), src(1, 1), op(0x24, gfx8=-1)),
931   ("v_add_co_u32",        False, False, dst(1, VCC), src(1, 1), op(0x25, gfx8=0x19, gfx10=-1)), # VOP3B only in RDNA
932   ("v_sub_co_u32",        False, False, dst(1, VCC), src(1, 1), op(0x26, gfx8=0x1a, gfx10=-1)), # VOP3B only in RDNA
933   ("v_subrev_co_u32",     False, False, dst(1, VCC), src(1, 1), op(0x27, gfx8=0x1b, gfx10=-1)), # VOP3B only in RDNA
934   ("v_addc_co_u32",       False, False, dst(1, VCC), src(1, 1, VCC), op(0x28, gfx8=0x1c, gfx10=0x28, gfx11=0x20)), # v_add_co_ci_u32 in RDNA
935   ("v_subb_co_u32",       False, False, dst(1, VCC), src(1, 1, VCC), op(0x29, gfx8=0x1d, gfx10=0x29, gfx11=0x21)), # v_sub_co_ci_u32 in RDNA
936   ("v_subbrev_co_u32",    False, False, dst(1, VCC), src(1, 1, VCC), op(0x2a, gfx8=0x1e, gfx10=0x2a, gfx11=0x22)), # v_subrev_co_ci_u32 in RDNA
937   ("v_fmac_f32",          True, True, dst(1), src(1, 1, 1), op(gfx10=0x2b)),
938   ("v_fmamk_f32",         False, False, dst(1), src(1, 1, 1), op(gfx10=0x2c)),
939   ("v_fmaak_f32",         False, False, dst(1), src(1, 1, 1), op(gfx10=0x2d)),
940   ("v_cvt_pkrtz_f16_f32", True, False, dst(1), src(1, 1), op(0x2f, gfx8=-1, gfx10=0x2f)), #v_cvt_pk_rtz_f16_f32 in GFX11
941   ("v_add_f16",           True, True, dst(1), src(1, 1), op(gfx8=0x1f, gfx10=0x32)),
942   ("v_sub_f16",           True, True, dst(1), src(1, 1), op(gfx8=0x20, gfx10=0x33)),
943   ("v_subrev_f16",        True, True, dst(1), src(1, 1), op(gfx8=0x21, gfx10=0x34)),
944   ("v_mul_f16",           True, True, dst(1), src(1, 1), op(gfx8=0x22, gfx10=0x35)),
945   ("v_mac_f16",           True, True, dst(1), src(1, 1, 1), op(gfx8=0x23, gfx10=-1)),
946   ("v_madmk_f16",         False, False, dst(1), src(1, 1, 1), op(gfx8=0x24, gfx10=-1)),
947   ("v_madak_f16",         False, False, dst(1), src(1, 1, 1), op(gfx8=0x25, gfx10=-1)),
948   ("v_add_u16",           False, False, dst(1), src(1, 1), op(gfx8=0x26, gfx10=-1)),
949   ("v_sub_u16",           False, False, dst(1), src(1, 1), op(gfx8=0x27, gfx10=-1)),
950   ("v_subrev_u16",        False, False, dst(1), src(1, 1), op(gfx8=0x28, gfx10=-1)),
951   ("v_mul_lo_u16",        False, False, dst(1), src(1, 1), op(gfx8=0x29, gfx10=-1)),
952   ("v_lshlrev_b16",       False, False, dst(1), src(1, 1), op(gfx8=0x2a, gfx10=-1)),
953   ("v_lshrrev_b16",       False, False, dst(1), src(1, 1), op(gfx8=0x2b, gfx10=-1)),
954   ("v_ashrrev_i16",       False, False, dst(1), src(1, 1), op(gfx8=0x2c, gfx10=-1)),
955   ("v_max_f16",           True, True, dst(1), src(1, 1), op(gfx8=0x2d, gfx10=0x39, gfx12=0x31)), #called v_max_num_f16 in GFX12
956   ("v_min_f16",           True, True, dst(1), src(1, 1), op(gfx8=0x2e, gfx10=0x3a, gfx12=0x30)), #called v_min_num_f16 in GFX12
957   ("v_max_u16",           False, False, dst(1), src(1, 1), op(gfx8=0x2f, gfx10=-1)),
958   ("v_max_i16",           False, False, dst(1), src(1, 1), op(gfx8=0x30, gfx10=-1)),
959   ("v_min_u16",           False, False, dst(1), src(1, 1), op(gfx8=0x31, gfx10=-1)),
960   ("v_min_i16",           False, False, dst(1), src(1, 1), op(gfx8=0x32, gfx10=-1)),
961   ("v_ldexp_f16",         False, True, dst(1), src(1, 1), op(gfx8=0x33, gfx10=0x3b)),
962   ("v_add_u32",           False, False, dst(1), src(1, 1), op(gfx9=0x34, gfx10=0x25)), # called v_add_nc_u32 in RDNA
963   ("v_sub_u32",           False, False, dst(1), src(1, 1), op(gfx9=0x35, gfx10=0x26)), # called v_sub_nc_u32 in RDNA
964   ("v_subrev_u32",        False, False, dst(1), src(1, 1), op(gfx9=0x36, gfx10=0x27)), # called v_subrev_nc_u32 in RDNA
965   ("v_fmac_f16",          True, True, dst(1), src(1, 1, 1), op(gfx10=0x36)),
966   ("v_fmamk_f16",         False, False, dst(1), src(1, 1, 1), op(gfx10=0x37)),
967   ("v_fmaak_f16",         False, False, dst(1), src(1, 1, 1), op(gfx10=0x38)),
968   ("v_pk_fmac_f16",       False, False, dst(1), src(1, 1, 1), op(gfx10=0x3c)),
969   ("v_dot2c_f32_f16",     False, False, dst(1), src(1, 1, 1), op(gfx9=0x37, gfx10=0x02, gfx12=-1)), #v_dot2acc_f32_f16 in GFX11
970   ("v_add_f64",           True, True, dst(2), src(2, 2), op(gfx12=0x02), InstrClass.ValuDoubleAdd),
971   ("v_mul_f64",           True, True, dst(2), src(2, 2), op(gfx12=0x06), InstrClass.ValuDoubleAdd),
972   ("v_lshlrev_b64",       False, False, dst(2), src(1, 2), op(gfx12=0x1f), InstrClass.Valu64),
973   ("v_min_f64",           True, True, dst(2), src(2, 2), op(gfx12=0x0d), InstrClass.ValuDoubleAdd),
974   ("v_max_f64",           True, True, dst(2), src(2, 2), op(gfx12=0x0e), InstrClass.ValuDoubleAdd),
975}
976for (name, in_mod, out_mod, defs, ops, num, cls) in default_class(VOP2, InstrClass.Valu32):
977   insn(name, num, Format.VOP2, cls, in_mod, out_mod, definitions = defs, operands = ops)
978
979
980# VOP1 instructions: instructions with 1 input and 1 output
981VOP1 = {
982   ("v_nop",                      False, False, dst(), src(), op(0x00)),
983   ("v_mov_b32",                  False, False, dst(1), src(1), op(0x01)),
984   ("v_readfirstlane_b32",        False, False, dst(1), src(1), op(0x02)),
985   ("v_cvt_i32_f64",              True, False, dst(1), src(2), op(0x03), InstrClass.ValuDoubleConvert),
986   ("v_cvt_f64_i32",              False, True, dst(2), src(1), op(0x04), InstrClass.ValuDoubleConvert),
987   ("v_cvt_f32_i32",              False, True, dst(1), src(1), op(0x05)),
988   ("v_cvt_f32_u32",              False, True, dst(1), src(1), op(0x06)),
989   ("v_cvt_u32_f32",              True, False, dst(1), src(1), op(0x07)),
990   ("v_cvt_i32_f32",              True, False, dst(1), src(1), op(0x08)),
991   ("v_cvt_f16_f32",              True, True, dst(1), src(1), op(0x0a)),
992   ("p_v_cvt_f16_f32_rtne",       True, True, dst(1), src(1), op(-1)),
993   ("v_cvt_f32_f16",              True, True, dst(1), src(1), op(0x0b)),
994   ("v_cvt_rpi_i32_f32",          True, False, dst(1), src(1), op(0x0c)), #v_cvt_nearest_i32_f32 in GFX11
995   ("v_cvt_flr_i32_f32",          True, False, dst(1), src(1), op(0x0d)),#v_cvt_floor_i32_f32 in GFX11
996   ("v_cvt_off_f32_i4",           False, True, dst(1), src(1), op(0x0e)),
997   ("v_cvt_f32_f64",              True, True, dst(1), src(2), op(0x0f), InstrClass.ValuDoubleConvert),
998   ("v_cvt_f64_f32",              True, True, dst(2), src(1), op(0x10), InstrClass.ValuDoubleConvert),
999   ("v_cvt_f32_ubyte0",           False, True, dst(1), src(1), op(0x11)),
1000   ("v_cvt_f32_ubyte1",           False, True, dst(1), src(1), op(0x12)),
1001   ("v_cvt_f32_ubyte2",           False, True, dst(1), src(1), op(0x13)),
1002   ("v_cvt_f32_ubyte3",           False, True, dst(1), src(1), op(0x14)),
1003   ("v_cvt_u32_f64",              True, False, dst(1), src(2), op(0x15), InstrClass.ValuDoubleConvert),
1004   ("v_cvt_f64_u32",              False, True, dst(2), src(1), op(0x16), InstrClass.ValuDoubleConvert),
1005   ("v_trunc_f64",                True, True, dst(2), src(2), op(gfx7=0x17), InstrClass.ValuDouble),
1006   ("v_ceil_f64",                 True, True, dst(2), src(2), op(gfx7=0x18), InstrClass.ValuDouble),
1007   ("v_rndne_f64",                True, True, dst(2), src(2), op(gfx7=0x19), InstrClass.ValuDouble),
1008   ("v_floor_f64",                True, True, dst(2), src(2), op(gfx7=0x1a), InstrClass.ValuDouble),
1009   ("v_pipeflush",                False, False, dst(), src(), op(gfx10=0x1b)),
1010   ("v_fract_f32",                True, True, dst(1), src(1), op(0x20, gfx8=0x1b, gfx10=0x20)),
1011   ("v_trunc_f32",                True, True, dst(1), src(1), op(0x21, gfx8=0x1c, gfx10=0x21)),
1012   ("v_ceil_f32",                 True, True, dst(1), src(1), op(0x22, gfx8=0x1d, gfx10=0x22)),
1013   ("v_rndne_f32",                True, True, dst(1), src(1), op(0x23, gfx8=0x1e, gfx10=0x23)),
1014   ("v_floor_f32",                True, True, dst(1), src(1), op(0x24, gfx8=0x1f, gfx10=0x24)),
1015   ("v_exp_f32",                  True, True, dst(1), src(1), op(0x25, gfx8=0x20, gfx10=0x25), InstrClass.ValuTranscendental32),
1016   ("v_log_clamp_f32",            True, True, dst(1), src(1), op(0x26, gfx8=-1), InstrClass.ValuTranscendental32),
1017   ("v_log_f32",                  True, True, dst(1), src(1), op(0x27, gfx8=0x21, gfx10=0x27), InstrClass.ValuTranscendental32),
1018   ("v_rcp_clamp_f32",            True, True, dst(1), src(1), op(0x28, gfx8=-1), InstrClass.ValuTranscendental32),
1019   ("v_rcp_legacy_f32",           True, True, dst(1), src(1), op(0x29, gfx8=-1), InstrClass.ValuTranscendental32),
1020   ("v_rcp_f32",                  True, True, dst(1), src(1), op(0x2a, gfx8=0x22, gfx10=0x2a), InstrClass.ValuTranscendental32),
1021   ("v_rcp_iflag_f32",            True, True, dst(1), src(1), op(0x2b, gfx8=0x23, gfx10=0x2b), InstrClass.ValuTranscendental32),
1022   ("v_rsq_clamp_f32",            True, True, dst(1), src(1), op(0x2c, gfx8=-1), InstrClass.ValuTranscendental32),
1023   ("v_rsq_legacy_f32",           True, True, dst(1), src(1), op(0x2d, gfx8=-1), InstrClass.ValuTranscendental32),
1024   ("v_rsq_f32",                  True, True, dst(1), src(1), op(0x2e, gfx8=0x24, gfx10=0x2e), InstrClass.ValuTranscendental32),
1025   ("v_rcp_f64",                  True, True, dst(2), src(2), op(0x2f, gfx8=0x25, gfx10=0x2f), InstrClass.ValuDoubleTranscendental),
1026   ("v_rcp_clamp_f64",            True, True, dst(2), src(2), op(0x30, gfx8=-1), InstrClass.ValuDoubleTranscendental),
1027   ("v_rsq_f64",                  True, True, dst(2), src(2), op(0x31, gfx8=0x26, gfx10=0x31), InstrClass.ValuDoubleTranscendental),
1028   ("v_rsq_clamp_f64",            True, True, dst(2), src(2), op(0x32, gfx8=-1), InstrClass.ValuDoubleTranscendental),
1029   ("v_sqrt_f32",                 True, True, dst(1), src(1), op(0x33, gfx8=0x27, gfx10=0x33), InstrClass.ValuTranscendental32),
1030   ("v_sqrt_f64",                 True, True, dst(2), src(2), op(0x34, gfx8=0x28, gfx10=0x34), InstrClass.ValuDoubleTranscendental),
1031   ("v_sin_f32",                  True, True, dst(1), src(1), op(0x35, gfx8=0x29, gfx10=0x35), InstrClass.ValuTranscendental32),
1032   ("v_cos_f32",                  True, True, dst(1), src(1), op(0x36, gfx8=0x2a, gfx10=0x36), InstrClass.ValuTranscendental32),
1033   ("v_not_b32",                  False, False, dst(1), src(1), op(0x37, gfx8=0x2b, gfx10=0x37)),
1034   ("v_bfrev_b32",                False, False, dst(1), src(1), op(0x38, gfx8=0x2c, gfx10=0x38)),
1035   ("v_ffbh_u32",                 False, False, dst(1), src(1), op(0x39, gfx8=0x2d, gfx10=0x39)), #v_clz_i32_u32 in GFX11
1036   ("v_ffbl_b32",                 False, False, dst(1), src(1), op(0x3a, gfx8=0x2e, gfx10=0x3a)), #v_ctz_i32_b32 in GFX11
1037   ("v_ffbh_i32",                 False, False, dst(1), src(1), op(0x3b, gfx8=0x2f, gfx10=0x3b)), #v_cls_i32 in GFX11
1038   ("v_frexp_exp_i32_f64",        True, False, dst(1), src(2), op(0x3c, gfx8=0x30, gfx10=0x3c), InstrClass.ValuDouble),
1039   ("v_frexp_mant_f64",           True, False, dst(2), src(2), op(0x3d, gfx8=0x31, gfx10=0x3d), InstrClass.ValuDouble),
1040   ("v_fract_f64",                True, True, dst(2), src(2), op(0x3e, gfx8=0x32, gfx10=0x3e), InstrClass.ValuDouble),
1041   ("v_frexp_exp_i32_f32",        True, False, dst(1), src(1), op(0x3f, gfx8=0x33, gfx10=0x3f)),
1042   ("v_frexp_mant_f32",           True, False, dst(1), src(1), op(0x40, gfx8=0x34, gfx10=0x40)),
1043   ("v_clrexcp",                  False, False, dst(), src(), op(0x41, gfx8=0x35, gfx10=0x41, gfx11=-1)),
1044   ("v_movreld_b32",              False, False, dst(1), src(1, M0), op(0x42, gfx8=0x36, gfx9=-1, gfx10=0x42)),
1045   ("v_movrels_b32",              False, False, dst(1), src(1, M0), op(0x43, gfx8=0x37, gfx9=-1, gfx10=0x43)),
1046   ("v_movrelsd_b32",             False, False, dst(1), src(1, M0), op(0x44, gfx8=0x38, gfx9=-1, gfx10=0x44)),
1047   ("v_movrelsd_2_b32",           False, False, dst(1), src(1, M0), op(gfx10=0x48)),
1048   ("v_screen_partition_4se_b32", False, False, dst(1), src(1), op(gfx9=0x37, gfx10=-1)),
1049   ("v_cvt_f16_u16",              False, True, dst(1), src(1), op(gfx8=0x39, gfx10=0x50)),
1050   ("v_cvt_f16_i16",              False, True, dst(1), src(1), op(gfx8=0x3a, gfx10=0x51)),
1051   ("v_cvt_u16_f16",              True, False, dst(1), src(1), op(gfx8=0x3b, gfx10=0x52)),
1052   ("v_cvt_i16_f16",              True, False, dst(1), src(1), op(gfx8=0x3c, gfx10=0x53)),
1053   ("v_rcp_f16",                  True, True, dst(1), src(1), op(gfx8=0x3d, gfx10=0x54), InstrClass.ValuTranscendental32),
1054   ("v_sqrt_f16",                 True, True, dst(1), src(1), op(gfx8=0x3e, gfx10=0x55), InstrClass.ValuTranscendental32),
1055   ("v_rsq_f16",                  True, True, dst(1), src(1), op(gfx8=0x3f, gfx10=0x56), InstrClass.ValuTranscendental32),
1056   ("v_log_f16",                  True, True, dst(1), src(1), op(gfx8=0x40, gfx10=0x57), InstrClass.ValuTranscendental32),
1057   ("v_exp_f16",                  True, True, dst(1), src(1), op(gfx8=0x41, gfx10=0x58), InstrClass.ValuTranscendental32),
1058   ("v_frexp_mant_f16",           True, False, dst(1), src(1), op(gfx8=0x42, gfx10=0x59)),
1059   ("v_frexp_exp_i16_f16",        True, False, dst(1), src(1), op(gfx8=0x43, gfx10=0x5a)),
1060   ("v_floor_f16",                True, True, dst(1), src(1), op(gfx8=0x44, gfx10=0x5b)),
1061   ("v_ceil_f16",                 True, True, dst(1), src(1), op(gfx8=0x45, gfx10=0x5c)),
1062   ("v_trunc_f16",                True, True, dst(1), src(1), op(gfx8=0x46, gfx10=0x5d)),
1063   ("v_rndne_f16",                True, True, dst(1), src(1), op(gfx8=0x47, gfx10=0x5e)),
1064   ("v_fract_f16",                True, True, dst(1), src(1), op(gfx8=0x48, gfx10=0x5f)),
1065   ("v_sin_f16",                  True, True, dst(1), src(1), op(gfx8=0x49, gfx10=0x60), InstrClass.ValuTranscendental32),
1066   ("v_cos_f16",                  True, True, dst(1), src(1), op(gfx8=0x4a, gfx10=0x61), InstrClass.ValuTranscendental32),
1067   ("v_exp_legacy_f32",           True, True, dst(1), src(1), op(gfx7=0x46, gfx8=0x4b, gfx10=-1), InstrClass.ValuTranscendental32),
1068   ("v_log_legacy_f32",           True, True, dst(1), src(1), op(gfx7=0x45, gfx8=0x4c, gfx10=-1), InstrClass.ValuTranscendental32),
1069   ("v_sat_pk_u8_i16",            False, False, dst(1), src(1), op(gfx9=0x4f, gfx10=0x62)),
1070   ("v_cvt_norm_i16_f16",         True, False, dst(1), src(1), op(gfx9=0x4d, gfx10=0x63)),
1071   ("v_cvt_norm_u16_f16",         True, False, dst(1), src(1), op(gfx9=0x4e, gfx10=0x64)),
1072   ("v_swap_b32",                 False, False, dst(1, 1), src(1, 1), op(gfx9=0x51, gfx10=0x65)),
1073   ("v_swaprel_b32",              False, False, dst(1, 1), src(1, 1, M0), op(gfx10=0x68)),
1074   ("v_permlane64_b32",           False, False, dst(1), src(1), op(gfx11=0x67)), #cannot use VOP3
1075   ("v_not_b16",                  False, False, dst(1), src(1), op(gfx11=0x69)),
1076   ("v_cvt_i32_i16",              False, False, dst(1), src(1), op(gfx11=0x6a)),
1077   ("v_cvt_u32_u16",              False, False, dst(1), src(1), op(gfx11=0x6b)),
1078   ("v_mov_b16",                  True, False, dst(1), src(1), op(gfx11=0x1c)),
1079   ("v_swap_b16",                 False, False, dst(1, 1), src(1, 1), op(gfx11=0x66)),
1080   ("v_cvt_f32_fp8",              False, False, dst(1), src(1), op(gfx12=0x6c)),
1081   ("v_cvt_f32_bf8",              False, False, dst(1), src(1), op(gfx12=0x6d)),
1082   ("v_cvt_pk_f32_fp8",           False, False, dst(2), src(1), op(gfx12=0x6e)),
1083   ("v_cvt_pk_f32_bf8",           False, False, dst(2), src(1), op(gfx12=0x6f)),
1084}
1085for (name, in_mod, out_mod, defs, ops, num, cls) in default_class(VOP1, InstrClass.Valu32):
1086   insn(name, num, Format.VOP1, cls, in_mod, out_mod, definitions = defs, operands = ops)
1087
1088
1089# VOPC instructions:
1090
1091VOPC_CLASS = {
1092   ("v_cmp_class_f32",  dst(VCC), src(1, 1), op(0x88, gfx8=0x10, gfx10=0x88, gfx11=0x7e)),
1093   ("v_cmp_class_f16",  dst(VCC), src(1, 1), op(gfx8=0x14, gfx10=0x8f, gfx11=0x7d)),
1094   ("v_cmpx_class_f32", dst(EXEC), src(1, 1), op(0x98, gfx8=0x11, gfx10=0x98, gfx11=0xfe)),
1095   ("v_cmpx_class_f16", dst(EXEC), src(1, 1), op(gfx8=0x15, gfx10=0x9f, gfx11=0xfd)),
1096   ("v_cmp_class_f64",  dst(VCC), src(2, 1), op(0xa8, gfx8=0x12, gfx10=0xa8, gfx11=0x7f), InstrClass.ValuDouble),
1097   ("v_cmpx_class_f64", dst(EXEC), src(2, 1), op(0xb8, gfx8=0x13, gfx10=0xb8, gfx11=0xff), InstrClass.ValuDouble),
1098}
1099for (name, defs, ops, num, cls) in default_class(VOPC_CLASS, InstrClass.Valu32):
1100    insn(name, num, Format.VOPC, cls, True, False, definitions = defs, operands = ops)
1101
1102VopcDataType = collections.namedtuple('VopcDataTypeInfo',
1103                                      ['kind', 'size', 'gfx6', 'gfx8', 'gfx10', 'gfx11'])
1104
1105#                  kind, size, gfx6, gfx8, gfx10,gfx11
1106F16 = VopcDataType('f',  16,      0, 0x20, 0xc8, 0x00)
1107F32 = VopcDataType('f',  32,   0x00, 0x40, 0x00, 0x10)
1108F64 = VopcDataType('f',  64,   0x20, 0x60, 0x20, 0x20)
1109I16 = VopcDataType('i',  16,      0, 0xa0, 0x88, 0x30)
1110I32 = VopcDataType('i',  32,   0x80, 0xc0, 0x80, 0x40)
1111I64 = VopcDataType('i',  64,   0xa0, 0xe0, 0xa0, 0x50)
1112U16 = VopcDataType('u',  16,      0, 0xa8, 0xa8, 0x38)
1113U32 = VopcDataType('u',  32,   0xc0, 0xc8, 0xc0, 0x48)
1114U64 = VopcDataType('u',  64,   0xe0, 0xe8, 0xe0, 0x58)
1115dtypes = [F16, F32, F64, I16, I32, I64, U16, U32, U64]
1116
1117COMPF = ["f", "lt", "eq", "le", "gt", "lg", "ge", "o", "u", "nge", "nlg", "ngt", "nle", "neq", "nlt", "tru"]
1118COMPI = ["f", "lt", "eq", "le", "gt", "lg", "ge", "tru"]
1119for comp, dtype, cmps, cmpx in itertools.product(range(16), dtypes, range(1), range(2)):
1120   if (comp >= 8 or cmps) and dtype.kind != 'f':
1121      continue
1122
1123   name = COMPF[comp] if dtype.kind == 'f' else COMPI[comp]
1124   name = 'v_cmp{}{}_{}_{}{}'.format('s' if cmps else '', 'x' if cmpx else '', name, dtype.kind, dtype.size)
1125
1126   gfx6 = comp | (cmpx<<4) | (cmps<<6) | dtype.gfx6
1127   gfx8 = comp | (cmpx<<4) | dtype.gfx8
1128   if dtype == F16:
1129      gfx10 = (comp & 0x7) | ((comp & 0x8) << 2) | (cmpx<<4) | dtype.gfx10
1130   else:
1131      gfx10 = comp | (cmpx<<4) | dtype.gfx10
1132   gfx11 = comp | (cmpx<<7) | dtype.gfx11
1133
1134   if cmps:
1135      gfx8 = -1
1136      gfx10 = -1
1137      gfx11 = -1
1138
1139   if dtype.size == 16:
1140      gfx6 = -1
1141
1142   if dtype in [I16, U16] and comp in [0, 7]:
1143      gfx10 = -1
1144      gfx11 = -1
1145
1146   if dtype.kind == 'i':
1147      gfx12 = -1 if comp in [0, 7] else gfx11
1148   else:
1149      gfx12 = -1 if comp in [0, 15] else gfx11
1150
1151   cls = InstrClass.Valu32
1152   if dtype == F64:
1153      cls = InstrClass.ValuDouble
1154   elif dtype in [I64, U64]:
1155      cls = InstrClass.Valu64
1156
1157   enc = Opcode(gfx6, gfx6, gfx8, gfx8, gfx10, gfx11, gfx12)
1158   insn(name, enc, Format.VOPC, cls, dtype.kind == 'f', False,
1159        definitions = dst(EXEC if cmpx else VCC),
1160        operands = src(2, 2) if dtype.size == 64 else src(1, 1))
1161
1162
1163# VOPP instructions: packed 16bit instructions - 2 or 3 inputs and 1 output
1164VOPP = {
1165   ("v_pk_mad_i16",     False, dst(1), src(1, 1, 1), op(gfx9=0x00)),
1166   ("v_pk_mul_lo_u16",  False, dst(1), src(1, 1), op(gfx9=0x01)),
1167   ("v_pk_add_i16",     False, dst(1), src(1, 1), op(gfx9=0x02)),
1168   ("v_pk_sub_i16",     False, dst(1), src(1, 1), op(gfx9=0x03)),
1169   ("v_pk_lshlrev_b16", False, dst(1), src(1, 1), op(gfx9=0x04)),
1170   ("v_pk_lshrrev_b16", False, dst(1), src(1, 1), op(gfx9=0x05)),
1171   ("v_pk_ashrrev_i16", False, dst(1), src(1, 1), op(gfx9=0x06)),
1172   ("v_pk_max_i16",     False, dst(1), src(1, 1), op(gfx9=0x07)),
1173   ("v_pk_min_i16",     False, dst(1), src(1, 1), op(gfx9=0x08)),
1174   ("v_pk_mad_u16",     False, dst(1), src(1, 1, 1), op(gfx9=0x09)),
1175   ("v_pk_add_u16",     False, dst(1), src(1, 1), op(gfx9=0x0a)),
1176   ("v_pk_sub_u16",     False, dst(1), src(1, 1), op(gfx9=0x0b)),
1177   ("v_pk_max_u16",     False, dst(1), src(1, 1), op(gfx9=0x0c)),
1178   ("v_pk_min_u16",     False, dst(1), src(1, 1), op(gfx9=0x0d)),
1179   ("v_pk_fma_f16",     True, dst(1), src(1, 1, 1), op(gfx9=0x0e)),
1180   ("v_pk_add_f16",     True, dst(1), src(1, 1), op(gfx9=0x0f)),
1181   ("v_pk_mul_f16",     True, dst(1), src(1, 1), op(gfx9=0x10)),
1182   ("v_pk_min_f16",     True, dst(1), src(1, 1), op(gfx9=0x11, gfx12=0x1b)), # called v_pk_min_num_f16 in GFX12
1183   ("v_pk_max_f16",     True, dst(1), src(1, 1), op(gfx9=0x12, gfx12=0x1c)), # called v_pk_min_num_f16 in GFX12
1184   ("v_pk_minimum_f16", True, dst(1), src(1, 1), op(gfx12=0x1d)),
1185   ("v_pk_maximum_f16", True, dst(1), src(1, 1), op(gfx12=0x1e)),
1186   ("v_fma_mix_f32",    True, dst(1), src(1, 1, 1), op(gfx9=0x20)), # v_mad_mix_f32 in VEGA ISA, v_fma_mix_f32 in RDNA ISA
1187   ("v_fma_mixlo_f16",  True, dst(1), src(1, 1, 1), op(gfx9=0x21)), # v_mad_mixlo_f16 in VEGA ISA, v_fma_mixlo_f16 in RDNA ISA
1188   ("v_fma_mixhi_f16",  True, dst(1), src(1, 1, 1), op(gfx9=0x22)), # v_mad_mixhi_f16 in VEGA ISA, v_fma_mixhi_f16 in RDNA ISA
1189   ("v_dot2_i32_i16",      False, dst(1), src(1, 1, 1), op(gfx9=0x26, gfx10=0x14, gfx11=-1)),
1190   ("v_dot2_u32_u16",      False, dst(1), src(1, 1, 1), op(gfx9=0x27, gfx10=0x15, gfx11=-1)),
1191   ("v_dot4_i32_iu8",      False, dst(1), src(1, 1, 1), op(gfx11=0x16)),
1192   ("v_dot4_i32_i8",       False, dst(1), src(1, 1, 1), op(gfx9=0x28, gfx10=0x16, gfx11=-1)),
1193   ("v_dot4_u32_u8",       False, dst(1), src(1, 1, 1), op(gfx9=0x29, gfx10=0x17)),
1194   ("v_dot8_i32_iu4",      False, dst(1), src(1, 1, 1), op(gfx11=0x18)),
1195   ("v_dot8_u32_u4",       False, dst(1), src(1, 1, 1), op(gfx9=0x2b, gfx10=0x19)),
1196   ("v_dot2_f32_f16",      False, dst(1), src(1, 1, 1), op(gfx9=0x23, gfx10=0x13)),
1197   ("v_dot2_f32_bf16",     False, dst(1), src(1, 1, 1), op(gfx11=0x1a)),
1198   ("v_dot4_f32_fp8_bf8",  False, dst(1), src(1, 1, 1), op(gfx12=0x24)),
1199   ("v_dot4_f32_bf8_fp8",  False, dst(1), src(1, 1, 1), op(gfx12=0x25)),
1200   ("v_dot4_f32_fp8_fp8",  False, dst(1), src(1, 1, 1), op(gfx12=0x26)),
1201   ("v_dot4_f32_bf8_bf8",  False, dst(1), src(1, 1, 1), op(gfx12=0x27)),
1202   ("v_wmma_f32_16x16x16_f16",       False, dst(), src(), op(gfx11=0x40), InstrClass.WMMA),
1203   ("v_wmma_f32_16x16x16_bf16",      False, dst(), src(), op(gfx11=0x41), InstrClass.WMMA),
1204   ("v_wmma_f16_16x16x16_f16",       False, dst(), src(), op(gfx11=0x42), InstrClass.WMMA),
1205   ("v_wmma_bf16_16x16x16_bf16",     False, dst(), src(), op(gfx11=0x43), InstrClass.WMMA),
1206   ("v_wmma_i32_16x16x16_iu8",       False, dst(), src(), op(gfx11=0x44), InstrClass.WMMA),
1207   ("v_wmma_i32_16x16x16_iu4",       False, dst(), src(), op(gfx11=0x45), InstrClass.WMMA),
1208   ("v_wmma_f32_16x16x16_fp8_fp8",   False, dst(), src(), op(gfx12=0x46), InstrClass.WMMA),
1209   ("v_wmma_f32_16x16x16_fp8_bf8",   False, dst(), src(), op(gfx12=0x47), InstrClass.WMMA),
1210   ("v_wmma_f32_16x16x16_bf8_fp8",   False, dst(), src(), op(gfx12=0x48), InstrClass.WMMA),
1211   ("v_wmma_f32_16x16x16_bf8_bf8",   False, dst(), src(), op(gfx12=0x49), InstrClass.WMMA),
1212   ("v_wmma_i32_16x16x32_iu4",       False, dst(), src(), op(gfx12=0x4a), InstrClass.WMMA),
1213   ("v_swmmac_f32_16x16x32_f16",     False, dst(), src(), op(gfx12=0x50), InstrClass.WMMA),
1214   ("v_swmmac_f32_16x16x32_bf16",    False, dst(), src(), op(gfx12=0x51), InstrClass.WMMA),
1215   ("v_swmmac_f16_16x16x32_f16",     False, dst(), src(), op(gfx12=0x52), InstrClass.WMMA),
1216   ("v_swmmac_bf16_16x16x32_bf16",   False, dst(), src(), op(gfx12=0x53), InstrClass.WMMA),
1217   ("v_swmmac_i32_16x16x32_iu8",     False, dst(), src(), op(gfx12=0x54), InstrClass.WMMA),
1218   ("v_swmmac_i32_16x16x32_iu4",     False, dst(), src(), op(gfx12=0x55), InstrClass.WMMA),
1219   ("v_swmmac_i32_16x16x64_iu4",     False, dst(), src(), op(gfx12=0x56), InstrClass.WMMA),
1220   ("v_swmmac_f32_16x16x32_fp8_fp8", False, dst(), src(), op(gfx12=0x57), InstrClass.WMMA),
1221   ("v_swmmac_f32_16x16x32_fp8_bf8", False, dst(), src(), op(gfx12=0x58), InstrClass.WMMA),
1222   ("v_swmmac_f32_16x16x32_bf8_fp8", False, dst(), src(), op(gfx12=0x59), InstrClass.WMMA),
1223   ("v_swmmac_f32_16x16x32_bf8_bf8", False, dst(), src(), op(gfx12=0x5a), InstrClass.WMMA),
1224}
1225for (name, modifiers, defs, ops, num, cls) in default_class(VOPP, InstrClass.Valu32):
1226   insn(name, num, Format.VOP3P, cls, modifiers, modifiers, definitions = defs, operands = ops)
1227
1228
1229# VINTRP (GFX6 - GFX10.3) instructions:
1230VINTRP = {
1231   ("v_interp_p1_f32",  dst(1), src(1, M0), op(0x00, gfx11=-1)),
1232   ("v_interp_p2_f32",  dst(1), src(1, M0, 1), op(0x01, gfx11=-1)),
1233   ("v_interp_mov_f32", dst(1), src(1, M0), op(0x02, gfx11=-1)),
1234}
1235for (name, defs, ops, num) in VINTRP:
1236   insn(name, num, Format.VINTRP, InstrClass.Valu32, definitions = defs, operands = ops)
1237
1238
1239# VINTERP (GFX11+) instructions:
1240VINTERP = {
1241   ("v_interp_p10_f32_inreg",         op(gfx11=0x00)),
1242   ("v_interp_p2_f32_inreg",          op(gfx11=0x01)),
1243   ("v_interp_p10_f16_f32_inreg",     op(gfx11=0x02)),
1244   ("v_interp_p2_f16_f32_inreg",      op(gfx11=0x03)),
1245   ("v_interp_p10_rtz_f16_f32_inreg", op(gfx11=0x04)),
1246   ("v_interp_p2_rtz_f16_f32_inreg",  op(gfx11=0x05)),
1247}
1248for (name, num) in VINTERP:
1249   insn(name, num, Format.VINTERP_INREG, InstrClass.Valu32, True, True, definitions = dst(1), operands = src(1, 1, 1))
1250
1251
1252# VOP3 instructions: 3 inputs, 1 output
1253# VOP3b instructions: have a unique scalar output, e.g. VOP2 with vcc out
1254VOP3 = {
1255   ("v_mad_legacy_f32",        True, True, dst(1), src(1, 1, 1), op(0x140, gfx8=0x1c0, gfx10=0x140, gfx11=-1)), # GFX6-GFX10
1256   ("v_mad_f32",               True, True, dst(1), src(1, 1, 1), op(0x141, gfx8=0x1c1, gfx10=0x141, gfx11=-1)),
1257   ("v_mad_i32_i24",           False, False, dst(1), src(1, 1, 1), op(0x142, gfx8=0x1c2, gfx10=0x142, gfx11=0x20a)),
1258   ("v_mad_u32_u24",           False, False, dst(1), src(1, 1, 1), op(0x143, gfx8=0x1c3, gfx10=0x143, gfx11=0x20b)),
1259   ("v_cubeid_f32",            True, True, dst(1), src(1, 1, 1), op(0x144, gfx8=0x1c4, gfx10=0x144, gfx11=0x20c)),
1260   ("v_cubesc_f32",            True, True, dst(1), src(1, 1, 1), op(0x145, gfx8=0x1c5, gfx10=0x145, gfx11=0x20d)),
1261   ("v_cubetc_f32",            True, True, dst(1), src(1, 1, 1), op(0x146, gfx8=0x1c6, gfx10=0x146, gfx11=0x20e)),
1262   ("v_cubema_f32",            True, True, dst(1), src(1, 1, 1), op(0x147, gfx8=0x1c7, gfx10=0x147, gfx11=0x20f)),
1263   ("v_bfe_u32",               False, False, dst(1), src(1, 1, 1), op(0x148, gfx8=0x1c8, gfx10=0x148, gfx11=0x210)),
1264   ("v_bfe_i32",               False, False, dst(1), src(1, 1, 1), op(0x149, gfx8=0x1c9, gfx10=0x149, gfx11=0x211)),
1265   ("v_bfi_b32",               False, False, dst(1), src(1, 1, 1), op(0x14a, gfx8=0x1ca, gfx10=0x14a, gfx11=0x212)),
1266   ("v_fma_f32",               True, True, dst(1), src(1, 1, 1), op(0x14b, gfx8=0x1cb, gfx10=0x14b, gfx11=0x213), InstrClass.ValuFma),
1267   ("v_fma_f64",               True, True, dst(2), src(2, 2, 2), op(0x14c, gfx8=0x1cc, gfx10=0x14c, gfx11=0x214), InstrClass.ValuDouble),
1268   ("v_lerp_u8",               False, False, dst(1), src(1, 1, 1), op(0x14d, gfx8=0x1cd, gfx10=0x14d, gfx11=0x215)),
1269   ("v_alignbit_b32",          False, False, dst(1), src(1, 1, 1), op(0x14e, gfx8=0x1ce, gfx10=0x14e, gfx11=0x216)),
1270   ("v_alignbyte_b32",         False, False, dst(1), src(1, 1, 1), op(0x14f, gfx8=0x1cf, gfx10=0x14f, gfx11=0x217)),
1271   ("v_mullit_f32",            True, True, dst(1), src(1, 1, 1), op(0x150, gfx8=-1, gfx10=0x150, gfx11=0x218)),
1272   ("v_min3_f32",              True, True, dst(1), src(1, 1, 1), op(0x151, gfx8=0x1d0, gfx10=0x151, gfx11=0x219, gfx12=0x229)), # called v_min3_num_f32 in GFX12
1273   ("v_min3_i32",              False, False, dst(1), src(1, 1, 1), op(0x152, gfx8=0x1d1, gfx10=0x152, gfx11=0x21a)),
1274   ("v_min3_u32",              False, False, dst(1), src(1, 1, 1), op(0x153, gfx8=0x1d2, gfx10=0x153, gfx11=0x21b)),
1275   ("v_max3_f32",              True, True, dst(1), src(1, 1, 1), op(0x154, gfx8=0x1d3, gfx10=0x154, gfx11=0x21c, gfx12=0x22a)), # called v_max3_num_f32 in GFX12
1276   ("v_max3_i32",              False, False, dst(1), src(1, 1, 1), op(0x155, gfx8=0x1d4, gfx10=0x155, gfx11=0x21d)),
1277   ("v_max3_u32",              False, False, dst(1), src(1, 1, 1), op(0x156, gfx8=0x1d5, gfx10=0x156, gfx11=0x21e)),
1278   ("v_med3_f32",              True, True, dst(1), src(1, 1, 1), op(0x157, gfx8=0x1d6, gfx10=0x157, gfx11=0x21f, gfx12=0x231)), # called v_med3_num_f32 in GFX12
1279   ("v_med3_i32",              False, False, dst(1), src(1, 1, 1), op(0x158, gfx8=0x1d7, gfx10=0x158, gfx11=0x220)),
1280   ("v_med3_u32",              False, False, dst(1), src(1, 1, 1), op(0x159, gfx8=0x1d8, gfx10=0x159, gfx11=0x221)),
1281   ("v_sad_u8",                False, False, dst(1), src(1, 1, 1), op(0x15a, gfx8=0x1d9, gfx10=0x15a, gfx11=0x222)),
1282   ("v_sad_hi_u8",             False, False, dst(1), src(1, 1, 1), op(0x15b, gfx8=0x1da, gfx10=0x15b, gfx11=0x223)),
1283   ("v_sad_u16",               False, False, dst(1), src(1, 1, 1), op(0x15c, gfx8=0x1db, gfx10=0x15c, gfx11=0x224)),
1284   ("v_sad_u32",               False, False, dst(1), src(1, 1, 1), op(0x15d, gfx8=0x1dc, gfx10=0x15d, gfx11=0x225)),
1285   ("v_cvt_pk_u8_f32",         True, False, dst(1), src(1, 1, 1), op(0x15e, gfx8=0x1dd, gfx10=0x15e, gfx11=0x226)),
1286   ("p_v_cvt_pk_u8_f32",       True, False, dst(1), src(1), op(-1)),
1287   ("v_div_fixup_f32",         True, True, dst(1), src(1, 1, 1), op(0x15f, gfx8=0x1de, gfx10=0x15f, gfx11=0x227)),
1288   ("v_div_fixup_f64",         True, True, dst(2), src(2, 2, 2), op(0x160, gfx8=0x1df, gfx10=0x160, gfx11=0x228)),
1289   ("v_lshl_b64",              False, False, dst(2), src(2, 1), op(0x161, gfx8=-1), InstrClass.Valu64),
1290   ("v_lshr_b64",              False, False, dst(2), src(2, 1), op(0x162, gfx8=-1), InstrClass.Valu64),
1291   ("v_ashr_i64",              False, False, dst(2), src(2, 1), op(0x163, gfx8=-1), InstrClass.Valu64),
1292   ("v_add_f64_e64",           True, True, dst(2), src(2, 2), op(0x164, gfx8=0x280, gfx10=0x164, gfx11=0x327, gfx12=0x102), InstrClass.ValuDoubleAdd), # GFX12 is VOP2 opcode + 0x100
1293   ("v_mul_f64_e64",           True, True, dst(2), src(2, 2), op(0x165, gfx8=0x281, gfx10=0x165, gfx11=0x328, gfx12=0x106), InstrClass.ValuDouble), # GFX12 is VOP2 opcode + 0x100
1294   ("v_min_f64_e64",           True, True, dst(2), src(2, 2), op(0x166, gfx8=0x282, gfx10=0x166, gfx11=0x329, gfx12=0x10d), InstrClass.ValuDouble), # GFX12 is VOP2 opcode + 0x100
1295   ("v_max_f64_e64",           True, True, dst(2), src(2, 2), op(0x167, gfx8=0x283, gfx10=0x167, gfx11=0x32a, gfx12=0x10e), InstrClass.ValuDouble), # GFX12 is VOP2 opcode + 0x100
1296   ("v_ldexp_f64",             False, True, dst(2), src(2, 1), op(0x168, gfx8=0x284, gfx10=0x168, gfx11=0x32b), InstrClass.ValuDouble), # src1 can take input modifiers
1297   ("v_mul_lo_u32",            False, False, dst(1), src(1, 1), op(0x169, gfx8=0x285, gfx10=0x169, gfx11=0x32c), InstrClass.ValuQuarterRate32),
1298   ("v_mul_hi_u32",            False, False, dst(1), src(1, 1), op(0x16a, gfx8=0x286, gfx10=0x16a, gfx11=0x32d), InstrClass.ValuQuarterRate32),
1299   ("v_mul_lo_i32",            False, False, dst(1), src(1, 1), op(0x16b, gfx8=0x285, gfx10=0x16b, gfx11=0x32c), InstrClass.ValuQuarterRate32), # identical to v_mul_lo_u32
1300   ("v_mul_hi_i32",            False, False, dst(1), src(1, 1), op(0x16c, gfx8=0x287, gfx10=0x16c, gfx11=0x32e), InstrClass.ValuQuarterRate32),
1301   ("v_div_scale_f32",         True, True, dst(1, VCC), src(1, 1, 1), op(0x16d, gfx8=0x1e0, gfx10=0x16d, gfx11=0x2fc)),
1302   ("v_div_scale_f64",         True, True, dst(2, VCC), src(2, 2, 2), op(0x16e, gfx8=0x1e1, gfx10=0x16e, gfx11=0x2fd), InstrClass.ValuDouble),
1303   ("v_div_fmas_f32",          True, True, dst(1), src(1, 1, 1, VCC), op(0x16f, gfx8=0x1e2, gfx10=0x16f, gfx11=0x237)),
1304   ("v_div_fmas_f64",          True, True, dst(2), src(2, 2, 2, VCC), op(0x170, gfx8=0x1e3, gfx10=0x170, gfx11=0x238), InstrClass.ValuDouble),
1305   ("v_msad_u8",               False, False, dst(1), src(1, 1, 1), op(0x171, gfx8=0x1e4, gfx10=0x171, gfx11=0x239)),
1306   ("v_qsad_pk_u16_u8",        False, False, dst(2), src(2, 1, 2), op(0x172, gfx8=0x1e5, gfx10=0x172, gfx11=0x23a)),
1307   ("v_mqsad_pk_u16_u8",       False, False, dst(2), src(2, 1, 2), op(0x173, gfx8=0x1e6, gfx10=0x173, gfx11=0x23b)),
1308   ("v_trig_preop_f64",        False, False, dst(2), src(2, 2), op(0x174, gfx8=0x292, gfx10=0x174, gfx11=0x32f), InstrClass.ValuDouble),
1309   ("v_mqsad_u32_u8",          False, False, dst(4), src(2, 1, 4), op(gfx7=0x175, gfx8=0x1e7, gfx10=0x175, gfx11=0x23d), InstrClass.ValuQuarterRate32),
1310   ("v_mad_u64_u32",           False, False, dst(2, VCC), src(1, 1, 2), op(gfx7=0x176, gfx8=0x1e8, gfx10=0x176, gfx11=0x2fe), InstrClass.Valu64), # called v_mad_co_u64_u32 in GFX12
1311   ("v_mad_i64_i32",           False, False, dst(2, VCC), src(1, 1, 2), op(gfx7=0x177, gfx8=0x1e9, gfx10=0x177, gfx11=0x2ff), InstrClass.Valu64), # called v_mad_co_i64_i32 in GFX12
1312   ("v_mad_legacy_f16",        True, True, dst(1), src(1, 1, 1), op(gfx8=0x1ea, gfx10=-1)),
1313   ("v_mad_legacy_u16",        False, False, dst(1), src(1, 1, 1), op(gfx8=0x1eb, gfx10=-1)),
1314   ("v_mad_legacy_i16",        False, False, dst(1), src(1, 1, 1), op(gfx8=0x1ec, gfx10=-1)),
1315   ("v_perm_b32",              False, False, dst(1), src(1, 1, 1), op(gfx8=0x1ed, gfx10=0x344, gfx11=0x244)),
1316   ("v_fma_legacy_f16",        True, True, dst(1), src(1, 1, 1), op(gfx8=0x1ee, gfx10=-1), InstrClass.ValuFma),
1317   ("v_div_fixup_legacy_f16",  True, True, dst(1), src(1, 1, 1), op(gfx8=0x1ef, gfx10=-1)),
1318   ("v_cvt_pkaccum_u8_f32",    True, False, dst(1), src(1, 1, 1), op(0x12c, gfx8=0x1f0, gfx10=-1)),
1319   ("v_mad_u32_u16",           False, False, dst(1), src(1, 1, 1), op(gfx9=0x1f1, gfx10=0x373, gfx11=0x259)),
1320   ("v_mad_i32_i16",           False, False, dst(1), src(1, 1, 1), op(gfx9=0x1f2, gfx10=0x375, gfx11=0x25a)),
1321   ("v_xad_u32",               False, False, dst(1), src(1, 1, 1), op(gfx9=0x1f3, gfx10=0x345, gfx11=0x245)),
1322   ("v_min3_f16",              True, True, dst(1), src(1, 1, 1), op(gfx9=0x1f4, gfx10=0x351, gfx11=0x249, gfx12=0x22b)), # called v_min3_num_f16 in GFX12
1323   ("v_min3_i16",              False, False, dst(1), src(1, 1, 1), op(gfx9=0x1f5, gfx10=0x352, gfx11=0x24a)),
1324   ("v_min3_u16",              False, False, dst(1), src(1, 1, 1), op(gfx9=0x1f6, gfx10=0x353, gfx11=0x24b)),
1325   ("v_max3_f16",              True, True, dst(1), src(1, 1, 1), op(gfx9=0x1f7, gfx10=0x354, gfx11=0x24c, gfx12=0x22c)), # called v_max3_num_f16 in GFX12
1326   ("v_max3_i16",              False, False, dst(1), src(1, 1, 1), op(gfx9=0x1f8, gfx10=0x355, gfx11=0x24d)),
1327   ("v_max3_u16",              False, False, dst(1), src(1, 1, 1), op(gfx9=0x1f9, gfx10=0x356, gfx11=0x24e)),
1328   ("v_med3_f16",              True, True, dst(1), src(1, 1, 1), op(gfx9=0x1fa, gfx10=0x357, gfx11=0x24f, gfx12=0x232)), # called v_med3_num_f16 in GFX12
1329   ("v_med3_i16",              False, False, dst(1), src(1, 1, 1), op(gfx9=0x1fb, gfx10=0x358, gfx11=0x250)),
1330   ("v_med3_u16",              False, False, dst(1), src(1, 1, 1), op(gfx9=0x1fc, gfx10=0x359, gfx11=0x251)),
1331   ("v_lshl_add_u32",          False, False, dst(1), src(1, 1, 1), op(gfx9=0x1fd, gfx10=0x346, gfx11=0x246)),
1332   ("v_add_lshl_u32",          False, False, dst(1), src(1, 1, 1), op(gfx9=0x1fe, gfx10=0x347, gfx11=0x247)),
1333   ("v_add3_u32",              False, False, dst(1), src(1, 1, 1), op(gfx9=0x1ff, gfx10=0x36d, gfx11=0x255)),
1334   ("v_lshl_or_b32",           False, False, dst(1), src(1, 1, 1), op(gfx9=0x200, gfx10=0x36f, gfx11=0x256)),
1335   ("v_and_or_b32",            False, False, dst(1), src(1, 1, 1), op(gfx9=0x201, gfx10=0x371, gfx11=0x257)),
1336   ("v_or3_b32",               False, False, dst(1), src(1, 1, 1), op(gfx9=0x202, gfx10=0x372, gfx11=0x258)),
1337   ("v_mad_f16",               True, True, dst(1), src(1, 1, 1), op(gfx9=0x203, gfx10=-1)),
1338   ("v_mad_u16",               False, False, dst(1), src(1, 1, 1), op(gfx9=0x204, gfx10=0x340, gfx11=0x241)),
1339   ("v_mad_i16",               False, False, dst(1), src(1, 1, 1), op(gfx9=0x205, gfx10=0x35e, gfx11=0x253)),
1340   ("v_fma_f16",               True, True, dst(1), src(1, 1, 1), op(gfx9=0x206, gfx10=0x34b, gfx11=0x248)),
1341   ("v_div_fixup_f16",         True, True, dst(1), src(1, 1, 1), op(gfx9=0x207, gfx10=0x35f, gfx11=0x254)),
1342   ("v_interp_p1ll_f16",       True, True, dst(1), src(1, M0), op(gfx8=0x274, gfx10=0x342, gfx11=-1)),
1343   ("v_interp_p1lv_f16",       True, True, dst(1), src(1, M0, 1), op(gfx8=0x275, gfx10=0x343, gfx11=-1)),
1344   ("v_interp_p2_legacy_f16",  True, True, dst(1), src(1, M0, 1), op(gfx8=0x276, gfx10=-1)),
1345   ("v_interp_p2_f16",         True, True, dst(1), src(1, M0, 1), op(gfx9=0x277, gfx10=0x35a, gfx11=-1)),
1346   ("v_interp_p2_hi_f16",      True, True, dst(1), src(1, M0, 1), op(gfx9=0x277, gfx10=0x35a, gfx11=-1)),
1347   ("v_ldexp_f32",             False, True, dst(1), src(1, 1), op(0x12b, gfx8=0x288, gfx10=0x362, gfx11=0x31c)),
1348   ("v_readlane_b32_e64",      False, False, dst(1), src(1, 1), op(gfx8=0x289, gfx10=0x360)),
1349   ("v_writelane_b32_e64",     False, False, dst(1), src(1, 1, 1), op(gfx8=0x28a, gfx10=0x361)),
1350   ("v_bcnt_u32_b32",          False, False, dst(1), src(1, 1), op(0x122, gfx8=0x28b, gfx10=0x364, gfx11=0x31e)),
1351   ("v_mbcnt_lo_u32_b32",      False, False, dst(1), src(1, 1), op(0x123, gfx8=0x28c, gfx10=0x365, gfx11=0x31f)),
1352   ("v_mbcnt_hi_u32_b32_e64",  False, False, dst(1), src(1, 1), op(gfx8=0x28d, gfx10=0x366, gfx11=0x320)),
1353   ("v_lshlrev_b64_e64",       False, False, dst(2), src(1, 2), op(gfx8=0x28f, gfx10=0x2ff, gfx11=0x33c, gfx12=0x11f), InstrClass.Valu64), # GFX12 is VOP2 opcode + 0x100
1354   ("v_lshrrev_b64",           False, False, dst(2), src(1, 2), op(gfx8=0x290, gfx10=0x300, gfx11=0x33d), InstrClass.Valu64),
1355   ("v_ashrrev_i64",           False, False, dst(2), src(1, 2), op(gfx8=0x291, gfx10=0x301, gfx11=0x33e), InstrClass.Valu64),
1356   ("v_bfm_b32",               False, False, dst(1), src(1, 1), op(0x11e, gfx8=0x293, gfx10=0x363, gfx11=0x31d)),
1357   ("v_cvt_pknorm_i16_f32",    True, False, dst(1), src(1, 1), op(0x12d, gfx8=0x294, gfx10=0x368, gfx11=0x321)),
1358   ("v_cvt_pknorm_u16_f32",    True, False, dst(1), src(1, 1), op(0x12e, gfx8=0x295, gfx10=0x369, gfx11=0x322)),
1359   ("v_cvt_pkrtz_f16_f32_e64", True, False, dst(1), src(1, 1), op(gfx8=0x296, gfx10=-1)),
1360   ("v_cvt_pk_u16_u32",        False, False, dst(1), src(1, 1), op(0x130, gfx8=0x297, gfx10=0x36a, gfx11=0x323)),
1361   ("v_cvt_pk_i16_i32",        False, False, dst(1), src(1, 1), op(0x131, gfx8=0x298, gfx10=0x36b, gfx11=0x324)),
1362   ("v_cvt_pknorm_i16_f16",    True, False, dst(1), src(1, 1), op(gfx9=0x299, gfx10=0x312)), #v_cvt_pk_norm_i16_f32 in GFX11
1363   ("v_cvt_pknorm_u16_f16",    True, False, dst(1), src(1, 1), op(gfx9=0x29a, gfx10=0x313)), #v_cvt_pk_norm_u16_f32 in GFX11
1364   ("v_add_i32",               False, False, dst(1), src(1, 1), op(gfx9=0x29c, gfx10=0x37f, gfx11=0x326)),
1365   ("v_sub_i32",               False, False, dst(1), src(1, 1), op(gfx9=0x29d, gfx10=0x376, gfx11=0x325)),
1366   ("v_add_i16",               False, False, dst(1), src(1, 1), op(gfx9=0x29e, gfx10=0x30d)),
1367   ("v_sub_i16",               False, False, dst(1), src(1, 1), op(gfx9=0x29f, gfx10=0x30e)),
1368   ("v_pack_b32_f16",          True, False, dst(1), src(1, 1), op(gfx9=0x2a0, gfx10=0x311)),
1369   ("v_xor3_b32",              False, False, dst(1), src(1, 1, 1), op(gfx10=0x178, gfx11=0x240)),
1370   ("v_permlane16_b32",        False, False, dst(1), src(1, 1, 1), op(gfx10=0x377, gfx11=0x25b)),
1371   ("v_permlanex16_b32",       False, False, dst(1), src(1, 1, 1), op(gfx10=0x378, gfx11=0x25c)),
1372   ("v_add_co_u32_e64",        False, False, dst(1, VCC), src(1, 1), op(gfx10=0x30f, gfx11=0x300)),
1373   ("v_sub_co_u32_e64",        False, False, dst(1, VCC), src(1, 1), op(gfx10=0x310, gfx11=0x301)),
1374   ("v_subrev_co_u32_e64",     False, False, dst(1, VCC), src(1, 1), op(gfx10=0x319, gfx11=0x302)),
1375   ("v_add_u16_e64",           False, False, dst(1), src(1, 1), op(gfx10=0x303)),
1376   ("v_sub_u16_e64",           False, False, dst(1), src(1, 1), op(gfx10=0x304)),
1377   ("v_mul_lo_u16_e64",        False, False, dst(1), src(1, 1), op(gfx10=0x305)),
1378   ("v_max_u16_e64",           False, False, dst(1), src(1, 1), op(gfx10=0x309)),
1379   ("v_max_i16_e64",           False, False, dst(1), src(1, 1), op(gfx10=0x30a)),
1380   ("v_min_u16_e64",           False, False, dst(1), src(1, 1), op(gfx10=0x30b)),
1381   ("v_min_i16_e64",           False, False, dst(1), src(1, 1), op(gfx10=0x30c)),
1382   ("v_lshrrev_b16_e64",       False, False, dst(1), src(1, 1), op(gfx10=0x307, gfx11=0x339)),
1383   ("v_ashrrev_i16_e64",       False, False, dst(1), src(1, 1), op(gfx10=0x308, gfx11=0x33a)),
1384   ("v_lshlrev_b16_e64",       False, False, dst(1), src(1, 1), op(gfx10=0x314, gfx11=0x338)),
1385   ("v_fma_legacy_f32",        True, True, dst(1), src(1, 1, 1), op(gfx10=0x140, gfx11=0x209), InstrClass.ValuFma), #GFX10.3+, v_fma_dx9_zero_f32 in GFX11
1386   ("v_maxmin_f32",            True, True, dst(1), src(1, 1, 1), op(gfx11=0x25e, gfx12=0x269)), # called v_maxmin_num_f32 in GFX12
1387   ("v_minmax_f32",            True, True, dst(1), src(1, 1, 1), op(gfx11=0x25f, gfx12=0x268)), # called v_minmax_num_f32 in GFX12
1388   ("v_maxmin_f16",            True, True, dst(1), src(1, 1, 1), op(gfx11=0x260, gfx12=0x26b)), # called v_maxmin_num_f16 in GFX12
1389   ("v_minmax_f16",            True, True, dst(1), src(1, 1, 1), op(gfx11=0x261, gfx12=0x26a)), # called v_minmax_num_f16 in GFX12
1390   ("v_maxmin_u32",            False, False, dst(1), src(1, 1, 1), op(gfx11=0x262)),
1391   ("v_minmax_u32",            False, False, dst(1), src(1, 1, 1), op(gfx11=0x263)),
1392   ("v_maxmin_i32",            False, False, dst(1), src(1, 1, 1), op(gfx11=0x264)),
1393   ("v_minmax_i32",            False, False, dst(1), src(1, 1, 1), op(gfx11=0x265)),
1394   ("v_dot2_f16_f16",          False, False, dst(1), src(1, 1, 1), op(gfx11=0x266)),
1395   ("v_dot2_bf16_bf16",        False, False, dst(1), src(1, 1, 1), op(gfx11=0x267)),
1396   ("v_cvt_pk_i16_f32",        True, False, dst(1), src(1, 1), op(gfx11=0x306)),
1397   ("v_cvt_pk_u16_f32",        True, False, dst(1), src(1, 1), op(gfx11=0x307)),
1398   ("v_and_b16",               False, False, dst(1), src(1, 1), op(gfx11=0x362)),
1399   ("v_or_b16",                False, False, dst(1), src(1, 1), op(gfx11=0x363)),
1400   ("v_xor_b16",               False, False, dst(1), src(1, 1), op(gfx11=0x364)),
1401   ("v_cndmask_b16",           True, False, dst(1), src(1, 1, VCC), op(gfx11=0x25d)),
1402   ("v_minimum3_f32",          True, True, dst(1), src(1, 1, 1), op(gfx12=0x22d)),
1403   ("v_maximum3_f32",          True, True, dst(1), src(1, 1, 1), op(gfx12=0x22e)),
1404   ("v_minimum3_f16",          True, True, dst(1), src(1, 1, 1), op(gfx12=0x22f)),
1405   ("v_maximum3_f16",          True, True, dst(1), src(1, 1, 1), op(gfx12=0x230)),
1406   ("v_minimummaximum_f32",    True, True, dst(1), src(1, 1, 1), op(gfx12=0x26c)),
1407   ("v_maximumminimum_f32",    True, True, dst(1), src(1, 1, 1), op(gfx12=0x26d)),
1408   ("v_minimummaximum_f16",    True, True, dst(1), src(1, 1, 1), op(gfx12=0x26e)),
1409   ("v_maximumminimum_f16",    True, True, dst(1), src(1, 1, 1), op(gfx12=0x26f)),
1410   ("v_s_exp_f32",             True, True, dst(1), src(1), op(gfx12=0x280), InstrClass.ValuPseudoScalarTrans),
1411   ("v_s_exp_f16",             True, True, dst(1), src(1), op(gfx12=0x281), InstrClass.ValuPseudoScalarTrans),
1412   ("v_s_log_f32",             True, True, dst(1), src(1), op(gfx12=0x282), InstrClass.ValuPseudoScalarTrans),
1413   ("v_s_log_f16",             True, True, dst(1), src(1), op(gfx12=0x283), InstrClass.ValuPseudoScalarTrans),
1414   ("v_s_rcp_f32",             True, True, dst(1), src(1), op(gfx12=0x284), InstrClass.ValuPseudoScalarTrans),
1415   ("v_s_rcp_f16",             True, True, dst(1), src(1), op(gfx12=0x285), InstrClass.ValuPseudoScalarTrans),
1416   ("v_s_rsq_f32",             True, True, dst(1), src(1), op(gfx12=0x286), InstrClass.ValuPseudoScalarTrans),
1417   ("v_s_rsq_f16",             True, True, dst(1), src(1), op(gfx12=0x287), InstrClass.ValuPseudoScalarTrans),
1418   ("v_s_sqrt_f32",            True, True, dst(1), src(1), op(gfx12=0x288), InstrClass.ValuPseudoScalarTrans),
1419   ("v_s_sqrt_f16",            True, True, dst(1), src(1), op(gfx12=0x289), InstrClass.ValuPseudoScalarTrans),
1420   ("v_minimum_f64",           True, True, dst(1), src(1, 1), op(gfx12=0x341)),
1421   ("v_maximum_f64",           True, True, dst(1), src(1, 1), op(gfx12=0x342)),
1422   ("v_minimum_f32",           True, True, dst(1), src(1, 1), op(gfx12=0x365)),
1423   ("v_maximum_f32",           True, True, dst(1), src(1, 1), op(gfx12=0x366)),
1424   ("v_minimum_f16",           True, True, dst(1), src(1, 1), op(gfx12=0x367)),
1425   ("v_maximum_f16",           True, True, dst(1), src(1, 1), op(gfx12=0x368)),
1426   ("v_permlane16_var_b32",    False, False, dst(1), src(1, 1), op(gfx12=0x30f)),
1427   ("v_permlanex16_var_b32",   False, False, dst(1), src(1, 1), op(gfx12=0x310)),
1428   ("v_cvt_pk_fp8_f32",        True, False, dst(1), src(1, 1), op(gfx12=0x369)),
1429   ("v_cvt_pk_bf8_f32",        True, False, dst(1), src(1, 1), op(gfx12=0x36a)),
1430   ("v_cvt_sr_fp8_f32",        True, False, dst(1), src(1, 1), op(gfx12=0x36b)),
1431   ("v_cvt_sr_bf8_f32",        True, False, dst(1), src(1, 1), op(gfx12=0x36c)),
1432}
1433for (name, in_mod, out_mod, defs, ops, num, cls) in default_class(VOP3, InstrClass.Valu32):
1434   insn(name, num, Format.VOP3, cls, in_mod, out_mod, definitions = defs, operands = ops)
1435
1436
1437VOPD = {
1438   ("v_dual_fmac_f32",         op(gfx11=0x00)),
1439   ("v_dual_fmaak_f32",        op(gfx11=0x01)),
1440   ("v_dual_fmamk_f32",        op(gfx11=0x02)),
1441   ("v_dual_mul_f32",          op(gfx11=0x03)),
1442   ("v_dual_add_f32",          op(gfx11=0x04)),
1443   ("v_dual_sub_f32",          op(gfx11=0x05)),
1444   ("v_dual_subrev_f32",       op(gfx11=0x06)),
1445   ("v_dual_mul_dx9_zero_f32", op(gfx11=0x07)),
1446   ("v_dual_mov_b32",          op(gfx11=0x08)),
1447   ("v_dual_cndmask_b32",      op(gfx11=0x09)),
1448   ("v_dual_max_f32",          op(gfx11=0x0a)),
1449   ("v_dual_min_f32",          op(gfx11=0x0b)),
1450   ("v_dual_dot2acc_f32_f16",  op(gfx11=0x0c)),
1451   ("v_dual_dot2acc_f32_bf16", op(gfx11=0x0d)),
1452   ("v_dual_add_nc_u32",       op(gfx11=0x10)),
1453   ("v_dual_lshlrev_b32",      op(gfx11=0x11)),
1454   ("v_dual_and_b32",          op(gfx11=0x12)),
1455}
1456for (name, num) in VOPD:
1457   insn(name, num, format = Format.VOPD, cls = InstrClass.Valu32)
1458
1459
1460# DS instructions: 3 inputs (1 addr, 2 data), 1 output
1461DS = {
1462   ("ds_add_u32",              op(0x00)),
1463   ("ds_sub_u32",              op(0x01)),
1464   ("ds_rsub_u32",             op(0x02)),
1465   ("ds_inc_u32",              op(0x03)),
1466   ("ds_dec_u32",              op(0x04)),
1467   ("ds_min_i32",              op(0x05)),
1468   ("ds_max_i32",              op(0x06)),
1469   ("ds_min_u32",              op(0x07)),
1470   ("ds_max_u32",              op(0x08)),
1471   ("ds_and_b32",              op(0x09)),
1472   ("ds_or_b32",               op(0x0a)),
1473   ("ds_xor_b32",              op(0x0b)),
1474   ("ds_mskor_b32",            op(0x0c)),
1475   ("ds_write_b32",            op(0x0d)), #ds_store_b32 in GFX11
1476   ("ds_write2_b32",           op(0x0e)), #ds_store_2addr_b32 in GFX11
1477   ("ds_write2st64_b32",       op(0x0f)), #ds_store_2addr_stride64_b32 in GFX11
1478   ("ds_cmpst_b32",            op(0x10)), #ds_cmpstore_b32 in GFX11
1479   ("ds_cmpst_f32",            op(0x11, gfx12=-1)), #ds_cmpstore_f32 in GFX11
1480   ("ds_min_f32",              op(0x12)), #ds_min_num_f32 in GFX12
1481   ("ds_max_f32",              op(0x13)), #ds_max_num_f32 in GFX12
1482   ("ds_nop",                  op(gfx7=0x14)),
1483   ("ds_add_f32",              op(gfx8=0x15)),
1484   ("ds_write_addtid_b32",     op(gfx8=0x1d, gfx10=0xb0)), #ds_store_addtid_b32 in GFX11
1485   ("ds_write_b8",             op(0x1e)), #ds_store_b8 in GFX11
1486   ("ds_write_b16",            op(0x1f)), #ds_store_b16 in GFX11
1487   ("ds_add_rtn_u32",          op(0x20)),
1488   ("ds_sub_rtn_u32",          op(0x21)),
1489   ("ds_rsub_rtn_u32",         op(0x22)),
1490   ("ds_inc_rtn_u32",          op(0x23)),
1491   ("ds_dec_rtn_u32",          op(0x24)),
1492   ("ds_min_rtn_i32",          op(0x25)),
1493   ("ds_max_rtn_i32",          op(0x26)),
1494   ("ds_min_rtn_u32",          op(0x27)),
1495   ("ds_max_rtn_u32",          op(0x28)),
1496   ("ds_and_rtn_b32",          op(0x29)),
1497   ("ds_or_rtn_b32",           op(0x2a)),
1498   ("ds_xor_rtn_b32",          op(0x2b)),
1499   ("ds_mskor_rtn_b32",        op(0x2c)),
1500   ("ds_wrxchg_rtn_b32",       op(0x2d)), #ds_storexchg_rtn_b32 in GFX11
1501   ("ds_wrxchg2_rtn_b32",      op(0x2e)), #ds_storexchg_2addr_rtn_b32 in GFX11
1502   ("ds_wrxchg2st64_rtn_b32",  op(0x2f)), #ds_storexchg_2addr_stride64_rtn_b32 in GFX11
1503   ("ds_cmpst_rtn_b32",        op(0x30)), #ds_cmpstore_rtn_b32 in GFX11
1504   ("ds_cmpst_rtn_f32",        op(0x31, gfx12=-1)), #ds_cmpstore_rtn_f32 in GFX11
1505   ("ds_min_rtn_f32",          op(0x32)), #ds_min_num_rtn_f32 in GFX12
1506   ("ds_max_rtn_f32",          op(0x33)), #ds_max_num_rtn_f32 in GFX12
1507   ("ds_wrap_rtn_b32",         op(gfx7=0x34, gfx12=-1)),
1508   ("ds_add_rtn_f32",          op(gfx8=0x35, gfx10=0x55, gfx11=0x79)),
1509   ("ds_read_b32",             op(0x36)), #ds_load_b32 in GFX11
1510   ("ds_read2_b32",            op(0x37)), #ds_load_2addr_b32 in GFX11
1511   ("ds_read2st64_b32",        op(0x38)), #ds_load_2addr_stride64_b32 in GFX11
1512   ("ds_read_i8",              op(0x39)), #ds_load_i8 in GFX11
1513   ("ds_read_u8",              op(0x3a)), #ds_load_u8 in GFX11
1514   ("ds_read_i16",             op(0x3b)), #ds_load_i16 in GFX11
1515   ("ds_read_u16",             op(0x3c)), #ds_load_u16 in GFX11
1516   ("ds_swizzle_b32",          op(0x35, gfx8=0x3d, gfx10=0x35)), #data1 & offset, no addr/data2
1517   ("ds_permute_b32",          op(gfx8=0x3e, gfx10=0xb2)),
1518   ("ds_bpermute_b32",         op(gfx8=0x3f, gfx10=0xb3)),
1519   ("ds_add_u64",              op(0x40)),
1520   ("ds_sub_u64",              op(0x41)),
1521   ("ds_rsub_u64",             op(0x42)),
1522   ("ds_inc_u64",              op(0x43)),
1523   ("ds_dec_u64",              op(0x44)),
1524   ("ds_min_i64",              op(0x45)),
1525   ("ds_max_i64",              op(0x46)),
1526   ("ds_min_u64",              op(0x47)),
1527   ("ds_max_u64",              op(0x48)),
1528   ("ds_and_b64",              op(0x49)),
1529   ("ds_or_b64",               op(0x4a)),
1530   ("ds_xor_b64",              op(0x4b)),
1531   ("ds_mskor_b64",            op(0x4c)),
1532   ("ds_write_b64",            op(0x4d)), #ds_store_b64 in GFX11
1533   ("ds_write2_b64",           op(0x4e)), #ds_store_2addr_b64 in GFX11
1534   ("ds_write2st64_b64",       op(0x4f)), #ds_store_2addr_stride64_b64 in GFX11
1535   ("ds_cmpst_b64",            op(0x50)), #ds_cmpstore_b64 in GFX11
1536   ("ds_cmpst_f64",            op(0x51, gfx12=-1)), #ds_cmpstore_f64 in GFX11
1537   ("ds_min_f64",              op(0x52)), #ds_min_num_f64 in GFX12
1538   ("ds_max_f64",              op(0x53)), #ds_max_num_f64 in GFX12
1539   ("ds_write_b8_d16_hi",      op(gfx9=0x54, gfx10=0xa0)), #ds_store_b8_d16_hi in GFX11
1540   ("ds_write_b16_d16_hi",     op(gfx9=0x55, gfx10=0xa1)), #ds_store_b16_d16_hi in GFX11
1541   ("ds_read_u8_d16",          op(gfx9=0x56, gfx10=0xa2)), #ds_load_u8_d16 in GFX11
1542   ("ds_read_u8_d16_hi",       op(gfx9=0x57, gfx10=0xa3)), #ds_load_u8_d16_hi in GFX11
1543   ("ds_read_i8_d16",          op(gfx9=0x58, gfx10=0xa4)), #ds_load_i8_d16 in GFX11
1544   ("ds_read_i8_d16_hi",       op(gfx9=0x59, gfx10=0xa5)), #ds_load_i8_d16_hi in GFX11
1545   ("ds_read_u16_d16",         op(gfx9=0x5a, gfx10=0xa6)), #ds_load_u16_d16 in GFX11
1546   ("ds_read_u16_d16_hi",      op(gfx9=0x5b, gfx10=0xa7)), #ds_load_u16_d16_hi in GFX11
1547   ("ds_add_rtn_u64",          op(0x60)),
1548   ("ds_sub_rtn_u64",          op(0x61)),
1549   ("ds_rsub_rtn_u64",         op(0x62)),
1550   ("ds_inc_rtn_u64",          op(0x63)),
1551   ("ds_dec_rtn_u64",          op(0x64)),
1552   ("ds_min_rtn_i64",          op(0x65)),
1553   ("ds_max_rtn_i64",          op(0x66)),
1554   ("ds_min_rtn_u64",          op(0x67)),
1555   ("ds_max_rtn_u64",          op(0x68)),
1556   ("ds_and_rtn_b64",          op(0x69)),
1557   ("ds_or_rtn_b64",           op(0x6a)),
1558   ("ds_xor_rtn_b64",          op(0x6b)),
1559   ("ds_mskor_rtn_b64",        op(0x6c)),
1560   ("ds_wrxchg_rtn_b64",       op(0x6d)), #ds_storexchg_rtn_b64 in GFX11
1561   ("ds_wrxchg2_rtn_b64",      op(0x6e)), #ds_storexchg_2addr_rtn_b64 in GFX11
1562   ("ds_wrxchg2st64_rtn_b64",  op(0x6f)), #ds_storexchg_2addr_stride64_rtn_b64 in GFX11
1563   ("ds_cmpst_rtn_b64",        op(0x70)), #ds_cmpstore_rtn_b64 in GFX11
1564   ("ds_cmpst_rtn_f64",        op(0x71, gfx12=-1)), #ds_cmpstore_rtn_f64 in GFX11
1565   ("ds_min_rtn_f64",          op(0x72)), #ds_min_num_f64 in GFX12
1566   ("ds_max_rtn_f64",          op(0x73)), #ds_max_num_f64 in GFX12
1567   ("ds_read_b64",             op(0x76)), #ds_load_b64 in GFX11
1568   ("ds_read2_b64",            op(0x77)), #ds_load_2addr_b64 in GFX11
1569   ("ds_read2st64_b64",        op(0x78)), #ds_load_2addr_stride64_b64 in GFX11
1570   ("ds_condxchg32_rtn_b64",   op(gfx7=0x7e)),
1571   ("ds_add_src2_u32",         op(0x80, gfx11=-1)),
1572   ("ds_sub_src2_u32",         op(0x81, gfx11=-1)),
1573   ("ds_rsub_src2_u32",        op(0x82, gfx11=-1)),
1574   ("ds_inc_src2_u32",         op(0x83, gfx11=-1)),
1575   ("ds_dec_src2_u32",         op(0x84, gfx11=-1)),
1576   ("ds_min_src2_i32",         op(0x85, gfx11=-1)),
1577   ("ds_max_src2_i32",         op(0x86, gfx11=-1)),
1578   ("ds_min_src2_u32",         op(0x87, gfx11=-1)),
1579   ("ds_max_src2_u32",         op(0x88, gfx11=-1)),
1580   ("ds_and_src2_b32",         op(0x89, gfx11=-1)),
1581   ("ds_or_src2_b32",          op(0x8a, gfx11=-1)),
1582   ("ds_xor_src2_b32",         op(0x8b, gfx11=-1)),
1583   ("ds_write_src2_b32",       op(0x8d, gfx11=-1)),
1584   ("ds_min_src2_f32",         op(0x92, gfx11=-1)),
1585   ("ds_max_src2_f32",         op(0x93, gfx11=-1)),
1586   ("ds_add_src2_f32",         op(gfx8=0x95, gfx11=-1)),
1587   ("ds_gws_sema_release_all", op(gfx7=0x18, gfx8=0x98, gfx10=0x18, gfx12=-1)),
1588   ("ds_gws_init",             op(0x19, gfx8=0x99, gfx10=0x19, gfx12=-1)),
1589   ("ds_gws_sema_v",           op(0x1a, gfx8=0x9a, gfx10=0x1a, gfx12=-1)),
1590   ("ds_gws_sema_br",          op(0x1b, gfx8=0x9b, gfx10=0x1b, gfx12=-1)),
1591   ("ds_gws_sema_p",           op(0x1c, gfx8=0x9c, gfx10=0x1c, gfx12=-1)),
1592   ("ds_gws_barrier",          op(0x1d, gfx8=0x9d, gfx10=0x1d, gfx12=-1)),
1593   ("ds_read_addtid_b32",      op(gfx8=0xb6, gfx10=0xb1)), #ds_load_addtid_b32 in GFX11
1594   ("ds_consume",              op(0x3d, gfx8=0xbd, gfx10=0x3d)),
1595   ("ds_append",               op(0x3e, gfx8=0xbe, gfx10=0x3e)),
1596   ("ds_ordered_count",        op(0x3f, gfx8=0xbf, gfx10=0x3f, gfx12=-1)),
1597   ("ds_add_src2_u64",         op(0xc0, gfx11=-1)),
1598   ("ds_sub_src2_u64",         op(0xc1, gfx11=-1)),
1599   ("ds_rsub_src2_u64",        op(0xc2, gfx11=-1)),
1600   ("ds_inc_src2_u64",         op(0xc3, gfx11=-1)),
1601   ("ds_dec_src2_u64",         op(0xc4, gfx11=-1)),
1602   ("ds_min_src2_i64",         op(0xc5, gfx11=-1)),
1603   ("ds_max_src2_i64",         op(0xc6, gfx11=-1)),
1604   ("ds_min_src2_u64",         op(0xc7, gfx11=-1)),
1605   ("ds_max_src2_u64",         op(0xc8, gfx11=-1)),
1606   ("ds_and_src2_b64",         op(0xc9, gfx11=-1)),
1607   ("ds_or_src2_b64",          op(0xca, gfx11=-1)),
1608   ("ds_xor_src2_b64",         op(0xcb, gfx11=-1)),
1609   ("ds_write_src2_b64",       op(0xcd, gfx11=-1)),
1610   ("ds_min_src2_f64",         op(0xd2, gfx11=-1)),
1611   ("ds_max_src2_f64",         op(0xd3, gfx11=-1)),
1612   ("ds_write_b96",            op(gfx7=0xde)), #ds_store_b96 in GFX11
1613   ("ds_write_b128",           op(gfx7=0xdf)), #ds_store_b128 in GFX11
1614   ("ds_condxchg32_rtn_b128",  op(gfx7=0xfd, gfx9=-1)),
1615   ("ds_read_b96",             op(gfx7=0xfe)), #ds_load_b96 in GFX11
1616   ("ds_read_b128",            op(gfx7=0xff)), #ds_load_b128 in GFX11
1617   ("ds_add_gs_reg_rtn",       op(gfx11=0x7a, gfx12=-1)),
1618   ("ds_sub_gs_reg_rtn",       op(gfx11=0x7b, gfx12=-1)),
1619   ("ds_cond_sub_u32",         op(gfx12=0x98)),
1620   ("ds_sub_clamp_u32",        op(gfx12=0x99)),
1621   ("ds_cond_sub_rtn",         op(gfx12=0xa8)),
1622   ("ds_sub_clamp_rtn_u32",    op(gfx12=0xa9)),
1623   ("ds_pk_add_f16",           op(gfx12=0x9a)),
1624   ("ds_pk_add_rtn_f16",       op(gfx12=0xaa)),
1625   ("ds_pk_add_bf16",          op(gfx12=0x9b)),
1626   ("ds_pk_add_rtn_bf16",      op(gfx12=0xab)),
1627}
1628for (name, num) in DS:
1629    insn(name, num, Format.DS, InstrClass.DS)
1630
1631
1632# LDSDIR instructions:
1633LDSDIR = {
1634   ("lds_param_load",  op(gfx11=0x00)), #called ds_param_load in GFX12?
1635   ("lds_direct_load", op(gfx11=0x01)), #called ds_param_load in GFX12?
1636}
1637for (name, num) in LDSDIR:
1638    insn(name, num, Format.LDSDIR, InstrClass.DS)
1639
1640# MUBUF instructions:
1641MUBUF = {
1642   ("buffer_load_format_x",         op(0x00)),
1643   ("buffer_load_format_xy",        op(0x01)),
1644   ("buffer_load_format_xyz",       op(0x02)),
1645   ("buffer_load_format_xyzw",      op(0x03)),
1646   ("buffer_store_format_x",        op(0x04)),
1647   ("buffer_store_format_xy",       op(0x05)),
1648   ("buffer_store_format_xyz",      op(0x06)),
1649   ("buffer_store_format_xyzw",     op(0x07)),
1650   ("buffer_load_format_d16_x",     op(gfx8=0x08, gfx10=0x80, gfx11=0x08)),
1651   ("buffer_load_format_d16_xy",    op(gfx8=0x09, gfx10=0x81, gfx11=0x09)),
1652   ("buffer_load_format_d16_xyz",   op(gfx8=0x0a, gfx10=0x82, gfx11=0x0a)),
1653   ("buffer_load_format_d16_xyzw",  op(gfx8=0x0b, gfx10=0x83, gfx11=0x0b)),
1654   ("buffer_store_format_d16_x",    op(gfx8=0x0c, gfx10=0x84, gfx11=0x0c)),
1655   ("buffer_store_format_d16_xy",   op(gfx8=0x0d, gfx10=0x85, gfx11=0x0d)),
1656   ("buffer_store_format_d16_xyz",  op(gfx8=0x0e, gfx10=0x86, gfx11=0x0e)),
1657   ("buffer_store_format_d16_xyzw", op(gfx8=0x0f, gfx10=0x87, gfx11=0x0f)),
1658   ("buffer_load_ubyte",            op(0x08, gfx8=0x10, gfx10=0x08, gfx11=0x10)),
1659   ("buffer_load_sbyte",            op(0x09, gfx8=0x11, gfx10=0x09, gfx11=0x11)),
1660   ("buffer_load_ushort",           op(0x0a, gfx8=0x12, gfx10=0x0a, gfx11=0x12)),
1661   ("buffer_load_sshort",           op(0x0b, gfx8=0x13, gfx10=0x0b, gfx11=0x13)),
1662   ("buffer_load_dword",            op(0x0c, gfx8=0x14, gfx10=0x0c, gfx11=0x14)),
1663   ("buffer_load_dwordx2",          op(0x0d, gfx8=0x15, gfx10=0x0d, gfx11=0x15)),
1664   ("buffer_load_dwordx3",          op(gfx7=0x0f, gfx8=0x16, gfx10=0x0f, gfx11=0x16)),
1665   ("buffer_load_dwordx4",          op(0x0e, gfx8=0x17, gfx10=0x0e, gfx11=0x17)),
1666   ("buffer_store_byte",            op(0x18)),
1667   ("buffer_store_byte_d16_hi",     op(gfx9=0x19, gfx11=0x24)),
1668   ("buffer_store_short",           op(0x1a, gfx11=0x19)),
1669   ("buffer_store_short_d16_hi",    op(gfx9=0x1b, gfx11=0x25)),
1670   ("buffer_store_dword",           op(0x1c, gfx11=0x1a)),
1671   ("buffer_store_dwordx2",         op(0x1d, gfx11=0x1b)),
1672   ("buffer_store_dwordx3",         op(gfx7=0x1f, gfx8=0x1e, gfx10=0x1f, gfx11=0x1c)),
1673   ("buffer_store_dwordx4",         op(0x1e, gfx8=0x1f, gfx10=0x1e, gfx11=0x1d)),
1674   ("buffer_load_ubyte_d16",        op(gfx9=0x20, gfx11=0x1e)),
1675   ("buffer_load_ubyte_d16_hi",     op(gfx9=0x21)),
1676   ("buffer_load_sbyte_d16",        op(gfx9=0x22, gfx11=0x1f)),
1677   ("buffer_load_sbyte_d16_hi",     op(gfx9=0x23, gfx11=0x22)),
1678   ("buffer_load_short_d16",        op(gfx9=0x24, gfx11=0x20)),
1679   ("buffer_load_short_d16_hi",     op(gfx9=0x25, gfx11=0x23)),
1680   ("buffer_load_format_d16_hi_x",  op(gfx9=0x26)),
1681   ("buffer_store_format_d16_hi_x", op(gfx9=0x27)),
1682   ("buffer_store_lds_dword",       op(gfx8=0x3d, gfx10=-1)),
1683   ("buffer_wbinvl1",               op(0x71, gfx8=0x3e, gfx10=-1)),
1684   ("buffer_wbinvl1_vol",           op(0x70, gfx8=0x3f, gfx10=-1)),
1685   ("buffer_atomic_swap",           op(0x30, gfx8=0x40, gfx10=0x30, gfx11=0x33)),
1686   ("buffer_atomic_cmpswap",        op(0x31, gfx8=0x41, gfx10=0x31, gfx11=0x34)),
1687   ("buffer_atomic_add",            op(0x32, gfx8=0x42, gfx10=0x32, gfx11=0x35)),
1688   ("buffer_atomic_sub",            op(0x33, gfx8=0x43, gfx10=0x33, gfx11=0x36)),
1689   ("buffer_atomic_rsub",           op(0x34, gfx7=-1)),
1690   ("buffer_atomic_smin",           op(0x35, gfx8=0x44, gfx10=0x35, gfx11=0x38)),
1691   ("buffer_atomic_umin",           op(0x36, gfx8=0x45, gfx10=0x36, gfx11=0x39)),
1692   ("buffer_atomic_smax",           op(0x37, gfx8=0x46, gfx10=0x37, gfx11=0x3a)),
1693   ("buffer_atomic_umax",           op(0x38, gfx8=0x47, gfx10=0x38, gfx11=0x3b)),
1694   ("buffer_atomic_and",            op(0x39, gfx8=0x48, gfx10=0x39, gfx11=0x3c)),
1695   ("buffer_atomic_or",             op(0x3a, gfx8=0x49, gfx10=0x3a, gfx11=0x3d)),
1696   ("buffer_atomic_xor",            op(0x3b, gfx8=0x4a, gfx10=0x3b, gfx11=0x3e)),
1697   ("buffer_atomic_inc",            op(0x3c, gfx8=0x4b, gfx10=0x3c, gfx11=0x3f)),
1698   ("buffer_atomic_dec",            op(0x3d, gfx8=0x4c, gfx10=0x3d, gfx11=0x40)),
1699   ("buffer_atomic_fcmpswap",       op(0x3e, gfx8=-1, gfx10=0x3e, gfx11=0x50, gfx12=-1)),
1700   ("buffer_atomic_fmin",           op(0x3f, gfx8=-1, gfx10=0x3f, gfx11=0x51)),
1701   ("buffer_atomic_fmax",           op(0x40, gfx8=-1, gfx10=0x40, gfx11=0x52)),
1702   ("buffer_atomic_swap_x2",        op(0x50, gfx8=0x60, gfx10=0x50, gfx11=0x41)),
1703   ("buffer_atomic_cmpswap_x2",     op(0x51, gfx8=0x61, gfx10=0x51, gfx11=0x42)),
1704   ("buffer_atomic_add_x2",         op(0x52, gfx8=0x62, gfx10=0x52, gfx11=0x43)),
1705   ("buffer_atomic_sub_x2",         op(0x53, gfx8=0x63, gfx10=0x53, gfx11=0x44)),
1706   ("buffer_atomic_rsub_x2",        op(0x54, gfx7=-1)),
1707   ("buffer_atomic_smin_x2",        op(0x55, gfx8=0x64, gfx10=0x55, gfx11=0x45)),
1708   ("buffer_atomic_umin_x2",        op(0x56, gfx8=0x65, gfx10=0x56, gfx11=0x46)),
1709   ("buffer_atomic_smax_x2",        op(0x57, gfx8=0x66, gfx10=0x57, gfx11=0x47)),
1710   ("buffer_atomic_umax_x2",        op(0x58, gfx8=0x67, gfx10=0x58, gfx11=0x48)),
1711   ("buffer_atomic_and_x2",         op(0x59, gfx8=0x68, gfx10=0x59, gfx11=0x49)),
1712   ("buffer_atomic_or_x2",          op(0x5a, gfx8=0x69, gfx10=0x5a, gfx11=0x4a)),
1713   ("buffer_atomic_xor_x2",         op(0x5b, gfx8=0x6a, gfx10=0x5b, gfx11=0x4b)),
1714   ("buffer_atomic_inc_x2",         op(0x5c, gfx8=0x6b, gfx10=0x5c, gfx11=0x4c)),
1715   ("buffer_atomic_dec_x2",         op(0x5d, gfx8=0x6c, gfx10=0x5d, gfx11=0x4d)),
1716   ("buffer_atomic_fcmpswap_x2",    op(0x5e, gfx8=-1, gfx10=0x5e, gfx11=-1)),
1717   ("buffer_atomic_fmin_x2",        op(0x5f, gfx8=-1, gfx10=0x5f, gfx11=-1)),
1718   ("buffer_atomic_fmax_x2",        op(0x60, gfx8=-1, gfx10=0x60, gfx11=-1)),
1719   ("buffer_gl0_inv",               op(gfx10=0x71, gfx11=0x2b, gfx12=-1)),
1720   ("buffer_gl1_inv",               op(gfx10=0x72, gfx11=0x2c, gfx12=-1)),
1721   ("buffer_atomic_csub",           op(gfx10=0x34, gfx11=0x37)), #GFX10.3+. seems glc must be set. buffer_atomic_csub_u32 in GFX11
1722   ("buffer_load_lds_b32",          op(gfx11=0x31, gfx12=-1)),
1723   ("buffer_load_lds_format_x",     op(gfx11=0x32, gfx12=-1)),
1724   ("buffer_load_lds_i8",           op(gfx11=0x2e, gfx12=-1)),
1725   ("buffer_load_lds_i16",          op(gfx11=0x30, gfx12=-1)),
1726   ("buffer_load_lds_u8",           op(gfx11=0x2d, gfx12=-1)),
1727   ("buffer_load_lds_u16",          op(gfx11=0x2f, gfx12=-1)),
1728   ("buffer_atomic_add_f32",        op(gfx11=0x56)),
1729   ("buffer_atomic_pk_add_f16",     op(gfx12=0x59)),
1730   ("buffer_atomic_pk_add_bf16",    op(gfx12=0x5a)),
1731}
1732for (name, num) in MUBUF:
1733    insn(name, num, Format.MUBUF, InstrClass.VMem, is_atomic = "atomic" in name)
1734
1735MTBUF = {
1736   ("tbuffer_load_format_x",         op(0x00)),
1737   ("tbuffer_load_format_xy",        op(0x01)),
1738   ("tbuffer_load_format_xyz",       op(0x02)),
1739   ("tbuffer_load_format_xyzw",      op(0x03)),
1740   ("tbuffer_store_format_x",        op(0x04)),
1741   ("tbuffer_store_format_xy",       op(0x05)),
1742   ("tbuffer_store_format_xyz",      op(0x06)),
1743   ("tbuffer_store_format_xyzw",     op(0x07)),
1744   ("tbuffer_load_format_d16_x",     op(gfx8=0x08)),
1745   ("tbuffer_load_format_d16_xy",    op(gfx8=0x09)),
1746   ("tbuffer_load_format_d16_xyz",   op(gfx8=0x0a)),
1747   ("tbuffer_load_format_d16_xyzw",  op(gfx8=0x0b)),
1748   ("tbuffer_store_format_d16_x",    op(gfx8=0x0c)),
1749   ("tbuffer_store_format_d16_xy",   op(gfx8=0x0d)),
1750   ("tbuffer_store_format_d16_xyz",  op(gfx8=0x0e)),
1751   ("tbuffer_store_format_d16_xyzw", op(gfx8=0x0f)),
1752}
1753for (name, num) in MTBUF:
1754    insn(name, num, Format.MTBUF, InstrClass.VMem)
1755
1756
1757MIMG = {
1758   ("image_load",                op(0x00)),
1759   ("image_load_mip",            op(0x01)),
1760   ("image_load_pck",            op(0x02)),
1761   ("image_load_pck_sgn",        op(0x03)),
1762   ("image_load_mip_pck",        op(0x04)),
1763   ("image_load_mip_pck_sgn",    op(0x05)),
1764   ("image_store",               op(0x08, gfx11=0x06)),
1765   ("image_store_mip",           op(0x09, gfx11=0x07)),
1766   ("image_store_pck",           op(0x0a, gfx11=0x08)),
1767   ("image_store_mip_pck",       op(0x0b, gfx11=0x09)),
1768   ("image_get_resinfo",         op(0x0e, gfx11=0x17)),
1769   ("image_get_lod",             op(0x60, gfx11=0x38)),
1770   ("image_msaa_load",           op(gfx10=0x80, gfx11=0x18)), #GFX10.3+
1771   ("image_atomic_swap",         op(0x0f, gfx8=0x10, gfx10=0x0f, gfx11=0x0a)),
1772   ("image_atomic_cmpswap",      op(0x10, gfx8=0x11, gfx10=0x10, gfx11=0x0b)),
1773   ("image_atomic_add",          op(0x11, gfx8=0x12, gfx10=0x11, gfx11=0x0c)),
1774   ("image_atomic_sub",          op(0x12, gfx8=0x13, gfx10=0x12, gfx11=0x0d)),
1775   ("image_atomic_rsub",         op(0x13, gfx7=-1)),
1776   ("image_atomic_smin",         op(0x14, gfx11=0x0e)),
1777   ("image_atomic_umin",         op(0x15, gfx11=0x0f)),
1778   ("image_atomic_smax",         op(0x16, gfx11=0x10)),
1779   ("image_atomic_umax",         op(0x17, gfx11=0x11)),
1780   ("image_atomic_and",          op(0x18, gfx11=0x12)),
1781   ("image_atomic_or",           op(0x19, gfx11=0x13)),
1782   ("image_atomic_xor",          op(0x1a, gfx11=0x14)),
1783   ("image_atomic_inc",          op(0x1b, gfx11=0x15)),
1784   ("image_atomic_dec",          op(0x1c, gfx11=0x16)),
1785   ("image_atomic_fcmpswap",     op(0x1d, gfx8=-1, gfx10=0x1d, gfx11=-1)),
1786   ("image_atomic_fmin",         op(0x1e, gfx8=-1, gfx10=0x1e, gfx11=-1, gfx12=0x84)), #image_atomic_min_num_flt in GFX12
1787   ("image_atomic_fmax",         op(0x1f, gfx8=-1, gfx10=0x1f, gfx11=-1, gfx12=0x85)), #image_atomic_max_num_flt in GFX12
1788   ("image_atomic_pk_add_f16",   op(gfx12=0x86)),
1789   ("image_atomic_pk_add_bf16",  op(gfx12=0x87)),
1790   ("image_atomic_add_flt",      op(gfx12=0x83)),
1791   ("image_sample",              op(0x20, gfx11=0x1b)),
1792   ("image_sample_cl",           op(0x21, gfx11=0x40)),
1793   ("image_sample_d",            op(0x22, gfx11=0x1c)),
1794   ("image_sample_d_cl",         op(0x23, gfx11=0x41)),
1795   ("image_sample_l",            op(0x24, gfx11=0x1d)),
1796   ("image_sample_b",            op(0x25, gfx11=0x1e)),
1797   ("image_sample_b_cl",         op(0x26, gfx11=0x42)),
1798   ("image_sample_lz",           op(0x27, gfx11=0x1f)),
1799   ("image_sample_c",            op(0x28, gfx11=0x20)),
1800   ("image_sample_c_cl",         op(0x29, gfx11=0x43)),
1801   ("image_sample_c_d",          op(0x2a, gfx11=0x21)),
1802   ("image_sample_c_d_cl",       op(0x2b, gfx11=0x44)),
1803   ("image_sample_c_l",          op(0x2c, gfx11=0x22)),
1804   ("image_sample_c_b",          op(0x2d, gfx11=0x23)),
1805   ("image_sample_c_b_cl",       op(0x2e, gfx11=0x45)),
1806   ("image_sample_c_lz",         op(0x2f, gfx11=0x24)),
1807   ("image_sample_o",            op(0x30, gfx11=0x25)),
1808   ("image_sample_cl_o",         op(0x31, gfx11=0x46)),
1809   ("image_sample_d_o",          op(0x32, gfx11=0x26)),
1810   ("image_sample_d_cl_o",       op(0x33, gfx11=0x47)),
1811   ("image_sample_l_o",          op(0x34, gfx11=0x27)),
1812   ("image_sample_b_o",          op(0x35, gfx11=0x28)),
1813   ("image_sample_b_cl_o",       op(0x36, gfx11=0x48)),
1814   ("image_sample_lz_o",         op(0x37, gfx11=0x29)),
1815   ("image_sample_c_o",          op(0x38, gfx11=0x2a)),
1816   ("image_sample_c_cl_o",       op(0x39, gfx11=0x49)),
1817   ("image_sample_c_d_o",        op(0x3a, gfx11=0x2b)),
1818   ("image_sample_c_d_cl_o",     op(0x3b, gfx11=0x4a)),
1819   ("image_sample_c_l_o",        op(0x3c, gfx11=0x2c)),
1820   ("image_sample_c_b_o",        op(0x3d, gfx11=0x2d)),
1821   ("image_sample_c_b_cl_o",     op(0x3e, gfx11=0x4b)),
1822   ("image_sample_c_lz_o",       op(0x3f, gfx11=0x2e)),
1823   ("image_sample_cd",           op(0x68, gfx11=-1)),
1824   ("image_sample_cd_cl",        op(0x69, gfx11=-1)),
1825   ("image_sample_c_cd",         op(0x6a, gfx11=-1)),
1826   ("image_sample_c_cd_cl",      op(0x6b, gfx11=-1)),
1827   ("image_sample_cd_o",         op(0x6c, gfx11=-1)),
1828   ("image_sample_cd_cl_o",      op(0x6d, gfx11=-1)),
1829   ("image_sample_c_cd_o",       op(0x6e, gfx11=-1)),
1830   ("image_sample_c_cd_cl_o",    op(0x6f, gfx11=-1)),
1831   ("image_sample_d_g16",        op(gfx10=0xa2, gfx11=0x39)),
1832   ("image_sample_d_cl_g16",     op(gfx10=0xa3, gfx11=0x5f)),
1833   ("image_sample_c_d_g16",      op(gfx10=0xaa, gfx11=0x3a)),
1834   ("image_sample_c_d_cl_g16",   op(gfx10=0xab, gfx11=0x54)),
1835   ("image_sample_d_o_g16",      op(gfx10=0xb2, gfx11=0x3b)),
1836   ("image_sample_d_cl_o_g16",   op(gfx10=0xb3, gfx11=0x55)),
1837   ("image_sample_c_d_o_g16",    op(gfx10=0xba, gfx11=0x3c)),
1838   ("image_sample_c_d_cl_o_g16", op(gfx10=0xbb, gfx11=0x56)),
1839   #("image_gather4h",            op(gfx9=0x42, gfx10=0x61, gfx11=0x90)), VEGA only?
1840   #("image_gather4h_pck",        op(gfx9=0x4a, gfx10=-1)), VEGA only?
1841   #("image_gather8h_pck",        op(gfx9=0x4b, gfx10=-1)), VEGA only?
1842   ("image_gather4",             op(0x40, gfx11=0x2f)),
1843   ("image_gather4_cl",          op(0x41, gfx11=0x60)),
1844   ("image_gather4_l",           op(0x44, gfx11=0x30)), # following instructions have different opcodes according to ISA sheet.
1845   ("image_gather4_b",           op(0x45, gfx11=0x31)),
1846   ("image_gather4_b_cl",        op(0x46, gfx11=0x61)),
1847   ("image_gather4_lz",          op(0x47, gfx11=0x32)),
1848   ("image_gather4_c",           op(0x48, gfx11=0x33)),
1849   ("image_gather4_c_cl",        op(0x49, gfx11=0x62)), # previous instructions have different opcodes according to ISA sheet.
1850   ("image_gather4_c_l",         op(0x4c, gfx11=0x63)),
1851   ("image_gather4_c_b",         op(0x4d, gfx11=0x64)),
1852   ("image_gather4_c_b_cl",      op(0x4e, gfx11=0x65)),
1853   ("image_gather4_c_lz",        op(0x4f, gfx11=0x34)),
1854   ("image_gather4_o",           op(0x50, gfx11=0x35)),
1855   ("image_gather4_cl_o",        op(0x51, gfx11=-1)),
1856   ("image_gather4_l_o",         op(0x54, gfx11=-1)),
1857   ("image_gather4_b_o",         op(0x55, gfx11=-1)),
1858   ("image_gather4_b_cl_o",      op(0x56, gfx11=-1)),
1859   ("image_gather4_lz_o",        op(0x57, gfx11=0x36)),
1860   ("image_gather4_c_o",         op(0x58, gfx11=-1)),
1861   ("image_gather4_c_cl_o",      op(0x59, gfx11=-1)),
1862   ("image_gather4_c_l_o",       op(0x5c, gfx11=-1)),
1863   ("image_gather4_c_b_o",       op(0x5d, gfx11=-1)),
1864   ("image_gather4_c_b_cl_o",    op(0x5e, gfx11=-1)),
1865   ("image_gather4_c_lz_o",      op(0x5f, gfx11=0x37)),
1866   ("image_bvh_intersect_ray",   op(gfx10=0xe6, gfx11=0x19)),
1867   ("image_bvh64_intersect_ray", op(gfx10=0xe7, gfx11=0x1a)),
1868}
1869for (name, num) in MIMG:
1870   insn(name, num, Format.MIMG, InstrClass.VMem, is_atomic = "atomic" in name)
1871
1872FLAT = {
1873   ("flat_load_ubyte",          op(0x08, gfx8=0x10, gfx10=0x08, gfx11=0x10)),
1874   ("flat_load_sbyte",          op(0x09, gfx8=0x11, gfx10=0x09, gfx11=0x11)),
1875   ("flat_load_ushort",         op(0x0a, gfx8=0x12, gfx10=0x0a, gfx11=0x12)),
1876   ("flat_load_sshort",         op(0x0b, gfx8=0x13, gfx10=0x0b, gfx11=0x13)),
1877   ("flat_load_dword",          op(0x0c, gfx8=0x14, gfx10=0x0c, gfx11=0x14)),
1878   ("flat_load_dwordx2",        op(0x0d, gfx8=0x15, gfx10=0x0d, gfx11=0x15)),
1879   ("flat_load_dwordx3",        op(0x0f, gfx8=0x16, gfx10=0x0f, gfx11=0x16)),
1880   ("flat_load_dwordx4",        op(0x0e, gfx8=0x17, gfx10=0x0e, gfx11=0x17)),
1881   ("flat_store_byte",          op(0x18)),
1882   ("flat_store_byte_d16_hi",   op(gfx8=0x19, gfx11=0x24)),
1883   ("flat_store_short",         op(0x1a, gfx11=0x19)),
1884   ("flat_store_short_d16_hi",  op(gfx8=0x1b, gfx11=0x25)),
1885   ("flat_store_dword",         op(0x1c, gfx11=0x1a)),
1886   ("flat_store_dwordx2",       op(0x1d, gfx11=0x1b)),
1887   ("flat_store_dwordx3",       op(0x1f, gfx8=0x1e, gfx10=0x1f, gfx11=0x1c)),
1888   ("flat_store_dwordx4",       op(0x1e, gfx8=0x1f, gfx10=0x1e, gfx11=0x1d)),
1889   ("flat_load_ubyte_d16",      op(gfx8=0x20, gfx11=0x1e)),
1890   ("flat_load_ubyte_d16_hi",   op(gfx8=0x21)),
1891   ("flat_load_sbyte_d16",      op(gfx8=0x22, gfx11=0x1f)),
1892   ("flat_load_sbyte_d16_hi",   op(gfx8=0x23, gfx11=0x22)),
1893   ("flat_load_short_d16",      op(gfx8=0x24, gfx11=0x20)),
1894   ("flat_load_short_d16_hi",   op(gfx8=0x25, gfx11=0x23)),
1895   ("flat_atomic_swap",         op(0x30, gfx8=0x40, gfx10=0x30, gfx11=0x33)),
1896   ("flat_atomic_cmpswap",      op(0x31, gfx8=0x41, gfx10=0x31, gfx11=0x34)),
1897   ("flat_atomic_add",          op(0x32, gfx8=0x42, gfx10=0x32, gfx11=0x35)),
1898   ("flat_atomic_sub",          op(0x33, gfx8=0x43, gfx10=0x33, gfx11=0x36)),
1899   ("flat_atomic_smin",         op(0x35, gfx8=0x44, gfx10=0x35, gfx11=0x38)),
1900   ("flat_atomic_umin",         op(0x36, gfx8=0x45, gfx10=0x36, gfx11=0x39)),
1901   ("flat_atomic_smax",         op(0x37, gfx8=0x46, gfx10=0x37, gfx11=0x3a)),
1902   ("flat_atomic_umax",         op(0x38, gfx8=0x47, gfx10=0x38, gfx11=0x3b)),
1903   ("flat_atomic_and",          op(0x39, gfx8=0x48, gfx10=0x39, gfx11=0x3c)),
1904   ("flat_atomic_or",           op(0x3a, gfx8=0x49, gfx10=0x3a, gfx11=0x3d)),
1905   ("flat_atomic_xor",          op(0x3b, gfx8=0x4a, gfx10=0x3b, gfx11=0x3e)),
1906   ("flat_atomic_inc",          op(0x3c, gfx8=0x4b, gfx10=0x3c, gfx11=0x3f)),
1907   ("flat_atomic_dec",          op(0x3d, gfx8=0x4c, gfx10=0x3d, gfx11=0x40)),
1908   ("flat_atomic_fcmpswap",     op(0x3e, gfx8=-1, gfx10=0x3e, gfx11=0x50, gfx12=-1)),
1909   ("flat_atomic_fmin",         op(0x3f, gfx8=-1, gfx10=0x3f, gfx11=0x51)),
1910   ("flat_atomic_fmax",         op(0x40, gfx8=-1, gfx10=0x40, gfx11=0x52)),
1911   ("flat_atomic_swap_x2",      op(0x50, gfx8=0x60, gfx10=0x50, gfx11=0x41)),
1912   ("flat_atomic_cmpswap_x2",   op(0x51, gfx8=0x61, gfx10=0x51, gfx11=0x42)),
1913   ("flat_atomic_add_x2",       op(0x52, gfx8=0x62, gfx10=0x52, gfx11=0x43)),
1914   ("flat_atomic_sub_x2",       op(0x53, gfx8=0x63, gfx10=0x53, gfx11=0x44)),
1915   ("flat_atomic_smin_x2",      op(0x55, gfx8=0x64, gfx10=0x55, gfx11=0x45)),
1916   ("flat_atomic_umin_x2",      op(0x56, gfx8=0x65, gfx10=0x56, gfx11=0x46)),
1917   ("flat_atomic_smax_x2",      op(0x57, gfx8=0x66, gfx10=0x57, gfx11=0x47)),
1918   ("flat_atomic_umax_x2",      op(0x58, gfx8=0x67, gfx10=0x58, gfx11=0x48)),
1919   ("flat_atomic_and_x2",       op(0x59, gfx8=0x68, gfx10=0x59, gfx11=0x49)),
1920   ("flat_atomic_or_x2",        op(0x5a, gfx8=0x69, gfx10=0x5a, gfx11=0x4a)),
1921   ("flat_atomic_xor_x2",       op(0x5b, gfx8=0x6a, gfx10=0x5b, gfx11=0x4b)),
1922   ("flat_atomic_inc_x2",       op(0x5c, gfx8=0x6b, gfx10=0x5c, gfx11=0x4c)),
1923   ("flat_atomic_dec_x2",       op(0x5d, gfx8=0x6c, gfx10=0x5d, gfx11=0x4d)),
1924   ("flat_atomic_fcmpswap_x2",  op(0x5e, gfx8=-1, gfx10=0x5e, gfx11=-1)),
1925   ("flat_atomic_fmin_x2",      op(0x5f, gfx8=-1, gfx10=0x5f, gfx11=-1)),
1926   ("flat_atomic_fmax_x2",      op(0x60, gfx8=-1, gfx10=0x60, gfx11=-1)),
1927   ("flat_atomic_add_f32",      op(gfx11=0x56)),
1928   ("flat_atomic_csub_u32",     op(gfx12=0x37)),
1929   ("flat_atomic_cond_sub_u32", op(gfx12=0x50)),
1930   ("flat_atomic_pk_add_f16",   op(gfx12=0x59)),
1931   ("flat_atomic_pk_add_bf16",  op(gfx12=0x5a)),
1932}
1933for (name, num) in FLAT:
1934    insn(name, num, Format.FLAT, InstrClass.VMem, is_atomic = "atomic" in name) #TODO: also LDS?
1935
1936GLOBAL = {
1937   ("global_load_ubyte",             op(gfx8=0x10, gfx10=0x08, gfx11=0x10)),
1938   ("global_load_sbyte",             op(gfx8=0x11, gfx10=0x09, gfx11=0x11)),
1939   ("global_load_ushort",            op(gfx8=0x12, gfx10=0x0a, gfx11=0x12)),
1940   ("global_load_sshort",            op(gfx8=0x13, gfx10=0x0b, gfx11=0x13)),
1941   ("global_load_dword",             op(gfx8=0x14, gfx10=0x0c, gfx11=0x14)),
1942   ("global_load_dwordx2",           op(gfx8=0x15, gfx10=0x0d, gfx11=0x15)),
1943   ("global_load_dwordx3",           op(gfx8=0x16, gfx10=0x0f, gfx11=0x16)),
1944   ("global_load_dwordx4",           op(gfx8=0x17, gfx10=0x0e, gfx11=0x17)),
1945   ("global_store_byte",             op(gfx8=0x18)),
1946   ("global_store_byte_d16_hi",      op(gfx8=0x19, gfx11=0x24)),
1947   ("global_store_short",            op(gfx8=0x1a, gfx11=0x19)),
1948   ("global_store_short_d16_hi",     op(gfx8=0x1b, gfx11=0x25)),
1949   ("global_store_dword",            op(gfx8=0x1c, gfx11=0x1a)),
1950   ("global_store_dwordx2",          op(gfx8=0x1d, gfx11=0x1b)),
1951   ("global_store_dwordx3",          op(gfx8=0x1e, gfx10=0x1f, gfx11=0x1c)),
1952   ("global_store_dwordx4",          op(gfx8=0x1f, gfx10=0x1e, gfx11=0x1d)),
1953   ("global_load_ubyte_d16",         op(gfx8=0x20, gfx11=0x1e)),
1954   ("global_load_ubyte_d16_hi",      op(gfx8=0x21)),
1955   ("global_load_sbyte_d16",         op(gfx8=0x22, gfx11=0x1f)),
1956   ("global_load_sbyte_d16_hi",      op(gfx8=0x23, gfx11=0x22)),
1957   ("global_load_short_d16",         op(gfx8=0x24, gfx11=0x20)),
1958   ("global_load_short_d16_hi",      op(gfx8=0x25, gfx11=0x23)),
1959   ("global_atomic_swap",            op(gfx8=0x40, gfx10=0x30, gfx11=0x33)),
1960   ("global_atomic_cmpswap",         op(gfx8=0x41, gfx10=0x31, gfx11=0x34)),
1961   ("global_atomic_add",             op(gfx8=0x42, gfx10=0x32, gfx11=0x35)),
1962   ("global_atomic_sub",             op(gfx8=0x43, gfx10=0x33, gfx11=0x36)),
1963   ("global_atomic_smin",            op(gfx8=0x44, gfx10=0x35, gfx11=0x38)),
1964   ("global_atomic_umin",            op(gfx8=0x45, gfx10=0x36, gfx11=0x39)),
1965   ("global_atomic_smax",            op(gfx8=0x46, gfx10=0x37, gfx11=0x3a)),
1966   ("global_atomic_umax",            op(gfx8=0x47, gfx10=0x38, gfx11=0x3b)),
1967   ("global_atomic_and",             op(gfx8=0x48, gfx10=0x39, gfx11=0x3c)),
1968   ("global_atomic_or",              op(gfx8=0x49, gfx10=0x3a, gfx11=0x3d)),
1969   ("global_atomic_xor",             op(gfx8=0x4a, gfx10=0x3b, gfx11=0x3e)),
1970   ("global_atomic_inc",             op(gfx8=0x4b, gfx10=0x3c, gfx11=0x3f)),
1971   ("global_atomic_dec",             op(gfx8=0x4c, gfx10=0x3d, gfx11=0x40)),
1972   ("global_atomic_fcmpswap",        op(gfx10=0x3e, gfx11=0x50, gfx12=-1)),
1973   ("global_atomic_fmin",            op(gfx10=0x3f, gfx11=0x51)),
1974   ("global_atomic_fmax",            op(gfx10=0x40, gfx11=0x52)),
1975   ("global_atomic_swap_x2",         op(gfx8=0x60, gfx10=0x50, gfx11=0x41)),
1976   ("global_atomic_cmpswap_x2",      op(gfx8=0x61, gfx10=0x51, gfx11=0x42)),
1977   ("global_atomic_add_x2",          op(gfx8=0x62, gfx10=0x52, gfx11=0x43)),
1978   ("global_atomic_sub_x2",          op(gfx8=0x63, gfx10=0x53, gfx11=0x44)),
1979   ("global_atomic_smin_x2",         op(gfx8=0x64, gfx10=0x55, gfx11=0x45)),
1980   ("global_atomic_umin_x2",         op(gfx8=0x65, gfx10=0x56, gfx11=0x46)),
1981   ("global_atomic_smax_x2",         op(gfx8=0x66, gfx10=0x57, gfx11=0x47)),
1982   ("global_atomic_umax_x2",         op(gfx8=0x67, gfx10=0x58, gfx11=0x48)),
1983   ("global_atomic_and_x2",          op(gfx8=0x68, gfx10=0x59, gfx11=0x49)),
1984   ("global_atomic_or_x2",           op(gfx8=0x69, gfx10=0x5a, gfx11=0x4a)),
1985   ("global_atomic_xor_x2",          op(gfx8=0x6a, gfx10=0x5b, gfx11=0x4b)),
1986   ("global_atomic_inc_x2",          op(gfx8=0x6b, gfx10=0x5c, gfx11=0x4c)),
1987   ("global_atomic_dec_x2",          op(gfx8=0x6c, gfx10=0x5d, gfx11=0x4d)),
1988   ("global_atomic_fcmpswap_x2",     op(gfx10=0x5e, gfx11=-1)),
1989   ("global_atomic_fmin_x2",         op(gfx10=0x5f, gfx11=-1)),
1990   ("global_atomic_fmax_x2",         op(gfx10=0x60, gfx11=-1)),
1991   ("global_load_dword_addtid",      op(gfx10=0x16, gfx11=0x28)), #GFX10.3+
1992   ("global_store_dword_addtid",     op(gfx10=0x17, gfx11=0x29)), #GFX10.3+
1993   ("global_atomic_csub",            op(gfx10=0x34, gfx11=0x37)), #GFX10.3+. seems glc must be set
1994   ("global_atomic_add_f32",         op(gfx11=0x56)),
1995   ("global_atomic_cond_sub_u32",    op(gfx12=0x50)),
1996   ("global_load_tr_b128",           op(gfx12=0x57)),
1997   ("global_load_tr_b64",            op(gfx12=0x58)),
1998   ("global_atomic_pk_add_f16",      op(gfx12=0x59)),
1999   ("global_atomic_pk_add_bf16",     op(gfx12=0x5a)),
2000   ("global_atomic_ordered_add_b64", op(gfx12=0x73)),
2001   ("global_inv",                    op(gfx12=0x2b)),
2002   ("global_wb",                     op(gfx12=0x2c)),
2003   ("global_wbinv",                  op(gfx12=0x4f)),
2004}
2005for (name, num) in GLOBAL:
2006    insn(name, num, Format.GLOBAL, InstrClass.VMem, is_atomic = "atomic" in name)
2007
2008SCRATCH = {
2009   #GFX89,GFX10,GFX11
2010   ("scratch_load_ubyte",         op(gfx8=0x10, gfx10=0x08, gfx11=0x10)),
2011   ("scratch_load_sbyte",         op(gfx8=0x11, gfx10=0x09, gfx11=0x11)),
2012   ("scratch_load_ushort",        op(gfx8=0x12, gfx10=0x0a, gfx11=0x12)),
2013   ("scratch_load_sshort",        op(gfx8=0x13, gfx10=0x0b, gfx11=0x13)),
2014   ("scratch_load_dword",         op(gfx8=0x14, gfx10=0x0c, gfx11=0x14)),
2015   ("scratch_load_dwordx2",       op(gfx8=0x15, gfx10=0x0d, gfx11=0x15)),
2016   ("scratch_load_dwordx3",       op(gfx8=0x16, gfx10=0x0f, gfx11=0x16)),
2017   ("scratch_load_dwordx4",       op(gfx8=0x17, gfx10=0x0e, gfx11=0x17)),
2018   ("scratch_store_byte",         op(gfx8=0x18)),
2019   ("scratch_store_byte_d16_hi",  op(gfx8=0x19, gfx11=0x24)),
2020   ("scratch_store_short",        op(gfx8=0x1a, gfx11=0x19)),
2021   ("scratch_store_short_d16_hi", op(gfx8=0x1b, gfx11=0x25)),
2022   ("scratch_store_dword",        op(gfx8=0x1c, gfx11=0x1a)),
2023   ("scratch_store_dwordx2",      op(gfx8=0x1d, gfx11=0x1b)),
2024   ("scratch_store_dwordx3",      op(gfx8=0x1e, gfx10=0x1f, gfx11=0x1c)),
2025   ("scratch_store_dwordx4",      op(gfx8=0x1f, gfx10=0x1e, gfx11=0x1d)),
2026   ("scratch_load_ubyte_d16",     op(gfx8=0x20, gfx11=0x1e)),
2027   ("scratch_load_ubyte_d16_hi",  op(gfx8=0x21)),
2028   ("scratch_load_sbyte_d16",     op(gfx8=0x22, gfx11=0x1f)),
2029   ("scratch_load_sbyte_d16_hi",  op(gfx8=0x23, gfx11=0x22)),
2030   ("scratch_load_short_d16",     op(gfx8=0x24, gfx11=0x20)),
2031   ("scratch_load_short_d16_hi",  op(gfx8=0x25, gfx11=0x23)),
2032}
2033for (name, num) in SCRATCH:
2034    insn(name, num, Format.SCRATCH, InstrClass.VMem)
2035
2036# check for duplicate opcode numbers
2037for ver in Opcode._fields:
2038    op_to_name = {}
2039    for inst in instructions.values():
2040        if inst.format in [Format.PSEUDO, Format.PSEUDO_BRANCH, Format.PSEUDO_BARRIER, Format.PSEUDO_REDUCTION]:
2041            continue
2042
2043        opcode = getattr(inst.op, ver)
2044        if opcode == -1:
2045            continue
2046
2047        key = (inst.format, opcode)
2048
2049        if key in op_to_name:
2050            # exceptions
2051            names = set([op_to_name[key], inst.name])
2052            if ver in ['gfx8', 'gfx9', 'gfx11', 'gfx12'] and names == set(['v_mul_lo_i32', 'v_mul_lo_u32']):
2053                continue
2054            # v_mad_legacy_f32 is replaced with v_fma_legacy_f32 on GFX10.3
2055            if ver == 'gfx10' and names == set(['v_mad_legacy_f32', 'v_fma_legacy_f32']):
2056                continue
2057            # v_mac_legacy_f32 is replaced with v_fmac_legacy_f32 on GFX10.3
2058            if ver == 'gfx10' and names == set(['v_mac_legacy_f32', 'v_fmac_legacy_f32']):
2059                continue
2060            # These are the same opcodes, but hi uses opsel
2061            if names == set(['v_interp_p2_f16', 'v_interp_p2_hi_f16']):
2062                continue
2063
2064            print('%s and %s share the same opcode number (%s)' % (op_to_name[key], inst.name, ver))
2065            sys.exit(1)
2066        else:
2067            op_to_name[key] = inst.name
2068