xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/tests/test_insert_waitcnt.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 #include "helpers.h"
7 
8 using namespace aco;
9 
10 BEGIN_TEST(insert_waitcnt.ds_ordered_count)
11    if (!setup_cs(NULL, GFX10_3))
12       return;
13 
14    Operand def0(PhysReg(256), v1);
15    Operand def1(PhysReg(257), v1);
16    Operand def2(PhysReg(258), v1);
17    Operand gds_base(PhysReg(259), v1);
18    Operand chan_counter(PhysReg(260), v1);
19    Operand m(m0, s1);
20 
21    Instruction* ds_instr;
22    //>> ds_ordered_count %0:v[0], %0:v[3], %0:m0 offset0:3072 gds storage:gds semantics:volatile
23    //! s_waitcnt lgkmcnt(0)
24    ds_instr = bld.ds(aco_opcode::ds_ordered_count, def0, gds_base, m, 3072u, 0u, true);
25    ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
26 
27    //! ds_add_rtn_u32 %0:v[1], %0:v[3], %0:v[4], %0:m0 gds storage:gds semantics:volatile,atomic,rmw
28    ds_instr = bld.ds(aco_opcode::ds_add_rtn_u32, def1, gds_base, chan_counter, m, 0u, 0u, true);
29    ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
30 
31    //! s_waitcnt lgkmcnt(0)
32    //! ds_ordered_count %0:v[2], %0:v[3], %0:m0 offset0:3840 gds storage:gds semantics:volatile
33    ds_instr = bld.ds(aco_opcode::ds_ordered_count, def2, gds_base, m, 3840u, 0u, true);
34    ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
35 
36    finish_waitcnt_test();
37 END_TEST
38 
39 BEGIN_TEST(insert_waitcnt.clause)
40    if (!setup_cs(NULL, GFX11))
41       return;
42 
43    Definition def_v4(PhysReg(260), v1);
44    Definition def_v5(PhysReg(261), v1);
45    Definition def_v6(PhysReg(262), v1);
46    Definition def_v7(PhysReg(263), v1);
47    Operand op_v0(PhysReg(256), v1);
48    Operand op_v4(PhysReg(260), v1);
49    Operand op_v5(PhysReg(261), v1);
50    Operand op_v6(PhysReg(262), v1);
51    Operand op_v7(PhysReg(263), v1);
52    Operand desc0(PhysReg(0), s4);
53 
54    //>> p_unit_test 0
55    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
56 
57    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
58    //! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[0], 0
59    //! v1: %0:v[6] = buffer_load_dword %0:s[0-3], %0:v[0], 0
60    //! v1: %0:v[7] = buffer_load_dword %0:s[0-3], %0:v[0], 0
61    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
62    bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc0, op_v0, Operand::zero(), 0, false);
63    bld.mubuf(aco_opcode::buffer_load_dword, def_v6, desc0, op_v0, Operand::zero(), 0, false);
64    bld.mubuf(aco_opcode::buffer_load_dword, def_v7, desc0, op_v0, Operand::zero(), 0, false);
65    //! s_waitcnt vmcnt(0)
66    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[4], 0
67    //! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[5], 0
68    //! v1: %0:v[6] = buffer_load_dword %0:s[0-3], %0:v[6], 0
69    //! v1: %0:v[7] = buffer_load_dword %0:s[0-3], %0:v[7], 0
70    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v4, Operand::zero(), 0, false);
71    bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc0, op_v5, Operand::zero(), 0, false);
72    bld.mubuf(aco_opcode::buffer_load_dword, def_v6, desc0, op_v6, Operand::zero(), 0, false);
73    bld.mubuf(aco_opcode::buffer_load_dword, def_v7, desc0, op_v7, Operand::zero(), 0, false);
74    //! s_waitcnt vmcnt(0)
75    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[4]
76    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[5]
77    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[6]
78    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[7]
79    bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v4, 0, false);
80    bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v5, 0, false);
81    bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v6, 0, false);
82    bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v7, 0, false);
83 
84    //>> p_unit_test 1
85    bld.reset(program->create_and_insert_block());
86    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
87 
88    //! s4: %0:s[4-7] = s_load_dwordx4 %0:s[0-1], 0
89    bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg(4), s4), Operand(PhysReg(0), s2),
90             Operand::zero());
91    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
92    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
93    //! s_waitcnt lgkmcnt(0) vmcnt(0)
94    //! v1: %0:v[5] = buffer_load_dword %0:s[4-7], %0:v[4], 0
95    bld.mubuf(aco_opcode::buffer_load_dword, def_v5, Operand(PhysReg(4), s4), op_v4, Operand::zero(),
96              0, false);
97 
98    //>> p_unit_test 2
99    bld.reset(program->create_and_insert_block());
100    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
101 
102    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
103    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
104    //! v_nop
105    bld.vop1(aco_opcode::v_nop);
106    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
107    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
108    //! s_waitcnt vmcnt(0)
109    //! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[4], 0
110    bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc0, op_v4, Operand::zero(), 0, false);
111 
112    finish_waitcnt_test();
113 END_TEST
114 
115 BEGIN_TEST(insert_waitcnt.waw.mixed_vmem_lds.vmem)
116    if (!setup_cs(NULL, GFX10))
117       return;
118 
119    Definition def_v4(PhysReg(260), v1);
120    Operand op_v0(PhysReg(256), v1);
121    Operand desc0(PhysReg(0), s4);
122 
123    //>> BB0
124    //! /* logical preds: / linear preds: / kind: top-level, */
125    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
126    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
127 
128    //>> BB1
129    //! /* logical preds: / linear preds: / kind: */
130    //! v1: %0:v[4] = ds_read_b32 %0:v[0]
131    bld.reset(program->create_and_insert_block());
132    bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0);
133 
134    bld.reset(program->create_and_insert_block());
135    program->blocks[2].linear_preds.push_back(0);
136    program->blocks[2].linear_preds.push_back(1);
137    program->blocks[2].logical_preds.push_back(0);
138    program->blocks[2].logical_preds.push_back(1);
139 
140    //>> BB2
141    //! /* logical preds: BB0, BB1, / linear preds: BB0, BB1, / kind: uniform, */
142    //! s_waitcnt lgkmcnt(0)
143    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
144    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
145 
146    finish_waitcnt_test();
147 END_TEST
148 
149 BEGIN_TEST(insert_waitcnt.waw.mixed_vmem_lds.lds)
150    if (!setup_cs(NULL, GFX10))
151       return;
152 
153    Definition def_v4(PhysReg(260), v1);
154    Operand op_v0(PhysReg(256), v1);
155    Operand desc0(PhysReg(0), s4);
156 
157    //>> BB0
158    //! /* logical preds: / linear preds: / kind: top-level, */
159    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
160    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
161 
162    //>> BB1
163    //! /* logical preds: / linear preds: / kind: */
164    //! v1: %0:v[4] = ds_read_b32 %0:v[0]
165    bld.reset(program->create_and_insert_block());
166    bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0);
167 
168    bld.reset(program->create_and_insert_block());
169    program->blocks[2].linear_preds.push_back(0);
170    program->blocks[2].linear_preds.push_back(1);
171    program->blocks[2].logical_preds.push_back(0);
172    program->blocks[2].logical_preds.push_back(1);
173 
174    //>> BB2
175    //! /* logical preds: BB0, BB1, / linear preds: BB0, BB1, / kind: uniform, */
176    //! s_waitcnt vmcnt(0)
177    //! v1: %0:v[4] = ds_read_b32 %0:v[0]
178    bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0);
179 
180    finish_waitcnt_test();
181 END_TEST
182 
183 BEGIN_TEST(insert_waitcnt.waw.vmem_types)
184    for (amd_gfx_level gfx : {GFX11, GFX12}) {
185       if (!setup_cs(NULL, gfx))
186          continue;
187 
188       Definition def_v4(PhysReg(260), v1);
189       Operand op_v0(PhysReg(256), v1);
190       Operand desc_s4(PhysReg(0), s4);
191       Operand desc_s8(PhysReg(8), s8);
192 
193       //>> p_unit_test 0
194       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
195       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
196       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
197       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
198       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
199 
200       //>> p_unit_test 1
201       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
202       //~gfx11! s_waitcnt vmcnt(0)
203       //~gfx12! s_wait_loadcnt imm:0
204       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
205       bld.reset(program->create_and_insert_block());
206       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
207       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
208       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
209 
210       //>> p_unit_test 2
211       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
212       //~gfx11! s_waitcnt vmcnt(0)
213       //~gfx12! s_wait_loadcnt imm:0
214       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
215       bld.reset(program->create_and_insert_block());
216       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
217       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
218       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
219                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
220 
221       //>> p_unit_test 3
222       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
223       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
224       bld.reset(program->create_and_insert_block());
225       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
226       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
227       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
228 
229       //>> p_unit_test 4
230       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
231       //~gfx11! s_waitcnt vmcnt(0)
232       //~gfx12! s_wait_samplecnt imm:0
233       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
234       bld.reset(program->create_and_insert_block());
235       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
236       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
237       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
238 
239       //>> p_unit_test 5
240       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
241       //~gfx11! s_waitcnt vmcnt(0)
242       //~gfx12! s_wait_samplecnt imm:0
243       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
244       bld.reset(program->create_and_insert_block());
245       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
246       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
247       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
248                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
249 
250       //>> p_unit_test 6
251       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
252       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
253       bld.reset(program->create_and_insert_block());
254       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
255       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
256                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
257       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
258                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
259 
260       //>> p_unit_test 7
261       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
262       //~gfx11! s_waitcnt vmcnt(0)
263       //~gfx12! s_wait_bvhcnt imm:0
264       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
265       bld.reset(program->create_and_insert_block());
266       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
267       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
268                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
269       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
270 
271       //>> p_unit_test 8
272       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
273       //~gfx11! s_waitcnt vmcnt(0)
274       //~gfx12! s_wait_bvhcnt imm:0
275       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
276       bld.reset(program->create_and_insert_block());
277       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
278       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
279                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
280       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
281 
282       //>> BB9
283       //! /* logical preds: / linear preds: / kind: */
284       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
285       bld.reset(program->create_and_insert_block());
286       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
287 
288       //>> BB10
289       //! /* logical preds: / linear preds: / kind: */
290       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
291       bld.reset(program->create_and_insert_block());
292       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
293 
294       bld.reset(program->create_and_insert_block());
295       program->blocks[11].linear_preds.push_back(9);
296       program->blocks[11].linear_preds.push_back(10);
297       program->blocks[11].logical_preds.push_back(9);
298       program->blocks[11].logical_preds.push_back(10);
299 
300       //>> BB11
301       //! /* logical preds: BB9, BB10, / linear preds: BB9, BB10, / kind: uniform, */
302       //! p_unit_test 9
303       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
304       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
305       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
306 
307       //>> BB12
308       //! /* logical preds: / linear preds: / kind: */
309       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
310       bld.reset(program->create_and_insert_block());
311       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
312 
313       //>> BB13
314       //! /* logical preds: / linear preds: / kind: */
315       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
316       bld.reset(program->create_and_insert_block());
317       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
318 
319       bld.reset(program->create_and_insert_block());
320       program->blocks[14].linear_preds.push_back(12);
321       program->blocks[14].linear_preds.push_back(13);
322       program->blocks[14].logical_preds.push_back(12);
323       program->blocks[14].logical_preds.push_back(13);
324 
325       //>> BB14
326       //! /* logical preds: BB12, BB13, / linear preds: BB12, BB13, / kind: uniform, */
327       //! p_unit_test 10
328       //~gfx11! s_waitcnt vmcnt(0)
329       //~gfx12! s_wait_samplecnt imm:0
330       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
331       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
332       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
333 
334       finish_waitcnt_test();
335    }
336 END_TEST
337 
338 BEGIN_TEST(insert_waitcnt.vmem)
339    if (!setup_cs(NULL, GFX12))
340       return;
341 
342    Definition def_v4(PhysReg(260), v1);
343    Definition def_v5(PhysReg(261), v1);
344    Definition def_v6(PhysReg(262), v1);
345    Definition def_v7(PhysReg(263), v1);
346    Definition def_v8(PhysReg(264), v1);
347    Definition def_v9(PhysReg(265), v1);
348    Operand op_v0(PhysReg(256), v1);
349    Operand op_v4(PhysReg(260), v1);
350    Operand op_v5(PhysReg(261), v1);
351    Operand op_v6(PhysReg(262), v1);
352    Operand op_v7(PhysReg(263), v1);
353    Operand op_v8(PhysReg(264), v1);
354    Operand op_v9(PhysReg(265), v1);
355    Operand desc_s4(PhysReg(0), s4);
356    Operand desc_s8(PhysReg(8), s8);
357 
358    //>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
359    //! v1: %0:v[5] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
360    //! v1: %0:v[6] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d unrm r128
361    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
362    bld.mimg(aco_opcode::image_sample, def_v5, desc_s8, desc_s4, Operand(v1), op_v0);
363    Instruction* instr =
364       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v6, desc_s4, Operand(s4), Operand(v1),
365                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)))
366          .instr;
367    instr->mimg().unrm = true;
368    instr->mimg().r128 = true;
369 
370    //! v1: %0:v[7] = image_load %0:s[8-15],  s4: undef,  v1: undef, %0:v[0] 1d
371    //! v1: %0:v[8] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
372    //! v1: %0:v[9] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d unrm r128
373    bld.mimg(aco_opcode::image_load, def_v7, desc_s8, Operand(s4), Operand(v1), op_v0, 0x1);
374    bld.mimg(aco_opcode::image_sample, def_v8, desc_s8, desc_s4, Operand(v1), op_v0);
375    instr = bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v9, desc_s4, Operand(s4),
376                     Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)))
377               .instr;
378    instr->mimg().unrm = true;
379    instr->mimg().r128 = true;
380 
381    //! s_wait_loadcnt imm:1
382    //! p_unit_test 0, %0:v[4]
383    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4);
384    //! s_wait_samplecnt imm:1
385    //! p_unit_test 1, %0:v[5]
386    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_v5);
387    //! s_wait_bvhcnt imm:1
388    //! p_unit_test 2, %0:v[6]
389    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_v6);
390    //! s_wait_loadcnt imm:0
391    //! p_unit_test 3, %0:v[7]
392    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_v7);
393    //! s_wait_samplecnt imm:0
394    //! p_unit_test 4, %0:v[8]
395    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4), op_v8);
396    //! s_wait_bvhcnt imm:0
397    //! p_unit_test 5, %0:v[9]
398    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5), op_v9);
399 
400    /* Despite not using a sampler, this uses samplecnt. */
401    //! v1: %0:v[5] = image_msaa_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
402    //! s_wait_samplecnt imm:0
403    //! p_unit_test 6, %0:v[5]
404    bld.mimg(aco_opcode::image_msaa_load, def_v5, desc_s8, Operand(s4), Operand(v1), op_v0);
405    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6), op_v5);
406 
407    finish_waitcnt_test();
408 END_TEST
409 
410 BEGIN_TEST(insert_waitcnt.lds_smem)
411    for (amd_gfx_level gfx : {GFX11, GFX12}) {
412       if (!setup_cs(NULL, gfx))
413          continue;
414 
415       Definition def_v4(PhysReg(260), v1);
416       Definition def_v5(PhysReg(261), v1);
417       Definition def_s4(PhysReg(4), s1);
418       Definition def_s5(PhysReg(5), s1);
419       Operand op_s0(PhysReg(0), s1);
420       Operand op_s4(PhysReg(4), s1);
421       Operand op_s5(PhysReg(5), s1);
422       Operand op_v0(PhysReg(256), v1);
423       Operand op_v4(PhysReg(260), v1);
424       Operand op_v5(PhysReg(261), v1);
425       Operand desc_s4(PhysReg(0), s4);
426 
427       //>> v1: %0:v[4] = ds_read_b32 %0:v[0]
428       //! s1: %0:s[4] = s_buffer_load_dword %0:s[0-3], %0:s[0]
429       //! v1: %0:v[5] = ds_read_b32 %0:v[0]
430       //! s1: %0:s[5] = s_buffer_load_dword %0:s[0-3], %0:s[0]
431       bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0);
432       bld.smem(aco_opcode::s_buffer_load_dword, def_s4, desc_s4, op_s0);
433       bld.ds(aco_opcode::ds_read_b32, def_v5, op_v0);
434       bld.smem(aco_opcode::s_buffer_load_dword, def_s5, desc_s4, op_s0);
435 
436       //~gfx11! s_waitcnt lgkmcnt(1)
437       //~gfx12! s_wait_dscnt imm:1
438       //! p_unit_test 0, %0:v[4]
439       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4);
440       //~gfx11! s_waitcnt lgkmcnt(0)
441       //~gfx12! s_wait_kmcnt imm:0
442       //! p_unit_test 1, %0:s[4]
443       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_s4);
444       //~gfx12! s_wait_dscnt imm:0
445       //! p_unit_test 2, %0:v[5]
446       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_v5);
447       //! p_unit_test 3, %0:s[5]
448       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_s5);
449 
450       finish_waitcnt_test();
451    }
452 END_TEST
453 
454 BEGIN_TEST(insert_waitcnt.sendmsg_smem)
455    for (amd_gfx_level gfx : {GFX11, GFX12}) {
456       if (!setup_cs(NULL, gfx))
457          continue;
458 
459       Definition def_s4(PhysReg(4), s1);
460       Definition def_s5(PhysReg(5), s1);
461       Definition def_s6(PhysReg(6), s1);
462       Definition def_s7(PhysReg(7), s1);
463       Operand op_s0(PhysReg(0), s1);
464       Operand op_s4(PhysReg(4), s1);
465       Operand op_s5(PhysReg(5), s1);
466       Operand op_s6(PhysReg(6), s1);
467       Operand op_s7(PhysReg(7), s1);
468       Operand desc_s4(PhysReg(0), s4);
469 
470       //>> s1: %0:s[4] = s_sendmsg_rtn_b32 3 sendmsg(rtn_get_realtime)
471       //! s1: %0:s[5] = s_buffer_load_dword %0:s[0-3], %0:s[0]
472       //! s1: %0:s[6] = s_sendmsg_rtn_b32 3 sendmsg(rtn_get_realtime)
473       //! s1: %0:s[7] = s_buffer_load_dword %0:s[0-3], %0:s[0]
474       bld.sop1(aco_opcode::s_sendmsg_rtn_b32, def_s4, Operand::c32(sendmsg_rtn_get_realtime));
475       bld.smem(aco_opcode::s_buffer_load_dword, def_s5, desc_s4, op_s0);
476       bld.sop1(aco_opcode::s_sendmsg_rtn_b32, def_s6, Operand::c32(sendmsg_rtn_get_realtime));
477       bld.smem(aco_opcode::s_buffer_load_dword, def_s7, desc_s4, op_s0);
478 
479       //~gfx12! s_wait_kmcnt imm:1
480       //~gfx11! s_waitcnt lgkmcnt(1)
481       //! p_unit_test 0, %0:s[4]
482       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_s4);
483       //~gfx12! s_wait_kmcnt imm:0
484       //~gfx11! s_waitcnt lgkmcnt(0)
485       //! p_unit_test 1, %0:s[5]
486       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_s5);
487       //! p_unit_test 2, %0:s[6]
488       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_s6);
489       //! p_unit_test 3, %0:s[7]
490       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_s7);
491 
492       finish_waitcnt_test();
493    }
494 END_TEST
495 
496 BEGIN_TEST(insert_waitcnt.vmem_ds)
497    if (!setup_cs(NULL, GFX12))
498       return;
499 
500    Definition def_v4(PhysReg(260), v1);
501    Definition def_v5(PhysReg(261), v1);
502    Operand op_v0(PhysReg(256), v1);
503    Operand op_v1(PhysReg(257), v1);
504    Operand op_v4(PhysReg(260), v1);
505    Operand op_v5(PhysReg(261), v1);
506    Operand desc_s4(PhysReg(0), s4);
507 
508    program->workgroup_size = 128;
509    program->wgp_mode = true;
510 
511    //>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
512    //! v1: %0:v[5] = ds_read_b32 %0:v[0]
513    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
514    bld.ds(aco_opcode::ds_read_b32, def_v5, op_v0);
515 
516    //! s_wait_loadcnt_dscnt dscnt(0) loadcnt(0)
517    //! p_unit_test 0, %0:v[4], %0:v[5]
518    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4, op_v5);
519 
520    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[1] storage:buffer
521    //! v1: %0:v[5] = ds_write_b32 %0:v[0], %0:v[1] storage:shared
522    Instruction* instr =
523       bld.mubuf(aco_opcode::buffer_store_dword, desc_s4, op_v0, Operand::zero(), op_v1, 0, false)
524          .instr;
525    instr->mubuf().sync = memory_sync_info(storage_buffer);
526    instr = bld.ds(aco_opcode::ds_write_b32, def_v5, op_v0, op_v1).instr;
527    instr->ds().sync = memory_sync_info(storage_shared);
528 
529    //! s_wait_storecnt_dscnt dscnt(0) storecnt(0)
530    bld.barrier(aco_opcode::p_barrier,
531                memory_sync_info(storage_buffer | storage_shared, semantic_acqrel, scope_workgroup));
532 
533    finish_waitcnt_test();
534 END_TEST
535