1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 * Copyright 2020 Valve Corporation
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "ac_pm4.h"
9 #include "ac_sqtt.h"
10
11 #include "sid.h"
12 #include "ac_gpu_info.h"
13 #include "util/u_math.h"
14 #include "util/os_time.h"
15
16 #include "sid.h"
17
18 uint64_t
ac_sqtt_get_info_offset(unsigned se)19 ac_sqtt_get_info_offset(unsigned se)
20 {
21 return sizeof(struct ac_sqtt_data_info) * se;
22 }
23
24 uint64_t
ac_sqtt_get_data_offset(const struct radeon_info * rad_info,const struct ac_sqtt * data,unsigned se)25 ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *data, unsigned se)
26 {
27 unsigned max_se = rad_info->max_se;
28 uint64_t data_offset;
29
30 data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
31 data_offset += data->buffer_size * se;
32
33 return data_offset;
34 }
35
36 static uint64_t
ac_sqtt_get_info_va(uint64_t va,unsigned se)37 ac_sqtt_get_info_va(uint64_t va, unsigned se)
38 {
39 return va + ac_sqtt_get_info_offset(se);
40 }
41
42 static uint64_t
ac_sqtt_get_data_va(const struct radeon_info * rad_info,const struct ac_sqtt * data,unsigned se)43 ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *data,
44 unsigned se)
45 {
46 return data->buffer_va + ac_sqtt_get_data_offset(rad_info, data, se);
47 }
48
49 void
ac_sqtt_init(struct ac_sqtt * data)50 ac_sqtt_init(struct ac_sqtt *data)
51 {
52 list_inithead(&data->rgp_pso_correlation.record);
53 simple_mtx_init(&data->rgp_pso_correlation.lock, mtx_plain);
54
55 list_inithead(&data->rgp_loader_events.record);
56 simple_mtx_init(&data->rgp_loader_events.lock, mtx_plain);
57
58 list_inithead(&data->rgp_code_object.record);
59 simple_mtx_init(&data->rgp_code_object.lock, mtx_plain);
60
61 list_inithead(&data->rgp_clock_calibration.record);
62 simple_mtx_init(&data->rgp_clock_calibration.lock, mtx_plain);
63
64 list_inithead(&data->rgp_queue_info.record);
65 simple_mtx_init(&data->rgp_queue_info.lock, mtx_plain);
66
67 list_inithead(&data->rgp_queue_event.record);
68 simple_mtx_init(&data->rgp_queue_event.lock, mtx_plain);
69 }
70
71 void
ac_sqtt_finish(struct ac_sqtt * data)72 ac_sqtt_finish(struct ac_sqtt *data)
73 {
74 assert(data->rgp_pso_correlation.record_count == 0);
75 simple_mtx_destroy(&data->rgp_pso_correlation.lock);
76
77 assert(data->rgp_loader_events.record_count == 0);
78 simple_mtx_destroy(&data->rgp_loader_events.lock);
79
80 assert(data->rgp_code_object.record_count == 0);
81 simple_mtx_destroy(&data->rgp_code_object.lock);
82
83 assert(data->rgp_clock_calibration.record_count == 0);
84 simple_mtx_destroy(&data->rgp_clock_calibration.lock);
85
86 assert(data->rgp_queue_info.record_count == 0);
87 simple_mtx_destroy(&data->rgp_queue_info.lock);
88
89 assert(data->rgp_queue_event.record_count == 0);
90 simple_mtx_destroy(&data->rgp_queue_event.lock);
91 }
92
93 bool
ac_is_sqtt_complete(const struct radeon_info * rad_info,const struct ac_sqtt * data,const struct ac_sqtt_data_info * info)94 ac_is_sqtt_complete(const struct radeon_info *rad_info, const struct ac_sqtt *data,
95 const struct ac_sqtt_data_info *info)
96 {
97 if (rad_info->gfx_level >= GFX10) {
98 /* GFX10 doesn't have THREAD_TRACE_CNTR but it reports the number of
99 * dropped bytes per SE via THREAD_TRACE_DROPPED_CNTR. Though, this
100 * doesn't seem reliable because it might still report non-zero even if
101 * the SQTT buffer isn't full.
102 *
103 * The solution here is to compare the number of bytes written by the hw
104 * (in units of 32 bytes) to the SQTT buffer size. If it's equal, that
105 * means that the buffer is full and should be resized.
106 */
107 return !(info->cur_offset * 32 == data->buffer_size - 32);
108 }
109
110 /* Otherwise, compare the current thread trace offset with the number
111 * of written bytes.
112 */
113 return info->cur_offset == info->gfx9_write_counter;
114 }
115
116 uint32_t
ac_get_expected_buffer_size(struct radeon_info * rad_info,const struct ac_sqtt_data_info * info)117 ac_get_expected_buffer_size(struct radeon_info *rad_info, const struct ac_sqtt_data_info *info)
118 {
119 if (rad_info->gfx_level >= GFX10) {
120 uint32_t dropped_cntr_per_se = info->gfx10_dropped_cntr / rad_info->max_se;
121 return ((info->cur_offset * 32) + dropped_cntr_per_se) / 1024;
122 }
123
124 return (info->gfx9_write_counter * 32) / 1024;
125 }
126
127 bool
ac_sqtt_add_pso_correlation(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t api_hash)128 ac_sqtt_add_pso_correlation(struct ac_sqtt *sqtt, uint64_t pipeline_hash, uint64_t api_hash)
129 {
130 struct rgp_pso_correlation *pso_correlation = &sqtt->rgp_pso_correlation;
131 struct rgp_pso_correlation_record *record;
132
133 record = malloc(sizeof(struct rgp_pso_correlation_record));
134 if (!record)
135 return false;
136
137 record->api_pso_hash = api_hash;
138 record->pipeline_hash[0] = pipeline_hash;
139 record->pipeline_hash[1] = pipeline_hash;
140 memset(record->api_level_obj_name, 0, sizeof(record->api_level_obj_name));
141
142 simple_mtx_lock(&pso_correlation->lock);
143 list_addtail(&record->list, &pso_correlation->record);
144 pso_correlation->record_count++;
145 simple_mtx_unlock(&pso_correlation->lock);
146
147 return true;
148 }
149
150 bool
ac_sqtt_add_code_object_loader_event(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t base_address)151 ac_sqtt_add_code_object_loader_event(struct ac_sqtt *sqtt, uint64_t pipeline_hash,
152 uint64_t base_address)
153 {
154 struct rgp_loader_events *loader_events = &sqtt->rgp_loader_events;
155 struct rgp_loader_events_record *record;
156
157 record = malloc(sizeof(struct rgp_loader_events_record));
158 if (!record)
159 return false;
160
161 record->loader_event_type = RGP_LOAD_TO_GPU_MEMORY;
162 record->reserved = 0;
163 record->base_address = base_address & 0xffffffffffff;
164 record->code_object_hash[0] = pipeline_hash;
165 record->code_object_hash[1] = pipeline_hash;
166 record->time_stamp = os_time_get_nano();
167
168 simple_mtx_lock(&loader_events->lock);
169 list_addtail(&record->list, &loader_events->record);
170 loader_events->record_count++;
171 simple_mtx_unlock(&loader_events->lock);
172
173 return true;
174 }
175
176 bool
ac_sqtt_add_clock_calibration(struct ac_sqtt * sqtt,uint64_t cpu_timestamp,uint64_t gpu_timestamp)177 ac_sqtt_add_clock_calibration(struct ac_sqtt *sqtt, uint64_t cpu_timestamp, uint64_t gpu_timestamp)
178 {
179 struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
180 struct rgp_clock_calibration_record *record;
181
182 record = malloc(sizeof(struct rgp_clock_calibration_record));
183 if (!record)
184 return false;
185
186 record->cpu_timestamp = cpu_timestamp;
187 record->gpu_timestamp = gpu_timestamp;
188
189 simple_mtx_lock(&clock_calibration->lock);
190 list_addtail(&record->list, &clock_calibration->record);
191 clock_calibration->record_count++;
192 simple_mtx_unlock(&clock_calibration->lock);
193
194 return true;
195 }
196
197 /* See https://gitlab.freedesktop.org/mesa/mesa/-/issues/5260
198 * On some HW SQTT can hang if we're not in one of the profiling pstates. */
199 bool
ac_check_profile_state(const struct radeon_info * info)200 ac_check_profile_state(const struct radeon_info *info)
201 {
202 char path[128];
203 char data[128];
204 int n;
205
206 if (!info->pci.valid)
207 return false; /* Unknown but optimistic. */
208
209 snprintf(path, sizeof(path),
210 "/sys/bus/pci/devices/%04x:%02x:%02x.%x/power_dpm_force_performance_level",
211 info->pci.domain, info->pci.bus, info->pci.dev, info->pci.func);
212
213 FILE *f = fopen(path, "r");
214 if (!f)
215 return false; /* Unknown but optimistic. */
216 n = fread(data, 1, sizeof(data) - 1, f);
217 fclose(f);
218 data[n] = 0;
219 return strstr(data, "profile") == NULL;
220 }
221
222 union rgp_sqtt_marker_cb_id
ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt * data,enum amd_ip_type ip_type)223 ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *data, enum amd_ip_type ip_type)
224 {
225 union rgp_sqtt_marker_cb_id cb_id = {0};
226
227 cb_id.global_cb_id.cb_index =
228 p_atomic_inc_return(&data->cmdbuf_ids_per_queue[ip_type]);
229
230 return cb_id;
231 }
232
233 static bool
ac_sqtt_se_is_disabled(const struct radeon_info * info,unsigned se)234 ac_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se)
235 {
236 /* No active CU on the SE means it is disabled. */
237 return info->cu_mask[se][0] == 0;
238 }
239
240 static uint32_t
ac_sqtt_get_active_cu(const struct radeon_info * info,unsigned se)241 ac_sqtt_get_active_cu(const struct radeon_info *info, unsigned se)
242 {
243 uint32_t cu_index;
244
245 if (info->gfx_level >= GFX11) {
246 /* GFX11 seems to operate on the last active CU. */
247 cu_index = util_last_bit(info->cu_mask[se][0]) - 1;
248 } else {
249 /* Default to the first active CU. */
250 cu_index = ffs(info->cu_mask[se][0]);
251 }
252
253 return cu_index;
254 }
255
256 bool
ac_sqtt_get_trace(struct ac_sqtt * data,const struct radeon_info * info,struct ac_sqtt_trace * sqtt_trace)257 ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info,
258 struct ac_sqtt_trace *sqtt_trace)
259 {
260 unsigned max_se = info->max_se;
261 void *ptr = data->ptr;
262
263 memset(sqtt_trace, 0, sizeof(*sqtt_trace));
264
265 for (unsigned se = 0; se < max_se; se++) {
266 uint64_t info_offset = ac_sqtt_get_info_offset(se);
267 uint64_t data_offset = ac_sqtt_get_data_offset(info, data, se);
268 void *info_ptr = (uint8_t *)ptr + info_offset;
269 void *data_ptr = (uint8_t *)ptr + data_offset;
270 struct ac_sqtt_data_info *trace_info = (struct ac_sqtt_data_info *)info_ptr;
271 struct ac_sqtt_data_se data_se = {0};
272 int active_cu = ac_sqtt_get_active_cu(info, se);
273
274 if (ac_sqtt_se_is_disabled(info, se))
275 continue;
276
277 if (!ac_is_sqtt_complete(info, data, trace_info))
278 return false;
279
280 data_se.data_ptr = data_ptr;
281 data_se.info = *trace_info;
282 data_se.shader_engine = se;
283
284 /* RGP seems to expect units of WGP on GFX10+. */
285 data_se.compute_unit = info->gfx_level >= GFX10 ? (active_cu / 2) : active_cu;
286
287 sqtt_trace->traces[sqtt_trace->num_traces] = data_se;
288 sqtt_trace->num_traces++;
289 }
290
291 sqtt_trace->rgp_code_object = &data->rgp_code_object;
292 sqtt_trace->rgp_loader_events = &data->rgp_loader_events;
293 sqtt_trace->rgp_pso_correlation = &data->rgp_pso_correlation;
294 sqtt_trace->rgp_queue_info = &data->rgp_queue_info;
295 sqtt_trace->rgp_queue_event = &data->rgp_queue_event;
296 sqtt_trace->rgp_clock_calibration = &data->rgp_clock_calibration;
297
298 return true;
299 }
300
301 uint32_t
ac_sqtt_get_ctrl(const struct radeon_info * info,bool enable)302 ac_sqtt_get_ctrl(const struct radeon_info *info, bool enable)
303 {
304
305 uint32_t ctrl;
306
307 if (info->gfx_level >= GFX11) {
308 ctrl = S_0367B0_MODE(enable) | S_0367B0_HIWATER(5) |
309 S_0367B0_UTIL_TIMER_GFX11(1) | S_0367B0_RT_FREQ(2) | /* 4096 clk */
310 S_0367B0_DRAW_EVENT_EN(1) | S_0367B0_SPI_STALL_EN(1) |
311 S_0367B0_SQ_STALL_EN(1) | S_0367B0_REG_AT_HWM(2);
312 } else {
313 assert(info->gfx_level >= GFX10);
314
315 ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) | S_008D1C_UTIL_TIMER(1) |
316 S_008D1C_RT_FREQ(2) | /* 4096 clk */ S_008D1C_DRAW_EVENT_EN(1) |
317 S_008D1C_REG_STALL_EN(1) | S_008D1C_SPI_STALL_EN(1) |
318 S_008D1C_SQ_STALL_EN(1) | S_008D1C_REG_DROP_ON_STALL(0);
319
320 if (info->gfx_level == GFX10_3)
321 ctrl |= S_008D1C_LOWATER_OFFSET(4);
322
323 if (info->has_sqtt_auto_flush_mode_bug)
324 ctrl |= S_008D1C_AUTO_FLUSH_MODE(1);
325 }
326
327 return ctrl;
328 }
329
330 uint32_t
ac_sqtt_get_shader_mask(const struct radeon_info * info)331 ac_sqtt_get_shader_mask(const struct radeon_info *info)
332 {
333 unsigned shader_mask = 0x7f; /* all shader stages */
334
335 if (info->gfx_level >= GFX11) {
336 /* Disable unsupported hw shader stages */
337 shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */);
338 }
339
340 return shader_mask;
341 }
342
343 void
ac_sqtt_emit_start(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,bool is_compute_queue)344 ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
345 const struct ac_sqtt *sqtt, bool is_compute_queue)
346 {
347 const uint32_t shifted_size = sqtt->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
348 const unsigned shader_mask = ac_sqtt_get_shader_mask(info);
349 const unsigned max_se = info->max_se;
350
351 for (unsigned se = 0; se < max_se; se++) {
352 uint64_t data_va = ac_sqtt_get_data_va(info, sqtt, se);
353 uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
354 int active_cu = ac_sqtt_get_active_cu(info, se);
355
356 if (ac_sqtt_se_is_disabled(info, se))
357 continue;
358
359 /* Target SEx and SH0. */
360 ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) |
361 S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
362
363 if (info->gfx_level >= GFX11) {
364 /* Order seems important for the following 2 registers. */
365 ac_pm4_set_reg(pm4, R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE,
366 S_0367A4_SIZE(shifted_size) | S_0367A4_BASE_HI(shifted_va >> 32));
367
368 ac_pm4_set_reg(pm4, R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
369
370 ac_pm4_set_reg(pm4, R_0367B4_SQ_THREAD_TRACE_MASK,
371 S_0367B4_WTYPE_INCLUDE(shader_mask) | S_0367B4_SA_SEL(0) |
372 S_0367B4_WGP_SEL(active_cu / 2) | S_0367B4_SIMD_SEL(0));
373
374 uint32_t sqtt_token_mask = S_0367B8_REG_INCLUDE(V_0367B8_REG_INCLUDE_SQDEC | V_0367B8_REG_INCLUDE_SHDEC |
375 V_0367B8_REG_INCLUDE_GFXUDEC | V_0367B8_REG_INCLUDE_COMP |
376 V_0367B8_REG_INCLUDE_CONTEXT | V_0367B8_REG_INCLUDE_CONFIG);
377
378 /* Performance counters with SQTT are considered deprecated. */
379 uint32_t token_exclude = V_0367B8_TOKEN_EXCLUDE_PERF;
380
381 if (!sqtt->instruction_timing_enabled) {
382 /* Reduce SQTT traffic when instruction timing isn't enabled. */
383 token_exclude |= V_0367B8_TOKEN_EXCLUDE_VMEMEXEC | V_0367B8_TOKEN_EXCLUDE_ALUEXEC |
384 V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE |
385 V_0367B8_TOKEN_EXCLUDE_INST;
386 }
387 sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE_GFX11(token_exclude) | S_0367B8_BOP_EVENTS_TOKEN_INCLUDE_GFX11(1);
388
389 ac_pm4_set_reg(pm4, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
390
391 /* Should be emitted last (it enables thread traces). */
392 ac_pm4_set_reg(pm4, R_0367B0_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, true));
393 } else if (info->gfx_level >= GFX10) {
394 /* Order seems important for the following 2 registers. */
395 ac_pm4_set_reg(pm4, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
396 S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32));
397
398 ac_pm4_set_reg(pm4, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
399
400 ac_pm4_set_reg(pm4, R_008D14_SQ_THREAD_TRACE_MASK,
401 S_008D14_WTYPE_INCLUDE(shader_mask) | S_008D14_SA_SEL(0) |
402 S_008D14_WGP_SEL(active_cu / 2) | S_008D14_SIMD_SEL(0));
403
404 uint32_t sqtt_token_mask = S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC |
405 V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_COMP |
406 V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG);
407
408 /* Performance counters with SQTT are considered deprecated. */
409 uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF;
410
411 if (!sqtt->instruction_timing_enabled) {
412 /* Reduce SQTT traffic when instruction timing isn't enabled. */
413 token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | V_008D18_TOKEN_EXCLUDE_ALUEXEC |
414 V_008D18_TOKEN_EXCLUDE_VALUINST | V_008D18_TOKEN_EXCLUDE_IMMEDIATE |
415 V_008D18_TOKEN_EXCLUDE_INST;
416 }
417 sqtt_token_mask |=
418 S_008D18_TOKEN_EXCLUDE(token_exclude) | S_008D18_BOP_EVENTS_TOKEN_INCLUDE(info->gfx_level == GFX10_3);
419
420 ac_pm4_set_reg(pm4, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
421
422 /* Should be emitted last (it enables thread traces). */
423 ac_pm4_set_reg(pm4, R_008D1C_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, true));
424 } else {
425 /* Order seems important for the following 4 registers. */
426 ac_pm4_set_reg(pm4, R_030CDC_SQ_THREAD_TRACE_BASE2, S_030CDC_ADDR_HI(shifted_va >> 32));
427
428 ac_pm4_set_reg(pm4, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
429
430 ac_pm4_set_reg(pm4, R_030CC4_SQ_THREAD_TRACE_SIZE, S_030CC4_SIZE(shifted_size));
431
432 ac_pm4_set_reg(pm4, R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1));
433
434 uint32_t sqtt_mask = S_030CC8_CU_SEL(active_cu) | S_030CC8_SH_SEL(0) | S_030CC8_SIMD_EN(0xf) |
435 S_030CC8_VM_ID_MASK(0) | S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) |
436 S_030CC8_SQ_STALL_EN(1);
437
438 if (info->gfx_level < GFX9) {
439 sqtt_mask |= S_030CC8_RANDOM_SEED(0xffff);
440 }
441
442 ac_pm4_set_reg(pm4, R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask);
443
444 /* Trace all tokens and registers. */
445 ac_pm4_set_reg(pm4, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
446 S_030CCC_TOKEN_MASK(0xbfff) | S_030CCC_REG_MASK(0xff) | S_030CCC_REG_DROP_ON_STALL(0));
447
448 /* Enable SQTT perf counters for all CUs. */
449 ac_pm4_set_reg(pm4, R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
450 S_030CD0_SH0_MASK(0xffff) | S_030CD0_SH1_MASK(0xffff));
451
452 ac_pm4_set_reg(pm4, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
453
454 ac_pm4_set_reg(pm4, R_030CEC_SQ_THREAD_TRACE_HIWATER, S_030CEC_HIWATER(4));
455
456 if (info->gfx_level == GFX9) {
457 /* Reset thread trace status errors. */
458 ac_pm4_set_reg(pm4, R_030CE8_SQ_THREAD_TRACE_STATUS, S_030CE8_UTC_ERROR(0));
459 }
460
461 /* Enable the thread trace mode. */
462 uint32_t sqtt_mode = S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) |
463 S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) |
464 S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
465 S_030CD8_MODE(1);
466
467 if (info->gfx_level == GFX9) {
468 /* Count SQTT traffic in TCC perf counters. */
469 sqtt_mode |= S_030CD8_TC_PERF_EN(1);
470 }
471
472 ac_pm4_set_reg(pm4, R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode);
473 }
474 }
475
476 /* Restore global broadcasting. */
477 ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_BROADCAST_WRITES(1) |
478 S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
479
480 /* Start the thread trace with a different event based on the queue. */
481 if (is_compute_queue) {
482 ac_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(1));
483 } else {
484 ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
485 ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
486 }
487
488 }
489
490 static const uint32_t gfx8_sqtt_info_regs[] = {
491 R_030CE4_SQ_THREAD_TRACE_WPTR,
492 R_030CE8_SQ_THREAD_TRACE_STATUS,
493 R_008E40_SQ_THREAD_TRACE_CNTR,
494 };
495
496 static const uint32_t gfx9_sqtt_info_regs[] = {
497 R_030CE4_SQ_THREAD_TRACE_WPTR,
498 R_030CE8_SQ_THREAD_TRACE_STATUS,
499 R_030CF0_SQ_THREAD_TRACE_CNTR,
500 };
501
502 static const uint32_t gfx10_sqtt_info_regs[] = {
503 R_008D10_SQ_THREAD_TRACE_WPTR,
504 R_008D20_SQ_THREAD_TRACE_STATUS,
505 R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
506 };
507
508 static const uint32_t gfx11_sqtt_info_regs[] = {
509 R_0367BC_SQ_THREAD_TRACE_WPTR,
510 R_0367D0_SQ_THREAD_TRACE_STATUS,
511 R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR,
512 };
513
514 static void
ac_sqtt_copy_info_regs(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,uint32_t se_index)515 ac_sqtt_copy_info_regs(const struct radeon_info *info, struct ac_pm4_state *pm4,
516 const struct ac_sqtt *sqtt, uint32_t se_index)
517 {
518 const uint32_t *sqtt_info_regs = NULL;
519
520 if (info->gfx_level >= GFX11) {
521 sqtt_info_regs = gfx11_sqtt_info_regs;
522 } else if (info->gfx_level >= GFX10) {
523 sqtt_info_regs = gfx10_sqtt_info_regs;
524 } else if (info->gfx_level == GFX9) {
525 sqtt_info_regs = gfx9_sqtt_info_regs;
526 } else {
527 assert(info->gfx_level == GFX8);
528 sqtt_info_regs = gfx8_sqtt_info_regs;
529 }
530
531 /* Get the VA where the info struct is stored for this SE. */
532 uint64_t info_va = ac_sqtt_get_info_va(sqtt->buffer_va, se_index);
533
534 /* Copy back the info struct one DWORD at a time. */
535 for (unsigned i = 0; i < 3; i++) {
536 ac_pm4_cmd_add(pm4, PKT3(PKT3_COPY_DATA, 4, 0));
537 ac_pm4_cmd_add(pm4, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM);
538 ac_pm4_cmd_add(pm4, sqtt_info_regs[i] >> 2);
539 ac_pm4_cmd_add(pm4, 0); /* unused */
540 ac_pm4_cmd_add(pm4, (info_va + i * 4));
541 ac_pm4_cmd_add(pm4, (info_va + i * 4) >> 32);
542 }
543
544 if (info->gfx_level == GFX11) {
545 /* On GFX11, SQ_THREAD_TRACE_WPTR is incremented from the "initial WPTR address" instead of 0.
546 * To get the number of bytes (in units of 32 bytes) written by SQTT, the workaround is to
547 * subtract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
548 *
549 * 1) get the current buffer base address for this SE
550 * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
551 * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits
552 */
553 uint64_t data_va = ac_sqtt_get_data_va(info, sqtt, se_index);
554 uint64_t shifted_data_va = (data_va >> 5);
555 uint32_t init_wptr_value = shifted_data_va & 0x1fffffff;
556
557 ac_pm4_cmd_add(pm4, PKT3(PKT3_ATOMIC_MEM, 7, 0));
558 ac_pm4_cmd_add(pm4, ATOMIC_OP(TC_OP_ATOMIC_SUB_32));
559 ac_pm4_cmd_add(pm4, info_va); /* addr lo */
560 ac_pm4_cmd_add(pm4, info_va >> 32); /* addr hi */
561 ac_pm4_cmd_add(pm4, init_wptr_value); /* data lo */
562 ac_pm4_cmd_add(pm4, 0); /* data hi */
563 ac_pm4_cmd_add(pm4, 0); /* compare data lo */
564 ac_pm4_cmd_add(pm4, 0); /* compare data hi */
565 ac_pm4_cmd_add(pm4, 0); /* loop interval */
566 }
567 }
568
569 void
ac_sqtt_emit_stop(const struct radeon_info * info,struct ac_pm4_state * pm4,bool is_compute_queue)570 ac_sqtt_emit_stop(const struct radeon_info *info, struct ac_pm4_state *pm4,
571 bool is_compute_queue)
572 {
573 /* Stop the thread trace with a different event based on the queue. */
574 if (is_compute_queue) {
575 ac_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(0));
576 } else {
577 ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
578 ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
579 }
580
581 ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
582 ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
583 }
584
585 void
ac_sqtt_emit_wait(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,bool is_compute_queue)586 ac_sqtt_emit_wait(const struct radeon_info *info, struct ac_pm4_state *pm4,
587 const struct ac_sqtt *sqtt, bool is_compute_queue)
588 {
589 const unsigned max_se = info->max_se;
590
591 for (unsigned se = 0; se < max_se; se++) {
592 if (ac_sqtt_se_is_disabled(info, se))
593 continue;
594
595 /* Target SEi and SH0. */
596 ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) |
597 S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
598
599 if (info->gfx_level >= GFX11) {
600 /* Make sure to wait for the trace buffer. */
601 ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
602 ac_pm4_cmd_add(pm4, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
603 ac_pm4_cmd_add(pm4, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
604 ac_pm4_cmd_add(pm4, 0);
605 ac_pm4_cmd_add(pm4, 0); /* reference value */
606 ac_pm4_cmd_add(pm4, ~C_0367D0_FINISH_DONE);
607 ac_pm4_cmd_add(pm4, 4); /* poll interval */
608
609 /* Disable the thread trace mode. */
610 ac_pm4_set_reg(pm4, R_0367B0_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, false));
611
612 /* Wait for thread trace completion. */
613 ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
614 ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
615 ac_pm4_cmd_add(pm4, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
616 ac_pm4_cmd_add(pm4, 0);
617 ac_pm4_cmd_add(pm4, 0); /* reference value */
618 ac_pm4_cmd_add(pm4, ~C_0367D0_BUSY); /* mask */
619 ac_pm4_cmd_add(pm4, 4); /* poll interval */
620 } else if (info->gfx_level >= GFX10) {
621 if (!info->has_sqtt_rb_harvest_bug) {
622 /* Make sure to wait for the trace buffer. */
623 ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
624 ac_pm4_cmd_add(pm4, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
625 ac_pm4_cmd_add(pm4, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
626 ac_pm4_cmd_add(pm4, 0);
627 ac_pm4_cmd_add(pm4, 0); /* reference value */
628 ac_pm4_cmd_add(pm4, ~C_008D20_FINISH_DONE);
629 ac_pm4_cmd_add(pm4, 4); /* poll interval */
630 }
631
632 /* Disable the thread trace mode. */
633 ac_pm4_set_reg(pm4, R_008D1C_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, false));
634
635 /* Wait for thread trace completion. */
636 ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
637 ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
638 ac_pm4_cmd_add(pm4, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
639 ac_pm4_cmd_add(pm4, 0);
640 ac_pm4_cmd_add(pm4, 0); /* reference value */
641 ac_pm4_cmd_add(pm4, ~C_008D20_BUSY); /* mask */
642 ac_pm4_cmd_add(pm4, 4); /* poll interval */
643 } else {
644 /* Disable the thread trace mode. */
645 ac_pm4_set_reg(pm4, R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0));
646
647 /* Wait for thread trace completion. */
648 ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
649 ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
650 ac_pm4_cmd_add(pm4, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
651 ac_pm4_cmd_add(pm4, 0);
652 ac_pm4_cmd_add(pm4, 0); /* reference value */
653 ac_pm4_cmd_add(pm4, ~C_030CE8_BUSY); /* mask */
654 ac_pm4_cmd_add(pm4, 4); /* poll interval */
655 }
656
657 ac_sqtt_copy_info_regs(info, pm4, sqtt, se);
658 }
659
660 /* Restore global broadcasting. */
661 ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_BROADCAST_WRITES(1) |
662 S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
663 }
664