xref: /aosp_15_r20/external/mesa3d/src/amd/common/ac_sqtt.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * Copyright 2020 Valve Corporation
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "ac_pm4.h"
9 #include "ac_sqtt.h"
10 
11 #include "sid.h"
12 #include "ac_gpu_info.h"
13 #include "util/u_math.h"
14 #include "util/os_time.h"
15 
16 #include "sid.h"
17 
18 uint64_t
ac_sqtt_get_info_offset(unsigned se)19 ac_sqtt_get_info_offset(unsigned se)
20 {
21    return sizeof(struct ac_sqtt_data_info) * se;
22 }
23 
24 uint64_t
ac_sqtt_get_data_offset(const struct radeon_info * rad_info,const struct ac_sqtt * data,unsigned se)25 ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *data, unsigned se)
26 {
27    unsigned max_se = rad_info->max_se;
28    uint64_t data_offset;
29 
30    data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
31    data_offset += data->buffer_size * se;
32 
33    return data_offset;
34 }
35 
36 static uint64_t
ac_sqtt_get_info_va(uint64_t va,unsigned se)37 ac_sqtt_get_info_va(uint64_t va, unsigned se)
38 {
39    return va + ac_sqtt_get_info_offset(se);
40 }
41 
42 static uint64_t
ac_sqtt_get_data_va(const struct radeon_info * rad_info,const struct ac_sqtt * data,unsigned se)43 ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *data,
44                     unsigned se)
45 {
46    return data->buffer_va + ac_sqtt_get_data_offset(rad_info, data, se);
47 }
48 
49 void
ac_sqtt_init(struct ac_sqtt * data)50 ac_sqtt_init(struct ac_sqtt *data)
51 {
52    list_inithead(&data->rgp_pso_correlation.record);
53    simple_mtx_init(&data->rgp_pso_correlation.lock, mtx_plain);
54 
55    list_inithead(&data->rgp_loader_events.record);
56    simple_mtx_init(&data->rgp_loader_events.lock, mtx_plain);
57 
58    list_inithead(&data->rgp_code_object.record);
59    simple_mtx_init(&data->rgp_code_object.lock, mtx_plain);
60 
61    list_inithead(&data->rgp_clock_calibration.record);
62    simple_mtx_init(&data->rgp_clock_calibration.lock, mtx_plain);
63 
64    list_inithead(&data->rgp_queue_info.record);
65    simple_mtx_init(&data->rgp_queue_info.lock, mtx_plain);
66 
67    list_inithead(&data->rgp_queue_event.record);
68    simple_mtx_init(&data->rgp_queue_event.lock, mtx_plain);
69 }
70 
71 void
ac_sqtt_finish(struct ac_sqtt * data)72 ac_sqtt_finish(struct ac_sqtt *data)
73 {
74    assert(data->rgp_pso_correlation.record_count == 0);
75    simple_mtx_destroy(&data->rgp_pso_correlation.lock);
76 
77    assert(data->rgp_loader_events.record_count == 0);
78    simple_mtx_destroy(&data->rgp_loader_events.lock);
79 
80    assert(data->rgp_code_object.record_count == 0);
81    simple_mtx_destroy(&data->rgp_code_object.lock);
82 
83    assert(data->rgp_clock_calibration.record_count == 0);
84    simple_mtx_destroy(&data->rgp_clock_calibration.lock);
85 
86    assert(data->rgp_queue_info.record_count == 0);
87    simple_mtx_destroy(&data->rgp_queue_info.lock);
88 
89    assert(data->rgp_queue_event.record_count == 0);
90    simple_mtx_destroy(&data->rgp_queue_event.lock);
91 }
92 
93 bool
ac_is_sqtt_complete(const struct radeon_info * rad_info,const struct ac_sqtt * data,const struct ac_sqtt_data_info * info)94 ac_is_sqtt_complete(const struct radeon_info *rad_info, const struct ac_sqtt *data,
95                     const struct ac_sqtt_data_info *info)
96 {
97    if (rad_info->gfx_level >= GFX10) {
98       /* GFX10 doesn't have THREAD_TRACE_CNTR but it reports the number of
99        * dropped bytes per SE via THREAD_TRACE_DROPPED_CNTR. Though, this
100        * doesn't seem reliable because it might still report non-zero even if
101        * the SQTT buffer isn't full.
102        *
103        * The solution here is to compare the number of bytes written by the hw
104        * (in units of 32 bytes) to the SQTT buffer size. If it's equal, that
105        * means that the buffer is full and should be resized.
106        */
107       return !(info->cur_offset * 32 == data->buffer_size - 32);
108    }
109 
110    /* Otherwise, compare the current thread trace offset with the number
111     * of written bytes.
112     */
113    return info->cur_offset == info->gfx9_write_counter;
114 }
115 
116 uint32_t
ac_get_expected_buffer_size(struct radeon_info * rad_info,const struct ac_sqtt_data_info * info)117 ac_get_expected_buffer_size(struct radeon_info *rad_info, const struct ac_sqtt_data_info *info)
118 {
119    if (rad_info->gfx_level >= GFX10) {
120       uint32_t dropped_cntr_per_se = info->gfx10_dropped_cntr / rad_info->max_se;
121       return ((info->cur_offset * 32) + dropped_cntr_per_se) / 1024;
122    }
123 
124    return (info->gfx9_write_counter * 32) / 1024;
125 }
126 
127 bool
ac_sqtt_add_pso_correlation(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t api_hash)128 ac_sqtt_add_pso_correlation(struct ac_sqtt *sqtt, uint64_t pipeline_hash, uint64_t api_hash)
129 {
130    struct rgp_pso_correlation *pso_correlation = &sqtt->rgp_pso_correlation;
131    struct rgp_pso_correlation_record *record;
132 
133    record = malloc(sizeof(struct rgp_pso_correlation_record));
134    if (!record)
135       return false;
136 
137    record->api_pso_hash = api_hash;
138    record->pipeline_hash[0] = pipeline_hash;
139    record->pipeline_hash[1] = pipeline_hash;
140    memset(record->api_level_obj_name, 0, sizeof(record->api_level_obj_name));
141 
142    simple_mtx_lock(&pso_correlation->lock);
143    list_addtail(&record->list, &pso_correlation->record);
144    pso_correlation->record_count++;
145    simple_mtx_unlock(&pso_correlation->lock);
146 
147    return true;
148 }
149 
150 bool
ac_sqtt_add_code_object_loader_event(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t base_address)151 ac_sqtt_add_code_object_loader_event(struct ac_sqtt *sqtt, uint64_t pipeline_hash,
152                                      uint64_t base_address)
153 {
154    struct rgp_loader_events *loader_events = &sqtt->rgp_loader_events;
155    struct rgp_loader_events_record *record;
156 
157    record = malloc(sizeof(struct rgp_loader_events_record));
158    if (!record)
159       return false;
160 
161    record->loader_event_type = RGP_LOAD_TO_GPU_MEMORY;
162    record->reserved = 0;
163    record->base_address = base_address & 0xffffffffffff;
164    record->code_object_hash[0] = pipeline_hash;
165    record->code_object_hash[1] = pipeline_hash;
166    record->time_stamp = os_time_get_nano();
167 
168    simple_mtx_lock(&loader_events->lock);
169    list_addtail(&record->list, &loader_events->record);
170    loader_events->record_count++;
171    simple_mtx_unlock(&loader_events->lock);
172 
173    return true;
174 }
175 
176 bool
ac_sqtt_add_clock_calibration(struct ac_sqtt * sqtt,uint64_t cpu_timestamp,uint64_t gpu_timestamp)177 ac_sqtt_add_clock_calibration(struct ac_sqtt *sqtt, uint64_t cpu_timestamp, uint64_t gpu_timestamp)
178 {
179    struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
180    struct rgp_clock_calibration_record *record;
181 
182    record = malloc(sizeof(struct rgp_clock_calibration_record));
183    if (!record)
184       return false;
185 
186    record->cpu_timestamp = cpu_timestamp;
187    record->gpu_timestamp = gpu_timestamp;
188 
189    simple_mtx_lock(&clock_calibration->lock);
190    list_addtail(&record->list, &clock_calibration->record);
191    clock_calibration->record_count++;
192    simple_mtx_unlock(&clock_calibration->lock);
193 
194    return true;
195 }
196 
197 /* See https://gitlab.freedesktop.org/mesa/mesa/-/issues/5260
198  * On some HW SQTT can hang if we're not in one of the profiling pstates. */
199 bool
ac_check_profile_state(const struct radeon_info * info)200 ac_check_profile_state(const struct radeon_info *info)
201 {
202    char path[128];
203    char data[128];
204    int n;
205 
206    if (!info->pci.valid)
207       return false; /* Unknown but optimistic. */
208 
209    snprintf(path, sizeof(path),
210             "/sys/bus/pci/devices/%04x:%02x:%02x.%x/power_dpm_force_performance_level",
211             info->pci.domain, info->pci.bus, info->pci.dev, info->pci.func);
212 
213    FILE *f = fopen(path, "r");
214    if (!f)
215       return false; /* Unknown but optimistic. */
216    n = fread(data, 1, sizeof(data) - 1, f);
217    fclose(f);
218    data[n] = 0;
219    return strstr(data, "profile") == NULL;
220 }
221 
222 union rgp_sqtt_marker_cb_id
ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt * data,enum amd_ip_type ip_type)223 ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *data, enum amd_ip_type ip_type)
224 {
225    union rgp_sqtt_marker_cb_id cb_id = {0};
226 
227    cb_id.global_cb_id.cb_index =
228       p_atomic_inc_return(&data->cmdbuf_ids_per_queue[ip_type]);
229 
230    return cb_id;
231 }
232 
233 static bool
ac_sqtt_se_is_disabled(const struct radeon_info * info,unsigned se)234 ac_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se)
235 {
236    /* No active CU on the SE means it is disabled. */
237    return info->cu_mask[se][0] == 0;
238 }
239 
240 static uint32_t
ac_sqtt_get_active_cu(const struct radeon_info * info,unsigned se)241 ac_sqtt_get_active_cu(const struct radeon_info *info, unsigned se)
242 {
243    uint32_t cu_index;
244 
245    if (info->gfx_level >= GFX11) {
246       /* GFX11 seems to operate on the last active CU. */
247       cu_index = util_last_bit(info->cu_mask[se][0]) - 1;
248    } else {
249       /* Default to the first active CU. */
250       cu_index = ffs(info->cu_mask[se][0]);
251    }
252 
253    return cu_index;
254 }
255 
256 bool
ac_sqtt_get_trace(struct ac_sqtt * data,const struct radeon_info * info,struct ac_sqtt_trace * sqtt_trace)257 ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info,
258                   struct ac_sqtt_trace *sqtt_trace)
259 {
260    unsigned max_se = info->max_se;
261    void *ptr = data->ptr;
262 
263    memset(sqtt_trace, 0, sizeof(*sqtt_trace));
264 
265    for (unsigned se = 0; se < max_se; se++) {
266       uint64_t info_offset = ac_sqtt_get_info_offset(se);
267       uint64_t data_offset = ac_sqtt_get_data_offset(info, data, se);
268       void *info_ptr = (uint8_t *)ptr + info_offset;
269       void *data_ptr = (uint8_t *)ptr + data_offset;
270       struct ac_sqtt_data_info *trace_info = (struct ac_sqtt_data_info *)info_ptr;
271       struct ac_sqtt_data_se data_se = {0};
272       int active_cu = ac_sqtt_get_active_cu(info, se);
273 
274       if (ac_sqtt_se_is_disabled(info, se))
275          continue;
276 
277       if (!ac_is_sqtt_complete(info, data, trace_info))
278          return false;
279 
280       data_se.data_ptr = data_ptr;
281       data_se.info = *trace_info;
282       data_se.shader_engine = se;
283 
284       /* RGP seems to expect units of WGP on GFX10+. */
285       data_se.compute_unit = info->gfx_level >= GFX10 ? (active_cu / 2) : active_cu;
286 
287       sqtt_trace->traces[sqtt_trace->num_traces] = data_se;
288       sqtt_trace->num_traces++;
289    }
290 
291    sqtt_trace->rgp_code_object = &data->rgp_code_object;
292    sqtt_trace->rgp_loader_events = &data->rgp_loader_events;
293    sqtt_trace->rgp_pso_correlation = &data->rgp_pso_correlation;
294    sqtt_trace->rgp_queue_info = &data->rgp_queue_info;
295    sqtt_trace->rgp_queue_event = &data->rgp_queue_event;
296    sqtt_trace->rgp_clock_calibration = &data->rgp_clock_calibration;
297 
298    return true;
299 }
300 
301 uint32_t
ac_sqtt_get_ctrl(const struct radeon_info * info,bool enable)302 ac_sqtt_get_ctrl(const struct radeon_info *info, bool enable)
303 {
304 
305    uint32_t ctrl;
306 
307    if (info->gfx_level >= GFX11) {
308       ctrl = S_0367B0_MODE(enable) | S_0367B0_HIWATER(5) |
309              S_0367B0_UTIL_TIMER_GFX11(1) | S_0367B0_RT_FREQ(2) | /* 4096 clk */
310              S_0367B0_DRAW_EVENT_EN(1) | S_0367B0_SPI_STALL_EN(1) |
311              S_0367B0_SQ_STALL_EN(1) | S_0367B0_REG_AT_HWM(2);
312    } else {
313       assert(info->gfx_level >= GFX10);
314 
315       ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) | S_008D1C_UTIL_TIMER(1) |
316              S_008D1C_RT_FREQ(2) | /* 4096 clk */ S_008D1C_DRAW_EVENT_EN(1) |
317              S_008D1C_REG_STALL_EN(1) | S_008D1C_SPI_STALL_EN(1) |
318              S_008D1C_SQ_STALL_EN(1) | S_008D1C_REG_DROP_ON_STALL(0);
319 
320       if (info->gfx_level == GFX10_3)
321          ctrl |= S_008D1C_LOWATER_OFFSET(4);
322 
323       if (info->has_sqtt_auto_flush_mode_bug)
324          ctrl |= S_008D1C_AUTO_FLUSH_MODE(1);
325    }
326 
327    return ctrl;
328 }
329 
330 uint32_t
ac_sqtt_get_shader_mask(const struct radeon_info * info)331 ac_sqtt_get_shader_mask(const struct radeon_info *info)
332 {
333    unsigned shader_mask = 0x7f; /* all shader stages */
334 
335    if (info->gfx_level >= GFX11) {
336       /* Disable unsupported hw shader stages */
337       shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */);
338    }
339 
340    return shader_mask;
341 }
342 
343 void
ac_sqtt_emit_start(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,bool is_compute_queue)344 ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
345                    const struct ac_sqtt *sqtt, bool is_compute_queue)
346 {
347    const uint32_t shifted_size = sqtt->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
348    const unsigned shader_mask = ac_sqtt_get_shader_mask(info);
349    const unsigned max_se = info->max_se;
350 
351    for (unsigned se = 0; se < max_se; se++) {
352       uint64_t data_va = ac_sqtt_get_data_va(info, sqtt, se);
353       uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
354       int active_cu = ac_sqtt_get_active_cu(info, se);
355 
356       if (ac_sqtt_se_is_disabled(info, se))
357          continue;
358 
359       /* Target SEx and SH0. */
360       ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) |
361                      S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
362 
363       if (info->gfx_level >= GFX11) {
364          /* Order seems important for the following 2 registers. */
365          ac_pm4_set_reg(pm4, R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE,
366                         S_0367A4_SIZE(shifted_size) | S_0367A4_BASE_HI(shifted_va >> 32));
367 
368          ac_pm4_set_reg(pm4, R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
369 
370          ac_pm4_set_reg(pm4, R_0367B4_SQ_THREAD_TRACE_MASK,
371                         S_0367B4_WTYPE_INCLUDE(shader_mask) | S_0367B4_SA_SEL(0) |
372                         S_0367B4_WGP_SEL(active_cu / 2) | S_0367B4_SIMD_SEL(0));
373 
374          uint32_t sqtt_token_mask = S_0367B8_REG_INCLUDE(V_0367B8_REG_INCLUDE_SQDEC | V_0367B8_REG_INCLUDE_SHDEC |
375                                                          V_0367B8_REG_INCLUDE_GFXUDEC | V_0367B8_REG_INCLUDE_COMP |
376                                                          V_0367B8_REG_INCLUDE_CONTEXT | V_0367B8_REG_INCLUDE_CONFIG);
377 
378          /* Performance counters with SQTT are considered deprecated. */
379          uint32_t token_exclude = V_0367B8_TOKEN_EXCLUDE_PERF;
380 
381          if (!sqtt->instruction_timing_enabled) {
382             /* Reduce SQTT traffic when instruction timing isn't enabled. */
383             token_exclude |= V_0367B8_TOKEN_EXCLUDE_VMEMEXEC | V_0367B8_TOKEN_EXCLUDE_ALUEXEC |
384                              V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE |
385                              V_0367B8_TOKEN_EXCLUDE_INST;
386          }
387          sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE_GFX11(token_exclude) | S_0367B8_BOP_EVENTS_TOKEN_INCLUDE_GFX11(1);
388 
389          ac_pm4_set_reg(pm4, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
390 
391          /* Should be emitted last (it enables thread traces). */
392          ac_pm4_set_reg(pm4, R_0367B0_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, true));
393       } else if (info->gfx_level >= GFX10) {
394          /* Order seems important for the following 2 registers. */
395          ac_pm4_set_reg(pm4, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
396                         S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32));
397 
398          ac_pm4_set_reg(pm4, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
399 
400          ac_pm4_set_reg(pm4, R_008D14_SQ_THREAD_TRACE_MASK,
401                         S_008D14_WTYPE_INCLUDE(shader_mask) | S_008D14_SA_SEL(0) |
402                         S_008D14_WGP_SEL(active_cu / 2) | S_008D14_SIMD_SEL(0));
403 
404          uint32_t sqtt_token_mask = S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC |
405                                                          V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_COMP |
406                                                          V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG);
407 
408          /* Performance counters with SQTT are considered deprecated. */
409          uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF;
410 
411          if (!sqtt->instruction_timing_enabled) {
412             /* Reduce SQTT traffic when instruction timing isn't enabled. */
413             token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | V_008D18_TOKEN_EXCLUDE_ALUEXEC |
414                              V_008D18_TOKEN_EXCLUDE_VALUINST | V_008D18_TOKEN_EXCLUDE_IMMEDIATE |
415                              V_008D18_TOKEN_EXCLUDE_INST;
416          }
417          sqtt_token_mask |=
418             S_008D18_TOKEN_EXCLUDE(token_exclude) | S_008D18_BOP_EVENTS_TOKEN_INCLUDE(info->gfx_level == GFX10_3);
419 
420          ac_pm4_set_reg(pm4, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
421 
422          /* Should be emitted last (it enables thread traces). */
423          ac_pm4_set_reg(pm4, R_008D1C_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, true));
424       } else {
425          /* Order seems important for the following 4 registers. */
426          ac_pm4_set_reg(pm4, R_030CDC_SQ_THREAD_TRACE_BASE2, S_030CDC_ADDR_HI(shifted_va >> 32));
427 
428          ac_pm4_set_reg(pm4, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
429 
430          ac_pm4_set_reg(pm4, R_030CC4_SQ_THREAD_TRACE_SIZE, S_030CC4_SIZE(shifted_size));
431 
432          ac_pm4_set_reg(pm4, R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1));
433 
434          uint32_t sqtt_mask = S_030CC8_CU_SEL(active_cu) | S_030CC8_SH_SEL(0) | S_030CC8_SIMD_EN(0xf) |
435                               S_030CC8_VM_ID_MASK(0) | S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) |
436                               S_030CC8_SQ_STALL_EN(1);
437 
438          if (info->gfx_level < GFX9) {
439             sqtt_mask |= S_030CC8_RANDOM_SEED(0xffff);
440          }
441 
442          ac_pm4_set_reg(pm4, R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask);
443 
444          /* Trace all tokens and registers. */
445          ac_pm4_set_reg(pm4, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
446                         S_030CCC_TOKEN_MASK(0xbfff) | S_030CCC_REG_MASK(0xff) | S_030CCC_REG_DROP_ON_STALL(0));
447 
448          /* Enable SQTT perf counters for all CUs. */
449          ac_pm4_set_reg(pm4, R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
450                         S_030CD0_SH0_MASK(0xffff) | S_030CD0_SH1_MASK(0xffff));
451 
452          ac_pm4_set_reg(pm4, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
453 
454          ac_pm4_set_reg(pm4, R_030CEC_SQ_THREAD_TRACE_HIWATER, S_030CEC_HIWATER(4));
455 
456          if (info->gfx_level == GFX9) {
457             /* Reset thread trace status errors. */
458             ac_pm4_set_reg(pm4, R_030CE8_SQ_THREAD_TRACE_STATUS, S_030CE8_UTC_ERROR(0));
459          }
460 
461          /* Enable the thread trace mode. */
462          uint32_t sqtt_mode = S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) |
463                               S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) |
464                               S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
465                               S_030CD8_MODE(1);
466 
467          if (info->gfx_level == GFX9) {
468             /* Count SQTT traffic in TCC perf counters. */
469             sqtt_mode |= S_030CD8_TC_PERF_EN(1);
470          }
471 
472          ac_pm4_set_reg(pm4, R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode);
473       }
474    }
475 
476    /* Restore global broadcasting. */
477    ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX,  S_030800_SE_BROADCAST_WRITES(1) |
478                   S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
479 
480    /* Start the thread trace with a different event based on the queue. */
481    if (is_compute_queue) {
482       ac_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(1));
483    } else {
484       ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
485       ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
486    }
487 
488 }
489 
490 static const uint32_t gfx8_sqtt_info_regs[] = {
491    R_030CE4_SQ_THREAD_TRACE_WPTR,
492    R_030CE8_SQ_THREAD_TRACE_STATUS,
493    R_008E40_SQ_THREAD_TRACE_CNTR,
494 };
495 
496 static const uint32_t gfx9_sqtt_info_regs[] = {
497    R_030CE4_SQ_THREAD_TRACE_WPTR,
498    R_030CE8_SQ_THREAD_TRACE_STATUS,
499    R_030CF0_SQ_THREAD_TRACE_CNTR,
500 };
501 
502 static const uint32_t gfx10_sqtt_info_regs[] = {
503    R_008D10_SQ_THREAD_TRACE_WPTR,
504    R_008D20_SQ_THREAD_TRACE_STATUS,
505    R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
506 };
507 
508 static const uint32_t gfx11_sqtt_info_regs[] = {
509    R_0367BC_SQ_THREAD_TRACE_WPTR,
510    R_0367D0_SQ_THREAD_TRACE_STATUS,
511    R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR,
512 };
513 
514 static void
ac_sqtt_copy_info_regs(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,uint32_t se_index)515 ac_sqtt_copy_info_regs(const struct radeon_info *info, struct ac_pm4_state *pm4,
516                        const struct ac_sqtt *sqtt, uint32_t se_index)
517 {
518    const uint32_t *sqtt_info_regs = NULL;
519 
520    if (info->gfx_level >= GFX11) {
521       sqtt_info_regs = gfx11_sqtt_info_regs;
522    } else if (info->gfx_level >= GFX10) {
523       sqtt_info_regs = gfx10_sqtt_info_regs;
524    } else if (info->gfx_level == GFX9) {
525       sqtt_info_regs = gfx9_sqtt_info_regs;
526    } else {
527       assert(info->gfx_level == GFX8);
528       sqtt_info_regs = gfx8_sqtt_info_regs;
529    }
530 
531    /* Get the VA where the info struct is stored for this SE. */
532    uint64_t info_va = ac_sqtt_get_info_va(sqtt->buffer_va, se_index);
533 
534    /* Copy back the info struct one DWORD at a time. */
535    for (unsigned i = 0; i < 3; i++) {
536       ac_pm4_cmd_add(pm4, PKT3(PKT3_COPY_DATA, 4, 0));
537       ac_pm4_cmd_add(pm4, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM);
538       ac_pm4_cmd_add(pm4, sqtt_info_regs[i] >> 2);
539       ac_pm4_cmd_add(pm4, 0); /* unused */
540       ac_pm4_cmd_add(pm4, (info_va + i * 4));
541       ac_pm4_cmd_add(pm4, (info_va + i * 4) >> 32);
542    }
543 
544    if (info->gfx_level == GFX11) {
545       /* On GFX11, SQ_THREAD_TRACE_WPTR is incremented from the "initial WPTR address" instead of 0.
546        * To get the number of bytes (in units of 32 bytes) written by SQTT, the workaround is to
547        * subtract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
548        *
549        * 1) get the current buffer base address for this SE
550        * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
551        * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits
552        */
553       uint64_t data_va = ac_sqtt_get_data_va(info, sqtt, se_index);
554       uint64_t shifted_data_va = (data_va >> 5);
555       uint32_t init_wptr_value = shifted_data_va & 0x1fffffff;
556 
557       ac_pm4_cmd_add(pm4, PKT3(PKT3_ATOMIC_MEM, 7, 0));
558       ac_pm4_cmd_add(pm4, ATOMIC_OP(TC_OP_ATOMIC_SUB_32));
559       ac_pm4_cmd_add(pm4, info_va);         /* addr lo */
560       ac_pm4_cmd_add(pm4, info_va >> 32);   /* addr hi */
561       ac_pm4_cmd_add(pm4, init_wptr_value); /* data lo */
562       ac_pm4_cmd_add(pm4, 0);               /* data hi */
563       ac_pm4_cmd_add(pm4, 0);               /* compare data lo */
564       ac_pm4_cmd_add(pm4, 0);               /* compare data hi */
565       ac_pm4_cmd_add(pm4, 0);               /* loop interval */
566    }
567 }
568 
569 void
ac_sqtt_emit_stop(const struct radeon_info * info,struct ac_pm4_state * pm4,bool is_compute_queue)570 ac_sqtt_emit_stop(const struct radeon_info *info, struct ac_pm4_state *pm4,
571                   bool is_compute_queue)
572 {
573    /* Stop the thread trace with a different event based on the queue. */
574    if (is_compute_queue) {
575       ac_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(0));
576    } else {
577       ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
578       ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
579    }
580 
581    ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
582    ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
583 }
584 
585 void
ac_sqtt_emit_wait(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,bool is_compute_queue)586 ac_sqtt_emit_wait(const struct radeon_info *info, struct ac_pm4_state *pm4,
587                   const struct ac_sqtt *sqtt, bool is_compute_queue)
588 {
589    const unsigned max_se = info->max_se;
590 
591    for (unsigned se = 0; se < max_se; se++) {
592       if (ac_sqtt_se_is_disabled(info, se))
593          continue;
594 
595       /* Target SEi and SH0. */
596       ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) |
597                      S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
598 
599       if (info->gfx_level >= GFX11) {
600          /* Make sure to wait for the trace buffer. */
601          ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
602          ac_pm4_cmd_add(pm4, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
603          ac_pm4_cmd_add(pm4, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
604          ac_pm4_cmd_add(pm4, 0);
605          ac_pm4_cmd_add(pm4, 0); /* reference value */
606          ac_pm4_cmd_add(pm4, ~C_0367D0_FINISH_DONE);
607          ac_pm4_cmd_add(pm4, 4); /* poll interval */
608 
609          /* Disable the thread trace mode. */
610          ac_pm4_set_reg(pm4, R_0367B0_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, false));
611 
612          /* Wait for thread trace completion. */
613          ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
614          ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
615          ac_pm4_cmd_add(pm4, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
616          ac_pm4_cmd_add(pm4, 0);
617          ac_pm4_cmd_add(pm4, 0);              /* reference value */
618          ac_pm4_cmd_add(pm4, ~C_0367D0_BUSY); /* mask */
619          ac_pm4_cmd_add(pm4, 4);              /* poll interval */
620       } else if (info->gfx_level >= GFX10) {
621          if (!info->has_sqtt_rb_harvest_bug) {
622             /* Make sure to wait for the trace buffer. */
623             ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
624             ac_pm4_cmd_add(pm4, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
625             ac_pm4_cmd_add(pm4, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
626             ac_pm4_cmd_add(pm4, 0);
627             ac_pm4_cmd_add(pm4, 0); /* reference value */
628             ac_pm4_cmd_add(pm4, ~C_008D20_FINISH_DONE);
629             ac_pm4_cmd_add(pm4, 4); /* poll interval */
630          }
631 
632          /* Disable the thread trace mode. */
633          ac_pm4_set_reg(pm4, R_008D1C_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, false));
634 
635          /* Wait for thread trace completion. */
636          ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
637          ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
638          ac_pm4_cmd_add(pm4, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
639          ac_pm4_cmd_add(pm4, 0);
640          ac_pm4_cmd_add(pm4, 0);              /* reference value */
641          ac_pm4_cmd_add(pm4, ~C_008D20_BUSY); /* mask */
642          ac_pm4_cmd_add(pm4, 4);              /* poll interval */
643       } else {
644          /* Disable the thread trace mode. */
645          ac_pm4_set_reg(pm4, R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0));
646 
647          /* Wait for thread trace completion. */
648          ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
649          ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
650          ac_pm4_cmd_add(pm4, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
651          ac_pm4_cmd_add(pm4, 0);
652          ac_pm4_cmd_add(pm4, 0);              /* reference value */
653          ac_pm4_cmd_add(pm4, ~C_030CE8_BUSY); /* mask */
654          ac_pm4_cmd_add(pm4, 4);              /* poll interval */
655       }
656 
657       ac_sqtt_copy_info_regs(info, pm4, sqtt, se);
658    }
659 
660    /* Restore global broadcasting. */
661    ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_BROADCAST_WRITES(1) |
662                   S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
663 }
664