xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_perfetto.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 #include <stdio.h>
7 #include <stdarg.h>
8 #include <string.h>
9 
10 #include "util/hash_table.h"
11 #include "util/u_process.h"
12 #include "util/hash_table.h"
13 
14 #include "si_pipe.h"
15 #include "si_perfetto.h"
16 #include "si_tracepoints.h"
17 
18 #ifdef HAVE_PERFETTO
19 
20 #include "util/perf/u_perfetto.h"
21 #include "util/perf/u_perfetto_renderpass.h"
22 
23 #include "si_tracepoints_perfetto.h"
24 
25 /* Just naming stages */
26 static const struct {
27    const char *name;
28 
29    /* The perfetto UI requires that there is a parent-child relationship
30     * within a row of elements. Which means that all children elements must
31     * end within the lifespan of their parent.
32     *
33     * Some elements like stalls and command buffers follow that relationship,
34     * but not all. This tells us in which UI row the elements should live.
35     */
36    enum si_ds_queue_stage draw_stage;
37 } si_queue_stage_desc[SI_DS_QUEUE_STAGE_N_STAGES] = {
38    /* Order must match the enum! */
39    {
40       "queue",
41       SI_DS_QUEUE_STAGE_QUEUE,
42    },
43    {
44       "compute",
45       SI_DS_QUEUE_STAGE_COMPUTE,
46    },
47    {
48       "draw",
49       SI_DS_QUEUE_STAGE_DRAW,
50    }
51 };
52 
53 struct SIRenderpassIncrementalState {
54    bool was_cleared = true;
55 };
56 
57 struct SIRenderpassTraits : public perfetto::DefaultDataSourceTraits {
58    using IncrementalStateType = SIRenderpassIncrementalState;
59 };
60 
61 class SIRenderpassDataSource : public MesaRenderpassDataSource<SIRenderpassDataSource,
62                                                                SIRenderpassTraits> {
63 };
64 
65 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
66 PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
67 
68 using perfetto::protos::pbzero::InternedGpuRenderStageSpecification_RenderStageCategory;
69 
sync_timestamp(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)70 static void sync_timestamp(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device)
71 {
72    uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
73    uint64_t gpu_ts;
74 
75    struct si_context *sctx = container_of(device, struct si_context, ds);
76    gpu_ts = sctx->screen->b.get_timestamp(&sctx->screen->b);
77 
78 
79    cpu_ts = perfetto::base::GetBootTimeNs().count();
80 
81    if (cpu_ts < device->next_clock_sync_ns)
82       return;
83 
84    PERFETTO_LOG("sending clocks gpu=0x%08x", device->gpu_clock_id);
85 
86    device->sync_gpu_ts = gpu_ts;
87    device->next_clock_sync_ns = cpu_ts + 1000000000ull;
88    MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits>::
89       EmitClockSync(ctx, cpu_ts, gpu_ts, device->gpu_clock_id);
90 }
91 
send_descriptors(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)92 static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx,
93                              struct si_ds_device *device)
94 {
95    PERFETTO_LOG("Sending renderstage descriptors");
96 
97    device->event_id = 0;
98    list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
99       for (uint32_t s = 0; s < ARRAY_SIZE(queue->stages); s++) {
100          queue->stages[s].start_ns[0] = 0;
101       }
102    }
103 
104    {
105       auto packet = ctx.NewTracePacket();
106 
107       packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
108       packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
109       packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
110 
111       auto interned_data = packet->set_interned_data();
112 
113       {
114          auto desc = interned_data->add_graphics_contexts();
115          desc->set_iid(device->iid);
116          desc->set_pid(getpid());
117          switch (device->api) {
118          case AMD_DS_API_OPENGL:
119             desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::OPEN_GL);
120             break;
121          case AMD_DS_API_VULKAN:
122             desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::VULKAN);
123             break;
124          default:
125             break;
126          }
127       }
128 
129       /* Emit all the IID picked at device/queue creation. */
130       list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
131          for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
132             {
133                /* We put the stage number in there so that all rows are order
134                 * by si_ds_queue_stage.
135                 */
136                char name[100];
137                snprintf(name, sizeof(name), "%.10s-%s-%u-%s", util_get_process_name(),
138                         queue->name, s, si_queue_stage_desc[s].name);
139 
140                auto desc = interned_data->add_gpu_specifications();
141                desc->set_iid(queue->stages[s].queue_iid);
142                desc->set_name(name);
143             }
144             {
145                auto desc = interned_data->add_gpu_specifications();
146                desc->set_iid(queue->stages[s].stage_iid);
147                desc->set_name(si_queue_stage_desc[s].name);
148             }
149          }
150       }
151    }
152 
153    device->next_clock_sync_ns = 0;
154    sync_timestamp(ctx, device);
155 }
156 
157 typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *,
158                                             const void*);
159 
begin_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id)160 static void begin_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id)
161 {
162    PERFETTO_LOG("begin event called - ts_ns=%" PRIu64, ts_ns);
163    uint32_t level = queue->stages[stage_id].level;
164    /* If we haven't managed to calibrate the alignment between GPU and CPU
165     * timestamps yet, then skip this trace, otherwise perfetto won't know
166     * what to do with it.
167     */
168    if (!queue->device->sync_gpu_ts) {
169       queue->stages[stage_id].start_ns[level] = 0;
170       return;
171    }
172 
173    if (level >= (ARRAY_SIZE(queue->stages[stage_id].start_ns) - 1))
174       return;
175 
176    queue->stages[stage_id].start_ns[level] = ts_ns;
177    queue->stages[stage_id].level++;
178 }
179 
end_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id,uint32_t submission_id,const char * app_event,const void * payload=nullptr,trace_payload_as_extra_func payload_as_extra=nullptr)180 static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id,
181                       uint32_t submission_id, const char *app_event, const void* payload = nullptr,
182                       trace_payload_as_extra_func payload_as_extra = nullptr)
183 {
184    PERFETTO_LOG("end event called - ts_ns=%" PRIu64, ts_ns);
185    struct si_ds_device *device = queue->device;
186 
187    /* If we haven't managed to calibrate the alignment between GPU and CPU
188     * timestamps yet, then skip this trace, otherwise perfetto won't know
189     * what to do with it.
190     */
191    if (!device->sync_gpu_ts)
192       return;
193 
194    if (queue->stages[stage_id].level == 0)
195       return;
196 
197    uint32_t level = --queue->stages[stage_id].level;
198    struct si_ds_stage *stage = &queue->stages[stage_id];
199    uint64_t start_ns = stage->start_ns[level];
200    PERFETTO_LOG("end event called - start_ns=%" PRIu64 " ts_ns=%" PRIu64, start_ns, ts_ns);
201    if (!start_ns || start_ns > ts_ns)
202       return;
203 
204    SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
205       if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
206          send_descriptors(tctx, queue->device);
207          state->was_cleared = false;
208       }
209 
210       sync_timestamp(tctx, queue->device);
211 
212       uint64_t evt_id = device->event_id++;
213 
214       /* If this is an application event, we might need to generate a new
215        * stage_iid if not already seen. Otherwise, it's a driver event and we
216        * have use the internal stage_iid.
217        */
218       uint64_t stage_iid = app_event ?
219                            tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) :
220                            stage->stage_iid;
221 
222       auto packet = tctx.NewTracePacket();
223 
224       packet->set_timestamp(start_ns);
225       packet->set_timestamp_clock_id(queue->device->gpu_clock_id);
226 
227       assert(ts_ns >= start_ns);
228 
229       auto event = packet->set_gpu_render_stage_event();
230       event->set_gpu_id(queue->device->gpu_id);
231 
232       event->set_hw_queue_iid(stage->queue_iid);
233       event->set_stage_iid(stage_iid);
234       event->set_context(queue->device->iid);
235       event->set_event_id(evt_id);
236       event->set_duration(ts_ns - start_ns);
237       event->set_submission_id(submission_id);
238 
239       if (payload && payload_as_extra) {
240          payload_as_extra(event, payload);
241       }
242    });
243 
244    stage->start_ns[level] = 0;
245 }
246 
247 #endif /* HAVE_PERFETTO */
248 
249 #ifdef __cplusplus
250 extern "C" {
251 #endif
252 
253 #ifdef HAVE_PERFETTO
254 
255 /*
256  * Trace callbacks, called from u_trace once the timestamps from GPU have been
257  * collected.
258  */
259 
260 #define CREATE_DUAL_EVENT_CALLBACK(event_name, stage)                                             \
261 void si_ds_begin_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx,       \
262                               const void *flush_data,                                             \
263                               const struct trace_si_begin_##event_name *payload,                  \
264                               const void *indirect_data)                                          \
265 {                                                                                                 \
266    const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data;           \
267    begin_event(flush->queue, ts_ns, stage);                                                       \
268 }                                                                                                 \
269                                                                                                   \
270 void si_ds_end_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx,         \
271                             const void *flush_data,                                               \
272                             const struct trace_si_end_##event_name *payload,                      \
273                             const void *indirect_data)                                            \
274 {                                                                                                 \
275    const struct si_ds_flush_data *flush =  (const struct si_ds_flush_data *) flush_data;          \
276    end_event(flush->queue, ts_ns, stage, flush->submission_id, NULL, payload,                     \
277              (trace_payload_as_extra_func)&trace_payload_as_extra_si_end_##event_name);           \
278 }                                                                                                 \
279 
CREATE_DUAL_EVENT_CALLBACK(draw,SI_DS_QUEUE_STAGE_DRAW)280 CREATE_DUAL_EVENT_CALLBACK(draw, SI_DS_QUEUE_STAGE_DRAW)
281 CREATE_DUAL_EVENT_CALLBACK(compute, SI_DS_QUEUE_STAGE_COMPUTE)
282 
283 uint64_t si_ds_begin_submit(struct si_ds_queue *queue)
284 {
285    return perfetto::base::GetBootTimeNs().count();
286 }
287 
si_ds_end_submit(struct si_ds_queue * queue,uint64_t start_ts)288 void si_ds_end_submit(struct si_ds_queue *queue, uint64_t start_ts)
289 {
290    if (!u_trace_should_process(&queue->device->trace_context)) {
291       queue->device->sync_gpu_ts = 0;
292       queue->device->next_clock_sync_ns = 0;
293       return;
294    }
295 
296    uint64_t end_ts = perfetto::base::GetBootTimeNs().count();
297    uint32_t submission_id = queue->submission_id++;
298 
299    SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
300       if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
301          send_descriptors(tctx, queue->device);
302          state->was_cleared = false;
303       }
304 
305       sync_timestamp(tctx, queue->device);
306 
307       auto packet = tctx.NewTracePacket();
308 
309       packet->set_timestamp(start_ts);
310 
311       auto event = packet->set_vulkan_api_event();
312       auto submit = event->set_vk_queue_submit();
313 
314       submit->set_duration_ns(end_ts - start_ts);
315       submit->set_vk_queue((uintptr_t) queue);
316       submit->set_submission_id(submission_id);
317    });
318 }
319 
320 #endif /* HAVE_PERFETTO */
321 
si_driver_ds_init_once(void)322 static void si_driver_ds_init_once(void)
323 {
324 #ifdef HAVE_PERFETTO
325    util_perfetto_init();
326    perfetto::DataSourceDescriptor dsd;
327    dsd.set_name("gpu.renderstages.amd");
328    SIRenderpassDataSource::Register(dsd);
329 #endif
330 }
331 
332 static once_flag si_driver_ds_once_flag = ONCE_FLAG_INIT;
333 static uint64_t iid = 1;
334 
get_iid()335 static uint64_t get_iid()
336 {
337    return iid++;
338 }
339 
si_pps_clock_id(uint32_t gpu_id)340 static uint32_t si_pps_clock_id(uint32_t gpu_id)
341 {
342    char buf[40];
343    snprintf(buf, sizeof(buf), "org.freedesktop.mesa.amd.gpu%u", gpu_id);
344 
345    return _mesa_hash_string(buf) | 0x80000000;
346 }
347 
si_driver_ds_init(void)348 void si_driver_ds_init(void)
349 {
350    call_once(&si_driver_ds_once_flag, si_driver_ds_init_once);
351    si_gpu_tracepoint_config_variable();
352 }
353 
si_ds_device_init(struct si_ds_device * device,const struct radeon_info * devinfo,uint32_t gpu_id,enum amd_ds_api api)354 void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo,
355                        uint32_t gpu_id, enum amd_ds_api api)
356 {
357    device->gpu_id = gpu_id;
358    device->gpu_clock_id = si_pps_clock_id(gpu_id);
359    device->info = devinfo;
360    device->iid = get_iid();
361    device->api = api;
362    list_inithead(&device->queues);
363 }
364 
si_ds_device_fini(struct si_ds_device * device)365 void si_ds_device_fini(struct si_ds_device *device)
366 {
367    u_trace_context_fini(&device->trace_context);
368 }
369 
si_ds_device_init_queue(struct si_ds_device * device,struct si_ds_queue * queue,const char * fmt_name,...)370 struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device,
371                                              struct si_ds_queue *queue,
372                                              const char *fmt_name, ...)
373 {
374    va_list ap;
375    queue->device = device;
376 
377    va_start(ap, fmt_name);
378    vsnprintf(queue->name, sizeof(queue->name), fmt_name, ap);
379    va_end(ap);
380 
381    for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
382       queue->stages[s].queue_iid = get_iid();
383       queue->stages[s].stage_iid = get_iid();
384    }
385 
386    list_add(&queue->link, &device->queues);
387 
388    return queue;
389 }
390 
si_ds_flush_data_init(struct si_ds_flush_data * data,struct si_ds_queue * queue,uint64_t submission_id)391 void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue,
392                            uint64_t submission_id)
393 {
394    memset(data, 0, sizeof(*data));
395 
396    data->queue = queue;
397    data->submission_id = submission_id;
398 
399    u_trace_init(&data->trace, &queue->device->trace_context);
400 }
401 
si_ds_flush_data_fini(struct si_ds_flush_data * data)402 void si_ds_flush_data_fini(struct si_ds_flush_data *data)
403 {
404    u_trace_fini(&data->trace);
405 }
406 
407 #ifdef __cplusplus
408 }
409 #endif
410