1 /*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6 #include <stdio.h>
7 #include <stdarg.h>
8 #include <string.h>
9
10 #include "util/hash_table.h"
11 #include "util/u_process.h"
12 #include "util/hash_table.h"
13
14 #include "si_pipe.h"
15 #include "si_perfetto.h"
16 #include "si_tracepoints.h"
17
18 #ifdef HAVE_PERFETTO
19
20 #include "util/perf/u_perfetto.h"
21 #include "util/perf/u_perfetto_renderpass.h"
22
23 #include "si_tracepoints_perfetto.h"
24
25 /* Just naming stages */
26 static const struct {
27 const char *name;
28
29 /* The perfetto UI requires that there is a parent-child relationship
30 * within a row of elements. Which means that all children elements must
31 * end within the lifespan of their parent.
32 *
33 * Some elements like stalls and command buffers follow that relationship,
34 * but not all. This tells us in which UI row the elements should live.
35 */
36 enum si_ds_queue_stage draw_stage;
37 } si_queue_stage_desc[SI_DS_QUEUE_STAGE_N_STAGES] = {
38 /* Order must match the enum! */
39 {
40 "queue",
41 SI_DS_QUEUE_STAGE_QUEUE,
42 },
43 {
44 "compute",
45 SI_DS_QUEUE_STAGE_COMPUTE,
46 },
47 {
48 "draw",
49 SI_DS_QUEUE_STAGE_DRAW,
50 }
51 };
52
53 struct SIRenderpassIncrementalState {
54 bool was_cleared = true;
55 };
56
57 struct SIRenderpassTraits : public perfetto::DefaultDataSourceTraits {
58 using IncrementalStateType = SIRenderpassIncrementalState;
59 };
60
61 class SIRenderpassDataSource : public MesaRenderpassDataSource<SIRenderpassDataSource,
62 SIRenderpassTraits> {
63 };
64
65 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
66 PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
67
68 using perfetto::protos::pbzero::InternedGpuRenderStageSpecification_RenderStageCategory;
69
sync_timestamp(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)70 static void sync_timestamp(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device)
71 {
72 uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
73 uint64_t gpu_ts;
74
75 struct si_context *sctx = container_of(device, struct si_context, ds);
76 gpu_ts = sctx->screen->b.get_timestamp(&sctx->screen->b);
77
78
79 cpu_ts = perfetto::base::GetBootTimeNs().count();
80
81 if (cpu_ts < device->next_clock_sync_ns)
82 return;
83
84 PERFETTO_LOG("sending clocks gpu=0x%08x", device->gpu_clock_id);
85
86 device->sync_gpu_ts = gpu_ts;
87 device->next_clock_sync_ns = cpu_ts + 1000000000ull;
88 MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits>::
89 EmitClockSync(ctx, cpu_ts, gpu_ts, device->gpu_clock_id);
90 }
91
send_descriptors(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)92 static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx,
93 struct si_ds_device *device)
94 {
95 PERFETTO_LOG("Sending renderstage descriptors");
96
97 device->event_id = 0;
98 list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
99 for (uint32_t s = 0; s < ARRAY_SIZE(queue->stages); s++) {
100 queue->stages[s].start_ns[0] = 0;
101 }
102 }
103
104 {
105 auto packet = ctx.NewTracePacket();
106
107 packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
108 packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
109 packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
110
111 auto interned_data = packet->set_interned_data();
112
113 {
114 auto desc = interned_data->add_graphics_contexts();
115 desc->set_iid(device->iid);
116 desc->set_pid(getpid());
117 switch (device->api) {
118 case AMD_DS_API_OPENGL:
119 desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::OPEN_GL);
120 break;
121 case AMD_DS_API_VULKAN:
122 desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::VULKAN);
123 break;
124 default:
125 break;
126 }
127 }
128
129 /* Emit all the IID picked at device/queue creation. */
130 list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
131 for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
132 {
133 /* We put the stage number in there so that all rows are order
134 * by si_ds_queue_stage.
135 */
136 char name[100];
137 snprintf(name, sizeof(name), "%.10s-%s-%u-%s", util_get_process_name(),
138 queue->name, s, si_queue_stage_desc[s].name);
139
140 auto desc = interned_data->add_gpu_specifications();
141 desc->set_iid(queue->stages[s].queue_iid);
142 desc->set_name(name);
143 }
144 {
145 auto desc = interned_data->add_gpu_specifications();
146 desc->set_iid(queue->stages[s].stage_iid);
147 desc->set_name(si_queue_stage_desc[s].name);
148 }
149 }
150 }
151 }
152
153 device->next_clock_sync_ns = 0;
154 sync_timestamp(ctx, device);
155 }
156
157 typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *,
158 const void*);
159
begin_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id)160 static void begin_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id)
161 {
162 PERFETTO_LOG("begin event called - ts_ns=%" PRIu64, ts_ns);
163 uint32_t level = queue->stages[stage_id].level;
164 /* If we haven't managed to calibrate the alignment between GPU and CPU
165 * timestamps yet, then skip this trace, otherwise perfetto won't know
166 * what to do with it.
167 */
168 if (!queue->device->sync_gpu_ts) {
169 queue->stages[stage_id].start_ns[level] = 0;
170 return;
171 }
172
173 if (level >= (ARRAY_SIZE(queue->stages[stage_id].start_ns) - 1))
174 return;
175
176 queue->stages[stage_id].start_ns[level] = ts_ns;
177 queue->stages[stage_id].level++;
178 }
179
end_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id,uint32_t submission_id,const char * app_event,const void * payload=nullptr,trace_payload_as_extra_func payload_as_extra=nullptr)180 static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id,
181 uint32_t submission_id, const char *app_event, const void* payload = nullptr,
182 trace_payload_as_extra_func payload_as_extra = nullptr)
183 {
184 PERFETTO_LOG("end event called - ts_ns=%" PRIu64, ts_ns);
185 struct si_ds_device *device = queue->device;
186
187 /* If we haven't managed to calibrate the alignment between GPU and CPU
188 * timestamps yet, then skip this trace, otherwise perfetto won't know
189 * what to do with it.
190 */
191 if (!device->sync_gpu_ts)
192 return;
193
194 if (queue->stages[stage_id].level == 0)
195 return;
196
197 uint32_t level = --queue->stages[stage_id].level;
198 struct si_ds_stage *stage = &queue->stages[stage_id];
199 uint64_t start_ns = stage->start_ns[level];
200 PERFETTO_LOG("end event called - start_ns=%" PRIu64 " ts_ns=%" PRIu64, start_ns, ts_ns);
201 if (!start_ns || start_ns > ts_ns)
202 return;
203
204 SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
205 if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
206 send_descriptors(tctx, queue->device);
207 state->was_cleared = false;
208 }
209
210 sync_timestamp(tctx, queue->device);
211
212 uint64_t evt_id = device->event_id++;
213
214 /* If this is an application event, we might need to generate a new
215 * stage_iid if not already seen. Otherwise, it's a driver event and we
216 * have use the internal stage_iid.
217 */
218 uint64_t stage_iid = app_event ?
219 tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) :
220 stage->stage_iid;
221
222 auto packet = tctx.NewTracePacket();
223
224 packet->set_timestamp(start_ns);
225 packet->set_timestamp_clock_id(queue->device->gpu_clock_id);
226
227 assert(ts_ns >= start_ns);
228
229 auto event = packet->set_gpu_render_stage_event();
230 event->set_gpu_id(queue->device->gpu_id);
231
232 event->set_hw_queue_iid(stage->queue_iid);
233 event->set_stage_iid(stage_iid);
234 event->set_context(queue->device->iid);
235 event->set_event_id(evt_id);
236 event->set_duration(ts_ns - start_ns);
237 event->set_submission_id(submission_id);
238
239 if (payload && payload_as_extra) {
240 payload_as_extra(event, payload);
241 }
242 });
243
244 stage->start_ns[level] = 0;
245 }
246
247 #endif /* HAVE_PERFETTO */
248
249 #ifdef __cplusplus
250 extern "C" {
251 #endif
252
253 #ifdef HAVE_PERFETTO
254
255 /*
256 * Trace callbacks, called from u_trace once the timestamps from GPU have been
257 * collected.
258 */
259
260 #define CREATE_DUAL_EVENT_CALLBACK(event_name, stage) \
261 void si_ds_begin_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx, \
262 const void *flush_data, \
263 const struct trace_si_begin_##event_name *payload, \
264 const void *indirect_data) \
265 { \
266 const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data; \
267 begin_event(flush->queue, ts_ns, stage); \
268 } \
269 \
270 void si_ds_end_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx, \
271 const void *flush_data, \
272 const struct trace_si_end_##event_name *payload, \
273 const void *indirect_data) \
274 { \
275 const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data; \
276 end_event(flush->queue, ts_ns, stage, flush->submission_id, NULL, payload, \
277 (trace_payload_as_extra_func)&trace_payload_as_extra_si_end_##event_name); \
278 } \
279
CREATE_DUAL_EVENT_CALLBACK(draw,SI_DS_QUEUE_STAGE_DRAW)280 CREATE_DUAL_EVENT_CALLBACK(draw, SI_DS_QUEUE_STAGE_DRAW)
281 CREATE_DUAL_EVENT_CALLBACK(compute, SI_DS_QUEUE_STAGE_COMPUTE)
282
283 uint64_t si_ds_begin_submit(struct si_ds_queue *queue)
284 {
285 return perfetto::base::GetBootTimeNs().count();
286 }
287
si_ds_end_submit(struct si_ds_queue * queue,uint64_t start_ts)288 void si_ds_end_submit(struct si_ds_queue *queue, uint64_t start_ts)
289 {
290 if (!u_trace_should_process(&queue->device->trace_context)) {
291 queue->device->sync_gpu_ts = 0;
292 queue->device->next_clock_sync_ns = 0;
293 return;
294 }
295
296 uint64_t end_ts = perfetto::base::GetBootTimeNs().count();
297 uint32_t submission_id = queue->submission_id++;
298
299 SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
300 if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
301 send_descriptors(tctx, queue->device);
302 state->was_cleared = false;
303 }
304
305 sync_timestamp(tctx, queue->device);
306
307 auto packet = tctx.NewTracePacket();
308
309 packet->set_timestamp(start_ts);
310
311 auto event = packet->set_vulkan_api_event();
312 auto submit = event->set_vk_queue_submit();
313
314 submit->set_duration_ns(end_ts - start_ts);
315 submit->set_vk_queue((uintptr_t) queue);
316 submit->set_submission_id(submission_id);
317 });
318 }
319
320 #endif /* HAVE_PERFETTO */
321
si_driver_ds_init_once(void)322 static void si_driver_ds_init_once(void)
323 {
324 #ifdef HAVE_PERFETTO
325 util_perfetto_init();
326 perfetto::DataSourceDescriptor dsd;
327 dsd.set_name("gpu.renderstages.amd");
328 SIRenderpassDataSource::Register(dsd);
329 #endif
330 }
331
332 static once_flag si_driver_ds_once_flag = ONCE_FLAG_INIT;
333 static uint64_t iid = 1;
334
get_iid()335 static uint64_t get_iid()
336 {
337 return iid++;
338 }
339
si_pps_clock_id(uint32_t gpu_id)340 static uint32_t si_pps_clock_id(uint32_t gpu_id)
341 {
342 char buf[40];
343 snprintf(buf, sizeof(buf), "org.freedesktop.mesa.amd.gpu%u", gpu_id);
344
345 return _mesa_hash_string(buf) | 0x80000000;
346 }
347
si_driver_ds_init(void)348 void si_driver_ds_init(void)
349 {
350 call_once(&si_driver_ds_once_flag, si_driver_ds_init_once);
351 si_gpu_tracepoint_config_variable();
352 }
353
si_ds_device_init(struct si_ds_device * device,const struct radeon_info * devinfo,uint32_t gpu_id,enum amd_ds_api api)354 void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo,
355 uint32_t gpu_id, enum amd_ds_api api)
356 {
357 device->gpu_id = gpu_id;
358 device->gpu_clock_id = si_pps_clock_id(gpu_id);
359 device->info = devinfo;
360 device->iid = get_iid();
361 device->api = api;
362 list_inithead(&device->queues);
363 }
364
si_ds_device_fini(struct si_ds_device * device)365 void si_ds_device_fini(struct si_ds_device *device)
366 {
367 u_trace_context_fini(&device->trace_context);
368 }
369
si_ds_device_init_queue(struct si_ds_device * device,struct si_ds_queue * queue,const char * fmt_name,...)370 struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device,
371 struct si_ds_queue *queue,
372 const char *fmt_name, ...)
373 {
374 va_list ap;
375 queue->device = device;
376
377 va_start(ap, fmt_name);
378 vsnprintf(queue->name, sizeof(queue->name), fmt_name, ap);
379 va_end(ap);
380
381 for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
382 queue->stages[s].queue_iid = get_iid();
383 queue->stages[s].stage_iid = get_iid();
384 }
385
386 list_add(&queue->link, &device->queues);
387
388 return queue;
389 }
390
si_ds_flush_data_init(struct si_ds_flush_data * data,struct si_ds_queue * queue,uint64_t submission_id)391 void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue,
392 uint64_t submission_id)
393 {
394 memset(data, 0, sizeof(*data));
395
396 data->queue = queue;
397 data->submission_id = submission_id;
398
399 u_trace_init(&data->trace, &queue->device->trace_context);
400 }
401
si_ds_flush_data_fini(struct si_ds_flush_data * data)402 void si_ds_flush_data_fini(struct si_ds_flush_data *data)
403 {
404 u_trace_fini(&data->trace);
405 }
406
407 #ifdef __cplusplus
408 }
409 #endif
410