1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_build_pm4.h"
8 #include "si_query.h"
9 #include "util/u_memory.h"
10
11 #include "ac_perfcounter.h"
12
13 struct si_query_group {
14 struct si_query_group *next;
15 struct ac_pc_block *block;
16 unsigned sub_gid; /* only used during init */
17 unsigned result_base; /* only used during init */
18 int se;
19 int instance;
20 unsigned num_counters;
21 unsigned selectors[AC_QUERY_MAX_COUNTERS];
22 };
23
24 struct si_query_counter {
25 unsigned base;
26 unsigned qwords;
27 unsigned stride; /* in uint64s */
28 };
29
30 struct si_query_pc {
31 struct si_query b;
32 struct si_query_buffer buffer;
33
34 /* Size of the results in memory, in bytes. */
35 unsigned result_size;
36
37 unsigned shaders;
38 unsigned num_counters;
39 struct si_query_counter *counters;
40 struct si_query_group *groups;
41 };
42
si_pc_emit_instance(struct si_context * sctx,int se,int instance)43 static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
44 {
45 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
46 unsigned value = S_030800_SH_BROADCAST_WRITES(1);
47
48 if (se >= 0) {
49 value |= S_030800_SE_INDEX(se);
50 } else {
51 value |= S_030800_SE_BROADCAST_WRITES(1);
52 }
53
54 if (sctx->gfx_level >= GFX10) {
55 /* TODO: Expose counters from each shader array separately if needed. */
56 value |= S_030800_SA_BROADCAST_WRITES(1);
57 }
58
59 if (instance >= 0) {
60 value |= S_030800_INSTANCE_INDEX(instance);
61 } else {
62 value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
63 }
64
65 radeon_begin(cs);
66 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value);
67 radeon_end();
68 }
69
si_pc_emit_shaders(struct radeon_cmdbuf * cs,unsigned shaders)70 void si_pc_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders)
71 {
72 radeon_begin(cs);
73 radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2);
74 radeon_emit(shaders & 0x7f);
75 radeon_emit(0xffffffff);
76 radeon_end();
77 }
78
si_pc_emit_select(struct si_context * sctx,struct ac_pc_block * block,unsigned count,unsigned * selectors)79 static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
80 unsigned *selectors)
81 {
82 struct ac_pc_block_base *regs = block->b->b;
83 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
84 unsigned idx;
85
86 assert(count <= regs->num_counters);
87
88 /* Fake counters. */
89 if (!regs->select0)
90 return;
91
92 radeon_begin(cs);
93
94 for (idx = 0; idx < count; ++idx) {
95 radeon_set_uconfig_reg_seq(regs->select0[idx], 1);
96 radeon_emit(selectors[idx] | regs->select_or);
97 }
98
99 for (idx = 0; idx < regs->num_spm_counters; idx++) {
100 radeon_set_uconfig_reg_seq(regs->select1[idx], 1);
101 radeon_emit(0);
102 }
103
104 radeon_end();
105 }
106
si_pc_emit_start(struct si_context * sctx,struct si_resource * buffer,uint64_t va)107 static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
108 {
109 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
110
111 si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
112 COPY_DATA_IMM, NULL, 1);
113
114 radeon_begin(cs);
115 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
116 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
117 radeon_event_write(V_028A90_PERFCOUNTER_STOP);
118 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
119 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
120 radeon_end();
121 }
122
123 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
124 * do it again in here. */
si_pc_emit_stop(struct si_context * sctx,struct si_resource * buffer,uint64_t va)125 static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
126 {
127 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
128
129 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
130 EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
131 si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
132
133 radeon_begin(cs);
134 radeon_event_write(V_028A90_PERFCOUNTER_SAMPLE);
135
136 if (!sctx->screen->info.never_send_perfcounter_stop)
137 radeon_event_write(V_028A90_PERFCOUNTER_STOP);
138
139 radeon_set_uconfig_reg(
140 R_036020_CP_PERFMON_CNTL,
141 S_036020_PERFMON_STATE(sctx->screen->info.never_stop_sq_perf_counters ?
142 V_036020_CP_PERFMON_STATE_START_COUNTING :
143 V_036020_CP_PERFMON_STATE_STOP_COUNTING) |
144 S_036020_PERFMON_SAMPLE_ENABLE(1));
145 radeon_end();
146 }
147
si_pc_emit_spm_start(struct radeon_cmdbuf * cs)148 void si_pc_emit_spm_start(struct radeon_cmdbuf *cs)
149 {
150 radeon_begin(cs);
151
152 /* Start SPM counters. */
153 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
154 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
155 S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
156 /* Start windowed performance counters. */
157 radeon_event_write(V_028A90_PERFCOUNTER_START);
158 radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(1));
159
160 radeon_end();
161 }
162
si_pc_emit_spm_stop(struct radeon_cmdbuf * cs,bool never_stop_sq_perf_counters,bool never_send_perfcounter_stop)163 void si_pc_emit_spm_stop(struct radeon_cmdbuf *cs, bool never_stop_sq_perf_counters,
164 bool never_send_perfcounter_stop)
165 {
166 radeon_begin(cs);
167
168 /* Stop windowed performance counters. */
169 if (!never_send_perfcounter_stop)
170 radeon_event_write(V_028A90_PERFCOUNTER_STOP);
171
172 radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(0));
173
174 /* Stop SPM counters. */
175 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
176 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
177 S_036020_SPM_PERFMON_STATE(never_stop_sq_perf_counters ?
178 V_036020_STRM_PERFMON_STATE_START_COUNTING :
179 V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
180
181 radeon_end();
182 }
183
si_pc_emit_spm_reset(struct radeon_cmdbuf * cs)184 void si_pc_emit_spm_reset(struct radeon_cmdbuf *cs)
185 {
186 radeon_begin(cs);
187 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
188 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
189 S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
190 radeon_end();
191 }
192
193
si_pc_emit_read(struct si_context * sctx,struct ac_pc_block * block,unsigned count,uint64_t va)194 static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
195 uint64_t va)
196 {
197 struct ac_pc_block_base *regs = block->b->b;
198 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
199 unsigned idx;
200 unsigned reg = regs->counter0_lo;
201 unsigned reg_delta = 8;
202
203 radeon_begin(cs);
204
205 if (regs->select0) {
206 for (idx = 0; idx < count; ++idx) {
207 if (regs->counters)
208 reg = regs->counters[idx];
209
210 radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
211 radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
212 COPY_DATA_COUNT_SEL); /* 64 bits */
213 radeon_emit(reg >> 2);
214 radeon_emit(0); /* unused */
215 radeon_emit(va);
216 radeon_emit(va >> 32);
217 va += sizeof(uint64_t);
218 reg += reg_delta;
219 }
220 } else {
221 /* Fake counters. */
222 for (idx = 0; idx < count; ++idx) {
223 radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
224 radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
225 COPY_DATA_COUNT_SEL);
226 radeon_emit(0); /* immediate */
227 radeon_emit(0);
228 radeon_emit(va);
229 radeon_emit(va >> 32);
230 va += sizeof(uint64_t);
231 }
232 }
233 radeon_end();
234 }
235
si_pc_query_destroy(struct si_context * sctx,struct si_query * squery)236 static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
237 {
238 struct si_query_pc *query = (struct si_query_pc *)squery;
239
240 while (query->groups) {
241 struct si_query_group *group = query->groups;
242 query->groups = group->next;
243 FREE(group);
244 }
245
246 FREE(query->counters);
247
248 si_query_buffer_destroy(sctx->screen, &query->buffer);
249 FREE(query);
250 }
251
si_inhibit_clockgating(struct si_context * sctx,struct radeon_cmdbuf * cs,bool inhibit)252 void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, bool inhibit)
253 {
254 if (sctx->gfx_level >= GFX11)
255 return;
256
257 radeon_begin(&sctx->gfx_cs);
258
259 if (sctx->gfx_level >= GFX10) {
260 radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL,
261 S_037390_PERFMON_CLOCK_STATE(inhibit));
262 } else if (sctx->gfx_level >= GFX8) {
263 radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
264 S_0372FC_PERFMON_CLOCK_STATE(inhibit));
265 }
266 radeon_end();
267 }
268
si_pc_query_resume(struct si_context * sctx,struct si_query * squery)269 static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
270 /*
271 struct si_query_hw *hwquery,
272 struct si_resource *buffer, uint64_t va)*/
273 {
274 struct si_query_pc *query = (struct si_query_pc *)squery;
275 int current_se = -1;
276 int current_instance = -1;
277
278 if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
279 return;
280 si_need_gfx_cs_space(sctx, 0);
281
282 if (query->shaders)
283 si_pc_emit_shaders(&sctx->gfx_cs, query->shaders);
284
285 si_inhibit_clockgating(sctx, &sctx->gfx_cs, true);
286
287 for (struct si_query_group *group = query->groups; group; group = group->next) {
288 struct ac_pc_block *block = group->block;
289
290 if (group->se != current_se || group->instance != current_instance) {
291 current_se = group->se;
292 current_instance = group->instance;
293 si_pc_emit_instance(sctx, group->se, group->instance);
294 }
295
296 si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
297 }
298
299 if (current_se != -1 || current_instance != -1)
300 si_pc_emit_instance(sctx, -1, -1);
301
302 uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
303 si_pc_emit_start(sctx, query->buffer.buf, va);
304 }
305
si_pc_query_suspend(struct si_context * sctx,struct si_query * squery)306 static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
307 {
308 struct si_query_pc *query = (struct si_query_pc *)squery;
309
310 if (!query->buffer.buf)
311 return;
312
313 uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
314 query->buffer.results_end += query->result_size;
315
316 si_pc_emit_stop(sctx, query->buffer.buf, va);
317
318 for (struct si_query_group *group = query->groups; group; group = group->next) {
319 struct ac_pc_block *block = group->block;
320 unsigned se = group->se >= 0 ? group->se : 0;
321 unsigned se_end = se + 1;
322
323 if ((block->b->b->flags & AC_PC_BLOCK_SE) && (group->se < 0))
324 se_end = sctx->screen->info.max_se;
325
326 do {
327 unsigned instance = group->instance >= 0 ? group->instance : 0;
328
329 do {
330 si_pc_emit_instance(sctx, se, instance);
331 si_pc_emit_read(sctx, block, group->num_counters, va);
332 va += sizeof(uint64_t) * group->num_counters;
333 } while (group->instance < 0 && ++instance < block->num_instances);
334 } while (++se < se_end);
335 }
336
337 si_pc_emit_instance(sctx, -1, -1);
338
339 si_inhibit_clockgating(sctx, &sctx->gfx_cs, false);
340 }
341
si_pc_query_begin(struct si_context * ctx,struct si_query * squery)342 static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
343 {
344 struct si_query_pc *query = (struct si_query_pc *)squery;
345
346 si_query_buffer_reset(ctx, &query->buffer);
347
348 list_addtail(&query->b.active_list, &ctx->active_queries);
349 ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
350
351 si_pc_query_resume(ctx, squery);
352
353 return true;
354 }
355
si_pc_query_end(struct si_context * ctx,struct si_query * squery)356 static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
357 {
358 struct si_query_pc *query = (struct si_query_pc *)squery;
359
360 si_pc_query_suspend(ctx, squery);
361
362 list_del(&squery->active_list);
363 ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
364
365 return query->buffer.buf != NULL;
366 }
367
si_pc_query_add_result(struct si_query_pc * query,void * buffer,union pipe_query_result * result)368 static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
369 union pipe_query_result *result)
370 {
371 uint64_t *results = buffer;
372 unsigned i, j;
373
374 for (i = 0; i < query->num_counters; ++i) {
375 struct si_query_counter *counter = &query->counters[i];
376
377 for (j = 0; j < counter->qwords; ++j) {
378 uint32_t value = results[counter->base + j * counter->stride];
379 result->batch[i].u64 += value;
380 }
381 }
382 }
383
si_pc_query_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)384 static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
385 union pipe_query_result *result)
386 {
387 struct si_query_pc *query = (struct si_query_pc *)squery;
388
389 memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
390
391 for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
392 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
393 unsigned results_base = 0;
394 void *map;
395
396 if (squery->b.flushed)
397 map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
398 else
399 map = si_buffer_map(sctx, qbuf->buf, usage);
400
401 if (!map)
402 return false;
403
404 while (results_base != qbuf->results_end) {
405 si_pc_query_add_result(query, map + results_base, result);
406 results_base += query->result_size;
407 }
408 }
409
410 return true;
411 }
412
413 static const struct si_query_ops batch_query_ops = {
414 .destroy = si_pc_query_destroy,
415 .begin = si_pc_query_begin,
416 .end = si_pc_query_end,
417 .get_result = si_pc_query_get_result,
418
419 .suspend = si_pc_query_suspend,
420 .resume = si_pc_query_resume,
421 };
422
get_group_state(struct si_screen * screen,struct si_query_pc * query,struct ac_pc_block * block,unsigned sub_gid)423 static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
424 struct ac_pc_block *block, unsigned sub_gid)
425 {
426 struct si_perfcounters *pc = screen->perfcounters;
427 struct si_query_group *group = query->groups;
428
429 while (group) {
430 if (group->block == block && group->sub_gid == sub_gid)
431 return group;
432 group = group->next;
433 }
434
435 group = CALLOC_STRUCT(si_query_group);
436 if (!group)
437 return NULL;
438
439 group->block = block;
440 group->sub_gid = sub_gid;
441
442 if (block->b->b->flags & AC_PC_BLOCK_SHADER) {
443 unsigned sub_gids = block->num_instances;
444 unsigned shader_id;
445 unsigned shaders;
446 unsigned query_shaders;
447
448 if (ac_pc_block_has_per_se_groups(&pc->base, block))
449 sub_gids = sub_gids * screen->info.max_se;
450 shader_id = sub_gid / sub_gids;
451 sub_gid = sub_gid % sub_gids;
452
453 shaders = ac_pc_shader_type_bits[shader_id];
454
455 query_shaders = query->shaders & ~AC_PC_SHADERS_WINDOWING;
456 if (query_shaders && query_shaders != shaders) {
457 fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
458 FREE(group);
459 return NULL;
460 }
461 query->shaders = shaders;
462 }
463
464 if (block->b->b->flags & AC_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
465 // A non-zero value in query->shaders ensures that the shader
466 // masking is reset unless the user explicitly requests one.
467 query->shaders = AC_PC_SHADERS_WINDOWING;
468 }
469
470 if (ac_pc_block_has_per_se_groups(&pc->base, block)) {
471 group->se = sub_gid / block->num_instances;
472 sub_gid = sub_gid % block->num_instances;
473 } else {
474 group->se = -1;
475 }
476
477 if (ac_pc_block_has_per_instance_groups(&pc->base, block)) {
478 group->instance = sub_gid;
479 } else {
480 group->instance = -1;
481 }
482
483 group->next = query->groups;
484 query->groups = group;
485
486 return group;
487 }
488
si_create_batch_query(struct pipe_context * ctx,unsigned num_queries,unsigned * query_types)489 struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
490 unsigned *query_types)
491 {
492 struct si_screen *screen = (struct si_screen *)ctx->screen;
493 struct si_perfcounters *pc = screen->perfcounters;
494 struct ac_pc_block *block;
495 struct si_query_group *group;
496 struct si_query_pc *query;
497 unsigned base_gid, sub_gid, sub_index;
498 unsigned i, j;
499
500 if (!pc)
501 return NULL;
502
503 query = CALLOC_STRUCT(si_query_pc);
504 if (!query)
505 return NULL;
506
507 query->b.ops = &batch_query_ops;
508
509 query->num_counters = num_queries;
510
511 /* Collect selectors per group */
512 for (i = 0; i < num_queries; ++i) {
513 unsigned sub_gid;
514
515 if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
516 goto error;
517
518 block =
519 ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
520 if (!block)
521 goto error;
522
523 sub_gid = sub_index / block->b->selectors;
524 sub_index = sub_index % block->b->selectors;
525
526 group = get_group_state(screen, query, block, sub_gid);
527 if (!group)
528 goto error;
529
530 if (group->num_counters >= block->b->b->num_counters) {
531 fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
532 goto error;
533 }
534 group->selectors[group->num_counters] = sub_index;
535 ++group->num_counters;
536 }
537
538 /* Compute result bases and CS size per group */
539 query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
540 query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
541
542 i = 0;
543 for (group = query->groups; group; group = group->next) {
544 struct ac_pc_block *block = group->block;
545 unsigned read_dw;
546 unsigned instances = 1;
547
548 if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
549 instances = screen->info.max_se;
550 if (group->instance < 0)
551 instances *= block->num_instances;
552
553 group->result_base = i;
554 query->result_size += sizeof(uint64_t) * instances * group->num_counters;
555 i += instances * group->num_counters;
556
557 read_dw = 6 * group->num_counters;
558 query->b.num_cs_dw_suspend += instances * read_dw;
559 query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
560 }
561
562 if (query->shaders) {
563 if (query->shaders == AC_PC_SHADERS_WINDOWING)
564 query->shaders = 0xffffffff;
565 }
566
567 /* Map user-supplied query array to result indices */
568 query->counters = CALLOC(num_queries, sizeof(*query->counters));
569 for (i = 0; i < num_queries; ++i) {
570 struct si_query_counter *counter = &query->counters[i];
571 struct ac_pc_block *block;
572
573 block =
574 ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
575
576 sub_gid = sub_index / block->b->selectors;
577 sub_index = sub_index % block->b->selectors;
578
579 group = get_group_state(screen, query, block, sub_gid);
580 assert(group != NULL);
581
582 for (j = 0; j < group->num_counters; ++j) {
583 if (group->selectors[j] == sub_index)
584 break;
585 }
586
587 counter->base = group->result_base + j;
588 counter->stride = group->num_counters;
589
590 counter->qwords = 1;
591 if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
592 counter->qwords = screen->info.max_se;
593 if (group->instance < 0)
594 counter->qwords *= block->num_instances;
595 }
596
597 return (struct pipe_query *)query;
598
599 error:
600 si_pc_query_destroy((struct si_context *)ctx, &query->b);
601 return NULL;
602 }
603
si_get_perfcounter_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_info * info)604 int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
605 struct pipe_driver_query_info *info)
606 {
607 struct si_perfcounters *pc = screen->perfcounters;
608 struct ac_pc_block *block;
609 unsigned base_gid, sub;
610
611 if (!pc)
612 return 0;
613
614 if (!info) {
615 unsigned bid, num_queries = 0;
616
617 for (bid = 0; bid < pc->base.num_blocks; ++bid) {
618 num_queries += pc->base.blocks[bid].b->selectors * pc->base.blocks[bid].num_groups;
619 }
620
621 return num_queries;
622 }
623
624 block = ac_lookup_counter(&pc->base, index, &base_gid, &sub);
625 if (!block)
626 return 0;
627
628 if (!block->selector_names) {
629 if (!ac_init_block_names(&screen->info, &pc->base, block))
630 return 0;
631 }
632 info->name = block->selector_names + sub * block->selector_name_stride;
633 info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
634 info->max_value.u64 = 0;
635 info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
636 info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
637 info->group_id = base_gid + sub / block->b->selectors;
638 info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
639 if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
640 info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
641 return 1;
642 }
643
si_get_perfcounter_group_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)644 int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
645 struct pipe_driver_query_group_info *info)
646 {
647 struct si_perfcounters *pc = screen->perfcounters;
648 struct ac_pc_block *block;
649
650 if (!pc)
651 return 0;
652
653 if (!info)
654 return pc->base.num_groups;
655
656 block = ac_lookup_group(&pc->base, &index);
657 if (!block)
658 return 0;
659
660 if (!block->group_names) {
661 if (!ac_init_block_names(&screen->info, &pc->base, block))
662 return 0;
663 }
664 info->name = block->group_names + index * block->group_name_stride;
665 info->num_queries = block->b->selectors;
666 info->max_active_queries = block->b->b->num_counters;
667 return 1;
668 }
669
si_destroy_perfcounters(struct si_screen * screen)670 void si_destroy_perfcounters(struct si_screen *screen)
671 {
672 struct si_perfcounters *pc = screen->perfcounters;
673
674 if (!pc)
675 return;
676
677 ac_destroy_perfcounters(&pc->base);
678 FREE(pc);
679 screen->perfcounters = NULL;
680 }
681
si_init_perfcounters(struct si_screen * screen)682 void si_init_perfcounters(struct si_screen *screen)
683 {
684 bool separate_se, separate_instance;
685
686 separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
687 separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
688
689 screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
690 if (!screen->perfcounters)
691 return;
692
693 screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
694 screen->perfcounters->num_instance_cs_dwords = 3;
695
696 if (!ac_init_perfcounters(&screen->info, separate_se, separate_instance,
697 &screen->perfcounters->base)) {
698 si_destroy_perfcounters(screen);
699 }
700 }
701
702 static bool
si_spm_init_bo(struct si_context * sctx)703 si_spm_init_bo(struct si_context *sctx)
704 {
705 struct radeon_winsys *ws = sctx->ws;
706 uint64_t size = 32 * 1024 * 1024; /* Default to 32MB. */
707
708 sctx->spm.buffer_size = size;
709 sctx->spm.sample_interval = 4096; /* Default to 4096 clk. */
710
711 sctx->spm.bo = ws->buffer_create(
712 ws, size, 4096,
713 RADEON_DOMAIN_GTT,
714 RADEON_FLAG_NO_INTERPROCESS_SHARING |
715 RADEON_FLAG_GTT_WC |
716 RADEON_FLAG_NO_SUBALLOC);
717
718 return sctx->spm.bo != NULL;
719 }
720
721
722 static void
si_emit_spm_counters(struct si_context * sctx,struct radeon_cmdbuf * cs)723 si_emit_spm_counters(struct si_context *sctx, struct radeon_cmdbuf *cs)
724 {
725 struct ac_spm *spm = &sctx->spm;
726
727 radeon_begin(cs);
728
729 for (uint32_t instance = 0; instance < ARRAY_SIZE(spm->sqg); instance++) {
730 uint32_t num_counters = spm->sqg[instance].num_counters;
731
732 if (!num_counters)
733 continue;
734
735 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
736 S_030800_SH_BROADCAST_WRITES(1) |
737 S_030800_INSTANCE_BROADCAST_WRITES(1) |
738 S_030800_SE_INDEX(instance));
739
740 for (uint32_t b = 0; b < num_counters; b++) {
741 const struct ac_spm_counter_select *cntr_sel = &spm->sqg[instance].counters[b];
742 uint32_t reg_base = R_036700_SQ_PERFCOUNTER0_SELECT;
743
744 radeon_set_uconfig_reg_seq(reg_base + b * 4, 1);
745 radeon_emit(cntr_sel->sel0 | S_036700_SQC_BANK_MASK(0xf)); /* SQC_BANK_MASK only gfx10 */
746 }
747 }
748
749 for (uint32_t b = 0; b < spm->num_block_sel; b++) {
750 struct ac_spm_block_select *block_sel = &spm->block_sel[b];
751 struct ac_pc_block_base *regs = block_sel->b->b->b;
752
753 for (unsigned i = 0; i < block_sel->num_instances; i++) {
754 struct ac_spm_block_instance *block_instance = &block_sel->instances[i];
755
756 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, block_instance->grbm_gfx_index);
757
758 for (unsigned c = 0; c < block_instance->num_counters; c++) {
759 const struct ac_spm_counter_select *cntr_sel = &block_instance->counters[c];
760
761 if (!cntr_sel->active)
762 continue;
763
764 radeon_set_uconfig_reg_seq(regs->select0[c], 1);
765 radeon_emit(cntr_sel->sel0);
766
767 radeon_set_uconfig_reg_seq(regs->select1[c], 1);
768 radeon_emit(cntr_sel->sel1);
769 }
770 }
771 }
772
773 /* Restore global broadcasting. */
774 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
775 S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
776 S_030800_INSTANCE_BROADCAST_WRITES(1));
777
778 radeon_end();
779 }
780
781 #define SPM_RING_BASE_ALIGN 32
782
783 void
si_emit_spm_setup(struct si_context * sctx,struct radeon_cmdbuf * cs)784 si_emit_spm_setup(struct si_context *sctx, struct radeon_cmdbuf *cs)
785 {
786 struct ac_spm *spm = &sctx->spm;
787 uint64_t va = sctx->screen->ws->buffer_get_virtual_address(spm->bo);
788 uint64_t ring_size = spm->buffer_size;
789
790 /* It's required that the ring VA and the size are correctly aligned. */
791 assert(!(va & (SPM_RING_BASE_ALIGN - 1)));
792 assert(!(ring_size & (SPM_RING_BASE_ALIGN - 1)));
793 assert(spm->sample_interval >= 32);
794
795 radeon_begin(cs);
796
797 /* Configure the SPM ring buffer. */
798 radeon_set_uconfig_reg(R_037200_RLC_SPM_PERFMON_CNTL,
799 S_037200_PERFMON_RING_MODE(0) | /* no stall and no interrupt on overflow */
800 S_037200_PERFMON_SAMPLE_INTERVAL(spm->sample_interval)); /* in sclk */
801 radeon_set_uconfig_reg(R_037204_RLC_SPM_PERFMON_RING_BASE_LO, va);
802 radeon_set_uconfig_reg(R_037208_RLC_SPM_PERFMON_RING_BASE_HI,
803 S_037208_RING_BASE_HI(va >> 32));
804 radeon_set_uconfig_reg(R_03720C_RLC_SPM_PERFMON_RING_SIZE, ring_size);
805
806 /* Configure the muxsel. */
807 uint32_t total_muxsel_lines = 0;
808 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
809 total_muxsel_lines += spm->num_muxsel_lines[s];
810 }
811
812 radeon_set_uconfig_reg(R_03726C_RLC_SPM_ACCUM_MODE, 0);
813 radeon_set_uconfig_reg(R_037210_RLC_SPM_PERFMON_SEGMENT_SIZE, 0);
814 radeon_set_uconfig_reg(R_03727C_RLC_SPM_PERFMON_SE3TO0_SEGMENT_SIZE,
815 S_03727C_SE0_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE0]) |
816 S_03727C_SE1_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE1]) |
817 S_03727C_SE2_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE2]) |
818 S_03727C_SE3_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE3]));
819 radeon_set_uconfig_reg(R_037280_RLC_SPM_PERFMON_GLB_SEGMENT_SIZE,
820 S_037280_PERFMON_SEGMENT_SIZE(total_muxsel_lines) |
821 S_037280_GLOBAL_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_GLOBAL]));
822
823 /* Upload each muxsel ram to the RLC. */
824 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
825 unsigned rlc_muxsel_addr, rlc_muxsel_data;
826 unsigned grbm_gfx_index = S_030800_SH_BROADCAST_WRITES(1) |
827 S_030800_INSTANCE_BROADCAST_WRITES(1);
828
829 if (!spm->num_muxsel_lines[s])
830 continue;
831
832 if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) {
833 grbm_gfx_index |= S_030800_SE_BROADCAST_WRITES(1);
834
835 rlc_muxsel_addr = R_037224_RLC_SPM_GLOBAL_MUXSEL_ADDR;
836 rlc_muxsel_data = R_037228_RLC_SPM_GLOBAL_MUXSEL_DATA;
837 } else {
838 grbm_gfx_index |= S_030800_SE_INDEX(s);
839
840 rlc_muxsel_addr = R_03721C_RLC_SPM_SE_MUXSEL_ADDR;
841 rlc_muxsel_data = R_037220_RLC_SPM_SE_MUXSEL_DATA;
842 }
843
844 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, grbm_gfx_index);
845
846 for (unsigned l = 0; l < spm->num_muxsel_lines[s]; l++) {
847 uint32_t *data = (uint32_t *)spm->muxsel_lines[s][l].muxsel;
848
849 /* Select MUXSEL_ADDR to point to the next muxsel. */
850 radeon_set_uconfig_reg(rlc_muxsel_addr, l * AC_SPM_MUXSEL_LINE_SIZE);
851
852 /* Write the muxsel line configuration with MUXSEL_DATA. */
853 radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + AC_SPM_MUXSEL_LINE_SIZE, 0));
854 radeon_emit(S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) |
855 S_370_WR_CONFIRM(1) |
856 S_370_ENGINE_SEL(V_370_ME) |
857 S_370_WR_ONE_ADDR(1));
858 radeon_emit(rlc_muxsel_data >> 2);
859 radeon_emit(0);
860 radeon_emit_array(data, AC_SPM_MUXSEL_LINE_SIZE);
861 }
862 }
863 radeon_end();
864
865 /* Select SPM counters. */
866 si_emit_spm_counters(sctx, cs);
867 }
868
869 bool
si_spm_init(struct si_context * sctx)870 si_spm_init(struct si_context *sctx)
871 {
872 const struct radeon_info *info = &sctx->screen->info;
873
874 sctx->screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
875 sctx->screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(sctx->screen);
876 sctx->screen->perfcounters->num_instance_cs_dwords = 3;
877
878 struct ac_perfcounters *pc = &sctx->screen->perfcounters->base;
879
880 if (!ac_init_perfcounters(info, false, false, pc))
881 return false;
882
883 if (!ac_init_spm(info, pc, &sctx->spm))
884 return false;
885
886 if (!si_spm_init_bo(sctx))
887 return false;
888
889 return true;
890 }
891
892 void
si_spm_finish(struct si_context * sctx)893 si_spm_finish(struct si_context *sctx)
894 {
895 struct pb_buffer_lean *bo = sctx->spm.bo;
896 radeon_bo_reference(sctx->screen->ws, &bo, NULL);
897
898 ac_destroy_spm(&sctx->spm);
899 }
900