xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_perfcounter.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2015 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_build_pm4.h"
8 #include "si_query.h"
9 #include "util/u_memory.h"
10 
11 #include "ac_perfcounter.h"
12 
13 struct si_query_group {
14    struct si_query_group *next;
15    struct ac_pc_block *block;
16    unsigned sub_gid;     /* only used during init */
17    unsigned result_base; /* only used during init */
18    int se;
19    int instance;
20    unsigned num_counters;
21    unsigned selectors[AC_QUERY_MAX_COUNTERS];
22 };
23 
24 struct si_query_counter {
25    unsigned base;
26    unsigned qwords;
27    unsigned stride; /* in uint64s */
28 };
29 
30 struct si_query_pc {
31    struct si_query b;
32    struct si_query_buffer buffer;
33 
34    /* Size of the results in memory, in bytes. */
35    unsigned result_size;
36 
37    unsigned shaders;
38    unsigned num_counters;
39    struct si_query_counter *counters;
40    struct si_query_group *groups;
41 };
42 
si_pc_emit_instance(struct si_context * sctx,int se,int instance)43 static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
44 {
45    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
46    unsigned value = S_030800_SH_BROADCAST_WRITES(1);
47 
48    if (se >= 0) {
49       value |= S_030800_SE_INDEX(se);
50    } else {
51       value |= S_030800_SE_BROADCAST_WRITES(1);
52    }
53 
54    if (sctx->gfx_level >= GFX10) {
55       /* TODO: Expose counters from each shader array separately if needed. */
56       value |= S_030800_SA_BROADCAST_WRITES(1);
57    }
58 
59    if (instance >= 0) {
60       value |= S_030800_INSTANCE_INDEX(instance);
61    } else {
62       value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
63    }
64 
65    radeon_begin(cs);
66    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value);
67    radeon_end();
68 }
69 
si_pc_emit_shaders(struct radeon_cmdbuf * cs,unsigned shaders)70 void si_pc_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders)
71 {
72    radeon_begin(cs);
73    radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2);
74    radeon_emit(shaders & 0x7f);
75    radeon_emit(0xffffffff);
76    radeon_end();
77 }
78 
si_pc_emit_select(struct si_context * sctx,struct ac_pc_block * block,unsigned count,unsigned * selectors)79 static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
80                               unsigned *selectors)
81 {
82    struct ac_pc_block_base *regs = block->b->b;
83    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
84    unsigned idx;
85 
86    assert(count <= regs->num_counters);
87 
88    /* Fake counters. */
89    if (!regs->select0)
90       return;
91 
92    radeon_begin(cs);
93 
94    for (idx = 0; idx < count; ++idx) {
95       radeon_set_uconfig_reg_seq(regs->select0[idx], 1);
96       radeon_emit(selectors[idx] | regs->select_or);
97    }
98 
99    for (idx = 0; idx < regs->num_spm_counters; idx++) {
100       radeon_set_uconfig_reg_seq(regs->select1[idx], 1);
101       radeon_emit(0);
102    }
103 
104    radeon_end();
105 }
106 
si_pc_emit_start(struct si_context * sctx,struct si_resource * buffer,uint64_t va)107 static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
108 {
109    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
110 
111    si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
112                    COPY_DATA_IMM, NULL, 1);
113 
114    radeon_begin(cs);
115    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
116                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
117    radeon_event_write(V_028A90_PERFCOUNTER_STOP);
118    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
119                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
120    radeon_end();
121 }
122 
123 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
124  * do it again in here. */
si_pc_emit_stop(struct si_context * sctx,struct si_resource * buffer,uint64_t va)125 static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
126 {
127    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
128 
129    si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
130                      EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
131    si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
132 
133    radeon_begin(cs);
134    radeon_event_write(V_028A90_PERFCOUNTER_SAMPLE);
135 
136    if (!sctx->screen->info.never_send_perfcounter_stop)
137       radeon_event_write(V_028A90_PERFCOUNTER_STOP);
138 
139    radeon_set_uconfig_reg(
140       R_036020_CP_PERFMON_CNTL,
141       S_036020_PERFMON_STATE(sctx->screen->info.never_stop_sq_perf_counters ?
142                                 V_036020_CP_PERFMON_STATE_START_COUNTING :
143                                 V_036020_CP_PERFMON_STATE_STOP_COUNTING) |
144       S_036020_PERFMON_SAMPLE_ENABLE(1));
145    radeon_end();
146 }
147 
si_pc_emit_spm_start(struct radeon_cmdbuf * cs)148 void si_pc_emit_spm_start(struct radeon_cmdbuf *cs)
149 {
150    radeon_begin(cs);
151 
152    /* Start SPM counters. */
153    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
154                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
155                              S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
156    /* Start windowed performance counters. */
157    radeon_event_write(V_028A90_PERFCOUNTER_START);
158    radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(1));
159 
160    radeon_end();
161 }
162 
si_pc_emit_spm_stop(struct radeon_cmdbuf * cs,bool never_stop_sq_perf_counters,bool never_send_perfcounter_stop)163 void si_pc_emit_spm_stop(struct radeon_cmdbuf *cs, bool never_stop_sq_perf_counters,
164                          bool never_send_perfcounter_stop)
165 {
166    radeon_begin(cs);
167 
168    /* Stop windowed performance counters. */
169    if (!never_send_perfcounter_stop)
170       radeon_event_write(V_028A90_PERFCOUNTER_STOP);
171 
172    radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(0));
173 
174    /* Stop SPM counters. */
175    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
176                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
177                           S_036020_SPM_PERFMON_STATE(never_stop_sq_perf_counters ?
178                              V_036020_STRM_PERFMON_STATE_START_COUNTING :
179                              V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
180 
181    radeon_end();
182 }
183 
si_pc_emit_spm_reset(struct radeon_cmdbuf * cs)184 void si_pc_emit_spm_reset(struct radeon_cmdbuf *cs)
185 {
186    radeon_begin(cs);
187    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
188                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
189                           S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
190    radeon_end();
191 }
192 
193 
si_pc_emit_read(struct si_context * sctx,struct ac_pc_block * block,unsigned count,uint64_t va)194 static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
195                             uint64_t va)
196 {
197    struct ac_pc_block_base *regs = block->b->b;
198    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
199    unsigned idx;
200    unsigned reg = regs->counter0_lo;
201    unsigned reg_delta = 8;
202 
203    radeon_begin(cs);
204 
205    if (regs->select0) {
206       for (idx = 0; idx < count; ++idx) {
207          if (regs->counters)
208             reg = regs->counters[idx];
209 
210          radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
211          radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
212                             COPY_DATA_COUNT_SEL); /* 64 bits */
213          radeon_emit(reg >> 2);
214          radeon_emit(0); /* unused */
215          radeon_emit(va);
216          radeon_emit(va >> 32);
217          va += sizeof(uint64_t);
218          reg += reg_delta;
219       }
220    } else {
221       /* Fake counters. */
222       for (idx = 0; idx < count; ++idx) {
223          radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
224          radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
225                      COPY_DATA_COUNT_SEL);
226          radeon_emit(0); /* immediate */
227          radeon_emit(0);
228          radeon_emit(va);
229          radeon_emit(va >> 32);
230          va += sizeof(uint64_t);
231       }
232    }
233    radeon_end();
234 }
235 
si_pc_query_destroy(struct si_context * sctx,struct si_query * squery)236 static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
237 {
238    struct si_query_pc *query = (struct si_query_pc *)squery;
239 
240    while (query->groups) {
241       struct si_query_group *group = query->groups;
242       query->groups = group->next;
243       FREE(group);
244    }
245 
246    FREE(query->counters);
247 
248    si_query_buffer_destroy(sctx->screen, &query->buffer);
249    FREE(query);
250 }
251 
si_inhibit_clockgating(struct si_context * sctx,struct radeon_cmdbuf * cs,bool inhibit)252 void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, bool inhibit)
253 {
254    if (sctx->gfx_level >= GFX11)
255       return;
256 
257    radeon_begin(&sctx->gfx_cs);
258 
259    if (sctx->gfx_level >= GFX10) {
260       radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL,
261                              S_037390_PERFMON_CLOCK_STATE(inhibit));
262    } else if (sctx->gfx_level >= GFX8) {
263       radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
264                              S_0372FC_PERFMON_CLOCK_STATE(inhibit));
265    }
266    radeon_end();
267 }
268 
si_pc_query_resume(struct si_context * sctx,struct si_query * squery)269 static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
270 /*
271                                    struct si_query_hw *hwquery,
272                                    struct si_resource *buffer, uint64_t va)*/
273 {
274    struct si_query_pc *query = (struct si_query_pc *)squery;
275    int current_se = -1;
276    int current_instance = -1;
277 
278    if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
279       return;
280    si_need_gfx_cs_space(sctx, 0);
281 
282    if (query->shaders)
283       si_pc_emit_shaders(&sctx->gfx_cs, query->shaders);
284 
285    si_inhibit_clockgating(sctx, &sctx->gfx_cs, true);
286 
287    for (struct si_query_group *group = query->groups; group; group = group->next) {
288       struct ac_pc_block *block = group->block;
289 
290       if (group->se != current_se || group->instance != current_instance) {
291          current_se = group->se;
292          current_instance = group->instance;
293          si_pc_emit_instance(sctx, group->se, group->instance);
294       }
295 
296       si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
297    }
298 
299    if (current_se != -1 || current_instance != -1)
300       si_pc_emit_instance(sctx, -1, -1);
301 
302    uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
303    si_pc_emit_start(sctx, query->buffer.buf, va);
304 }
305 
si_pc_query_suspend(struct si_context * sctx,struct si_query * squery)306 static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
307 {
308    struct si_query_pc *query = (struct si_query_pc *)squery;
309 
310    if (!query->buffer.buf)
311       return;
312 
313    uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
314    query->buffer.results_end += query->result_size;
315 
316    si_pc_emit_stop(sctx, query->buffer.buf, va);
317 
318    for (struct si_query_group *group = query->groups; group; group = group->next) {
319       struct ac_pc_block *block = group->block;
320       unsigned se = group->se >= 0 ? group->se : 0;
321       unsigned se_end = se + 1;
322 
323       if ((block->b->b->flags & AC_PC_BLOCK_SE) && (group->se < 0))
324          se_end = sctx->screen->info.max_se;
325 
326       do {
327          unsigned instance = group->instance >= 0 ? group->instance : 0;
328 
329          do {
330             si_pc_emit_instance(sctx, se, instance);
331             si_pc_emit_read(sctx, block, group->num_counters, va);
332             va += sizeof(uint64_t) * group->num_counters;
333          } while (group->instance < 0 && ++instance < block->num_instances);
334       } while (++se < se_end);
335    }
336 
337    si_pc_emit_instance(sctx, -1, -1);
338 
339    si_inhibit_clockgating(sctx, &sctx->gfx_cs, false);
340 }
341 
si_pc_query_begin(struct si_context * ctx,struct si_query * squery)342 static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
343 {
344    struct si_query_pc *query = (struct si_query_pc *)squery;
345 
346    si_query_buffer_reset(ctx, &query->buffer);
347 
348    list_addtail(&query->b.active_list, &ctx->active_queries);
349    ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
350 
351    si_pc_query_resume(ctx, squery);
352 
353    return true;
354 }
355 
si_pc_query_end(struct si_context * ctx,struct si_query * squery)356 static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
357 {
358    struct si_query_pc *query = (struct si_query_pc *)squery;
359 
360    si_pc_query_suspend(ctx, squery);
361 
362    list_del(&squery->active_list);
363    ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
364 
365    return query->buffer.buf != NULL;
366 }
367 
si_pc_query_add_result(struct si_query_pc * query,void * buffer,union pipe_query_result * result)368 static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
369                                    union pipe_query_result *result)
370 {
371    uint64_t *results = buffer;
372    unsigned i, j;
373 
374    for (i = 0; i < query->num_counters; ++i) {
375       struct si_query_counter *counter = &query->counters[i];
376 
377       for (j = 0; j < counter->qwords; ++j) {
378          uint32_t value = results[counter->base + j * counter->stride];
379          result->batch[i].u64 += value;
380       }
381    }
382 }
383 
si_pc_query_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)384 static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
385                                    union pipe_query_result *result)
386 {
387    struct si_query_pc *query = (struct si_query_pc *)squery;
388 
389    memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
390 
391    for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
392       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
393       unsigned results_base = 0;
394       void *map;
395 
396       if (squery->b.flushed)
397          map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
398       else
399          map = si_buffer_map(sctx, qbuf->buf, usage);
400 
401       if (!map)
402          return false;
403 
404       while (results_base != qbuf->results_end) {
405          si_pc_query_add_result(query, map + results_base, result);
406          results_base += query->result_size;
407       }
408    }
409 
410    return true;
411 }
412 
413 static const struct si_query_ops batch_query_ops = {
414    .destroy = si_pc_query_destroy,
415    .begin = si_pc_query_begin,
416    .end = si_pc_query_end,
417    .get_result = si_pc_query_get_result,
418 
419    .suspend = si_pc_query_suspend,
420    .resume = si_pc_query_resume,
421 };
422 
get_group_state(struct si_screen * screen,struct si_query_pc * query,struct ac_pc_block * block,unsigned sub_gid)423 static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
424                                               struct ac_pc_block *block, unsigned sub_gid)
425 {
426    struct si_perfcounters *pc = screen->perfcounters;
427    struct si_query_group *group = query->groups;
428 
429    while (group) {
430       if (group->block == block && group->sub_gid == sub_gid)
431          return group;
432       group = group->next;
433    }
434 
435    group = CALLOC_STRUCT(si_query_group);
436    if (!group)
437       return NULL;
438 
439    group->block = block;
440    group->sub_gid = sub_gid;
441 
442    if (block->b->b->flags & AC_PC_BLOCK_SHADER) {
443       unsigned sub_gids = block->num_instances;
444       unsigned shader_id;
445       unsigned shaders;
446       unsigned query_shaders;
447 
448       if (ac_pc_block_has_per_se_groups(&pc->base, block))
449          sub_gids = sub_gids * screen->info.max_se;
450       shader_id = sub_gid / sub_gids;
451       sub_gid = sub_gid % sub_gids;
452 
453       shaders = ac_pc_shader_type_bits[shader_id];
454 
455       query_shaders = query->shaders & ~AC_PC_SHADERS_WINDOWING;
456       if (query_shaders && query_shaders != shaders) {
457          fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
458          FREE(group);
459          return NULL;
460       }
461       query->shaders = shaders;
462    }
463 
464    if (block->b->b->flags & AC_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
465       // A non-zero value in query->shaders ensures that the shader
466       // masking is reset unless the user explicitly requests one.
467       query->shaders = AC_PC_SHADERS_WINDOWING;
468    }
469 
470    if (ac_pc_block_has_per_se_groups(&pc->base, block)) {
471       group->se = sub_gid / block->num_instances;
472       sub_gid = sub_gid % block->num_instances;
473    } else {
474       group->se = -1;
475    }
476 
477    if (ac_pc_block_has_per_instance_groups(&pc->base, block)) {
478       group->instance = sub_gid;
479    } else {
480       group->instance = -1;
481    }
482 
483    group->next = query->groups;
484    query->groups = group;
485 
486    return group;
487 }
488 
si_create_batch_query(struct pipe_context * ctx,unsigned num_queries,unsigned * query_types)489 struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
490                                          unsigned *query_types)
491 {
492    struct si_screen *screen = (struct si_screen *)ctx->screen;
493    struct si_perfcounters *pc = screen->perfcounters;
494    struct ac_pc_block *block;
495    struct si_query_group *group;
496    struct si_query_pc *query;
497    unsigned base_gid, sub_gid, sub_index;
498    unsigned i, j;
499 
500    if (!pc)
501       return NULL;
502 
503    query = CALLOC_STRUCT(si_query_pc);
504    if (!query)
505       return NULL;
506 
507    query->b.ops = &batch_query_ops;
508 
509    query->num_counters = num_queries;
510 
511    /* Collect selectors per group */
512    for (i = 0; i < num_queries; ++i) {
513       unsigned sub_gid;
514 
515       if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
516          goto error;
517 
518       block =
519          ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
520       if (!block)
521          goto error;
522 
523       sub_gid = sub_index / block->b->selectors;
524       sub_index = sub_index % block->b->selectors;
525 
526       group = get_group_state(screen, query, block, sub_gid);
527       if (!group)
528          goto error;
529 
530       if (group->num_counters >= block->b->b->num_counters) {
531          fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
532          goto error;
533       }
534       group->selectors[group->num_counters] = sub_index;
535       ++group->num_counters;
536    }
537 
538    /* Compute result bases and CS size per group */
539    query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
540    query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
541 
542    i = 0;
543    for (group = query->groups; group; group = group->next) {
544       struct ac_pc_block *block = group->block;
545       unsigned read_dw;
546       unsigned instances = 1;
547 
548       if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
549          instances = screen->info.max_se;
550       if (group->instance < 0)
551          instances *= block->num_instances;
552 
553       group->result_base = i;
554       query->result_size += sizeof(uint64_t) * instances * group->num_counters;
555       i += instances * group->num_counters;
556 
557       read_dw = 6 * group->num_counters;
558       query->b.num_cs_dw_suspend += instances * read_dw;
559       query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
560    }
561 
562    if (query->shaders) {
563       if (query->shaders == AC_PC_SHADERS_WINDOWING)
564          query->shaders = 0xffffffff;
565    }
566 
567    /* Map user-supplied query array to result indices */
568    query->counters = CALLOC(num_queries, sizeof(*query->counters));
569    for (i = 0; i < num_queries; ++i) {
570       struct si_query_counter *counter = &query->counters[i];
571       struct ac_pc_block *block;
572 
573       block =
574          ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
575 
576       sub_gid = sub_index / block->b->selectors;
577       sub_index = sub_index % block->b->selectors;
578 
579       group = get_group_state(screen, query, block, sub_gid);
580       assert(group != NULL);
581 
582       for (j = 0; j < group->num_counters; ++j) {
583          if (group->selectors[j] == sub_index)
584             break;
585       }
586 
587       counter->base = group->result_base + j;
588       counter->stride = group->num_counters;
589 
590       counter->qwords = 1;
591       if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
592          counter->qwords = screen->info.max_se;
593       if (group->instance < 0)
594          counter->qwords *= block->num_instances;
595    }
596 
597    return (struct pipe_query *)query;
598 
599 error:
600    si_pc_query_destroy((struct si_context *)ctx, &query->b);
601    return NULL;
602 }
603 
si_get_perfcounter_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_info * info)604 int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
605                             struct pipe_driver_query_info *info)
606 {
607    struct si_perfcounters *pc = screen->perfcounters;
608    struct ac_pc_block *block;
609    unsigned base_gid, sub;
610 
611    if (!pc)
612       return 0;
613 
614    if (!info) {
615       unsigned bid, num_queries = 0;
616 
617       for (bid = 0; bid < pc->base.num_blocks; ++bid) {
618          num_queries += pc->base.blocks[bid].b->selectors * pc->base.blocks[bid].num_groups;
619       }
620 
621       return num_queries;
622    }
623 
624    block = ac_lookup_counter(&pc->base, index, &base_gid, &sub);
625    if (!block)
626       return 0;
627 
628    if (!block->selector_names) {
629       if (!ac_init_block_names(&screen->info, &pc->base, block))
630          return 0;
631    }
632    info->name = block->selector_names + sub * block->selector_name_stride;
633    info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
634    info->max_value.u64 = 0;
635    info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
636    info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
637    info->group_id = base_gid + sub / block->b->selectors;
638    info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
639    if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
640       info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
641    return 1;
642 }
643 
si_get_perfcounter_group_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)644 int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
645                                   struct pipe_driver_query_group_info *info)
646 {
647    struct si_perfcounters *pc = screen->perfcounters;
648    struct ac_pc_block *block;
649 
650    if (!pc)
651       return 0;
652 
653    if (!info)
654       return pc->base.num_groups;
655 
656    block = ac_lookup_group(&pc->base, &index);
657    if (!block)
658       return 0;
659 
660    if (!block->group_names) {
661       if (!ac_init_block_names(&screen->info, &pc->base, block))
662          return 0;
663    }
664    info->name = block->group_names + index * block->group_name_stride;
665    info->num_queries = block->b->selectors;
666    info->max_active_queries = block->b->b->num_counters;
667    return 1;
668 }
669 
si_destroy_perfcounters(struct si_screen * screen)670 void si_destroy_perfcounters(struct si_screen *screen)
671 {
672    struct si_perfcounters *pc = screen->perfcounters;
673 
674    if (!pc)
675       return;
676 
677    ac_destroy_perfcounters(&pc->base);
678    FREE(pc);
679    screen->perfcounters = NULL;
680 }
681 
si_init_perfcounters(struct si_screen * screen)682 void si_init_perfcounters(struct si_screen *screen)
683 {
684    bool separate_se, separate_instance;
685 
686    separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
687    separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
688 
689    screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
690    if (!screen->perfcounters)
691       return;
692 
693    screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
694    screen->perfcounters->num_instance_cs_dwords = 3;
695 
696    if (!ac_init_perfcounters(&screen->info, separate_se, separate_instance,
697                              &screen->perfcounters->base)) {
698       si_destroy_perfcounters(screen);
699    }
700 }
701 
702 static bool
si_spm_init_bo(struct si_context * sctx)703 si_spm_init_bo(struct si_context *sctx)
704 {
705    struct radeon_winsys *ws = sctx->ws;
706    uint64_t size = 32 * 1024 * 1024; /* Default to 32MB. */
707 
708    sctx->spm.buffer_size = size;
709    sctx->spm.sample_interval = 4096; /* Default to 4096 clk. */
710 
711    sctx->spm.bo = ws->buffer_create(
712       ws, size, 4096,
713       RADEON_DOMAIN_GTT,
714       RADEON_FLAG_NO_INTERPROCESS_SHARING |
715          RADEON_FLAG_GTT_WC |
716          RADEON_FLAG_NO_SUBALLOC);
717 
718    return sctx->spm.bo != NULL;
719 }
720 
721 
722 static void
si_emit_spm_counters(struct si_context * sctx,struct radeon_cmdbuf * cs)723 si_emit_spm_counters(struct si_context *sctx, struct radeon_cmdbuf *cs)
724 {
725    struct ac_spm *spm = &sctx->spm;
726 
727    radeon_begin(cs);
728 
729    for (uint32_t instance = 0; instance < ARRAY_SIZE(spm->sqg); instance++) {
730       uint32_t num_counters = spm->sqg[instance].num_counters;
731 
732       if (!num_counters)
733          continue;
734 
735       radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
736                              S_030800_SH_BROADCAST_WRITES(1) |
737                              S_030800_INSTANCE_BROADCAST_WRITES(1) |
738                              S_030800_SE_INDEX(instance));
739 
740       for (uint32_t b = 0; b < num_counters; b++) {
741          const struct ac_spm_counter_select *cntr_sel = &spm->sqg[instance].counters[b];
742          uint32_t reg_base = R_036700_SQ_PERFCOUNTER0_SELECT;
743 
744          radeon_set_uconfig_reg_seq(reg_base + b * 4, 1);
745          radeon_emit(cntr_sel->sel0 | S_036700_SQC_BANK_MASK(0xf)); /* SQC_BANK_MASK only gfx10 */
746       }
747    }
748 
749    for (uint32_t b = 0; b < spm->num_block_sel; b++) {
750       struct ac_spm_block_select *block_sel = &spm->block_sel[b];
751       struct ac_pc_block_base *regs = block_sel->b->b->b;
752 
753       for (unsigned i = 0; i < block_sel->num_instances; i++) {
754          struct ac_spm_block_instance *block_instance = &block_sel->instances[i];
755 
756          radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, block_instance->grbm_gfx_index);
757 
758          for (unsigned c = 0; c < block_instance->num_counters; c++) {
759             const struct ac_spm_counter_select *cntr_sel = &block_instance->counters[c];
760 
761             if (!cntr_sel->active)
762                continue;
763 
764             radeon_set_uconfig_reg_seq(regs->select0[c], 1);
765             radeon_emit(cntr_sel->sel0);
766 
767             radeon_set_uconfig_reg_seq(regs->select1[c], 1);
768             radeon_emit(cntr_sel->sel1);
769          }
770       }
771    }
772 
773    /* Restore global broadcasting. */
774    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
775                           S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
776                           S_030800_INSTANCE_BROADCAST_WRITES(1));
777 
778    radeon_end();
779 }
780 
781 #define SPM_RING_BASE_ALIGN 32
782 
783 void
si_emit_spm_setup(struct si_context * sctx,struct radeon_cmdbuf * cs)784 si_emit_spm_setup(struct si_context *sctx, struct radeon_cmdbuf *cs)
785 {
786    struct ac_spm *spm = &sctx->spm;
787    uint64_t va = sctx->screen->ws->buffer_get_virtual_address(spm->bo);
788    uint64_t ring_size = spm->buffer_size;
789 
790    /* It's required that the ring VA and the size are correctly aligned. */
791    assert(!(va & (SPM_RING_BASE_ALIGN - 1)));
792    assert(!(ring_size & (SPM_RING_BASE_ALIGN - 1)));
793    assert(spm->sample_interval >= 32);
794 
795    radeon_begin(cs);
796 
797    /* Configure the SPM ring buffer. */
798    radeon_set_uconfig_reg(R_037200_RLC_SPM_PERFMON_CNTL,
799                           S_037200_PERFMON_RING_MODE(0) | /* no stall and no interrupt on overflow */
800                           S_037200_PERFMON_SAMPLE_INTERVAL(spm->sample_interval)); /* in sclk */
801    radeon_set_uconfig_reg(R_037204_RLC_SPM_PERFMON_RING_BASE_LO, va);
802    radeon_set_uconfig_reg(R_037208_RLC_SPM_PERFMON_RING_BASE_HI,
803                           S_037208_RING_BASE_HI(va >> 32));
804    radeon_set_uconfig_reg(R_03720C_RLC_SPM_PERFMON_RING_SIZE, ring_size);
805 
806    /* Configure the muxsel. */
807    uint32_t total_muxsel_lines = 0;
808    for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
809       total_muxsel_lines += spm->num_muxsel_lines[s];
810    }
811 
812    radeon_set_uconfig_reg(R_03726C_RLC_SPM_ACCUM_MODE, 0);
813    radeon_set_uconfig_reg(R_037210_RLC_SPM_PERFMON_SEGMENT_SIZE, 0);
814    radeon_set_uconfig_reg(R_03727C_RLC_SPM_PERFMON_SE3TO0_SEGMENT_SIZE,
815                           S_03727C_SE0_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE0]) |
816                           S_03727C_SE1_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE1]) |
817                           S_03727C_SE2_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE2]) |
818                           S_03727C_SE3_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE3]));
819    radeon_set_uconfig_reg(R_037280_RLC_SPM_PERFMON_GLB_SEGMENT_SIZE,
820                           S_037280_PERFMON_SEGMENT_SIZE(total_muxsel_lines) |
821                           S_037280_GLOBAL_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_GLOBAL]));
822 
823    /* Upload each muxsel ram to the RLC. */
824    for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
825       unsigned rlc_muxsel_addr, rlc_muxsel_data;
826       unsigned grbm_gfx_index = S_030800_SH_BROADCAST_WRITES(1) |
827                                 S_030800_INSTANCE_BROADCAST_WRITES(1);
828 
829       if (!spm->num_muxsel_lines[s])
830          continue;
831 
832       if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) {
833          grbm_gfx_index |= S_030800_SE_BROADCAST_WRITES(1);
834 
835          rlc_muxsel_addr = R_037224_RLC_SPM_GLOBAL_MUXSEL_ADDR;
836          rlc_muxsel_data = R_037228_RLC_SPM_GLOBAL_MUXSEL_DATA;
837       } else {
838          grbm_gfx_index |= S_030800_SE_INDEX(s);
839 
840          rlc_muxsel_addr = R_03721C_RLC_SPM_SE_MUXSEL_ADDR;
841          rlc_muxsel_data = R_037220_RLC_SPM_SE_MUXSEL_DATA;
842       }
843 
844       radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, grbm_gfx_index);
845 
846       for (unsigned l = 0; l < spm->num_muxsel_lines[s]; l++) {
847          uint32_t *data = (uint32_t *)spm->muxsel_lines[s][l].muxsel;
848 
849          /* Select MUXSEL_ADDR to point to the next muxsel. */
850          radeon_set_uconfig_reg(rlc_muxsel_addr, l * AC_SPM_MUXSEL_LINE_SIZE);
851 
852          /* Write the muxsel line configuration with MUXSEL_DATA. */
853          radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + AC_SPM_MUXSEL_LINE_SIZE, 0));
854          radeon_emit(S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) |
855                      S_370_WR_CONFIRM(1) |
856                      S_370_ENGINE_SEL(V_370_ME) |
857                      S_370_WR_ONE_ADDR(1));
858          radeon_emit(rlc_muxsel_data >> 2);
859          radeon_emit(0);
860          radeon_emit_array(data, AC_SPM_MUXSEL_LINE_SIZE);
861       }
862    }
863    radeon_end();
864 
865    /* Select SPM counters. */
866    si_emit_spm_counters(sctx, cs);
867 }
868 
869 bool
si_spm_init(struct si_context * sctx)870 si_spm_init(struct si_context *sctx)
871 {
872    const struct radeon_info *info = &sctx->screen->info;
873 
874    sctx->screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
875    sctx->screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(sctx->screen);
876    sctx->screen->perfcounters->num_instance_cs_dwords = 3;
877 
878    struct ac_perfcounters *pc = &sctx->screen->perfcounters->base;
879 
880    if (!ac_init_perfcounters(info, false, false, pc))
881       return false;
882 
883    if (!ac_init_spm(info, pc, &sctx->spm))
884       return false;
885 
886    if (!si_spm_init_bo(sctx))
887       return false;
888 
889    return true;
890 }
891 
892 void
si_spm_finish(struct si_context * sctx)893 si_spm_finish(struct si_context *sctx)
894 {
895    struct pb_buffer_lean *bo = sctx->spm.bo;
896    radeon_bo_reference(sctx->screen->ws, &bo, NULL);
897 
898    ac_destroy_spm(&sctx->spm);
899 }
900