xref: /aosp_15_r20/external/mesa3d/src/amd/common/ac_spm.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2021 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ac_spm.h"
8 
9 #include "util/bitscan.h"
10 #include "util/u_memory.h"
11 #include "ac_perfcounter.h"
12 
13 /* SPM counters definition. */
14 /* GFX10+ */
15 static struct ac_spm_counter_descr gfx10_num_l2_hits = {TCP, 0x9};
16 static struct ac_spm_counter_descr gfx10_num_l2_misses = {TCP, 0x12};
17 static struct ac_spm_counter_descr gfx10_num_scache_hits = {SQ, 0x14f};
18 static struct ac_spm_counter_descr gfx10_num_scache_misses = {SQ, 0x150};
19 static struct ac_spm_counter_descr gfx10_num_scache_misses_dup = {SQ, 0x151};
20 static struct ac_spm_counter_descr gfx10_num_icache_hits = {SQ, 0x12c};
21 static struct ac_spm_counter_descr gfx10_num_icache_misses = {SQ, 0x12d};
22 static struct ac_spm_counter_descr gfx10_num_icache_misses_dup = {SQ, 0x12e};
23 static struct ac_spm_counter_descr gfx10_num_gl1c_hits = {GL1C, 0xe};
24 static struct ac_spm_counter_descr gfx10_num_gl1c_misses = {GL1C, 0x12};
25 static struct ac_spm_counter_descr gfx10_num_gl2c_hits = {GL2C, 0x3};
26 static struct ac_spm_counter_descr gfx10_num_gl2c_misses = {GL2C, 0x23};
27 
28 static struct ac_spm_counter_create_info gfx10_spm_counters[] = {
29    {&gfx10_num_l2_hits},
30    {&gfx10_num_l2_misses},
31    {&gfx10_num_scache_hits},
32    {&gfx10_num_scache_misses},
33    {&gfx10_num_scache_misses_dup},
34    {&gfx10_num_icache_hits},
35    {&gfx10_num_icache_misses},
36    {&gfx10_num_icache_misses_dup},
37    {&gfx10_num_gl1c_hits},
38    {&gfx10_num_gl1c_misses},
39    {&gfx10_num_gl2c_hits},
40    {&gfx10_num_gl2c_misses},
41 };
42 
43 /* GFX10.3+ */
44 static struct ac_spm_counter_descr gfx103_num_gl2c_misses = {GL2C, 0x2b};
45 
46 static struct ac_spm_counter_create_info gfx103_spm_counters[] = {
47    {&gfx10_num_l2_hits},
48    {&gfx10_num_l2_misses},
49    {&gfx10_num_scache_hits},
50    {&gfx10_num_scache_misses},
51    {&gfx10_num_scache_misses_dup},
52    {&gfx10_num_icache_hits},
53    {&gfx10_num_icache_misses},
54    {&gfx10_num_icache_misses_dup},
55    {&gfx10_num_gl1c_hits},
56    {&gfx10_num_gl1c_misses},
57    {&gfx10_num_gl2c_hits},
58    {&gfx103_num_gl2c_misses},
59 };
60 
61 /* GFX11+ */
62 static struct ac_spm_counter_descr gfx11_num_l2_misses = {TCP, 0x11};
63 static struct ac_spm_counter_descr gfx11_num_scache_hits = {SQ_WGP, 0x126};
64 static struct ac_spm_counter_descr gfx11_num_scache_misses = {SQ_WGP, 0x127};
65 static struct ac_spm_counter_descr gfx11_num_scache_misses_dup = {SQ_WGP, 0x128};
66 static struct ac_spm_counter_descr gfx11_num_icache_hits = {SQ_WGP, 0x10e};
67 static struct ac_spm_counter_descr gfx11_num_icache_misses = {SQ_WGP, 0x10f};
68 static struct ac_spm_counter_descr gfx11_num_icache_misses_dup = {SQ_WGP, 0x110};
69 
70 static struct ac_spm_counter_create_info gfx11_spm_counters[] = {
71    {&gfx10_num_l2_hits},
72    {&gfx11_num_l2_misses},
73    {&gfx11_num_scache_hits},
74    {&gfx11_num_scache_misses},
75    {&gfx11_num_scache_misses_dup},
76    {&gfx11_num_icache_hits},
77    {&gfx11_num_icache_misses},
78    {&gfx11_num_icache_misses_dup},
79    {&gfx10_num_gl1c_hits},
80    {&gfx10_num_gl1c_misses},
81    {&gfx10_num_gl2c_hits},
82    {&gfx103_num_gl2c_misses},
83 };
84 
85 static struct ac_spm_block_select *
ac_spm_get_block_select(struct ac_spm * spm,const struct ac_pc_block * block)86 ac_spm_get_block_select(struct ac_spm *spm, const struct ac_pc_block *block)
87 {
88    struct ac_spm_block_select *block_sel, *new_block_sel;
89    uint32_t num_block_sel;
90 
91    for (uint32_t i = 0; i < spm->num_block_sel; i++) {
92       if (spm->block_sel[i].b->b->b->gpu_block == block->b->b->gpu_block)
93          return &spm->block_sel[i];
94    }
95 
96    /* Allocate a new select block if it doesn't already exist. */
97    num_block_sel = spm->num_block_sel + 1;
98    block_sel = realloc(spm->block_sel, num_block_sel * sizeof(*block_sel));
99    if (!block_sel)
100       return NULL;
101 
102    spm->num_block_sel = num_block_sel;
103    spm->block_sel = block_sel;
104 
105    /* Initialize the new select block. */
106    new_block_sel = &spm->block_sel[spm->num_block_sel - 1];
107    memset(new_block_sel, 0, sizeof(*new_block_sel));
108 
109    new_block_sel->b = block;
110    new_block_sel->instances =
111       calloc(block->num_global_instances, sizeof(*new_block_sel->instances));
112    if (!new_block_sel->instances)
113       return NULL;
114    new_block_sel->num_instances = block->num_global_instances;
115 
116    for (unsigned i = 0; i < new_block_sel->num_instances; i++)
117       new_block_sel->instances[i].num_counters = block->b->b->num_spm_counters;
118 
119    return new_block_sel;
120 }
121 
122 struct ac_spm_instance_mapping {
123    uint32_t se_index;         /* SE index or 0 if global */
124    uint32_t sa_index;         /* SA index or 0 if global or per-SE */
125    uint32_t instance_index;
126 };
127 
128 static bool
ac_spm_init_instance_mapping(const struct radeon_info * info,const struct ac_pc_block * block,const struct ac_spm_counter_info * counter,struct ac_spm_instance_mapping * mapping)129 ac_spm_init_instance_mapping(const struct radeon_info *info,
130                              const struct ac_pc_block *block,
131                              const struct ac_spm_counter_info *counter,
132                              struct ac_spm_instance_mapping *mapping)
133 {
134    uint32_t instance_index = 0, se_index = 0, sa_index = 0;
135 
136    if (block->b->b->flags & AC_PC_BLOCK_SE) {
137       if (block->b->b->gpu_block == SQ) {
138          /* Per-SE blocks. */
139          se_index = counter->instance / block->num_instances;
140          instance_index = counter->instance % block->num_instances;
141       } else {
142          /* Per-SA blocks. */
143          assert(block->b->b->gpu_block == GL1C ||
144                 block->b->b->gpu_block == TCP ||
145                 block->b->b->gpu_block == SQ_WGP);
146          se_index = (counter->instance / block->num_instances) / info->max_sa_per_se;
147          sa_index = (counter->instance / block->num_instances) % info->max_sa_per_se;
148          instance_index = counter->instance % block->num_instances;
149       }
150    } else {
151       /* Global blocks. */
152       assert(block->b->b->gpu_block == GL2C);
153       instance_index = counter->instance;
154    }
155 
156    if (se_index >= info->num_se ||
157        sa_index >= info->max_sa_per_se ||
158        instance_index >= block->num_instances)
159       return false;
160 
161    mapping->se_index = se_index;
162    mapping->sa_index = sa_index;
163    mapping->instance_index = instance_index;
164 
165    return true;
166 }
167 
168 static void
ac_spm_init_muxsel(const struct radeon_info * info,const struct ac_pc_block * block,const struct ac_spm_instance_mapping * mapping,struct ac_spm_counter_info * counter,uint32_t spm_wire)169 ac_spm_init_muxsel(const struct radeon_info *info,
170                    const struct ac_pc_block *block,
171                    const struct ac_spm_instance_mapping *mapping,
172                    struct ac_spm_counter_info *counter,
173                    uint32_t spm_wire)
174 {
175    const uint16_t counter_idx = 2 * spm_wire + (counter->is_even ? 0 : 1);
176    union ac_spm_muxsel *muxsel = &counter->muxsel;
177 
178    if (info->gfx_level >= GFX11) {
179       muxsel->gfx11.counter = counter_idx;
180       muxsel->gfx11.block = block->b->b->spm_block_select;
181       muxsel->gfx11.shader_array = mapping->sa_index;
182       muxsel->gfx11.instance = mapping->instance_index;
183    } else {
184       muxsel->gfx10.counter = counter_idx;
185       muxsel->gfx10.block = block->b->b->spm_block_select;
186       muxsel->gfx10.shader_array = mapping->sa_index;
187       muxsel->gfx10.instance = mapping->instance_index;
188    }
189 }
190 
191 static uint32_t
ac_spm_init_grbm_gfx_index(const struct ac_pc_block * block,const struct ac_spm_instance_mapping * mapping)192 ac_spm_init_grbm_gfx_index(const struct ac_pc_block *block,
193                            const struct ac_spm_instance_mapping *mapping)
194 {
195    uint32_t instance = mapping->instance_index;
196    uint32_t grbm_gfx_index = 0;
197 
198    grbm_gfx_index |= S_030800_SE_INDEX(mapping->se_index) |
199                      S_030800_SH_INDEX(mapping->sa_index);
200 
201    switch (block->b->b->gpu_block) {
202    case GL2C:
203       /* Global blocks. */
204       grbm_gfx_index |= S_030800_SE_BROADCAST_WRITES(1);
205       break;
206    case SQ:
207       /* Per-SE blocks. */
208       grbm_gfx_index |= S_030800_SH_BROADCAST_WRITES(1);
209       break;
210    default:
211       /* Other blocks shouldn't broadcast. */
212       break;
213    }
214 
215    if (block->b->b->gpu_block == SQ_WGP) {
216       union {
217          struct {
218             uint32_t block_index : 2; /* Block index withing WGP */
219             uint32_t wgp_index : 3;
220             uint32_t is_below_spi : 1; /* 0: lower WGP numbers, 1: higher WGP numbers */
221             uint32_t reserved : 26;
222          };
223 
224          uint32_t value;
225       } instance_index = {0};
226 
227       const uint32_t num_wgp_above_spi = 4;
228       const bool is_below_spi = mapping->instance_index >= num_wgp_above_spi;
229 
230       instance_index.wgp_index =
231          is_below_spi ? (mapping->instance_index - num_wgp_above_spi) : mapping->instance_index;
232       instance_index.is_below_spi = is_below_spi;
233 
234       instance = instance_index.value;
235    }
236 
237    grbm_gfx_index |= S_030800_INSTANCE_INDEX(instance);
238 
239    return grbm_gfx_index;
240 }
241 
242 static bool
ac_spm_map_counter(struct ac_spm * spm,struct ac_spm_block_select * block_sel,struct ac_spm_counter_info * counter,const struct ac_spm_instance_mapping * mapping,uint32_t * spm_wire)243 ac_spm_map_counter(struct ac_spm *spm, struct ac_spm_block_select *block_sel,
244                    struct ac_spm_counter_info *counter,
245                    const struct ac_spm_instance_mapping *mapping,
246                    uint32_t *spm_wire)
247 {
248    uint32_t instance = counter->instance;
249 
250    if (block_sel->b->b->b->gpu_block == SQ_WGP) {
251       if (!spm->sq_wgp[instance].grbm_gfx_index) {
252          spm->sq_wgp[instance].grbm_gfx_index =
253             ac_spm_init_grbm_gfx_index(block_sel->b, mapping);
254       }
255 
256       for (unsigned i = 0; i < ARRAY_SIZE(spm->sq_wgp[instance].counters); i++) {
257          struct ac_spm_counter_select *cntr_sel = &spm->sq_wgp[instance].counters[i];
258 
259          if (i < spm->sq_wgp[instance].num_counters)
260             continue;
261 
262          cntr_sel->sel0 |= S_036700_PERF_SEL(counter->event_id) |
263                            S_036700_SPM_MODE(1) | /* 16-bit clamp */
264                            S_036700_PERF_MODE(0);
265 
266          /* Each SQ_WQP modules (GFX11+) share one 32-bit accumulator/wire
267           * per pair of selects.
268           */
269          cntr_sel->active |= 1 << (i % 2);
270          *spm_wire = i / 2;
271 
272          if (cntr_sel->active & 0x1)
273             counter->is_even = true;
274 
275          spm->sq_wgp[instance].num_counters++;
276          return true;
277       }
278    } else if (block_sel->b->b->b->gpu_block == SQ) {
279       for (unsigned i = 0; i < ARRAY_SIZE(spm->sqg[instance].counters); i++) {
280          struct ac_spm_counter_select *cntr_sel = &spm->sqg[instance].counters[i];
281 
282          if (i < spm->sqg[instance].num_counters)
283             continue;
284 
285          /* SQ doesn't support 16-bit counters. */
286          cntr_sel->sel0 |= S_036700_PERF_SEL(counter->event_id) |
287                            S_036700_SPM_MODE(3) | /* 32-bit clamp */
288                            S_036700_PERF_MODE(0);
289          cntr_sel->active |= 0x3;
290 
291          /* 32-bits counter are always even. */
292          counter->is_even = true;
293 
294          /* One wire per SQ module. */
295          *spm_wire = i;
296 
297          spm->sqg[instance].num_counters++;
298          return true;
299       }
300    } else {
301       /* Generic blocks. */
302       struct ac_spm_block_instance *block_instance =
303          &block_sel->instances[instance];
304 
305       if (!block_instance->grbm_gfx_index) {
306          block_instance->grbm_gfx_index =
307             ac_spm_init_grbm_gfx_index(block_sel->b, mapping);
308       }
309 
310       for (unsigned i = 0; i < block_instance->num_counters; i++) {
311          struct ac_spm_counter_select *cntr_sel = &block_instance->counters[i];
312          int index = ffs(~cntr_sel->active) - 1;
313 
314          switch (index) {
315          case 0: /* use S_037004_PERF_SEL */
316             cntr_sel->sel0 |= S_037004_PERF_SEL(counter->event_id) |
317                               S_037004_CNTR_MODE(1) | /* 16-bit clamp */
318                               S_037004_PERF_MODE(0); /* accum */
319             break;
320          case 1: /* use S_037004_PERF_SEL1 */
321             cntr_sel->sel0 |= S_037004_PERF_SEL1(counter->event_id) |
322                               S_037004_PERF_MODE1(0);
323             break;
324          case 2: /* use S_037004_PERF_SEL2 */
325             cntr_sel->sel1 |= S_037008_PERF_SEL2(counter->event_id) |
326                               S_037008_PERF_MODE2(0);
327             break;
328          case 3: /* use S_037004_PERF_SEL3 */
329             cntr_sel->sel1 |= S_037008_PERF_SEL3(counter->event_id) |
330                               S_037008_PERF_MODE3(0);
331             break;
332          default:
333             return false;
334          }
335 
336          /* Mark this 16-bit counter as used. */
337          cntr_sel->active |= 1 << index;
338 
339          /* Determine if the counter is even or odd. */
340          counter->is_even = !(index % 2);
341 
342          /* Determine the SPM wire (one wire holds two 16-bit counters). */
343          *spm_wire = !!(index >= 2);
344 
345          return true;
346       }
347    }
348 
349    return false;
350 }
351 
352 static bool
ac_spm_add_counter(const struct radeon_info * info,const struct ac_perfcounters * pc,struct ac_spm * spm,const struct ac_spm_counter_create_info * counter_info)353 ac_spm_add_counter(const struct radeon_info *info,
354                    const struct ac_perfcounters *pc,
355                    struct ac_spm *spm,
356                    const struct ac_spm_counter_create_info *counter_info)
357 {
358    struct ac_spm_instance_mapping instance_mapping = {0};
359    struct ac_spm_counter_info *counter;
360    struct ac_spm_block_select *block_sel;
361    struct ac_pc_block *block;
362    uint32_t spm_wire;
363 
364    /* Check if the GPU block is valid. */
365    block = ac_pc_get_block(pc, counter_info->b->gpu_block);
366    if (!block) {
367       fprintf(stderr, "ac/spm: Invalid GPU block.\n");
368       return false;
369    }
370 
371    /* Check if the number of instances is valid. */
372    if (counter_info->instance > block->num_global_instances - 1) {
373       fprintf(stderr, "ac/spm: Invalid instance ID.\n");
374       return false;
375    }
376 
377    /* Check if the event ID is valid. */
378    if (counter_info->b->event_id > block->b->selectors) {
379       fprintf(stderr, "ac/spm: Invalid event ID.\n");
380       return false;
381    }
382 
383    counter = &spm->counters[spm->num_counters];
384    spm->num_counters++;
385 
386    counter->gpu_block = counter_info->b->gpu_block;
387    counter->event_id = counter_info->b->event_id;
388    counter->instance = counter_info->instance;
389 
390    /* Get the select block used to configure the counter. */
391    block_sel = ac_spm_get_block_select(spm, block);
392    if (!block_sel)
393       return false;
394 
395    /* Initialize instance mapping for the counter. */
396    if (!ac_spm_init_instance_mapping(info, block, counter, &instance_mapping)) {
397       fprintf(stderr, "ac/spm: Failed to initialize instance mapping.\n");
398       return false;
399    }
400 
401    /* Map the counter to the select block. */
402    if (!ac_spm_map_counter(spm, block_sel, counter, &instance_mapping, &spm_wire)) {
403       fprintf(stderr, "ac/spm: No free slots available!\n");
404       return false;
405    }
406 
407    /* Determine the counter segment type. */
408    if (block->b->b->flags & AC_PC_BLOCK_SE) {
409       counter->segment_type = instance_mapping.se_index;
410    } else {
411       counter->segment_type = AC_SPM_SEGMENT_TYPE_GLOBAL;
412    }
413 
414    /* Configure the muxsel for SPM. */
415    ac_spm_init_muxsel(info, block, &instance_mapping, counter, spm_wire);
416 
417    return true;
418 }
419 
420 static void
ac_spm_fill_muxsel_ram(const struct radeon_info * info,struct ac_spm * spm,enum ac_spm_segment_type segment_type,uint32_t offset)421 ac_spm_fill_muxsel_ram(const struct radeon_info *info,
422                        struct ac_spm *spm,
423                        enum ac_spm_segment_type segment_type,
424                        uint32_t offset)
425 {
426    struct ac_spm_muxsel_line *mappings = spm->muxsel_lines[segment_type];
427    uint32_t even_counter_idx = 0, even_line_idx = 0;
428    uint32_t odd_counter_idx = 0, odd_line_idx = 1;
429 
430    /* Add the global timestamps first. */
431    if (segment_type == AC_SPM_SEGMENT_TYPE_GLOBAL) {
432       if (info->gfx_level >= GFX11) {
433          mappings[even_line_idx].muxsel[even_counter_idx++].value = 0xf840;
434          mappings[even_line_idx].muxsel[even_counter_idx++].value = 0xf841;
435          mappings[even_line_idx].muxsel[even_counter_idx++].value = 0xf842;
436          mappings[even_line_idx].muxsel[even_counter_idx++].value = 0xf843;
437       } else {
438          for (unsigned i = 0; i < 4; i++) {
439             mappings[even_line_idx].muxsel[even_counter_idx++].value = 0xf0f0;
440          }
441       }
442    }
443 
444    for (unsigned i = 0; i < spm->num_counters; i++) {
445       struct ac_spm_counter_info *counter = &spm->counters[i];
446 
447       if (counter->segment_type != segment_type)
448          continue;
449 
450       if (counter->is_even) {
451          counter->offset =
452             (offset + even_line_idx) * AC_SPM_NUM_COUNTER_PER_MUXSEL + even_counter_idx;
453 
454          mappings[even_line_idx].muxsel[even_counter_idx] = spm->counters[i].muxsel;
455          if (++even_counter_idx == AC_SPM_NUM_COUNTER_PER_MUXSEL) {
456             even_counter_idx = 0;
457             even_line_idx += 2;
458          }
459       } else {
460          counter->offset =
461             (offset + odd_line_idx) * AC_SPM_NUM_COUNTER_PER_MUXSEL + odd_counter_idx;
462 
463          mappings[odd_line_idx].muxsel[odd_counter_idx] = spm->counters[i].muxsel;
464          if (++odd_counter_idx == AC_SPM_NUM_COUNTER_PER_MUXSEL) {
465             odd_counter_idx = 0;
466             odd_line_idx += 2;
467          }
468       }
469    }
470 }
471 
ac_init_spm(const struct radeon_info * info,const struct ac_perfcounters * pc,struct ac_spm * spm)472 bool ac_init_spm(const struct radeon_info *info,
473                  const struct ac_perfcounters *pc,
474                  struct ac_spm *spm)
475 {
476    const struct ac_spm_counter_create_info *create_info;
477    unsigned create_info_count;
478    unsigned num_counters = 0;
479 
480    switch (info->gfx_level) {
481    case GFX10:
482       create_info_count = ARRAY_SIZE(gfx10_spm_counters);
483       create_info = gfx10_spm_counters;
484       break;
485    case GFX10_3:
486       create_info_count = ARRAY_SIZE(gfx103_spm_counters);
487       create_info = gfx103_spm_counters;
488       break;
489    case GFX11:
490    case GFX11_5:
491       create_info_count = ARRAY_SIZE(gfx11_spm_counters);
492       create_info = gfx11_spm_counters;
493       break;
494    default:
495       return false; /* not implemented */
496    }
497 
498    /* Count the total number of counters. */
499    for (unsigned i = 0; i < create_info_count; i++) {
500       const struct ac_pc_block *block = ac_pc_get_block(pc, create_info[i].b->gpu_block);
501 
502       if (!block)
503          return false;
504 
505       num_counters += block->num_global_instances;
506    }
507 
508    spm->counters = CALLOC(num_counters, sizeof(*spm->counters));
509    if (!spm->counters)
510       return false;
511 
512    for (unsigned i = 0; i < create_info_count; i++) {
513       const struct ac_pc_block *block = ac_pc_get_block(pc, create_info[i].b->gpu_block);
514       struct ac_spm_counter_create_info counter = create_info[i];
515 
516       for (unsigned j = 0; j < block->num_global_instances; j++) {
517          counter.instance = j;
518 
519          if (!ac_spm_add_counter(info, pc, spm, &counter)) {
520             fprintf(stderr, "ac/spm: Failed to add SPM counter (%d).\n", i);
521             return false;
522          }
523       }
524    }
525 
526    /* Determine the segment size and create a muxsel ram for every segment. */
527    for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
528       unsigned num_even_counters = 0, num_odd_counters = 0;
529 
530       if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) {
531          /* The global segment always start with a 64-bit timestamp. */
532          num_even_counters += AC_SPM_GLOBAL_TIMESTAMP_COUNTERS;
533       }
534 
535       /* Count the number of even/odd counters for this segment. */
536       for (unsigned c = 0; c < spm->num_counters; c++) {
537          struct ac_spm_counter_info *counter = &spm->counters[c];
538 
539          if (counter->segment_type != s)
540             continue;
541 
542          if (counter->is_even) {
543             num_even_counters++;
544          } else {
545             num_odd_counters++;
546          }
547       }
548 
549       /* Compute the number of lines. */
550       unsigned even_lines =
551          DIV_ROUND_UP(num_even_counters, AC_SPM_NUM_COUNTER_PER_MUXSEL);
552       unsigned odd_lines =
553          DIV_ROUND_UP(num_odd_counters, AC_SPM_NUM_COUNTER_PER_MUXSEL);
554       unsigned num_lines = (even_lines > odd_lines) ? (2 * even_lines - 1) : (2 * odd_lines);
555 
556       spm->muxsel_lines[s] = CALLOC(num_lines, sizeof(*spm->muxsel_lines[s]));
557       if (!spm->muxsel_lines[s])
558          return false;
559       spm->num_muxsel_lines[s] = num_lines;
560    }
561 
562    /* Compute the maximum number of muxsel lines among all SEs. On GFX11,
563     * there is only one SE segment size value and the highest value is used.
564     */
565    for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_GLOBAL; s++) {
566       spm->max_se_muxsel_lines =
567          MAX2(spm->num_muxsel_lines[s], spm->max_se_muxsel_lines);
568    }
569 
570    /* RLC uses the following order: Global, SE0, SE1, SE2, SE3, SE4, SE5. */
571    ac_spm_fill_muxsel_ram(info, spm, AC_SPM_SEGMENT_TYPE_GLOBAL, 0);
572 
573    const uint32_t num_global_lines = spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_GLOBAL];
574 
575    if (info->gfx_level >= GFX11) {
576       /* On GFX11, RLC uses one segment size for every single SE. */
577       for (unsigned i = 0; i < info->num_se; i++) {
578          assert(i < AC_SPM_SEGMENT_TYPE_GLOBAL);
579          uint32_t offset = num_global_lines + i * spm->max_se_muxsel_lines;
580 
581          ac_spm_fill_muxsel_ram(info, spm, i, offset);
582       }
583    } else {
584       uint32_t offset = num_global_lines;
585 
586       for (unsigned i = 0; i < info->num_se; i++) {
587          assert(i < AC_SPM_SEGMENT_TYPE_GLOBAL);
588 
589          ac_spm_fill_muxsel_ram(info, spm, i, offset);
590 
591          offset += spm->num_muxsel_lines[i];
592       }
593    }
594 
595    /* On GFX11, the data size written by the hw is in units of segment. */
596    spm->ptr_granularity = info->gfx_level >= GFX11 ? 32 : 1;
597 
598    return true;
599 }
600 
ac_destroy_spm(struct ac_spm * spm)601 void ac_destroy_spm(struct ac_spm *spm)
602 {
603    for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
604       FREE(spm->muxsel_lines[s]);
605    }
606 
607    for (unsigned i = 0; i < spm->num_block_sel; i++) {
608       FREE(spm->block_sel[i].instances);
609    }
610 
611    FREE(spm->block_sel);
612    FREE(spm->counters);
613 }
614 
ac_spm_get_sample_size(const struct ac_spm * spm)615 static uint32_t ac_spm_get_sample_size(const struct ac_spm *spm)
616 {
617    uint32_t sample_size = 0; /* in bytes */
618 
619    for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
620       sample_size += spm->num_muxsel_lines[s] * AC_SPM_MUXSEL_LINE_SIZE * 4;
621    }
622 
623    return sample_size;
624 }
625 
ac_spm_get_num_samples(const struct ac_spm * spm)626 static uint32_t ac_spm_get_num_samples(const struct ac_spm *spm)
627 {
628    uint32_t sample_size = ac_spm_get_sample_size(spm);
629    uint32_t *ptr = (uint32_t *)spm->ptr;
630    uint32_t data_size, num_lines_written;
631    uint32_t num_samples = 0;
632 
633    /* Get the data size (in bytes) written by the hw to the ring buffer. */
634    data_size = ptr[0] * spm->ptr_granularity;
635 
636    /* Compute the number of 256 bits (16 * 16-bits counters) lines written. */
637    num_lines_written = data_size / (2 * AC_SPM_NUM_COUNTER_PER_MUXSEL);
638 
639    /* Check for overflow. */
640    if (num_lines_written % (sample_size / 32)) {
641       abort();
642    } else {
643       num_samples = num_lines_written / (sample_size / 32);
644    }
645 
646    return num_samples;
647 }
648 
ac_spm_get_trace(const struct ac_spm * spm,struct ac_spm_trace * trace)649 void ac_spm_get_trace(const struct ac_spm *spm, struct ac_spm_trace *trace)
650 {
651    memset(trace, 0, sizeof(*trace));
652 
653    trace->ptr = spm->ptr;
654    trace->sample_interval = spm->sample_interval;
655    trace->num_counters = spm->num_counters;
656    trace->counters = spm->counters;
657    trace->sample_size_in_bytes = ac_spm_get_sample_size(spm);
658    trace->num_samples = ac_spm_get_num_samples(spm);
659 }
660