xref: /aosp_15_r20/external/mesa3d/src/amd/vulkan/radv_perfcounter.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include <inttypes.h>
8 
9 #include "ac_perfcounter.h"
10 #include "amdgfxregs.h"
11 #include "radv_cs.h"
12 #include "radv_entrypoints.h"
13 #include "radv_perfcounter.h"
14 #include "radv_sqtt.h"
15 #include "sid.h"
16 
17 void
radv_perfcounter_emit_shaders(struct radv_device * device,struct radeon_cmdbuf * cs,unsigned shaders)18 radv_perfcounter_emit_shaders(struct radv_device *device, struct radeon_cmdbuf *cs, unsigned shaders)
19 {
20    const struct radv_physical_device *pdev = radv_device_physical(device);
21 
22    if (pdev->info.gfx_level >= GFX10) {
23       radeon_set_uconfig_reg(cs, R_036780_SQ_PERFCOUNTER_CTRL, shaders & 0x7f);
24       if (pdev->info.gfx_level >= GFX11)
25          radeon_set_uconfig_reg(cs, R_036760_SQG_PERFCOUNTER_CTRL, shaders & 0x7f);
26    } else {
27       radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
28       radeon_emit(cs, shaders & 0x7f);
29       radeon_emit(cs, 0xffffffff);
30    }
31 }
32 
33 static void
radv_emit_windowed_counters(struct radv_device * device,struct radeon_cmdbuf * cs,int family,bool enable)34 radv_emit_windowed_counters(struct radv_device *device, struct radeon_cmdbuf *cs, int family, bool enable)
35 {
36    if (family == RADV_QUEUE_GENERAL) {
37       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
38       radeon_emit(cs, EVENT_TYPE(enable ? V_028A90_PERFCOUNTER_START : V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
39    }
40 
41    radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(enable));
42 }
43 
44 void
radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf * cs)45 radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf *cs)
46 {
47    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
48                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
49                              S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
50 }
51 
52 void
radv_perfcounter_emit_spm_start(struct radv_device * device,struct radeon_cmdbuf * cs,int family)53 radv_perfcounter_emit_spm_start(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
54 {
55    /* Start SPM counters. */
56    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
57                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
58                              S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
59 
60    radv_emit_windowed_counters(device, cs, family, true);
61 }
62 
63 void
radv_perfcounter_emit_spm_stop(struct radv_device * device,struct radeon_cmdbuf * cs,int family)64 radv_perfcounter_emit_spm_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
65 {
66    const struct radv_physical_device *pdev = radv_device_physical(device);
67 
68    radv_emit_windowed_counters(device, cs, family, false);
69 
70    /* Stop SPM counters. */
71    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
72                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
73                              S_036020_SPM_PERFMON_STATE(pdev->info.never_stop_sq_perf_counters
74                                                            ? V_036020_STRM_PERFMON_STATE_START_COUNTING
75                                                            : V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
76 }
77 
78 enum radv_perfcounter_op {
79    RADV_PC_OP_SUM,
80    RADV_PC_OP_MAX,
81    RADV_PC_OP_RATIO_DIVSCALE,
82    RADV_PC_OP_REVERSE_RATIO, /* (reg1 - reg0) / reg1 */
83    RADV_PC_OP_SUM_WEIGHTED_4,
84 };
85 
86 #define S_REG_SEL(x)   ((x)&0xFFFF)
87 #define G_REG_SEL(x)   ((x)&0xFFFF)
88 #define S_REG_BLOCK(x) ((x) << 16)
89 #define G_REG_BLOCK(x) (((x) >> 16) & 0x7FFF)
90 
91 #define S_REG_OFFSET(x)    ((x)&0xFFFF)
92 #define G_REG_OFFSET(x)    ((x)&0xFFFF)
93 #define S_REG_INSTANCES(x) ((x) << 16)
94 #define G_REG_INSTANCES(x) (((x) >> 16) & 0x7FFF)
95 #define S_REG_CONSTANT(x)  ((x) << 31)
96 #define G_REG_CONSTANT(x)  ((x) >> 31)
97 
98 struct radv_perfcounter_impl {
99    enum radv_perfcounter_op op;
100    uint32_t regs[8];
101 };
102 
103 /* Only append to this list, never insert into the middle or remove (but can rename).
104  *
105  * The invariant we're trying to get here is counters that have the same meaning, so
106  * these can be shared between counters that have different implementations on different
107  * GPUs, but should be unique within a GPU.
108  */
109 enum radv_perfcounter_uuid {
110    RADV_PC_UUID_GPU_CYCLES,
111    RADV_PC_UUID_SHADER_WAVES,
112    RADV_PC_UUID_SHADER_INSTRUCTIONS,
113    RADV_PC_UUID_SHADER_INSTRUCTIONS_VALU,
114    RADV_PC_UUID_SHADER_INSTRUCTIONS_SALU,
115    RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_LOAD,
116    RADV_PC_UUID_SHADER_INSTRUCTIONS_SMEM_LOAD,
117    RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_STORE,
118    RADV_PC_UUID_SHADER_INSTRUCTIONS_LDS,
119    RADV_PC_UUID_SHADER_INSTRUCTIONS_GDS,
120    RADV_PC_UUID_SHADER_VALU_BUSY,
121    RADV_PC_UUID_SHADER_SALU_BUSY,
122    RADV_PC_UUID_VRAM_READ_SIZE,
123    RADV_PC_UUID_VRAM_WRITE_SIZE,
124    RADV_PC_UUID_L0_CACHE_HIT_RATIO,
125    RADV_PC_UUID_L1_CACHE_HIT_RATIO,
126    RADV_PC_UUID_L2_CACHE_HIT_RATIO,
127 };
128 
129 struct radv_perfcounter_desc {
130    struct radv_perfcounter_impl impl;
131 
132    VkPerformanceCounterUnitKHR unit;
133 
134    char name[VK_MAX_DESCRIPTION_SIZE];
135    char category[VK_MAX_DESCRIPTION_SIZE];
136    char description[VK_MAX_DESCRIPTION_SIZE];
137    enum radv_perfcounter_uuid uuid;
138 };
139 
140 #define PC_DESC(arg_op, arg_unit, arg_name, arg_category, arg_description, arg_uuid, ...)                              \
141    (struct radv_perfcounter_desc)                                                                                      \
142    {                                                                                                                   \
143       .impl = {.op = arg_op, .regs = {__VA_ARGS__}}, .unit = VK_PERFORMANCE_COUNTER_UNIT_##arg_unit##_KHR,             \
144       .name = arg_name, .category = arg_category, .description = arg_description, .uuid = RADV_PC_UUID_##arg_uuid      \
145    }
146 
147 #define ADD_PC(op, unit, name, category, description, uuid, ...)                                                       \
148    do {                                                                                                                \
149       if (descs) {                                                                                                     \
150          descs[*count] = PC_DESC((op), unit, name, category, description, uuid, __VA_ARGS__);                          \
151       }                                                                                                                \
152       ++*count;                                                                                                        \
153    } while (0)
154 #define CTR(block, ctr) (S_REG_BLOCK(block) | S_REG_SEL(ctr))
155 #define CONSTANT(v)     (S_REG_CONSTANT(1) | (uint32_t)(v))
156 
157 enum { GRBM_PERF_SEL_GUI_ACTIVE = CTR(GRBM, 2) };
158 
159 enum { CPF_PERF_SEL_CPF_STAT_BUSY_GFX10 = CTR(CPF, 0x18) };
160 
161 enum {
162    GL1C_PERF_SEL_REQ = CTR(GL1C, 0xe),
163    GL1C_PERF_SEL_REQ_MISS = CTR(GL1C, 0x12),
164 };
165 
166 enum {
167    GL2C_PERF_SEL_REQ = CTR(GL2C, 0x3),
168 
169    GL2C_PERF_SEL_MISS_GFX101 = CTR(GL2C, 0x23),
170    GL2C_PERF_SEL_MC_WRREQ_GFX101 = CTR(GL2C, 0x4b),
171    GL2C_PERF_SEL_EA_WRREQ_64B_GFX101 = CTR(GL2C, 0x4c),
172    GL2C_PERF_SEL_EA_RDREQ_32B_GFX101 = CTR(GL2C, 0x59),
173    GL2C_PERF_SEL_EA_RDREQ_64B_GFX101 = CTR(GL2C, 0x5a),
174    GL2C_PERF_SEL_EA_RDREQ_96B_GFX101 = CTR(GL2C, 0x5b),
175    GL2C_PERF_SEL_EA_RDREQ_128B_GFX101 = CTR(GL2C, 0x5c),
176 
177    GL2C_PERF_SEL_MISS_GFX103 = CTR(GL2C, 0x2b),
178    GL2C_PERF_SEL_MC_WRREQ_GFX103 = CTR(GL2C, 0x53),
179    GL2C_PERF_SEL_EA_WRREQ_64B_GFX103 = CTR(GL2C, 0x55),
180    GL2C_PERF_SEL_EA_RDREQ_32B_GFX103 = CTR(GL2C, 0x63),
181    GL2C_PERF_SEL_EA_RDREQ_64B_GFX103 = CTR(GL2C, 0x64),
182    GL2C_PERF_SEL_EA_RDREQ_96B_GFX103 = CTR(GL2C, 0x65),
183    GL2C_PERF_SEL_EA_RDREQ_128B_GFX103 = CTR(GL2C, 0x66),
184 };
185 
186 enum {
187    SQ_PERF_SEL_WAVES = CTR(SQ, 0x4),
188    SQ_PERF_SEL_INSTS_ALL_GFX10 = CTR(SQ, 0x31),
189    SQ_PERF_SEL_INSTS_GDS_GFX10 = CTR(SQ, 0x37),
190    SQ_PERF_SEL_INSTS_LDS_GFX10 = CTR(SQ, 0x3b),
191    SQ_PERF_SEL_INSTS_SALU_GFX10 = CTR(SQ, 0x3c),
192    SQ_PERF_SEL_INSTS_SMEM_GFX10 = CTR(SQ, 0x3d),
193    SQ_PERF_SEL_INSTS_VALU_GFX10 = CTR(SQ, 0x40),
194    SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10 = CTR(SQ, 0x45),
195    SQ_PERF_SEL_INSTS_TEX_STORE_GFX10 = CTR(SQ, 0x46),
196    SQ_PERF_SEL_INST_CYCLES_VALU_GFX10 = CTR(SQ, 0x75),
197 };
198 
199 enum {
200    TCP_PERF_SEL_REQ_GFX10 = CTR(TCP, 0x9),
201    TCP_PERF_SEL_REQ_MISS_GFX10 = CTR(TCP, 0x12),
202 };
203 
204 #define CTR_NUM_SIMD CONSTANT(pdev->info.num_simd_per_compute_unit * pdev->info.num_cu)
205 #define CTR_NUM_CUS  CONSTANT(pdev->info.num_cu)
206 
207 static void
radv_query_perfcounter_descs(struct radv_physical_device * pdev,uint32_t * count,struct radv_perfcounter_desc * descs)208 radv_query_perfcounter_descs(struct radv_physical_device *pdev, uint32_t *count, struct radv_perfcounter_desc *descs)
209 {
210    *count = 0;
211 
212    ADD_PC(RADV_PC_OP_MAX, CYCLES, "GPU active cycles", "GRBM", "cycles the GPU is active processing a command buffer.",
213           GPU_CYCLES, GRBM_PERF_SEL_GUI_ACTIVE);
214 
215    ADD_PC(RADV_PC_OP_SUM, GENERIC, "Waves", "Shaders", "Number of waves executed", SHADER_WAVES, SQ_PERF_SEL_WAVES);
216    ADD_PC(RADV_PC_OP_SUM, GENERIC, "Instructions", "Shaders", "Number of Instructions executed", SHADER_INSTRUCTIONS,
217           SQ_PERF_SEL_INSTS_ALL_GFX10);
218    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VALU Instructions", "Shaders", "Number of VALU Instructions executed",
219           SHADER_INSTRUCTIONS_VALU, SQ_PERF_SEL_INSTS_VALU_GFX10);
220    ADD_PC(RADV_PC_OP_SUM, GENERIC, "SALU Instructions", "Shaders", "Number of SALU Instructions executed",
221           SHADER_INSTRUCTIONS_SALU, SQ_PERF_SEL_INSTS_SALU_GFX10);
222    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Load Instructions", "Shaders", "Number of VMEM load instructions executed",
223           SHADER_INSTRUCTIONS_VMEM_LOAD, SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10);
224    ADD_PC(RADV_PC_OP_SUM, GENERIC, "SMEM Load Instructions", "Shaders", "Number of SMEM load instructions executed",
225           SHADER_INSTRUCTIONS_SMEM_LOAD, SQ_PERF_SEL_INSTS_SMEM_GFX10);
226    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Store Instructions", "Shaders", "Number of VMEM store instructions executed",
227           SHADER_INSTRUCTIONS_VMEM_STORE, SQ_PERF_SEL_INSTS_TEX_STORE_GFX10);
228    ADD_PC(RADV_PC_OP_SUM, GENERIC, "LDS Instructions", "Shaders", "Number of LDS Instructions executed",
229           SHADER_INSTRUCTIONS_LDS, SQ_PERF_SEL_INSTS_LDS_GFX10);
230    ADD_PC(RADV_PC_OP_SUM, GENERIC, "GDS Instructions", "Shaders", "Number of GDS Instructions executed",
231           SHADER_INSTRUCTIONS_GDS, SQ_PERF_SEL_INSTS_GDS_GFX10);
232 
233    ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "VALU Busy", "Shader Utilization",
234           "Percentage of time the VALU units are busy", SHADER_VALU_BUSY, SQ_PERF_SEL_INST_CYCLES_VALU_GFX10,
235           CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_SIMD);
236    ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "SALU Busy", "Shader Utilization",
237           "Percentage of time the SALU units are busy", SHADER_SALU_BUSY, SQ_PERF_SEL_INSTS_SALU_GFX10,
238           CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_CUS);
239 
240    if (pdev->info.gfx_level >= GFX10_3) {
241       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", "Number of bytes read from VRAM",
242              VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX103, CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX103,
243              CONSTANT(64), GL2C_PERF_SEL_EA_RDREQ_96B_GFX103, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX103,
244              CONSTANT(128));
245       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", "Number of bytes written to VRAM",
246              VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX103, CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX103,
247              CONSTANT(64), CONSTANT(0), CONSTANT(0), CONSTANT(0), CONSTANT(0));
248    } else {
249       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", "Number of bytes read from VRAM",
250              VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX101, CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX101,
251              CONSTANT(64), GL2C_PERF_SEL_EA_RDREQ_96B_GFX101, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX101,
252              CONSTANT(128));
253       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", "Number of bytes written to VRAM",
254              VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX101, CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX101,
255              CONSTANT(32), CONSTANT(0), CONSTANT(0), CONSTANT(0), CONSTANT(0));
256    }
257 
258    ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L0 cache hit ratio", "Memory", "Hit ratio of L0 cache", L0_CACHE_HIT_RATIO,
259           TCP_PERF_SEL_REQ_MISS_GFX10, TCP_PERF_SEL_REQ_GFX10);
260    ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L1 cache hit ratio", "Memory", "Hit ratio of L1 cache", L1_CACHE_HIT_RATIO,
261           GL1C_PERF_SEL_REQ_MISS, GL1C_PERF_SEL_REQ);
262    if (pdev->info.gfx_level >= GFX10_3) {
263       ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", "Hit ratio of L2 cache",
264              L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX103, GL2C_PERF_SEL_REQ);
265    } else {
266       ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", "Hit ratio of L2 cache",
267              L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX101, GL2C_PERF_SEL_REQ);
268    }
269 }
270 
271 static bool
radv_init_perfcounter_descs(struct radv_physical_device * pdev)272 radv_init_perfcounter_descs(struct radv_physical_device *pdev)
273 {
274    if (pdev->perfcounters)
275       return true;
276 
277    uint32_t count;
278    radv_query_perfcounter_descs(pdev, &count, NULL);
279 
280    struct radv_perfcounter_desc *descs = malloc(sizeof(*descs) * count);
281    if (!descs)
282       return false;
283 
284    radv_query_perfcounter_descs(pdev, &count, descs);
285    pdev->num_perfcounters = count;
286    pdev->perfcounters = descs;
287 
288    return true;
289 }
290 
291 static int
cmp_uint32_t(const void * a,const void * b)292 cmp_uint32_t(const void *a, const void *b)
293 {
294    uint32_t l = *(const uint32_t *)a;
295    uint32_t r = *(const uint32_t *)b;
296 
297    return (l < r) ? -1 : (l > r) ? 1 : 0;
298 }
299 
300 static VkResult
radv_get_counter_registers(const struct radv_physical_device * pdev,uint32_t num_indices,const uint32_t * indices,unsigned * out_num_regs,uint32_t ** out_regs)301 radv_get_counter_registers(const struct radv_physical_device *pdev, uint32_t num_indices, const uint32_t *indices,
302                            unsigned *out_num_regs, uint32_t **out_regs)
303 {
304    ASSERTED uint32_t num_counters = pdev->num_perfcounters;
305    const struct radv_perfcounter_desc *descs = pdev->perfcounters;
306 
307    unsigned full_reg_cnt = num_indices * ARRAY_SIZE(descs->impl.regs);
308    uint32_t *regs = malloc(full_reg_cnt * sizeof(uint32_t));
309    if (!regs)
310       return VK_ERROR_OUT_OF_HOST_MEMORY;
311 
312    unsigned reg_cnt = 0;
313    for (unsigned i = 0; i < num_indices; ++i) {
314       uint32_t index = indices[i];
315       assert(index < num_counters);
316       for (unsigned j = 0; j < ARRAY_SIZE(descs[index].impl.regs) && descs[index].impl.regs[j]; ++j) {
317          if (!G_REG_CONSTANT(descs[index].impl.regs[j]))
318             regs[reg_cnt++] = descs[index].impl.regs[j];
319       }
320    }
321 
322    qsort(regs, reg_cnt, sizeof(uint32_t), cmp_uint32_t);
323 
324    unsigned deduped_reg_cnt = 0;
325    for (unsigned i = 1; i < reg_cnt; ++i) {
326       if (regs[i] != regs[deduped_reg_cnt])
327          regs[++deduped_reg_cnt] = regs[i];
328    }
329    ++deduped_reg_cnt;
330 
331    *out_num_regs = deduped_reg_cnt;
332    *out_regs = regs;
333    return VK_SUCCESS;
334 }
335 
336 static unsigned
radv_pc_get_num_instances(const struct radv_physical_device * pdev,struct ac_pc_block * ac_block)337 radv_pc_get_num_instances(const struct radv_physical_device *pdev, struct ac_pc_block *ac_block)
338 {
339    return ac_block->num_instances * ((ac_block->b->b->flags & AC_PC_BLOCK_SE) ? pdev->info.max_se : 1);
340 }
341 
342 static unsigned
radv_get_num_counter_passes(const struct radv_physical_device * pdev,unsigned num_regs,const uint32_t * regs)343 radv_get_num_counter_passes(const struct radv_physical_device *pdev, unsigned num_regs, const uint32_t *regs)
344 {
345    enum ac_pc_gpu_block prev_block = NUM_GPU_BLOCK;
346    unsigned block_reg_count = 0;
347    struct ac_pc_block *ac_block = NULL;
348    unsigned passes_needed = 1;
349 
350    for (unsigned i = 0; i < num_regs; ++i) {
351       enum ac_pc_gpu_block block = G_REG_BLOCK(regs[i]);
352 
353       if (block != prev_block) {
354          block_reg_count = 0;
355          prev_block = block;
356          ac_block = ac_pc_get_block(&pdev->ac_perfcounters, block);
357       }
358 
359       ++block_reg_count;
360 
361       passes_needed = MAX2(passes_needed, DIV_ROUND_UP(block_reg_count, ac_block->b->b->num_counters));
362    }
363 
364    return passes_needed;
365 }
366 
367 void
radv_pc_deinit_query_pool(struct radv_pc_query_pool * pool)368 radv_pc_deinit_query_pool(struct radv_pc_query_pool *pool)
369 {
370    free(pool->counters);
371    free(pool->pc_regs);
372 }
373 
374 VkResult
radv_pc_init_query_pool(struct radv_physical_device * pdev,const VkQueryPoolCreateInfo * pCreateInfo,struct radv_pc_query_pool * pool)375 radv_pc_init_query_pool(struct radv_physical_device *pdev, const VkQueryPoolCreateInfo *pCreateInfo,
376                         struct radv_pc_query_pool *pool)
377 {
378    const VkQueryPoolPerformanceCreateInfoKHR *perf_info =
379       vk_find_struct_const(pCreateInfo->pNext, QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
380    VkResult result;
381 
382    if (!radv_init_perfcounter_descs(pdev))
383       return VK_ERROR_OUT_OF_HOST_MEMORY;
384 
385    result = radv_get_counter_registers(pdev, perf_info->counterIndexCount, perf_info->pCounterIndices,
386                                        &pool->num_pc_regs, &pool->pc_regs);
387    if (result != VK_SUCCESS)
388       return result;
389 
390    pool->num_passes = radv_get_num_counter_passes(pdev, pool->num_pc_regs, pool->pc_regs);
391 
392    uint32_t *pc_reg_offsets = malloc(pool->num_pc_regs * sizeof(uint32_t));
393    if (!pc_reg_offsets)
394       return VK_ERROR_OUT_OF_HOST_MEMORY;
395 
396    unsigned offset = 0;
397    for (unsigned i = 0; i < pool->num_pc_regs; ++i) {
398       enum ac_pc_gpu_block block = pool->pc_regs[i] >> 16;
399       struct ac_pc_block *ac_block = ac_pc_get_block(&pdev->ac_perfcounters, block);
400       unsigned num_instances = radv_pc_get_num_instances(pdev, ac_block);
401 
402       pc_reg_offsets[i] = S_REG_OFFSET(offset) | S_REG_INSTANCES(num_instances);
403       offset += sizeof(uint64_t) * 2 * num_instances;
404    }
405 
406    /* allow an uint32_t per pass to signal completion. */
407    pool->b.stride = offset + 8 * pool->num_passes;
408 
409    pool->num_counters = perf_info->counterIndexCount;
410    pool->counters = malloc(pool->num_counters * sizeof(struct radv_perfcounter_impl));
411    if (!pool->counters) {
412       free(pc_reg_offsets);
413       return VK_ERROR_OUT_OF_HOST_MEMORY;
414    }
415 
416    for (unsigned i = 0; i < pool->num_counters; ++i) {
417       pool->counters[i] = pdev->perfcounters[perf_info->pCounterIndices[i]].impl;
418 
419       for (unsigned j = 0; j < ARRAY_SIZE(pool->counters[i].regs); ++j) {
420          uint32_t reg = pool->counters[i].regs[j];
421          if (!reg || G_REG_CONSTANT(reg))
422             continue;
423 
424          unsigned k;
425          for (k = 0; k < pool->num_pc_regs; ++k)
426             if (pool->pc_regs[k] == reg)
427                break;
428          pool->counters[i].regs[j] = pc_reg_offsets[k];
429       }
430    }
431 
432    free(pc_reg_offsets);
433    return VK_SUCCESS;
434 }
435 
436 static void
radv_emit_instance(struct radv_cmd_buffer * cmd_buffer,int se,int instance)437 radv_emit_instance(struct radv_cmd_buffer *cmd_buffer, int se, int instance)
438 {
439    struct radeon_cmdbuf *cs = cmd_buffer->cs;
440    unsigned value = S_030800_SH_BROADCAST_WRITES(1);
441 
442    if (se >= 0) {
443       value |= S_030800_SE_INDEX(se);
444    } else {
445       value |= S_030800_SE_BROADCAST_WRITES(1);
446    }
447 
448    if (instance >= 0) {
449       value |= S_030800_INSTANCE_INDEX(instance);
450    } else {
451       value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
452    }
453 
454    radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
455 }
456 
457 static void
radv_emit_select(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,unsigned * selectors)458 radv_emit_select(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, unsigned *selectors)
459 {
460    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
461    const struct radv_physical_device *pdev = radv_device_physical(device);
462    const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
463    const enum radv_queue_family qf = cmd_buffer->qf;
464    struct ac_pc_block_base *regs = block->b->b;
465    struct radeon_cmdbuf *cs = cmd_buffer->cs;
466    unsigned idx;
467 
468    assert(count <= regs->num_counters);
469 
470    /* Fake counters. */
471    if (!regs->select0)
472       return;
473 
474    for (idx = 0; idx < count; ++idx) {
475       radeon_set_uconfig_perfctr_reg(gfx_level, qf, cs, regs->select0[idx],
476                                      G_REG_SEL(selectors[idx]) | regs->select_or);
477    }
478 
479    for (idx = 0; idx < regs->num_spm_counters; idx++) {
480       radeon_set_uconfig_reg_seq(cs, regs->select1[idx], 1);
481       radeon_emit(cs, 0);
482    }
483 }
484 
485 static void
radv_pc_emit_block_instance_read(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,uint64_t va)486 radv_pc_emit_block_instance_read(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count,
487                                  uint64_t va)
488 {
489    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
490    const struct radv_physical_device *pdev = radv_device_physical(device);
491    struct ac_pc_block_base *regs = block->b->b;
492    struct radeon_cmdbuf *cs = cmd_buffer->cs;
493    unsigned reg = regs->counter0_lo;
494    unsigned reg_delta = 8;
495 
496    assert(regs->select0);
497    for (unsigned idx = 0; idx < count; ++idx) {
498       if (regs->counters)
499          reg = regs->counters[idx];
500 
501       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
502       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM |
503                          COPY_DATA_COUNT_SEL); /* 64 bits */
504       radeon_emit(cs, reg >> 2);
505       radeon_emit(cs, 0); /* unused */
506       radeon_emit(cs, va);
507       radeon_emit(cs, va >> 32);
508 
509       va += sizeof(uint64_t) * 2 * radv_pc_get_num_instances(pdev, block);
510       reg += reg_delta;
511    }
512 }
513 
514 static void
radv_pc_sample_block(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,uint64_t va)515 radv_pc_sample_block(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, uint64_t va)
516 {
517    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
518    const struct radv_physical_device *pdev = radv_device_physical(device);
519    unsigned se_end = 1;
520    if (block->b->b->flags & AC_PC_BLOCK_SE)
521       se_end = pdev->info.max_se;
522 
523    for (unsigned se = 0; se < se_end; ++se) {
524       for (unsigned instance = 0; instance < block->num_instances; ++instance) {
525          radv_emit_instance(cmd_buffer, se, instance);
526          radv_pc_emit_block_instance_read(cmd_buffer, block, count, va);
527          va += sizeof(uint64_t) * 2;
528       }
529    }
530 }
531 
532 static void
radv_pc_wait_idle(struct radv_cmd_buffer * cmd_buffer)533 radv_pc_wait_idle(struct radv_cmd_buffer *cmd_buffer)
534 {
535    struct radeon_cmdbuf *cs = cmd_buffer->cs;
536 
537    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
538    radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
539 
540    radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
541    radeon_emit(cs, 0);          /* CP_COHER_CNTL */
542    radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
543    radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
544    radeon_emit(cs, 0);          /* CP_COHER_BASE */
545    radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
546    radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
547    radeon_emit(cs, 0);          /* GCR_CNTL */
548 
549    radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
550    radeon_emit(cs, 0);
551 }
552 
553 static void
radv_pc_stop_and_sample(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va,bool end)554 radv_pc_stop_and_sample(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va, bool end)
555 {
556    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
557    const struct radv_physical_device *pdev = radv_device_physical(device);
558    struct radeon_cmdbuf *cs = cmd_buffer->cs;
559 
560    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
561    radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
562 
563    radv_pc_wait_idle(cmd_buffer);
564 
565    radv_emit_instance(cmd_buffer, -1, -1);
566    radv_emit_windowed_counters(device, cs, cmd_buffer->qf, false);
567 
568    radeon_set_uconfig_reg(
569       cs, R_036020_CP_PERFMON_CNTL,
570       S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
571 
572    for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
573       uint64_t pred_va = radv_buffer_get_va(device->perf_counter_bo) + PERF_CTR_BO_PASS_OFFSET + 8 * pass;
574       uint64_t reg_va = va + (end ? 8 : 0);
575 
576       radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
577       radeon_emit(cs, pred_va);
578       radeon_emit(cs, pred_va >> 32);
579       radeon_emit(cs, 0); /* Cache policy */
580 
581       uint32_t *skip_dwords = cs->buf + cs->cdw;
582       radeon_emit(cs, 0);
583 
584       for (unsigned i = 0; i < pool->num_pc_regs;) {
585          enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
586          struct ac_pc_block *ac_block = ac_pc_get_block(&pdev->ac_perfcounters, block);
587          unsigned offset = ac_block->num_instances * pass;
588          unsigned num_instances = radv_pc_get_num_instances(pdev, ac_block);
589 
590          unsigned cnt = 1;
591          while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
592             ++cnt;
593 
594          if (offset < cnt) {
595             unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
596             radv_pc_sample_block(cmd_buffer, ac_block, pass_reg_cnt,
597                                  reg_va + offset * num_instances * sizeof(uint64_t));
598          }
599 
600          i += cnt;
601          reg_va += num_instances * sizeof(uint64_t) * 2 * cnt;
602       }
603 
604       if (end) {
605          uint64_t signal_va = va + pool->b.stride - 8 - 8 * pass;
606          radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
607          radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
608          radeon_emit(cs, signal_va);
609          radeon_emit(cs, signal_va >> 32);
610          radeon_emit(cs, 1); /* value */
611       }
612 
613       *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
614    }
615 
616    radv_emit_instance(cmd_buffer, -1, -1);
617 }
618 
619 void
radv_pc_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va)620 radv_pc_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va)
621 {
622    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
623    struct radeon_cmdbuf *cs = cmd_buffer->cs;
624    const struct radv_physical_device *pdev = radv_device_physical(device);
625    ASSERTED unsigned cdw_max;
626 
627    cmd_buffer->state.uses_perf_counters = true;
628 
629    cdw_max = radeon_check_space(device->ws, cs,
630                                 256 +                      /* Random one time stuff */
631                                    10 * pool->num_passes + /* COND_EXECs */
632                                    pool->b.stride / 8 * (5 + 8));
633 
634    radv_cs_add_buffer(device->ws, cmd_buffer->cs, pool->b.bo);
635    radv_cs_add_buffer(device->ws, cmd_buffer->cs, device->perf_counter_bo);
636 
637    uint64_t perf_ctr_va = radv_buffer_get_va(device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
638    radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
639    radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
640    radeon_emit(cs, perf_ctr_va);
641    radeon_emit(cs, perf_ctr_va >> 32);
642    radeon_emit(cs, 0); /* value */
643 
644    radv_pc_wait_idle(cmd_buffer);
645 
646    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
647                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
648 
649    radv_emit_inhibit_clockgating(device, cs, true);
650    radv_emit_spi_config_cntl(device, cs, true);
651    radv_perfcounter_emit_shaders(device, cs, 0x7f);
652 
653    for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
654       uint64_t pred_va = radv_buffer_get_va(device->perf_counter_bo) + PERF_CTR_BO_PASS_OFFSET + 8 * pass;
655 
656       radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
657       radeon_emit(cs, pred_va);
658       radeon_emit(cs, pred_va >> 32);
659       radeon_emit(cs, 0); /* Cache policy */
660 
661       uint32_t *skip_dwords = cs->buf + cs->cdw;
662       radeon_emit(cs, 0);
663 
664       for (unsigned i = 0; i < pool->num_pc_regs;) {
665          enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
666          struct ac_pc_block *ac_block = ac_pc_get_block(&pdev->ac_perfcounters, block);
667          unsigned offset = ac_block->num_instances * pass;
668 
669          unsigned cnt = 1;
670          while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
671             ++cnt;
672 
673          if (offset < cnt) {
674             unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
675             radv_emit_select(cmd_buffer, ac_block, pass_reg_cnt, pool->pc_regs + i + offset);
676          }
677 
678          i += cnt;
679       }
680 
681       *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
682    }
683 
684    radv_emit_instance(cmd_buffer, -1, -1);
685 
686    /* The following sequence actually starts the perfcounters. */
687 
688    radv_pc_stop_and_sample(cmd_buffer, pool, va, false);
689 
690    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
691                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
692 
693    radv_emit_windowed_counters(device, cs, cmd_buffer->qf, true);
694 
695    assert(cmd_buffer->cs->cdw <= cdw_max);
696 }
697 
698 void
radv_pc_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va)699 radv_pc_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va)
700 {
701    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
702    const struct radv_physical_device *pdev = radv_device_physical(device);
703    struct radeon_cmdbuf *cs = cmd_buffer->cs;
704    ASSERTED unsigned cdw_max;
705 
706    cdw_max = radeon_check_space(device->ws, cs,
707                                 256 + /* Reserved for things that don't scale with passes/counters */
708                                    5 * pool->num_passes + /* COND_EXECs */
709                                    pool->b.stride / 8 * 8);
710 
711    radv_cs_add_buffer(device->ws, cmd_buffer->cs, pool->b.bo);
712    radv_cs_add_buffer(device->ws, cmd_buffer->cs, device->perf_counter_bo);
713 
714    uint64_t perf_ctr_va = radv_buffer_get_va(device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
715    radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
716                                 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, perf_ctr_va, 1, cmd_buffer->gfx9_fence_va);
717    radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, perf_ctr_va, 1, 0xffffffff);
718 
719    radv_pc_wait_idle(cmd_buffer);
720    radv_pc_stop_and_sample(cmd_buffer, pool, va, true);
721 
722    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
723                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
724    radv_emit_spi_config_cntl(device, cs, false);
725    radv_emit_inhibit_clockgating(device, cs, false);
726 
727    assert(cmd_buffer->cs->cdw <= cdw_max);
728 }
729 
730 static uint64_t
radv_pc_sum_reg(uint32_t reg,const uint64_t * data)731 radv_pc_sum_reg(uint32_t reg, const uint64_t *data)
732 {
733    unsigned instances = G_REG_INSTANCES(reg);
734    unsigned offset = G_REG_OFFSET(reg) / 8;
735    uint64_t result = 0;
736 
737    if (G_REG_CONSTANT(reg))
738       return reg & 0x7fffffffu;
739 
740    for (unsigned i = 0; i < instances; ++i) {
741       result += data[offset + 2 * i + 1] - data[offset + 2 * i];
742    }
743 
744    return result;
745 }
746 
747 static uint64_t
radv_pc_max_reg(uint32_t reg,const uint64_t * data)748 radv_pc_max_reg(uint32_t reg, const uint64_t *data)
749 {
750    unsigned instances = G_REG_INSTANCES(reg);
751    unsigned offset = G_REG_OFFSET(reg) / 8;
752    uint64_t result = 0;
753 
754    if (G_REG_CONSTANT(reg))
755       return reg & 0x7fffffffu;
756 
757    for (unsigned i = 0; i < instances; ++i) {
758       result = MAX2(result, data[offset + 2 * i + 1]);
759    }
760 
761    return result;
762 }
763 
764 static union VkPerformanceCounterResultKHR
radv_pc_get_result(const struct radv_perfcounter_impl * impl,const uint64_t * data)765 radv_pc_get_result(const struct radv_perfcounter_impl *impl, const uint64_t *data)
766 {
767    union VkPerformanceCounterResultKHR result;
768 
769    switch (impl->op) {
770    case RADV_PC_OP_MAX:
771       result.float64 = radv_pc_max_reg(impl->regs[0], data);
772       break;
773    case RADV_PC_OP_SUM:
774       result.float64 = radv_pc_sum_reg(impl->regs[0], data);
775       break;
776    case RADV_PC_OP_RATIO_DIVSCALE:
777       result.float64 = radv_pc_sum_reg(impl->regs[0], data) / (double)radv_pc_sum_reg(impl->regs[1], data) /
778                        radv_pc_sum_reg(impl->regs[2], data) * 100.0;
779       break;
780    case RADV_PC_OP_REVERSE_RATIO: {
781       double tmp = radv_pc_sum_reg(impl->regs[1], data);
782       result.float64 = (tmp - radv_pc_sum_reg(impl->regs[0], data)) / tmp * 100.0;
783       break;
784    }
785    case RADV_PC_OP_SUM_WEIGHTED_4:
786       result.float64 = 0.0;
787       for (unsigned i = 0; i < 4; ++i)
788          result.float64 += radv_pc_sum_reg(impl->regs[2 * i], data) * radv_pc_sum_reg(impl->regs[2 * i + 1], data);
789       break;
790    default:
791       unreachable("unhandled performance counter operation");
792    }
793    return result;
794 }
795 
796 void
radv_pc_get_results(const struct radv_pc_query_pool * pc_pool,const uint64_t * data,void * out)797 radv_pc_get_results(const struct radv_pc_query_pool *pc_pool, const uint64_t *data, void *out)
798 {
799    union VkPerformanceCounterResultKHR *pc_result = out;
800 
801    for (unsigned i = 0; i < pc_pool->num_counters; ++i) {
802       pc_result[i] = radv_pc_get_result(pc_pool->counters + i, data);
803    }
804 }
805 
806 VKAPI_ATTR VkResult VKAPI_CALL
radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)807 radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
808    VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, uint32_t *pCounterCount,
809    VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
810 {
811    VK_FROM_HANDLE(radv_physical_device, pdev, physicalDevice);
812 
813    if (vk_queue_to_radv(pdev, queueFamilyIndex) != RADV_QUEUE_GENERAL) {
814       *pCounterCount = 0;
815       return VK_SUCCESS;
816    }
817 
818    if (!radv_init_perfcounter_descs(pdev))
819       return VK_ERROR_OUT_OF_HOST_MEMORY;
820 
821    uint32_t counter_cnt = pdev->num_perfcounters;
822    const struct radv_perfcounter_desc *descs = pdev->perfcounters;
823 
824    if (!pCounters && !pCounterDescriptions) {
825       *pCounterCount = counter_cnt;
826       return VK_SUCCESS;
827    }
828 
829    VkResult result = counter_cnt > *pCounterCount ? VK_INCOMPLETE : VK_SUCCESS;
830    counter_cnt = MIN2(counter_cnt, *pCounterCount);
831    *pCounterCount = counter_cnt;
832 
833    for (uint32_t i = 0; i < counter_cnt; ++i) {
834       if (pCounters) {
835          pCounters[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_KHR;
836          pCounters[i].unit = descs[i].unit;
837          pCounters[i].scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
838          pCounters[i].storage = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR;
839 
840          memset(&pCounters[i].uuid, 0, sizeof(pCounters[i].uuid));
841          strcpy((char *)&pCounters[i].uuid, "RADV");
842 
843          const uint32_t uuid = descs[i].uuid;
844          memcpy(&pCounters[i].uuid[12], &uuid, sizeof(uuid));
845       }
846 
847       if (pCounterDescriptions) {
848          pCounterDescriptions[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_DESCRIPTION_KHR;
849          pCounterDescriptions[i].flags = VK_PERFORMANCE_COUNTER_DESCRIPTION_CONCURRENTLY_IMPACTED_BIT_KHR;
850          strcpy(pCounterDescriptions[i].name, descs[i].name);
851          strcpy(pCounterDescriptions[i].category, descs[i].category);
852          strcpy(pCounterDescriptions[i].description, descs[i].description);
853       }
854    }
855    return result;
856 }
857 
858 VKAPI_ATTR void VKAPI_CALL
radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)859 radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
860    VkPhysicalDevice physicalDevice, const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
861    uint32_t *pNumPasses)
862 {
863    VK_FROM_HANDLE(radv_physical_device, pdev, physicalDevice);
864 
865    if (pPerformanceQueryCreateInfo->counterIndexCount == 0) {
866       *pNumPasses = 0;
867       return;
868    }
869 
870    if (!radv_init_perfcounter_descs(pdev)) {
871       /* Can't return an error, so log */
872       fprintf(stderr, "radv: Failed to init perf counters\n");
873       *pNumPasses = 1;
874       return;
875    }
876 
877    assert(vk_queue_to_radv(pdev, pPerformanceQueryCreateInfo->queueFamilyIndex) == RADV_QUEUE_GENERAL);
878 
879    unsigned num_regs = 0;
880    uint32_t *regs = NULL;
881    VkResult result = radv_get_counter_registers(pdev, pPerformanceQueryCreateInfo->counterIndexCount,
882                                                 pPerformanceQueryCreateInfo->pCounterIndices, &num_regs, &regs);
883    if (result != VK_SUCCESS) {
884       /* Can't return an error, so log */
885       fprintf(stderr, "radv: Failed to allocate memory for perf counters\n");
886    }
887 
888    *pNumPasses = radv_get_num_counter_passes(pdev, num_regs, regs);
889    free(regs);
890 }
891