xref: /aosp_15_r20/external/mesa3d/src/amd/common/ac_debug.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2015 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ac_debug.h"
8 #include "sid.h"
9 #include "sid_tables.h"
10 
11 #include "util/u_string.h"
12 
13 #include <inttypes.h>
14 
ac_find_register(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)15 const struct si_reg *ac_find_register(enum amd_gfx_level gfx_level, enum radeon_family family,
16                                       unsigned offset)
17 {
18    const struct si_reg *table;
19    unsigned table_size;
20 
21    switch (gfx_level) {
22    case GFX12:
23       table = gfx12_reg_table;
24       table_size = ARRAY_SIZE(gfx12_reg_table);
25       break;
26    case GFX11_5:
27       table = gfx115_reg_table;
28       table_size = ARRAY_SIZE(gfx115_reg_table);
29       break;
30    case GFX11:
31       table = gfx11_reg_table;
32       table_size = ARRAY_SIZE(gfx11_reg_table);
33       break;
34    case GFX10_3:
35       table = gfx103_reg_table;
36       table_size = ARRAY_SIZE(gfx103_reg_table);
37       break;
38    case GFX10:
39       table = gfx10_reg_table;
40       table_size = ARRAY_SIZE(gfx10_reg_table);
41       break;
42    case GFX9:
43       if (family == CHIP_GFX940) {
44          table = gfx940_reg_table;
45          table_size = ARRAY_SIZE(gfx940_reg_table);
46          break;
47       }
48       table = gfx9_reg_table;
49       table_size = ARRAY_SIZE(gfx9_reg_table);
50       break;
51    case GFX8:
52       if (family == CHIP_STONEY) {
53          table = gfx81_reg_table;
54          table_size = ARRAY_SIZE(gfx81_reg_table);
55          break;
56       }
57       table = gfx8_reg_table;
58       table_size = ARRAY_SIZE(gfx8_reg_table);
59       break;
60    case GFX7:
61       table = gfx7_reg_table;
62       table_size = ARRAY_SIZE(gfx7_reg_table);
63       break;
64    case GFX6:
65       table = gfx6_reg_table;
66       table_size = ARRAY_SIZE(gfx6_reg_table);
67       break;
68    default:
69       return NULL;
70    }
71 
72    for (unsigned i = 0; i < table_size; i++) {
73       const struct si_reg *reg = &table[i];
74 
75       if (reg->offset == offset)
76          return reg;
77    }
78 
79    return NULL;
80 }
81 
ac_get_register_name(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)82 const char *ac_get_register_name(enum amd_gfx_level gfx_level, enum radeon_family family,
83                                  unsigned offset)
84 {
85    const struct si_reg *reg = ac_find_register(gfx_level, family, offset);
86 
87    return reg ? sid_strings + reg->name_offset : "(no name)";
88 }
89 
ac_register_exists(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)90 bool ac_register_exists(enum amd_gfx_level gfx_level, enum radeon_family family,
91                         unsigned offset)
92 {
93    return ac_find_register(gfx_level, family, offset) != NULL;
94 }
95 
96 /**
97  * Parse dmesg and return TRUE if a VM fault has been detected.
98  *
99  * \param gfx_level		gfx level
100  * \param old_dmesg_timestamp	previous dmesg timestamp parsed at init time
101  * \param out_addr		detected VM fault addr
102  */
ac_vm_fault_occurred(enum amd_gfx_level gfx_level,uint64_t * old_dmesg_timestamp,uint64_t * out_addr)103 bool ac_vm_fault_occurred(enum amd_gfx_level gfx_level, uint64_t *old_dmesg_timestamp,
104                          uint64_t *out_addr)
105 {
106 #ifdef _WIN32
107    return false;
108 #else
109    char line[2000];
110    unsigned sec, usec;
111    int progress = 0;
112    uint64_t dmesg_timestamp = 0;
113    bool fault = false;
114 
115    FILE *p = popen("dmesg", "r");
116    if (!p)
117       return false;
118 
119    while (fgets(line, sizeof(line), p)) {
120       char *msg, len;
121 
122       if (!line[0] || line[0] == '\n')
123          continue;
124 
125       /* Get the timestamp. */
126       if (sscanf(line, "[%u.%u]", &sec, &usec) != 2) {
127          static bool hit = false;
128          if (!hit) {
129             fprintf(stderr, "%s: failed to parse line '%s'\n", __func__, line);
130             hit = true;
131          }
132          continue;
133       }
134       dmesg_timestamp = sec * 1000000ull + usec;
135 
136       /* If just updating the timestamp. */
137       if (!out_addr)
138          continue;
139 
140       /* Process messages only if the timestamp is newer. */
141       if (dmesg_timestamp <= *old_dmesg_timestamp)
142          continue;
143 
144       /* Only process the first VM fault. */
145       if (fault)
146          continue;
147 
148       /* Remove trailing \n */
149       len = strlen(line);
150       if (len && line[len - 1] == '\n')
151          line[len - 1] = 0;
152 
153       /* Get the message part. */
154       msg = strchr(line, ']');
155       if (!msg)
156          continue;
157       msg++;
158 
159       const char *header_line, *addr_line_prefix, *addr_line_format;
160 
161       if (gfx_level >= GFX9) {
162          /* Match this:
163           * ..: [gfxhub] VMC page fault (src_id:0 ring:158 vm_id:2 pas_id:0)
164           * ..:   at page 0x0000000219f8f000 from 27
165           * ..: VM_L2_PROTECTION_FAULT_STATUS:0x0020113C
166           */
167          header_line = "VMC page fault";
168          addr_line_prefix = "   at page";
169          addr_line_format = "%" PRIx64;
170       } else {
171          header_line = "GPU fault detected:";
172          addr_line_prefix = "VM_CONTEXT1_PROTECTION_FAULT_ADDR";
173          addr_line_format = "%" PRIX64;
174       }
175 
176       switch (progress) {
177       case 0:
178          if (strstr(msg, header_line))
179             progress = 1;
180          break;
181       case 1:
182          msg = strstr(msg, addr_line_prefix);
183          if (msg) {
184             msg = strstr(msg, "0x");
185             if (msg) {
186                msg += 2;
187                if (sscanf(msg, addr_line_format, out_addr) == 1)
188                   fault = true;
189             }
190          }
191          progress = 0;
192          break;
193       default:
194          progress = 0;
195       }
196    }
197    pclose(p);
198 
199    if (dmesg_timestamp > *old_dmesg_timestamp)
200       *old_dmesg_timestamp = dmesg_timestamp;
201 
202    return fault;
203 #endif
204 }
205 
206 char *
ac_get_umr_waves(const struct radeon_info * info,enum amd_ip_type ring)207 ac_get_umr_waves(const struct radeon_info *info, enum amd_ip_type ring)
208 {
209    /* TODO: Dump compute ring. */
210    if (ring != AMD_IP_GFX)
211       return NULL;
212 
213 #ifndef _WIN32
214    char *data;
215    size_t size;
216    FILE *f = open_memstream(&data, &size);
217    if (!f)
218       return NULL;
219 
220    char cmd[256];
221    sprintf(cmd, "umr --by-pci %04x:%02x:%02x.%01x -O bits,halt_waves -go 0 -wa %s -go 1 2>&1", info->pci.domain,
222            info->pci.bus, info->pci.dev, info->pci.func, info->gfx_level >= GFX10 ? "gfx_0.0.0" : "gfx");
223 
224    char line[2048];
225    FILE *p = popen(cmd, "r");
226    if (p) {
227       while (fgets(line, sizeof(line), p))
228          fputs(line, f);
229       fprintf(f, "\n");
230       pclose(p);
231    }
232 
233    fclose(f);
234 
235    return data;
236 #else
237    return NULL;
238 #endif
239 }
240 
compare_wave(const void * p1,const void * p2)241 static int compare_wave(const void *p1, const void *p2)
242 {
243    struct ac_wave_info *w1 = (struct ac_wave_info *)p1;
244    struct ac_wave_info *w2 = (struct ac_wave_info *)p2;
245 
246    /* Sort waves according to PC and then SE, SH, CU, etc. */
247    if (w1->pc < w2->pc)
248       return -1;
249    if (w1->pc > w2->pc)
250       return 1;
251    if (w1->se < w2->se)
252       return -1;
253    if (w1->se > w2->se)
254       return 1;
255    if (w1->sh < w2->sh)
256       return -1;
257    if (w1->sh > w2->sh)
258       return 1;
259    if (w1->cu < w2->cu)
260       return -1;
261    if (w1->cu > w2->cu)
262       return 1;
263    if (w1->simd < w2->simd)
264       return -1;
265    if (w1->simd > w2->simd)
266       return 1;
267    if (w1->wave < w2->wave)
268       return -1;
269    if (w1->wave > w2->wave)
270       return 1;
271 
272    return 0;
273 }
274 
275 #define AC_UMR_REGISTERS_LINE "Main Registers"
276 
277 static bool
ac_read_umr_register(const char ** _scan,const char * name,uint32_t * value)278 ac_read_umr_register(const char **_scan, const char *name, uint32_t *value)
279 {
280    const char *scan = *_scan;
281    if (strncmp(scan, name, MIN2(strlen(scan), strlen(name))))
282       return false;
283 
284    scan += strlen(name);
285    scan += strlen(": ");
286 
287    *value = strtoul(scan, NULL, 16);
288    *_scan = scan + 8;
289    return true;
290 }
291 
292 /* Return wave information. "waves" should be a large enough array. */
ac_get_wave_info(enum amd_gfx_level gfx_level,const struct radeon_info * info,const char * wave_dump,struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])293 unsigned ac_get_wave_info(enum amd_gfx_level gfx_level, const struct radeon_info *info,
294                           const char *wave_dump,
295                           struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])
296 {
297 #ifdef _WIN32
298    return 0;
299 #else
300    char *dump = NULL;
301    if (!wave_dump) {
302       dump = ac_get_umr_waves(info, AMD_IP_GFX);
303       wave_dump = dump;
304    }
305 
306    unsigned num_waves = 0;
307 
308    while (true) {
309       const char *end = strchr(wave_dump, '\n');
310       if (!end)
311          break;
312 
313       if (strncmp(wave_dump, AC_UMR_REGISTERS_LINE, strlen(AC_UMR_REGISTERS_LINE))) {
314          wave_dump = end + 1;
315          continue;
316       }
317 
318       assert(num_waves < AC_MAX_WAVES_PER_CHIP);
319       struct ac_wave_info *w = &waves[num_waves];
320       memset(w, 0, sizeof(struct ac_wave_info));
321       num_waves++;
322 
323       while (true) {
324          const char *end2 = strchr(wave_dump, '\n');
325          if (!end2)
326             break;
327          if (end2 - wave_dump < 2)
328             break;
329 
330          const char *scan = wave_dump;
331          while (scan < end2) {
332             if (strncmp(scan, "ix", MIN2(strlen(scan), strlen("ix")))) {
333                scan++;
334                continue;
335             }
336 
337             scan += strlen("ix");
338 
339             bool progress = false;
340 
341             progress |= ac_read_umr_register(&scan, "SQ_WAVE_STATUS", &w->status);
342             progress |= ac_read_umr_register(&scan, "SQ_WAVE_PC_LO", &w->pc_lo);
343             progress |= ac_read_umr_register(&scan, "SQ_WAVE_PC_HI", &w->pc_hi);
344             progress |= ac_read_umr_register(&scan, "SQ_WAVE_EXEC_LO", &w->exec_lo);
345             progress |= ac_read_umr_register(&scan, "SQ_WAVE_EXEC_HI", &w->exec_hi);
346             progress |= ac_read_umr_register(&scan, "SQ_WAVE_INST_DW0", &w->inst_dw0);
347             progress |= ac_read_umr_register(&scan, "SQ_WAVE_INST_DW1", &w->inst_dw1);
348 
349             uint32_t wave;
350             if (ac_read_umr_register(&scan, "SQ_WAVE_HW_ID", &wave)) {
351                w->se = G_000050_SE_ID(wave);
352                w->sh = G_000050_SH_ID(wave);
353                w->cu = G_000050_CU_ID(wave);
354                w->simd = G_000050_SIMD_ID(wave);
355                w->wave = G_000050_WAVE_ID(wave);
356 
357                progress = true;
358             }
359 
360             if (ac_read_umr_register(&scan, "SQ_WAVE_HW_ID1", &wave)) {
361                w->se = G_00045C_SE_ID(wave);
362                w->sh = G_00045C_SA_ID(wave);
363                w->cu = G_00045C_WGP_ID(wave);
364                w->simd = G_00045C_SIMD_ID(wave);
365                w->wave = G_00045C_WAVE_ID(wave);
366 
367                progress = true;
368             }
369 
370             /* Skip registers we do not handle. */
371             if (!progress) {
372                while (scan < end2) {
373                   if (*scan == '|') {
374                      progress = true;
375                      break;
376                   }
377                   scan++;
378                }
379             }
380 
381             if (!progress)
382                break;
383          }
384 
385          wave_dump = end2 + 1;
386       }
387    }
388 
389    qsort(waves, num_waves, sizeof(struct ac_wave_info), compare_wave);
390 
391    free(dump);
392 
393    return num_waves;
394 #endif
395 }
396 
397 /* List of GFXHUB clients from AMDGPU source code. */
398 static const char *const gfx10_gfxhub_client_ids[] = {
399    "CB/DB",
400    "Reserved",
401    "GE1",
402    "GE2",
403    "CPF",
404    "CPC",
405    "CPG",
406    "RLC",
407    "TCP",
408    "SQC (inst)",
409    "SQC (data)",
410    "SQG",
411    "Reserved",
412    "SDMA0",
413    "SDMA1",
414    "GCR",
415    "SDMA2",
416    "SDMA3",
417 };
418 
419 static const char *
ac_get_gfx10_gfxhub_client(unsigned cid)420 ac_get_gfx10_gfxhub_client(unsigned cid)
421 {
422    if (cid >= ARRAY_SIZE(gfx10_gfxhub_client_ids))
423       return "UNKNOWN";
424    return gfx10_gfxhub_client_ids[cid];
425 }
426 
ac_print_gpuvm_fault_status(FILE * output,enum amd_gfx_level gfx_level,uint32_t status)427 void ac_print_gpuvm_fault_status(FILE *output, enum amd_gfx_level gfx_level,
428                                  uint32_t status)
429 {
430    if (gfx_level >= GFX10) {
431       const uint8_t cid = G_00A130_CID(status);
432 
433       fprintf(output, "GCVM_L2_PROTECTION_FAULT_STATUS: 0x%x\n", status);
434       fprintf(output, "\t CLIENT_ID: (%s) 0x%x\n", ac_get_gfx10_gfxhub_client(cid), cid);
435       fprintf(output, "\t MORE_FAULTS: %d\n", G_00A130_MORE_FAULTS(status));
436       fprintf(output, "\t WALKER_ERROR: %d\n", G_00A130_WALKER_ERROR(status));
437       fprintf(output, "\t PERMISSION_FAULTS: %d\n", G_00A130_PERMISSION_FAULTS(status));
438       fprintf(output, "\t MAPPING_ERROR: %d\n", G_00A130_MAPPING_ERROR(status));
439       fprintf(output, "\t RW: %d\n", G_00A130_RW(status));
440    } else {
441       fprintf(output, "VM_CONTEXT1_PROTECTION_FAULT_STATUS: 0x%x\n", status);
442    }
443 }
444