1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_debug.h"
8 #include "sid.h"
9 #include "sid_tables.h"
10
11 #include "util/u_string.h"
12
13 #include <inttypes.h>
14
ac_find_register(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)15 const struct si_reg *ac_find_register(enum amd_gfx_level gfx_level, enum radeon_family family,
16 unsigned offset)
17 {
18 const struct si_reg *table;
19 unsigned table_size;
20
21 switch (gfx_level) {
22 case GFX12:
23 table = gfx12_reg_table;
24 table_size = ARRAY_SIZE(gfx12_reg_table);
25 break;
26 case GFX11_5:
27 table = gfx115_reg_table;
28 table_size = ARRAY_SIZE(gfx115_reg_table);
29 break;
30 case GFX11:
31 table = gfx11_reg_table;
32 table_size = ARRAY_SIZE(gfx11_reg_table);
33 break;
34 case GFX10_3:
35 table = gfx103_reg_table;
36 table_size = ARRAY_SIZE(gfx103_reg_table);
37 break;
38 case GFX10:
39 table = gfx10_reg_table;
40 table_size = ARRAY_SIZE(gfx10_reg_table);
41 break;
42 case GFX9:
43 if (family == CHIP_GFX940) {
44 table = gfx940_reg_table;
45 table_size = ARRAY_SIZE(gfx940_reg_table);
46 break;
47 }
48 table = gfx9_reg_table;
49 table_size = ARRAY_SIZE(gfx9_reg_table);
50 break;
51 case GFX8:
52 if (family == CHIP_STONEY) {
53 table = gfx81_reg_table;
54 table_size = ARRAY_SIZE(gfx81_reg_table);
55 break;
56 }
57 table = gfx8_reg_table;
58 table_size = ARRAY_SIZE(gfx8_reg_table);
59 break;
60 case GFX7:
61 table = gfx7_reg_table;
62 table_size = ARRAY_SIZE(gfx7_reg_table);
63 break;
64 case GFX6:
65 table = gfx6_reg_table;
66 table_size = ARRAY_SIZE(gfx6_reg_table);
67 break;
68 default:
69 return NULL;
70 }
71
72 for (unsigned i = 0; i < table_size; i++) {
73 const struct si_reg *reg = &table[i];
74
75 if (reg->offset == offset)
76 return reg;
77 }
78
79 return NULL;
80 }
81
ac_get_register_name(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)82 const char *ac_get_register_name(enum amd_gfx_level gfx_level, enum radeon_family family,
83 unsigned offset)
84 {
85 const struct si_reg *reg = ac_find_register(gfx_level, family, offset);
86
87 return reg ? sid_strings + reg->name_offset : "(no name)";
88 }
89
ac_register_exists(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)90 bool ac_register_exists(enum amd_gfx_level gfx_level, enum radeon_family family,
91 unsigned offset)
92 {
93 return ac_find_register(gfx_level, family, offset) != NULL;
94 }
95
96 /**
97 * Parse dmesg and return TRUE if a VM fault has been detected.
98 *
99 * \param gfx_level gfx level
100 * \param old_dmesg_timestamp previous dmesg timestamp parsed at init time
101 * \param out_addr detected VM fault addr
102 */
ac_vm_fault_occurred(enum amd_gfx_level gfx_level,uint64_t * old_dmesg_timestamp,uint64_t * out_addr)103 bool ac_vm_fault_occurred(enum amd_gfx_level gfx_level, uint64_t *old_dmesg_timestamp,
104 uint64_t *out_addr)
105 {
106 #ifdef _WIN32
107 return false;
108 #else
109 char line[2000];
110 unsigned sec, usec;
111 int progress = 0;
112 uint64_t dmesg_timestamp = 0;
113 bool fault = false;
114
115 FILE *p = popen("dmesg", "r");
116 if (!p)
117 return false;
118
119 while (fgets(line, sizeof(line), p)) {
120 char *msg, len;
121
122 if (!line[0] || line[0] == '\n')
123 continue;
124
125 /* Get the timestamp. */
126 if (sscanf(line, "[%u.%u]", &sec, &usec) != 2) {
127 static bool hit = false;
128 if (!hit) {
129 fprintf(stderr, "%s: failed to parse line '%s'\n", __func__, line);
130 hit = true;
131 }
132 continue;
133 }
134 dmesg_timestamp = sec * 1000000ull + usec;
135
136 /* If just updating the timestamp. */
137 if (!out_addr)
138 continue;
139
140 /* Process messages only if the timestamp is newer. */
141 if (dmesg_timestamp <= *old_dmesg_timestamp)
142 continue;
143
144 /* Only process the first VM fault. */
145 if (fault)
146 continue;
147
148 /* Remove trailing \n */
149 len = strlen(line);
150 if (len && line[len - 1] == '\n')
151 line[len - 1] = 0;
152
153 /* Get the message part. */
154 msg = strchr(line, ']');
155 if (!msg)
156 continue;
157 msg++;
158
159 const char *header_line, *addr_line_prefix, *addr_line_format;
160
161 if (gfx_level >= GFX9) {
162 /* Match this:
163 * ..: [gfxhub] VMC page fault (src_id:0 ring:158 vm_id:2 pas_id:0)
164 * ..: at page 0x0000000219f8f000 from 27
165 * ..: VM_L2_PROTECTION_FAULT_STATUS:0x0020113C
166 */
167 header_line = "VMC page fault";
168 addr_line_prefix = " at page";
169 addr_line_format = "%" PRIx64;
170 } else {
171 header_line = "GPU fault detected:";
172 addr_line_prefix = "VM_CONTEXT1_PROTECTION_FAULT_ADDR";
173 addr_line_format = "%" PRIX64;
174 }
175
176 switch (progress) {
177 case 0:
178 if (strstr(msg, header_line))
179 progress = 1;
180 break;
181 case 1:
182 msg = strstr(msg, addr_line_prefix);
183 if (msg) {
184 msg = strstr(msg, "0x");
185 if (msg) {
186 msg += 2;
187 if (sscanf(msg, addr_line_format, out_addr) == 1)
188 fault = true;
189 }
190 }
191 progress = 0;
192 break;
193 default:
194 progress = 0;
195 }
196 }
197 pclose(p);
198
199 if (dmesg_timestamp > *old_dmesg_timestamp)
200 *old_dmesg_timestamp = dmesg_timestamp;
201
202 return fault;
203 #endif
204 }
205
206 char *
ac_get_umr_waves(const struct radeon_info * info,enum amd_ip_type ring)207 ac_get_umr_waves(const struct radeon_info *info, enum amd_ip_type ring)
208 {
209 /* TODO: Dump compute ring. */
210 if (ring != AMD_IP_GFX)
211 return NULL;
212
213 #ifndef _WIN32
214 char *data;
215 size_t size;
216 FILE *f = open_memstream(&data, &size);
217 if (!f)
218 return NULL;
219
220 char cmd[256];
221 sprintf(cmd, "umr --by-pci %04x:%02x:%02x.%01x -O bits,halt_waves -go 0 -wa %s -go 1 2>&1", info->pci.domain,
222 info->pci.bus, info->pci.dev, info->pci.func, info->gfx_level >= GFX10 ? "gfx_0.0.0" : "gfx");
223
224 char line[2048];
225 FILE *p = popen(cmd, "r");
226 if (p) {
227 while (fgets(line, sizeof(line), p))
228 fputs(line, f);
229 fprintf(f, "\n");
230 pclose(p);
231 }
232
233 fclose(f);
234
235 return data;
236 #else
237 return NULL;
238 #endif
239 }
240
compare_wave(const void * p1,const void * p2)241 static int compare_wave(const void *p1, const void *p2)
242 {
243 struct ac_wave_info *w1 = (struct ac_wave_info *)p1;
244 struct ac_wave_info *w2 = (struct ac_wave_info *)p2;
245
246 /* Sort waves according to PC and then SE, SH, CU, etc. */
247 if (w1->pc < w2->pc)
248 return -1;
249 if (w1->pc > w2->pc)
250 return 1;
251 if (w1->se < w2->se)
252 return -1;
253 if (w1->se > w2->se)
254 return 1;
255 if (w1->sh < w2->sh)
256 return -1;
257 if (w1->sh > w2->sh)
258 return 1;
259 if (w1->cu < w2->cu)
260 return -1;
261 if (w1->cu > w2->cu)
262 return 1;
263 if (w1->simd < w2->simd)
264 return -1;
265 if (w1->simd > w2->simd)
266 return 1;
267 if (w1->wave < w2->wave)
268 return -1;
269 if (w1->wave > w2->wave)
270 return 1;
271
272 return 0;
273 }
274
275 #define AC_UMR_REGISTERS_LINE "Main Registers"
276
277 static bool
ac_read_umr_register(const char ** _scan,const char * name,uint32_t * value)278 ac_read_umr_register(const char **_scan, const char *name, uint32_t *value)
279 {
280 const char *scan = *_scan;
281 if (strncmp(scan, name, MIN2(strlen(scan), strlen(name))))
282 return false;
283
284 scan += strlen(name);
285 scan += strlen(": ");
286
287 *value = strtoul(scan, NULL, 16);
288 *_scan = scan + 8;
289 return true;
290 }
291
292 /* Return wave information. "waves" should be a large enough array. */
ac_get_wave_info(enum amd_gfx_level gfx_level,const struct radeon_info * info,const char * wave_dump,struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])293 unsigned ac_get_wave_info(enum amd_gfx_level gfx_level, const struct radeon_info *info,
294 const char *wave_dump,
295 struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])
296 {
297 #ifdef _WIN32
298 return 0;
299 #else
300 char *dump = NULL;
301 if (!wave_dump) {
302 dump = ac_get_umr_waves(info, AMD_IP_GFX);
303 wave_dump = dump;
304 }
305
306 unsigned num_waves = 0;
307
308 while (true) {
309 const char *end = strchr(wave_dump, '\n');
310 if (!end)
311 break;
312
313 if (strncmp(wave_dump, AC_UMR_REGISTERS_LINE, strlen(AC_UMR_REGISTERS_LINE))) {
314 wave_dump = end + 1;
315 continue;
316 }
317
318 assert(num_waves < AC_MAX_WAVES_PER_CHIP);
319 struct ac_wave_info *w = &waves[num_waves];
320 memset(w, 0, sizeof(struct ac_wave_info));
321 num_waves++;
322
323 while (true) {
324 const char *end2 = strchr(wave_dump, '\n');
325 if (!end2)
326 break;
327 if (end2 - wave_dump < 2)
328 break;
329
330 const char *scan = wave_dump;
331 while (scan < end2) {
332 if (strncmp(scan, "ix", MIN2(strlen(scan), strlen("ix")))) {
333 scan++;
334 continue;
335 }
336
337 scan += strlen("ix");
338
339 bool progress = false;
340
341 progress |= ac_read_umr_register(&scan, "SQ_WAVE_STATUS", &w->status);
342 progress |= ac_read_umr_register(&scan, "SQ_WAVE_PC_LO", &w->pc_lo);
343 progress |= ac_read_umr_register(&scan, "SQ_WAVE_PC_HI", &w->pc_hi);
344 progress |= ac_read_umr_register(&scan, "SQ_WAVE_EXEC_LO", &w->exec_lo);
345 progress |= ac_read_umr_register(&scan, "SQ_WAVE_EXEC_HI", &w->exec_hi);
346 progress |= ac_read_umr_register(&scan, "SQ_WAVE_INST_DW0", &w->inst_dw0);
347 progress |= ac_read_umr_register(&scan, "SQ_WAVE_INST_DW1", &w->inst_dw1);
348
349 uint32_t wave;
350 if (ac_read_umr_register(&scan, "SQ_WAVE_HW_ID", &wave)) {
351 w->se = G_000050_SE_ID(wave);
352 w->sh = G_000050_SH_ID(wave);
353 w->cu = G_000050_CU_ID(wave);
354 w->simd = G_000050_SIMD_ID(wave);
355 w->wave = G_000050_WAVE_ID(wave);
356
357 progress = true;
358 }
359
360 if (ac_read_umr_register(&scan, "SQ_WAVE_HW_ID1", &wave)) {
361 w->se = G_00045C_SE_ID(wave);
362 w->sh = G_00045C_SA_ID(wave);
363 w->cu = G_00045C_WGP_ID(wave);
364 w->simd = G_00045C_SIMD_ID(wave);
365 w->wave = G_00045C_WAVE_ID(wave);
366
367 progress = true;
368 }
369
370 /* Skip registers we do not handle. */
371 if (!progress) {
372 while (scan < end2) {
373 if (*scan == '|') {
374 progress = true;
375 break;
376 }
377 scan++;
378 }
379 }
380
381 if (!progress)
382 break;
383 }
384
385 wave_dump = end2 + 1;
386 }
387 }
388
389 qsort(waves, num_waves, sizeof(struct ac_wave_info), compare_wave);
390
391 free(dump);
392
393 return num_waves;
394 #endif
395 }
396
397 /* List of GFXHUB clients from AMDGPU source code. */
398 static const char *const gfx10_gfxhub_client_ids[] = {
399 "CB/DB",
400 "Reserved",
401 "GE1",
402 "GE2",
403 "CPF",
404 "CPC",
405 "CPG",
406 "RLC",
407 "TCP",
408 "SQC (inst)",
409 "SQC (data)",
410 "SQG",
411 "Reserved",
412 "SDMA0",
413 "SDMA1",
414 "GCR",
415 "SDMA2",
416 "SDMA3",
417 };
418
419 static const char *
ac_get_gfx10_gfxhub_client(unsigned cid)420 ac_get_gfx10_gfxhub_client(unsigned cid)
421 {
422 if (cid >= ARRAY_SIZE(gfx10_gfxhub_client_ids))
423 return "UNKNOWN";
424 return gfx10_gfxhub_client_ids[cid];
425 }
426
ac_print_gpuvm_fault_status(FILE * output,enum amd_gfx_level gfx_level,uint32_t status)427 void ac_print_gpuvm_fault_status(FILE *output, enum amd_gfx_level gfx_level,
428 uint32_t status)
429 {
430 if (gfx_level >= GFX10) {
431 const uint8_t cid = G_00A130_CID(status);
432
433 fprintf(output, "GCVM_L2_PROTECTION_FAULT_STATUS: 0x%x\n", status);
434 fprintf(output, "\t CLIENT_ID: (%s) 0x%x\n", ac_get_gfx10_gfxhub_client(cid), cid);
435 fprintf(output, "\t MORE_FAULTS: %d\n", G_00A130_MORE_FAULTS(status));
436 fprintf(output, "\t WALKER_ERROR: %d\n", G_00A130_WALKER_ERROR(status));
437 fprintf(output, "\t PERMISSION_FAULTS: %d\n", G_00A130_PERMISSION_FAULTS(status));
438 fprintf(output, "\t MAPPING_ERROR: %d\n", G_00A130_MAPPING_ERROR(status));
439 fprintf(output, "\t RW: %d\n", G_00A130_RW(status));
440 } else {
441 fprintf(output, "VM_CONTEXT1_PROTECTION_FAULT_STATUS: 0x%x\n", status);
442 }
443 }
444