1 /*
2 * Copyright (C) 2022-2023 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "genxml/gen_macros.h"
25 #include "decode.h"
26
27 #if PAN_ARCH >= 10
28 /* Limit for Mali-G610. -1 because we're not including the active frame */
29 #define MAX_CALL_STACK_DEPTH (8 - 1)
30
31 struct queue_ctx {
32 /* Size of CSHWIF register file in 32-bit registers */
33 unsigned nr_regs;
34
35 /* CSHWIF register file */
36 uint32_t *regs;
37
38 /* Current instruction pointer (CPU pointer for convenience) */
39 uint64_t *ip;
40
41 /* Current instruction end pointer */
42 uint64_t *end;
43
44 /* Call stack. Depth=0 means root */
45 struct {
46 /* Link register to return to */
47 uint64_t *lr;
48
49 /* End pointer, there is a return (or exit) after */
50 uint64_t *end;
51 } call_stack[MAX_CALL_STACK_DEPTH];
52 uint8_t call_stack_depth;
53
54 unsigned gpu_id;
55 };
56
57 static uint32_t
cs_get_u32(struct queue_ctx * qctx,uint8_t reg)58 cs_get_u32(struct queue_ctx *qctx, uint8_t reg)
59 {
60 assert(reg < qctx->nr_regs);
61 return qctx->regs[reg];
62 }
63
64 static uint64_t
cs_get_u64(struct queue_ctx * qctx,uint8_t reg)65 cs_get_u64(struct queue_ctx *qctx, uint8_t reg)
66 {
67 return (((uint64_t)cs_get_u32(qctx, reg + 1)) << 32) | cs_get_u32(qctx, reg);
68 }
69
70 static void
pandecode_run_compute(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_COMPUTE * I)71 pandecode_run_compute(struct pandecode_context *ctx, FILE *fp,
72 struct queue_ctx *qctx, struct MALI_CS_RUN_COMPUTE *I)
73 {
74 const char *axes[4] = {"x_axis", "y_axis", "z_axis"};
75
76 /* Print the instruction. Ignore the selects and the flags override
77 * since we'll print them implicitly later.
78 */
79 fprintf(fp, "RUN_COMPUTE%s.%s #%u\n",
80 I->progress_increment ? ".progress_inc" : "", axes[I->task_axis],
81 I->task_increment);
82
83 ctx->indent++;
84
85 unsigned reg_srt = 0 + (I->srt_select * 2);
86 unsigned reg_fau = 8 + (I->fau_select * 2);
87 unsigned reg_spd = 16 + (I->spd_select * 2);
88 unsigned reg_tsd = 24 + (I->tsd_select * 2);
89
90 GENX(pandecode_resource_tables)(ctx, cs_get_u64(qctx, reg_srt), "Resources");
91
92 mali_ptr fau = cs_get_u64(qctx, reg_fau);
93
94 if (fau)
95 GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
96
97 GENX(pandecode_shader)
98 (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
99
100 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
101 "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
102
103 pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
104 DUMP_CL(ctx, COMPUTE_SIZE_WORKGROUP, &qctx->regs[33], "Workgroup size\n");
105 pandecode_log(ctx, "Job offset X: %u\n", cs_get_u32(qctx, 34));
106 pandecode_log(ctx, "Job offset Y: %u\n", cs_get_u32(qctx, 35));
107 pandecode_log(ctx, "Job offset Z: %u\n", cs_get_u32(qctx, 36));
108 pandecode_log(ctx, "Job size X: %u\n", cs_get_u32(qctx, 37));
109 pandecode_log(ctx, "Job size Y: %u\n", cs_get_u32(qctx, 38));
110 pandecode_log(ctx, "Job size Z: %u\n", cs_get_u32(qctx, 39));
111
112 ctx->indent--;
113 }
114
115 static void
pandecode_run_compute_indirect(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_COMPUTE_INDIRECT * I)116 pandecode_run_compute_indirect(struct pandecode_context *ctx, FILE *fp,
117 struct queue_ctx *qctx,
118 struct MALI_CS_RUN_COMPUTE_INDIRECT *I)
119 {
120 /* Print the instruction. Ignore the selects and the flags override
121 * since we'll print them implicitly later.
122 */
123 fprintf(fp, "RUN_COMPUTE_INDIRECT%s #%u\n",
124 I->progress_increment ? ".progress_inc" : "",
125 I->workgroups_per_task);
126
127 ctx->indent++;
128
129 unsigned reg_srt = 0 + (I->srt_select * 2);
130 unsigned reg_fau = 8 + (I->fau_select * 2);
131 unsigned reg_spd = 16 + (I->spd_select * 2);
132 unsigned reg_tsd = 24 + (I->tsd_select * 2);
133
134 GENX(pandecode_resource_tables)(ctx, cs_get_u64(qctx, reg_srt), "Resources");
135
136 mali_ptr fau = cs_get_u64(qctx, reg_fau);
137
138 if (fau)
139 GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
140
141 GENX(pandecode_shader)
142 (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
143
144 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
145 "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
146
147 pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
148 DUMP_CL(ctx, COMPUTE_SIZE_WORKGROUP, &qctx->regs[33], "Workgroup size\n");
149 pandecode_log(ctx, "Job offset X: %u\n", cs_get_u32(qctx, 34));
150 pandecode_log(ctx, "Job offset Y: %u\n", cs_get_u32(qctx, 35));
151 pandecode_log(ctx, "Job offset Z: %u\n", cs_get_u32(qctx, 36));
152 pandecode_log(ctx, "Job size X: %u\n", cs_get_u32(qctx, 37));
153 pandecode_log(ctx, "Job size Y: %u\n", cs_get_u32(qctx, 38));
154 pandecode_log(ctx, "Job size Z: %u\n", cs_get_u32(qctx, 39));
155
156 ctx->indent--;
157 }
158
159 static void
pandecode_run_tiling(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_TILING * I)160 pandecode_run_tiling(struct pandecode_context *ctx, FILE *fp,
161 struct queue_ctx *qctx, struct MALI_CS_RUN_TILING *I)
162 {
163 /* Print the instruction. Ignore the selects and the flags override
164 * since we'll print them implicitly later.
165 */
166 fprintf(fp, "RUN_TILING%s", I->progress_increment ? ".progress_inc" : "");
167
168 fprintf(fp, "\n");
169
170 ctx->indent++;
171
172 /* Merge flag overrides with the register flags */
173 uint32_t tiler_flags_raw = cs_get_u64(qctx, 56);
174 tiler_flags_raw |= I->flags_override;
175 pan_unpack(&tiler_flags_raw, PRIMITIVE_FLAGS, tiler_flags);
176
177 unsigned reg_srt = I->srt_select * 2;
178 unsigned reg_fau = 8 + I->fau_select * 2;
179 unsigned reg_spd = 16 + I->spd_select * 2;
180 unsigned reg_tsd = 24 + I->tsd_select;
181
182 mali_ptr srt = cs_get_u64(qctx, reg_srt);
183 mali_ptr fau = cs_get_u64(qctx, reg_fau);
184 mali_ptr spd = cs_get_u64(qctx, reg_spd);
185 mali_ptr tsd = cs_get_u64(qctx, reg_tsd);
186
187 if (srt)
188 GENX(pandecode_resource_tables)(ctx, srt, "Fragment resources");
189
190 if (fau) {
191 uint64_t lo = fau & BITFIELD64_MASK(48);
192 uint64_t hi = fau >> 56;
193
194 GENX(pandecode_fau)(ctx, lo, hi, "Fragment FAU");
195 }
196
197 if (spd) {
198 GENX(pandecode_shader)
199 (ctx, spd, "Fragment shader", qctx->gpu_id);
200 }
201
202 DUMP_ADDR(ctx, LOCAL_STORAGE, tsd, "Fragment Local Storage @%" PRIx64 ":\n",
203 tsd);
204
205 pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
206 pandecode_log(ctx, "Index count: %u\n", cs_get_u32(qctx, 33));
207 pandecode_log(ctx, "Instance count: %u\n", cs_get_u32(qctx, 34));
208
209 if (tiler_flags.index_type)
210 pandecode_log(ctx, "Index offset: %u\n", cs_get_u32(qctx, 35));
211
212 pandecode_log(ctx, "Vertex offset: %d\n", cs_get_u32(qctx, 36));
213 pandecode_log(ctx, "Tiler DCD flags2: %X\n", cs_get_u32(qctx, 38));
214
215 if (tiler_flags.index_type)
216 pandecode_log(ctx, "Index array size: %u\n", cs_get_u32(qctx, 39));
217
218 GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
219
220 DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
221 pandecode_log(ctx, "Low depth clamp: %f\n", uif(cs_get_u32(qctx, 44)));
222 pandecode_log(ctx, "High depth clamp: %f\n", uif(cs_get_u32(qctx, 45)));
223 pandecode_log(ctx, "Occlusion: %" PRIx64 "\n", cs_get_u64(qctx, 46));
224 pandecode_log(ctx, "Vertex position array: %" PRIx64 "\n",
225 cs_get_u64(qctx, 48));
226
227 mali_ptr blend = cs_get_u64(qctx, 50);
228 GENX(pandecode_blend_descs)(ctx, blend & ~7, blend & 7, 0, qctx->gpu_id);
229
230 DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");
231
232 if (tiler_flags.index_type)
233 pandecode_log(ctx, "Indices: %" PRIx64 "\n", cs_get_u64(qctx, 54));
234
235 DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
236 DUMP_CL(ctx, DCD_FLAGS_0, &qctx->regs[57], "DCD Flags 0\n");
237 DUMP_CL(ctx, DCD_FLAGS_1, &qctx->regs[58], "DCD Flags 1\n");
238 pandecode_log(ctx, "Vertex bounds: %u\n", cs_get_u32(qctx, 59));
239 DUMP_CL(ctx, PRIMITIVE_SIZE, &qctx->regs[60], "Primitive size\n");
240
241 ctx->indent--;
242 }
243 static void
pandecode_run_idvs(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_IDVS * I)244 pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
245 struct queue_ctx *qctx, struct MALI_CS_RUN_IDVS *I)
246 {
247 /* Print the instruction. Ignore the selects and the flags override
248 * since we'll print them implicitly later.
249 */
250 fprintf(fp, "RUN_IDVS%s%s", I->progress_increment ? ".progress_inc" : "",
251 I->malloc_enable ? "" : ".no_malloc");
252
253 if (I->draw_id_register_enable)
254 fprintf(fp, " r%u", I->draw_id);
255
256 fprintf(fp, "\n");
257
258 ctx->indent++;
259
260 /* Merge flag overrides with the register flags */
261 uint32_t tiler_flags_raw = cs_get_u64(qctx, 56);
262 tiler_flags_raw |= I->flags_override;
263 pan_unpack(&tiler_flags_raw, PRIMITIVE_FLAGS, tiler_flags);
264
265 unsigned reg_position_srt = 0;
266 unsigned reg_position_fau = 8;
267 unsigned reg_position_tsd = 24;
268
269 unsigned reg_vary_srt = I->varying_srt_select ? 2 : 0;
270 unsigned reg_vary_fau = I->varying_fau_select ? 10 : 8;
271 unsigned reg_vary_tsd = I->varying_tsd_select ? 26 : 24;
272
273 unsigned reg_frag_srt = I->fragment_srt_select ? 4 : 0;
274 unsigned reg_frag_fau = 12;
275 unsigned reg_frag_tsd = I->fragment_tsd_select ? 28 : 24;
276
277 uint64_t position_srt = cs_get_u64(qctx, reg_position_srt);
278 uint64_t vary_srt = cs_get_u64(qctx, reg_vary_srt);
279 uint64_t frag_srt = cs_get_u64(qctx, reg_frag_srt);
280
281 if (position_srt)
282 GENX(pandecode_resource_tables)(ctx, position_srt, "Position resources");
283
284 if (vary_srt)
285 GENX(pandecode_resource_tables)(ctx, vary_srt, "Varying resources");
286
287 if (frag_srt)
288 GENX(pandecode_resource_tables)(ctx, frag_srt, "Fragment resources");
289
290 mali_ptr position_fau = cs_get_u64(qctx, reg_position_fau);
291 mali_ptr vary_fau = cs_get_u64(qctx, reg_vary_fau);
292 mali_ptr fragment_fau = cs_get_u64(qctx, reg_frag_fau);
293
294 if (position_fau) {
295 uint64_t lo = position_fau & BITFIELD64_MASK(48);
296 uint64_t hi = position_fau >> 56;
297
298 GENX(pandecode_fau)(ctx, lo, hi, "Position FAU");
299 }
300
301 if (vary_fau) {
302 uint64_t lo = vary_fau & BITFIELD64_MASK(48);
303 uint64_t hi = vary_fau >> 56;
304
305 GENX(pandecode_fau)(ctx, lo, hi, "Varying FAU");
306 }
307
308 if (fragment_fau) {
309 uint64_t lo = fragment_fau & BITFIELD64_MASK(48);
310 uint64_t hi = fragment_fau >> 56;
311
312 GENX(pandecode_fau)(ctx, lo, hi, "Fragment FAU");
313 }
314
315 if (cs_get_u64(qctx, 16)) {
316 GENX(pandecode_shader)
317 (ctx, cs_get_u64(qctx, 16), "Position shader", qctx->gpu_id);
318 }
319
320 if (tiler_flags.secondary_shader) {
321 uint64_t ptr = cs_get_u64(qctx, 18);
322
323 GENX(pandecode_shader)(ctx, ptr, "Varying shader", qctx->gpu_id);
324 }
325
326 if (cs_get_u64(qctx, 20)) {
327 GENX(pandecode_shader)
328 (ctx, cs_get_u64(qctx, 20), "Fragment shader", qctx->gpu_id);
329 }
330
331 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_position_tsd),
332 "Position Local Storage @%" PRIx64 ":\n",
333 cs_get_u64(qctx, reg_position_tsd));
334 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_vary_tsd),
335 "Varying Local Storage @%" PRIx64 ":\n",
336 cs_get_u64(qctx, reg_vary_tsd));
337 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_frag_tsd),
338 "Fragment Local Storage @%" PRIx64 ":\n",
339 cs_get_u64(qctx, reg_frag_tsd));
340
341 pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
342 pandecode_log(ctx, "Index count: %u\n", cs_get_u32(qctx, 33));
343 pandecode_log(ctx, "Instance count: %u\n", cs_get_u32(qctx, 34));
344
345 if (tiler_flags.index_type)
346 pandecode_log(ctx, "Index offset: %u\n", cs_get_u32(qctx, 35));
347
348 pandecode_log(ctx, "Vertex offset: %d\n", cs_get_u32(qctx, 36));
349 pandecode_log(ctx, "Instance offset: %u\n", cs_get_u32(qctx, 37));
350 pandecode_log(ctx, "Tiler DCD flags2: %X\n", cs_get_u32(qctx, 38));
351
352 if (tiler_flags.index_type)
353 pandecode_log(ctx, "Index array size: %u\n", cs_get_u32(qctx, 39));
354
355 GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
356
357 DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
358 pandecode_log(ctx, "Low depth clamp: %f\n", uif(cs_get_u32(qctx, 44)));
359 pandecode_log(ctx, "High depth clamp: %f\n", uif(cs_get_u32(qctx, 45)));
360 pandecode_log(ctx, "Occlusion: %" PRIx64 "\n", cs_get_u64(qctx, 46));
361
362 if (tiler_flags.secondary_shader)
363 pandecode_log(ctx, "Varying allocation: %u\n", cs_get_u32(qctx, 48));
364
365 mali_ptr blend = cs_get_u64(qctx, 50);
366 GENX(pandecode_blend_descs)(ctx, blend & ~7, blend & 7, 0, qctx->gpu_id);
367
368 DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");
369
370 if (tiler_flags.index_type)
371 pandecode_log(ctx, "Indices: %" PRIx64 "\n", cs_get_u64(qctx, 54));
372
373 DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
374 DUMP_CL(ctx, DCD_FLAGS_0, &qctx->regs[57], "DCD Flags 0\n");
375 DUMP_CL(ctx, DCD_FLAGS_1, &qctx->regs[58], "DCD Flags 1\n");
376 DUMP_CL(ctx, PRIMITIVE_SIZE, &qctx->regs[60], "Primitive size\n");
377
378 ctx->indent--;
379 }
380
381 static void
pandecode_run_fragment(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_FRAGMENT * I)382 pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,
383 struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT *I)
384 {
385 static const char *tile_order[] = {
386 "zorder", "horizontal", "vertical", "unknown",
387 "unknown", "rev_horizontal", "rev_vertical", "unknown",
388 "unknown", "unknown", "unknown", "unknown",
389 "unknown", "unknown", "unknown", "unknown",
390 };
391
392 fprintf(fp, "RUN_FRAGMENT%s.tile_order=%s%s\n",
393 I->enable_tem ? ".tile_enable_map_enable" : "",
394 tile_order[I->tile_order],
395 I->progress_increment ? ".progress_inc" : "");
396
397 ctx->indent++;
398
399 DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
400
401 /* TODO: Tile enable map */
402 GENX(pandecode_fbd)
403 (ctx, cs_get_u64(qctx, 40) & ~0x3full, true, qctx->gpu_id);
404
405 ctx->indent--;
406 }
407
408 static void
pandecode_run_fullscreen(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_FULLSCREEN * I)409 pandecode_run_fullscreen(struct pandecode_context *ctx, FILE *fp,
410 struct queue_ctx *qctx,
411 struct MALI_CS_RUN_FULLSCREEN *I)
412 {
413 fprintf(fp, "RUN_FULLSCREEN%s\n",
414 I->progress_increment ? ".progress_inc" : "");
415
416 ctx->indent++;
417
418 /* Merge flag overrides with the register flags */
419 uint32_t tiler_flags_raw = cs_get_u64(qctx, 56);
420 tiler_flags_raw |= I->flags_override;
421 pan_unpack(&tiler_flags_raw, PRIMITIVE_FLAGS, tiler_flags);
422 DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
423
424 GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
425
426 DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
427
428 pan_unpack(PANDECODE_PTR(ctx, cs_get_u64(qctx, I->dcd), void), DRAW, dcd);
429 GENX(pandecode_dcd)(ctx, &dcd, 0, qctx->gpu_id);
430
431 ctx->indent--;
432 }
433
434 static void
print_indirect(unsigned address,int16_t offset,FILE * fp)435 print_indirect(unsigned address, int16_t offset, FILE *fp)
436 {
437 if (offset)
438 fprintf(fp, "[d%u + %d]", address, offset);
439 else
440 fprintf(fp, "[d%u]", address);
441 }
442
443 static void
print_reg_tuple(unsigned base,uint16_t mask,FILE * fp)444 print_reg_tuple(unsigned base, uint16_t mask, FILE *fp)
445 {
446 bool first_reg = true;
447
448 u_foreach_bit(i, mask) {
449 fprintf(fp, "%sr%u", first_reg ? "" : ":", base + i);
450 first_reg = false;
451 }
452
453 if (mask == 0)
454 fprintf(fp, "_");
455 }
456
457 static const char *conditions_str[] = {
458 "le", "gt", "eq", "ne", "lt", "ge", "always",
459 };
460
461 static void
disassemble_ceu_instr(struct pandecode_context * ctx,uint64_t dword,unsigned indent,bool verbose,FILE * fp,struct queue_ctx * qctx)462 disassemble_ceu_instr(struct pandecode_context *ctx, uint64_t dword,
463 unsigned indent, bool verbose, FILE *fp,
464 struct queue_ctx *qctx)
465 {
466 if (verbose) {
467 fprintf(fp, " ");
468 for (unsigned b = 0; b < 8; ++b)
469 fprintf(fp, " %02x", (uint8_t)(dword >> (8 * b)));
470 }
471
472 for (int i = 0; i < indent; ++i)
473 fprintf(fp, " ");
474
475 /* Unpack the base so we get the opcode */
476 uint8_t *bytes = (uint8_t *)&dword;
477 pan_unpack(bytes, CS_BASE, base);
478
479 switch (base.opcode) {
480 case MALI_CS_OPCODE_NOP: {
481 pan_unpack(bytes, CS_NOP, I);
482
483 if (I.ignored)
484 fprintf(fp, "NOP // 0x%" PRIX64 "\n", I.ignored);
485 else
486 fprintf(fp, "NOP\n");
487 break;
488 }
489
490 case MALI_CS_OPCODE_MOVE: {
491 pan_unpack(bytes, CS_MOVE, I);
492
493 fprintf(fp, "MOVE d%u, #0x%" PRIX64 "\n", I.destination, I.immediate);
494 break;
495 }
496
497 case MALI_CS_OPCODE_MOVE32: {
498 pan_unpack(bytes, CS_MOVE32, I);
499 fprintf(fp, "MOVE32 r%u, #0x%X\n", I.destination, I.immediate);
500 break;
501 }
502
503 case MALI_CS_OPCODE_WAIT: {
504 bool first = true;
505 pan_unpack(bytes, CS_WAIT, I);
506 fprintf(fp, "WAIT%s ", I.progress_increment ? ".progress_inc" : "");
507
508 u_foreach_bit(i, I.wait_mask) {
509 fprintf(fp, "%s%u", first ? "" : ",", i);
510 first = false;
511 }
512
513 fprintf(fp, "\n");
514 break;
515 }
516
517 case MALI_CS_OPCODE_RUN_COMPUTE: {
518 pan_unpack(bytes, CS_RUN_COMPUTE, I);
519 pandecode_run_compute(ctx, fp, qctx, &I);
520 break;
521 }
522
523 case MALI_CS_OPCODE_RUN_TILING: {
524 pan_unpack(bytes, CS_RUN_TILING, I);
525 pandecode_run_tiling(ctx, fp, qctx, &I);
526 break;
527 }
528
529 case MALI_CS_OPCODE_RUN_IDVS: {
530 pan_unpack(bytes, CS_RUN_IDVS, I);
531 pandecode_run_idvs(ctx, fp, qctx, &I);
532 break;
533 }
534
535 case MALI_CS_OPCODE_RUN_FRAGMENT: {
536 pan_unpack(bytes, CS_RUN_FRAGMENT, I);
537 pandecode_run_fragment(ctx, fp, qctx, &I);
538 break;
539 }
540
541 case MALI_CS_OPCODE_RUN_FULLSCREEN: {
542 pan_unpack(bytes, CS_RUN_FULLSCREEN, I);
543 pandecode_run_fullscreen(ctx, fp, qctx, &I);
544 break;
545 }
546
547 case MALI_CS_OPCODE_FINISH_TILING: {
548 pan_unpack(bytes, CS_FINISH_TILING, I);
549 fprintf(fp, "FINISH_TILING%s\n",
550 I.progress_increment ? ".progress_inc" : "");
551 break;
552 }
553
554 case MALI_CS_OPCODE_FINISH_FRAGMENT: {
555 pan_unpack(bytes, CS_FINISH_FRAGMENT, I);
556 fprintf(fp, "FINISH_FRAGMENT.%s, d%u, d%u, #%x, #%u\n",
557 I.increment_fragment_completed ? ".frag_end" : "",
558 I.last_heap_chunk, I.first_heap_chunk, I.wait_mask,
559 I.signal_slot);
560 break;
561 }
562
563 case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
564 pan_unpack(bytes, CS_ADD_IMMEDIATE32, I);
565
566 fprintf(fp, "ADD_IMMEDIATE32 r%u, r%u, #%d\n", I.destination, I.source,
567 I.immediate);
568 break;
569 }
570
571 case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
572 pan_unpack(bytes, CS_ADD_IMMEDIATE64, I);
573
574 fprintf(fp, "ADD_IMMEDIATE64 d%u, d%u, #%d\n", I.destination, I.source,
575 I.immediate);
576 break;
577 }
578
579 case MALI_CS_OPCODE_UMIN32: {
580 pan_unpack(bytes, CS_UMIN32, I);
581
582 fprintf(fp, "UMIN32 r%u, r%u, r%u\n", I.destination, I.source_1,
583 I.source_2);
584 break;
585 }
586
587 case MALI_CS_OPCODE_LOAD_MULTIPLE: {
588 pan_unpack(bytes, CS_LOAD_MULTIPLE, I);
589
590 fprintf(fp, "LOAD_MULTIPLE ");
591 print_reg_tuple(I.base_register, I.mask, fp);
592 fprintf(fp, ", ");
593 print_indirect(I.address, I.offset, fp);
594 fprintf(fp, "\n");
595 break;
596 }
597
598 case MALI_CS_OPCODE_STORE_MULTIPLE: {
599 pan_unpack(bytes, CS_STORE_MULTIPLE, I);
600
601 fprintf(fp, "STORE_MULTIPLE ");
602 print_indirect(I.address, I.offset, fp);
603 fprintf(fp, ", ");
604 print_reg_tuple(I.base_register, I.mask, fp);
605 fprintf(fp, "\n");
606 break;
607 }
608
609 case MALI_CS_OPCODE_BRANCH: {
610 pan_unpack(bytes, CS_BRANCH, I);
611 fprintf(fp, "BRANCH.%s r%u, #%d\n", conditions_str[I.condition], I.value,
612 I.offset);
613 break;
614 }
615
616 case MALI_CS_OPCODE_SET_SB_ENTRY: {
617 pan_unpack(bytes, CS_SET_SB_ENTRY, I);
618 fprintf(fp, "SET_SB_ENTRY #%u, #%u\n", I.endpoint_entry, I.other_entry);
619 break;
620 }
621
622 case MALI_CS_OPCODE_PROGRESS_WAIT: {
623 pan_unpack(bytes, CS_PROGRESS_WAIT, I);
624 fprintf(fp, "PROGRESS_WAIT d%u, #%u\n", I.source, I.queue);
625 break;
626 }
627
628 case MALI_CS_OPCODE_SET_EXCEPTION_HANDLER: {
629 pan_unpack(bytes, CS_SET_EXCEPTION_HANDLER, I);
630 fprintf(fp, "SET_EXCEPTION_HANDLER d%u, r%u\n", I.address, I.length);
631 break;
632 }
633
634 case MALI_CS_OPCODE_CALL: {
635 pan_unpack(bytes, CS_CALL, I);
636 fprintf(fp, "CALL d%u, r%u\n", I.address, I.length);
637 break;
638 }
639
640 case MALI_CS_OPCODE_JUMP: {
641 pan_unpack(bytes, CS_JUMP, I);
642 fprintf(fp, "JUMP d%u, r%u\n", I.address, I.length);
643 break;
644 }
645
646 case MALI_CS_OPCODE_REQ_RESOURCE: {
647 pan_unpack(bytes, CS_REQ_RESOURCE, I);
648
649 fprintf(fp, "REQ_RESOURCE");
650 if (I.compute)
651 fprintf(fp, ".compute");
652 if (I.fragment)
653 fprintf(fp, ".fragment");
654 if (I.tiler)
655 fprintf(fp, ".tiler");
656 if (I.idvs)
657 fprintf(fp, ".idvs");
658 fprintf(fp, "\n");
659 break;
660 }
661
662 case MALI_CS_OPCODE_FLUSH_CACHE2: {
663 pan_unpack(bytes, CS_FLUSH_CACHE2, I);
664 static const char *mode[] = {
665 "nop",
666 "clean",
667 "INVALID",
668 "clean_invalidate",
669 };
670
671 fprintf(fp, "FLUSH_CACHE2.%s_l2.%s_lsc%s r%u, #%x, #%u\n",
672 mode[I.l2_flush_mode], mode[I.lsc_flush_mode],
673 I.other_invalidate ? ".invalidate_other" : ".nop_other",
674 I.latest_flush_id, I.wait_mask, I.signal_slot);
675 break;
676 }
677
678 case MALI_CS_OPCODE_SYNC_ADD32: {
679 pan_unpack(bytes, CS_SYNC_ADD32, I);
680 fprintf(fp, "SYNC_ADD32%s%s [d%u], r%u, #%x, #%u\n",
681 I.error_propagate ? ".error_propagate" : "",
682 I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
683 I.data, I.wait_mask, I.signal_slot);
684 break;
685 }
686
687 case MALI_CS_OPCODE_SYNC_SET32: {
688 pan_unpack(bytes, CS_SYNC_SET32, I);
689 fprintf(fp, "SYNC_SET32.%s%s [d%u], r%u, #%x, #%u\n",
690 I.error_propagate ? ".error_propagate" : "",
691 I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
692 I.data, I.wait_mask, I.signal_slot);
693 break;
694 }
695
696 case MALI_CS_OPCODE_SYNC_WAIT32: {
697 pan_unpack(bytes, CS_SYNC_WAIT32, I);
698 fprintf(fp, "SYNC_WAIT32%s%s d%u, r%u\n", conditions_str[I.condition],
699 I.error_reject ? ".reject" : ".inherit", I.address, I.data);
700 break;
701 }
702
703 case MALI_CS_OPCODE_STORE_STATE: {
704 static const char *states_str[] = {
705 "SYSTEM_TIMESTAMP",
706 "CYCLE_COUNT",
707 "DISJOINT_COUNT",
708 "ERROR_STATE",
709 };
710
711 pan_unpack(bytes, CS_STORE_STATE, I);
712 fprintf(fp, "STORE_STATE.%s d%u, #%i, #%x, #%u\n",
713 I.state >= ARRAY_SIZE(states_str) ? "UNKNOWN_STATE"
714 : states_str[I.state],
715 I.address, I.offset, I.wait_mask, I.signal_slot);
716 break;
717 }
718
719 case MALI_CS_OPCODE_PROT_REGION: {
720 pan_unpack(bytes, CS_PROT_REGION, I);
721 fprintf(fp, "PROT_REGION #%u\n", I.size);
722 break;
723 }
724
725 case MALI_CS_OPCODE_PROGRESS_STORE: {
726 pan_unpack(bytes, CS_PROGRESS_STORE, I);
727 fprintf(fp, "PROGRESS_STORE d%u\n", I.source);
728 break;
729 }
730
731 case MALI_CS_OPCODE_PROGRESS_LOAD: {
732 pan_unpack(bytes, CS_PROGRESS_LOAD, I);
733 fprintf(fp, "PROGRESS_LOAD d%u\n", I.destination);
734 break;
735 }
736
737 case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
738 pan_unpack(bytes, CS_RUN_COMPUTE_INDIRECT, I);
739 pandecode_run_compute_indirect(ctx, fp, qctx, &I);
740 break;
741 }
742
743 case MALI_CS_OPCODE_ERROR_BARRIER: {
744 pan_unpack(bytes, CS_ERROR_BARRIER, I);
745 fprintf(fp, "ERROR_BARRIER");
746 break;
747 }
748
749 case MALI_CS_OPCODE_HEAP_SET: {
750 pan_unpack(bytes, CS_HEAP_SET, I);
751 fprintf(fp, "HEAP_SET d%u\n", I.address);
752 break;
753 }
754
755 case MALI_CS_OPCODE_HEAP_OPERATION: {
756 pan_unpack(bytes, CS_HEAP_OPERATION, I);
757 const char *counter_names[] = {"vt_start", "vt_end", NULL, "frag_end"};
758 fprintf(fp, "HEAP_OPERATION.%s #%x, #%d\n", counter_names[I.operation],
759 I.wait_mask, I.signal_slot);
760 break;
761 }
762
763 case MALI_CS_OPCODE_TRACE_POINT: {
764 pan_unpack(bytes, CS_TRACE_POINT, I);
765 fprintf(fp, "TRACE_POINT r%d:r%d, #%x, #%u\n", I.base_register,
766 I.base_register + I.register_count - 1, I.wait_mask,
767 I.signal_slot);
768 break;
769 }
770
771 case MALI_CS_OPCODE_SYNC_ADD64: {
772 pan_unpack(bytes, CS_SYNC_ADD64, I);
773 fprintf(fp, "SYNC_ADD64%s%s [d%u], d%u, #%x, #%u\n",
774 I.error_propagate ? ".error_propagate" : "",
775 I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
776 I.data, I.wait_mask, I.signal_slot);
777 break;
778 }
779
780 case MALI_CS_OPCODE_SYNC_SET64: {
781 pan_unpack(bytes, CS_SYNC_SET64, I);
782 fprintf(fp, "SYNC_SET64.%s%s [d%u], d%u, #%x, #%u\n",
783 I.error_propagate ? ".error_propagate" : "",
784 I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
785 I.data, I.wait_mask, I.signal_slot);
786 break;
787 }
788
789 case MALI_CS_OPCODE_SYNC_WAIT64: {
790 pan_unpack(bytes, CS_SYNC_WAIT64, I);
791
792 fprintf(fp, "SYNC_WAIT64%s%s d%u, d%u\n", conditions_str[I.condition],
793 I.error_reject ? ".reject" : ".inherit", I.address, I.data);
794 break;
795 }
796
797 default: {
798 fprintf(fp, "UNKNOWN_%u 0x%" PRIX64 "\n", base.opcode, base.data);
799 break;
800 }
801 }
802 }
803
804 static bool
interpret_ceu_jump(struct pandecode_context * ctx,struct queue_ctx * qctx,uint64_t reg_address,uint32_t reg_length)805 interpret_ceu_jump(struct pandecode_context *ctx, struct queue_ctx *qctx,
806 uint64_t reg_address, uint32_t reg_length)
807 {
808 uint32_t address_lo = qctx->regs[reg_address];
809 uint32_t address_hi = qctx->regs[reg_address + 1];
810 uint32_t length = qctx->regs[reg_length];
811
812 if (length % 8) {
813 fprintf(stderr, "CS call alignment error\n");
814 return false;
815 }
816
817 /* Map the entire subqueue now */
818 uint64_t address = ((uint64_t)address_hi << 32) | address_lo;
819 uint64_t *cs = pandecode_fetch_gpu_mem(ctx, address, length);
820
821 qctx->ip = cs;
822 qctx->end = cs + (length / 8);
823
824 /* Skip the usual IP update */
825 return true;
826 }
827
828 /*
829 * Interpret a single instruction of the CS, updating the register file,
830 * instruction pointer, and call stack. Memory access and GPU controls are
831 * ignored for now.
832 *
833 * Returns true if execution should continue.
834 */
835 static bool
interpret_ceu_instr(struct pandecode_context * ctx,struct queue_ctx * qctx)836 interpret_ceu_instr(struct pandecode_context *ctx, struct queue_ctx *qctx)
837 {
838 /* Unpack the base so we get the opcode */
839 uint8_t *bytes = (uint8_t *)qctx->ip;
840 pan_unpack(bytes, CS_BASE, base);
841
842 assert(qctx->ip < qctx->end);
843
844 switch (base.opcode) {
845 case MALI_CS_OPCODE_MOVE: {
846 pan_unpack(bytes, CS_MOVE, I);
847
848 qctx->regs[I.destination + 0] = (uint32_t)I.immediate;
849 qctx->regs[I.destination + 1] = (uint32_t)(I.immediate >> 32);
850 break;
851 }
852
853 case MALI_CS_OPCODE_MOVE32: {
854 pan_unpack(bytes, CS_MOVE32, I);
855
856 qctx->regs[I.destination] = I.immediate;
857 break;
858 }
859
860 case MALI_CS_OPCODE_LOAD_MULTIPLE: {
861 pan_unpack(bytes, CS_LOAD_MULTIPLE, I);
862 mali_ptr addr =
863 ((uint64_t)qctx->regs[I.address + 1] << 32) | qctx->regs[I.address];
864 addr += I.offset;
865
866 uint32_t *src =
867 pandecode_fetch_gpu_mem(ctx, addr, util_last_bit(I.mask) * 4);
868
869 for (uint32_t i = 0; i < 16; i++) {
870 if (I.mask & BITFIELD_BIT(i))
871 qctx->regs[I.base_register + i] = src[i];
872 }
873 break;
874 }
875
876 case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
877 pan_unpack(bytes, CS_ADD_IMMEDIATE32, I);
878
879 qctx->regs[I.destination] = qctx->regs[I.source] + I.immediate;
880 break;
881 }
882
883 case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
884 pan_unpack(bytes, CS_ADD_IMMEDIATE64, I);
885
886 int64_t value =
887 (qctx->regs[I.source] | ((int64_t)qctx->regs[I.source + 1] << 32)) +
888 I.immediate;
889
890 qctx->regs[I.destination] = value;
891 qctx->regs[I.destination + 1] = value >> 32;
892 break;
893 }
894
895 case MALI_CS_OPCODE_CALL: {
896 pan_unpack(bytes, CS_CALL, I);
897
898 if (qctx->call_stack_depth == MAX_CALL_STACK_DEPTH) {
899 fprintf(stderr, "CS call stack overflow\n");
900 return false;
901 }
902
903 assert(qctx->call_stack_depth < MAX_CALL_STACK_DEPTH);
904
905 qctx->ip++;
906
907 /* Note: tail calls are not optimized in the hardware. */
908 assert(qctx->ip <= qctx->end);
909
910 unsigned depth = qctx->call_stack_depth++;
911
912 qctx->call_stack[depth].lr = qctx->ip;
913 qctx->call_stack[depth].end = qctx->end;
914
915 return interpret_ceu_jump(ctx, qctx, I.address, I.length);
916 }
917
918 case MALI_CS_OPCODE_JUMP: {
919 pan_unpack(bytes, CS_JUMP, I);
920
921 if (qctx->call_stack_depth == 0) {
922 fprintf(stderr, "Cannot jump from the entrypoint\n");
923 return false;
924 }
925
926 return interpret_ceu_jump(ctx, qctx, I.address, I.length);
927 }
928
929 default:
930 break;
931 }
932
933 /* Update IP first to point to the next instruction, so call doesn't
934 * require special handling (even for tail calls).
935 */
936 qctx->ip++;
937
938 while (qctx->ip == qctx->end) {
939 /* Graceful termination */
940 if (qctx->call_stack_depth == 0)
941 return false;
942
943 /* Pop off the call stack */
944 unsigned old_depth = --qctx->call_stack_depth;
945
946 qctx->ip = qctx->call_stack[old_depth].lr;
947 qctx->end = qctx->call_stack[old_depth].end;
948 }
949
950 return true;
951 }
952
953 void
GENX(pandecode_cs)954 GENX(pandecode_cs)(struct pandecode_context *ctx, mali_ptr queue, uint32_t size,
955 unsigned gpu_id, uint32_t *regs)
956 {
957 pandecode_dump_file_open(ctx);
958
959 uint64_t *cs = pandecode_fetch_gpu_mem(ctx, queue, size);
960
961 /* Mali-G610 has 96 registers. Other devices not yet supported, we can make
962 * this configurable later when we encounter new Malis.
963 */
964 struct queue_ctx qctx = {
965 .nr_regs = 96,
966 .regs = regs,
967 .ip = cs,
968 .end = cs + (size / 8),
969 .gpu_id = gpu_id,
970
971 /* If this is a kernel mode queue, we don't see the root ring buffer and
972 * we must adjust the initial call stack depth accordingly.
973 */
974 .call_stack_depth = ctx->usermode_queue ? 0 : 1,
975 };
976
977 if (size) {
978 do {
979 disassemble_ceu_instr(ctx, *(qctx.ip), 1 + qctx.call_stack_depth, true,
980 ctx->dump_stream, &qctx);
981 } while (interpret_ceu_instr(ctx, &qctx));
982 }
983
984 fflush(ctx->dump_stream);
985 pandecode_map_read_write(ctx);
986 }
987 #endif
988