xref: /aosp_15_r20/external/mesa3d/src/panfrost/lib/genxml/decode_csf.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2022-2023 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "genxml/gen_macros.h"
25 #include "decode.h"
26 
27 #if PAN_ARCH >= 10
28 /* Limit for Mali-G610. -1 because we're not including the active frame */
29 #define MAX_CALL_STACK_DEPTH (8 - 1)
30 
31 struct queue_ctx {
32    /* Size of CSHWIF register file in 32-bit registers */
33    unsigned nr_regs;
34 
35    /* CSHWIF register file */
36    uint32_t *regs;
37 
38    /* Current instruction pointer (CPU pointer for convenience) */
39    uint64_t *ip;
40 
41    /* Current instruction end pointer */
42    uint64_t *end;
43 
44    /* Call stack. Depth=0 means root */
45    struct {
46       /* Link register to return to */
47       uint64_t *lr;
48 
49       /* End pointer, there is a return (or exit) after */
50       uint64_t *end;
51    } call_stack[MAX_CALL_STACK_DEPTH];
52    uint8_t call_stack_depth;
53 
54    unsigned gpu_id;
55 };
56 
57 static uint32_t
cs_get_u32(struct queue_ctx * qctx,uint8_t reg)58 cs_get_u32(struct queue_ctx *qctx, uint8_t reg)
59 {
60    assert(reg < qctx->nr_regs);
61    return qctx->regs[reg];
62 }
63 
64 static uint64_t
cs_get_u64(struct queue_ctx * qctx,uint8_t reg)65 cs_get_u64(struct queue_ctx *qctx, uint8_t reg)
66 {
67    return (((uint64_t)cs_get_u32(qctx, reg + 1)) << 32) | cs_get_u32(qctx, reg);
68 }
69 
70 static void
pandecode_run_compute(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_COMPUTE * I)71 pandecode_run_compute(struct pandecode_context *ctx, FILE *fp,
72                       struct queue_ctx *qctx, struct MALI_CS_RUN_COMPUTE *I)
73 {
74    const char *axes[4] = {"x_axis", "y_axis", "z_axis"};
75 
76    /* Print the instruction. Ignore the selects and the flags override
77     * since we'll print them implicitly later.
78     */
79    fprintf(fp, "RUN_COMPUTE%s.%s #%u\n",
80            I->progress_increment ? ".progress_inc" : "", axes[I->task_axis],
81            I->task_increment);
82 
83    ctx->indent++;
84 
85    unsigned reg_srt = 0 + (I->srt_select * 2);
86    unsigned reg_fau = 8 + (I->fau_select * 2);
87    unsigned reg_spd = 16 + (I->spd_select * 2);
88    unsigned reg_tsd = 24 + (I->tsd_select * 2);
89 
90    GENX(pandecode_resource_tables)(ctx, cs_get_u64(qctx, reg_srt), "Resources");
91 
92    mali_ptr fau = cs_get_u64(qctx, reg_fau);
93 
94    if (fau)
95       GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
96 
97    GENX(pandecode_shader)
98    (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
99 
100    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
101              "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
102 
103    pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
104    DUMP_CL(ctx, COMPUTE_SIZE_WORKGROUP, &qctx->regs[33], "Workgroup size\n");
105    pandecode_log(ctx, "Job offset X: %u\n", cs_get_u32(qctx, 34));
106    pandecode_log(ctx, "Job offset Y: %u\n", cs_get_u32(qctx, 35));
107    pandecode_log(ctx, "Job offset Z: %u\n", cs_get_u32(qctx, 36));
108    pandecode_log(ctx, "Job size X: %u\n", cs_get_u32(qctx, 37));
109    pandecode_log(ctx, "Job size Y: %u\n", cs_get_u32(qctx, 38));
110    pandecode_log(ctx, "Job size Z: %u\n", cs_get_u32(qctx, 39));
111 
112    ctx->indent--;
113 }
114 
115 static void
pandecode_run_compute_indirect(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_COMPUTE_INDIRECT * I)116 pandecode_run_compute_indirect(struct pandecode_context *ctx, FILE *fp,
117                                struct queue_ctx *qctx,
118                                struct MALI_CS_RUN_COMPUTE_INDIRECT *I)
119 {
120    /* Print the instruction. Ignore the selects and the flags override
121     * since we'll print them implicitly later.
122     */
123    fprintf(fp, "RUN_COMPUTE_INDIRECT%s #%u\n",
124            I->progress_increment ? ".progress_inc" : "",
125            I->workgroups_per_task);
126 
127    ctx->indent++;
128 
129    unsigned reg_srt = 0 + (I->srt_select * 2);
130    unsigned reg_fau = 8 + (I->fau_select * 2);
131    unsigned reg_spd = 16 + (I->spd_select * 2);
132    unsigned reg_tsd = 24 + (I->tsd_select * 2);
133 
134    GENX(pandecode_resource_tables)(ctx, cs_get_u64(qctx, reg_srt), "Resources");
135 
136    mali_ptr fau = cs_get_u64(qctx, reg_fau);
137 
138    if (fau)
139       GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
140 
141    GENX(pandecode_shader)
142    (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
143 
144    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
145              "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
146 
147    pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
148    DUMP_CL(ctx, COMPUTE_SIZE_WORKGROUP, &qctx->regs[33], "Workgroup size\n");
149    pandecode_log(ctx, "Job offset X: %u\n", cs_get_u32(qctx, 34));
150    pandecode_log(ctx, "Job offset Y: %u\n", cs_get_u32(qctx, 35));
151    pandecode_log(ctx, "Job offset Z: %u\n", cs_get_u32(qctx, 36));
152    pandecode_log(ctx, "Job size X: %u\n", cs_get_u32(qctx, 37));
153    pandecode_log(ctx, "Job size Y: %u\n", cs_get_u32(qctx, 38));
154    pandecode_log(ctx, "Job size Z: %u\n", cs_get_u32(qctx, 39));
155 
156    ctx->indent--;
157 }
158 
159 static void
pandecode_run_tiling(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_TILING * I)160 pandecode_run_tiling(struct pandecode_context *ctx, FILE *fp,
161                      struct queue_ctx *qctx, struct MALI_CS_RUN_TILING *I)
162 {
163    /* Print the instruction. Ignore the selects and the flags override
164     * since we'll print them implicitly later.
165     */
166    fprintf(fp, "RUN_TILING%s", I->progress_increment ? ".progress_inc" : "");
167 
168    fprintf(fp, "\n");
169 
170    ctx->indent++;
171 
172    /* Merge flag overrides with the register flags */
173    uint32_t tiler_flags_raw = cs_get_u64(qctx, 56);
174    tiler_flags_raw |= I->flags_override;
175    pan_unpack(&tiler_flags_raw, PRIMITIVE_FLAGS, tiler_flags);
176 
177    unsigned reg_srt = I->srt_select * 2;
178    unsigned reg_fau = 8 + I->fau_select * 2;
179    unsigned reg_spd = 16 + I->spd_select * 2;
180    unsigned reg_tsd = 24 + I->tsd_select;
181 
182    mali_ptr srt = cs_get_u64(qctx, reg_srt);
183    mali_ptr fau = cs_get_u64(qctx, reg_fau);
184    mali_ptr spd = cs_get_u64(qctx, reg_spd);
185    mali_ptr tsd = cs_get_u64(qctx, reg_tsd);
186 
187    if (srt)
188       GENX(pandecode_resource_tables)(ctx, srt, "Fragment resources");
189 
190    if (fau) {
191       uint64_t lo = fau & BITFIELD64_MASK(48);
192       uint64_t hi = fau >> 56;
193 
194       GENX(pandecode_fau)(ctx, lo, hi, "Fragment FAU");
195    }
196 
197    if (spd) {
198       GENX(pandecode_shader)
199       (ctx, spd, "Fragment shader", qctx->gpu_id);
200    }
201 
202    DUMP_ADDR(ctx, LOCAL_STORAGE, tsd, "Fragment Local Storage @%" PRIx64 ":\n",
203              tsd);
204 
205    pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
206    pandecode_log(ctx, "Index count: %u\n", cs_get_u32(qctx, 33));
207    pandecode_log(ctx, "Instance count: %u\n", cs_get_u32(qctx, 34));
208 
209    if (tiler_flags.index_type)
210       pandecode_log(ctx, "Index offset: %u\n", cs_get_u32(qctx, 35));
211 
212    pandecode_log(ctx, "Vertex offset: %d\n", cs_get_u32(qctx, 36));
213    pandecode_log(ctx, "Tiler DCD flags2: %X\n", cs_get_u32(qctx, 38));
214 
215    if (tiler_flags.index_type)
216       pandecode_log(ctx, "Index array size: %u\n", cs_get_u32(qctx, 39));
217 
218    GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
219 
220    DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
221    pandecode_log(ctx, "Low depth clamp: %f\n", uif(cs_get_u32(qctx, 44)));
222    pandecode_log(ctx, "High depth clamp: %f\n", uif(cs_get_u32(qctx, 45)));
223    pandecode_log(ctx, "Occlusion: %" PRIx64 "\n", cs_get_u64(qctx, 46));
224    pandecode_log(ctx, "Vertex position array: %" PRIx64 "\n",
225                  cs_get_u64(qctx, 48));
226 
227    mali_ptr blend = cs_get_u64(qctx, 50);
228    GENX(pandecode_blend_descs)(ctx, blend & ~7, blend & 7, 0, qctx->gpu_id);
229 
230    DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");
231 
232    if (tiler_flags.index_type)
233       pandecode_log(ctx, "Indices: %" PRIx64 "\n", cs_get_u64(qctx, 54));
234 
235    DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
236    DUMP_CL(ctx, DCD_FLAGS_0, &qctx->regs[57], "DCD Flags 0\n");
237    DUMP_CL(ctx, DCD_FLAGS_1, &qctx->regs[58], "DCD Flags 1\n");
238    pandecode_log(ctx, "Vertex bounds: %u\n", cs_get_u32(qctx, 59));
239    DUMP_CL(ctx, PRIMITIVE_SIZE, &qctx->regs[60], "Primitive size\n");
240 
241    ctx->indent--;
242 }
243 static void
pandecode_run_idvs(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_IDVS * I)244 pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
245                    struct queue_ctx *qctx, struct MALI_CS_RUN_IDVS *I)
246 {
247    /* Print the instruction. Ignore the selects and the flags override
248     * since we'll print them implicitly later.
249     */
250    fprintf(fp, "RUN_IDVS%s%s", I->progress_increment ? ".progress_inc" : "",
251            I->malloc_enable ? "" : ".no_malloc");
252 
253    if (I->draw_id_register_enable)
254       fprintf(fp, " r%u", I->draw_id);
255 
256    fprintf(fp, "\n");
257 
258    ctx->indent++;
259 
260    /* Merge flag overrides with the register flags */
261    uint32_t tiler_flags_raw = cs_get_u64(qctx, 56);
262    tiler_flags_raw |= I->flags_override;
263    pan_unpack(&tiler_flags_raw, PRIMITIVE_FLAGS, tiler_flags);
264 
265    unsigned reg_position_srt = 0;
266    unsigned reg_position_fau = 8;
267    unsigned reg_position_tsd = 24;
268 
269    unsigned reg_vary_srt = I->varying_srt_select ? 2 : 0;
270    unsigned reg_vary_fau = I->varying_fau_select ? 10 : 8;
271    unsigned reg_vary_tsd = I->varying_tsd_select ? 26 : 24;
272 
273    unsigned reg_frag_srt = I->fragment_srt_select ? 4 : 0;
274    unsigned reg_frag_fau = 12;
275    unsigned reg_frag_tsd = I->fragment_tsd_select ? 28 : 24;
276 
277    uint64_t position_srt = cs_get_u64(qctx, reg_position_srt);
278    uint64_t vary_srt = cs_get_u64(qctx, reg_vary_srt);
279    uint64_t frag_srt = cs_get_u64(qctx, reg_frag_srt);
280 
281    if (position_srt)
282       GENX(pandecode_resource_tables)(ctx, position_srt, "Position resources");
283 
284    if (vary_srt)
285       GENX(pandecode_resource_tables)(ctx, vary_srt, "Varying resources");
286 
287    if (frag_srt)
288       GENX(pandecode_resource_tables)(ctx, frag_srt, "Fragment resources");
289 
290    mali_ptr position_fau = cs_get_u64(qctx, reg_position_fau);
291    mali_ptr vary_fau = cs_get_u64(qctx, reg_vary_fau);
292    mali_ptr fragment_fau = cs_get_u64(qctx, reg_frag_fau);
293 
294    if (position_fau) {
295       uint64_t lo = position_fau & BITFIELD64_MASK(48);
296       uint64_t hi = position_fau >> 56;
297 
298       GENX(pandecode_fau)(ctx, lo, hi, "Position FAU");
299    }
300 
301    if (vary_fau) {
302       uint64_t lo = vary_fau & BITFIELD64_MASK(48);
303       uint64_t hi = vary_fau >> 56;
304 
305       GENX(pandecode_fau)(ctx, lo, hi, "Varying FAU");
306    }
307 
308    if (fragment_fau) {
309       uint64_t lo = fragment_fau & BITFIELD64_MASK(48);
310       uint64_t hi = fragment_fau >> 56;
311 
312       GENX(pandecode_fau)(ctx, lo, hi, "Fragment FAU");
313    }
314 
315    if (cs_get_u64(qctx, 16)) {
316       GENX(pandecode_shader)
317       (ctx, cs_get_u64(qctx, 16), "Position shader", qctx->gpu_id);
318    }
319 
320    if (tiler_flags.secondary_shader) {
321       uint64_t ptr = cs_get_u64(qctx, 18);
322 
323       GENX(pandecode_shader)(ctx, ptr, "Varying shader", qctx->gpu_id);
324    }
325 
326    if (cs_get_u64(qctx, 20)) {
327       GENX(pandecode_shader)
328       (ctx, cs_get_u64(qctx, 20), "Fragment shader", qctx->gpu_id);
329    }
330 
331    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_position_tsd),
332              "Position Local Storage @%" PRIx64 ":\n",
333              cs_get_u64(qctx, reg_position_tsd));
334    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_vary_tsd),
335              "Varying Local Storage @%" PRIx64 ":\n",
336              cs_get_u64(qctx, reg_vary_tsd));
337    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_frag_tsd),
338              "Fragment Local Storage @%" PRIx64 ":\n",
339              cs_get_u64(qctx, reg_frag_tsd));
340 
341    pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
342    pandecode_log(ctx, "Index count: %u\n", cs_get_u32(qctx, 33));
343    pandecode_log(ctx, "Instance count: %u\n", cs_get_u32(qctx, 34));
344 
345    if (tiler_flags.index_type)
346       pandecode_log(ctx, "Index offset: %u\n", cs_get_u32(qctx, 35));
347 
348    pandecode_log(ctx, "Vertex offset: %d\n", cs_get_u32(qctx, 36));
349    pandecode_log(ctx, "Instance offset: %u\n", cs_get_u32(qctx, 37));
350    pandecode_log(ctx, "Tiler DCD flags2: %X\n", cs_get_u32(qctx, 38));
351 
352    if (tiler_flags.index_type)
353       pandecode_log(ctx, "Index array size: %u\n", cs_get_u32(qctx, 39));
354 
355    GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
356 
357    DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
358    pandecode_log(ctx, "Low depth clamp: %f\n", uif(cs_get_u32(qctx, 44)));
359    pandecode_log(ctx, "High depth clamp: %f\n", uif(cs_get_u32(qctx, 45)));
360    pandecode_log(ctx, "Occlusion: %" PRIx64 "\n", cs_get_u64(qctx, 46));
361 
362    if (tiler_flags.secondary_shader)
363       pandecode_log(ctx, "Varying allocation: %u\n", cs_get_u32(qctx, 48));
364 
365    mali_ptr blend = cs_get_u64(qctx, 50);
366    GENX(pandecode_blend_descs)(ctx, blend & ~7, blend & 7, 0, qctx->gpu_id);
367 
368    DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");
369 
370    if (tiler_flags.index_type)
371       pandecode_log(ctx, "Indices: %" PRIx64 "\n", cs_get_u64(qctx, 54));
372 
373    DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
374    DUMP_CL(ctx, DCD_FLAGS_0, &qctx->regs[57], "DCD Flags 0\n");
375    DUMP_CL(ctx, DCD_FLAGS_1, &qctx->regs[58], "DCD Flags 1\n");
376    DUMP_CL(ctx, PRIMITIVE_SIZE, &qctx->regs[60], "Primitive size\n");
377 
378    ctx->indent--;
379 }
380 
381 static void
pandecode_run_fragment(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_FRAGMENT * I)382 pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,
383                        struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT *I)
384 {
385    static const char *tile_order[] = {
386       "zorder",  "horizontal",     "vertical",     "unknown",
387       "unknown", "rev_horizontal", "rev_vertical", "unknown",
388       "unknown", "unknown",        "unknown",      "unknown",
389       "unknown", "unknown",        "unknown",      "unknown",
390    };
391 
392    fprintf(fp, "RUN_FRAGMENT%s.tile_order=%s%s\n",
393            I->enable_tem ? ".tile_enable_map_enable" : "",
394            tile_order[I->tile_order],
395            I->progress_increment ? ".progress_inc" : "");
396 
397    ctx->indent++;
398 
399    DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
400 
401    /* TODO: Tile enable map */
402    GENX(pandecode_fbd)
403    (ctx, cs_get_u64(qctx, 40) & ~0x3full, true, qctx->gpu_id);
404 
405    ctx->indent--;
406 }
407 
408 static void
pandecode_run_fullscreen(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_FULLSCREEN * I)409 pandecode_run_fullscreen(struct pandecode_context *ctx, FILE *fp,
410                          struct queue_ctx *qctx,
411                          struct MALI_CS_RUN_FULLSCREEN *I)
412 {
413    fprintf(fp, "RUN_FULLSCREEN%s\n",
414            I->progress_increment ? ".progress_inc" : "");
415 
416    ctx->indent++;
417 
418    /* Merge flag overrides with the register flags */
419    uint32_t tiler_flags_raw = cs_get_u64(qctx, 56);
420    tiler_flags_raw |= I->flags_override;
421    pan_unpack(&tiler_flags_raw, PRIMITIVE_FLAGS, tiler_flags);
422    DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
423 
424    GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
425 
426    DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
427 
428    pan_unpack(PANDECODE_PTR(ctx, cs_get_u64(qctx, I->dcd), void), DRAW, dcd);
429    GENX(pandecode_dcd)(ctx, &dcd, 0, qctx->gpu_id);
430 
431    ctx->indent--;
432 }
433 
434 static void
print_indirect(unsigned address,int16_t offset,FILE * fp)435 print_indirect(unsigned address, int16_t offset, FILE *fp)
436 {
437    if (offset)
438       fprintf(fp, "[d%u + %d]", address, offset);
439    else
440       fprintf(fp, "[d%u]", address);
441 }
442 
443 static void
print_reg_tuple(unsigned base,uint16_t mask,FILE * fp)444 print_reg_tuple(unsigned base, uint16_t mask, FILE *fp)
445 {
446    bool first_reg = true;
447 
448    u_foreach_bit(i, mask) {
449       fprintf(fp, "%sr%u", first_reg ? "" : ":", base + i);
450       first_reg = false;
451    }
452 
453    if (mask == 0)
454       fprintf(fp, "_");
455 }
456 
457 static const char *conditions_str[] = {
458    "le", "gt", "eq", "ne", "lt", "ge", "always",
459 };
460 
461 static void
disassemble_ceu_instr(struct pandecode_context * ctx,uint64_t dword,unsigned indent,bool verbose,FILE * fp,struct queue_ctx * qctx)462 disassemble_ceu_instr(struct pandecode_context *ctx, uint64_t dword,
463                       unsigned indent, bool verbose, FILE *fp,
464                       struct queue_ctx *qctx)
465 {
466    if (verbose) {
467       fprintf(fp, " ");
468       for (unsigned b = 0; b < 8; ++b)
469          fprintf(fp, " %02x", (uint8_t)(dword >> (8 * b)));
470    }
471 
472    for (int i = 0; i < indent; ++i)
473       fprintf(fp, "  ");
474 
475    /* Unpack the base so we get the opcode */
476    uint8_t *bytes = (uint8_t *)&dword;
477    pan_unpack(bytes, CS_BASE, base);
478 
479    switch (base.opcode) {
480    case MALI_CS_OPCODE_NOP: {
481       pan_unpack(bytes, CS_NOP, I);
482 
483       if (I.ignored)
484          fprintf(fp, "NOP // 0x%" PRIX64 "\n", I.ignored);
485       else
486          fprintf(fp, "NOP\n");
487       break;
488    }
489 
490    case MALI_CS_OPCODE_MOVE: {
491       pan_unpack(bytes, CS_MOVE, I);
492 
493       fprintf(fp, "MOVE d%u, #0x%" PRIX64 "\n", I.destination, I.immediate);
494       break;
495    }
496 
497    case MALI_CS_OPCODE_MOVE32: {
498       pan_unpack(bytes, CS_MOVE32, I);
499       fprintf(fp, "MOVE32 r%u, #0x%X\n", I.destination, I.immediate);
500       break;
501    }
502 
503    case MALI_CS_OPCODE_WAIT: {
504       bool first = true;
505       pan_unpack(bytes, CS_WAIT, I);
506       fprintf(fp, "WAIT%s ", I.progress_increment ? ".progress_inc" : "");
507 
508       u_foreach_bit(i, I.wait_mask) {
509          fprintf(fp, "%s%u", first ? "" : ",", i);
510          first = false;
511       }
512 
513       fprintf(fp, "\n");
514       break;
515    }
516 
517    case MALI_CS_OPCODE_RUN_COMPUTE: {
518       pan_unpack(bytes, CS_RUN_COMPUTE, I);
519       pandecode_run_compute(ctx, fp, qctx, &I);
520       break;
521    }
522 
523    case MALI_CS_OPCODE_RUN_TILING: {
524       pan_unpack(bytes, CS_RUN_TILING, I);
525       pandecode_run_tiling(ctx, fp, qctx, &I);
526       break;
527    }
528 
529    case MALI_CS_OPCODE_RUN_IDVS: {
530       pan_unpack(bytes, CS_RUN_IDVS, I);
531       pandecode_run_idvs(ctx, fp, qctx, &I);
532       break;
533    }
534 
535    case MALI_CS_OPCODE_RUN_FRAGMENT: {
536       pan_unpack(bytes, CS_RUN_FRAGMENT, I);
537       pandecode_run_fragment(ctx, fp, qctx, &I);
538       break;
539    }
540 
541    case MALI_CS_OPCODE_RUN_FULLSCREEN: {
542       pan_unpack(bytes, CS_RUN_FULLSCREEN, I);
543       pandecode_run_fullscreen(ctx, fp, qctx, &I);
544       break;
545    }
546 
547    case MALI_CS_OPCODE_FINISH_TILING: {
548       pan_unpack(bytes, CS_FINISH_TILING, I);
549       fprintf(fp, "FINISH_TILING%s\n",
550               I.progress_increment ? ".progress_inc" : "");
551       break;
552    }
553 
554    case MALI_CS_OPCODE_FINISH_FRAGMENT: {
555       pan_unpack(bytes, CS_FINISH_FRAGMENT, I);
556       fprintf(fp, "FINISH_FRAGMENT.%s, d%u, d%u, #%x, #%u\n",
557               I.increment_fragment_completed ? ".frag_end" : "",
558               I.last_heap_chunk, I.first_heap_chunk, I.wait_mask,
559               I.signal_slot);
560       break;
561    }
562 
563    case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
564       pan_unpack(bytes, CS_ADD_IMMEDIATE32, I);
565 
566       fprintf(fp, "ADD_IMMEDIATE32 r%u, r%u, #%d\n", I.destination, I.source,
567               I.immediate);
568       break;
569    }
570 
571    case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
572       pan_unpack(bytes, CS_ADD_IMMEDIATE64, I);
573 
574       fprintf(fp, "ADD_IMMEDIATE64 d%u, d%u, #%d\n", I.destination, I.source,
575               I.immediate);
576       break;
577    }
578 
579    case MALI_CS_OPCODE_UMIN32: {
580       pan_unpack(bytes, CS_UMIN32, I);
581 
582       fprintf(fp, "UMIN32 r%u, r%u, r%u\n", I.destination, I.source_1,
583               I.source_2);
584       break;
585    }
586 
587    case MALI_CS_OPCODE_LOAD_MULTIPLE: {
588       pan_unpack(bytes, CS_LOAD_MULTIPLE, I);
589 
590       fprintf(fp, "LOAD_MULTIPLE ");
591       print_reg_tuple(I.base_register, I.mask, fp);
592       fprintf(fp, ", ");
593       print_indirect(I.address, I.offset, fp);
594       fprintf(fp, "\n");
595       break;
596    }
597 
598    case MALI_CS_OPCODE_STORE_MULTIPLE: {
599       pan_unpack(bytes, CS_STORE_MULTIPLE, I);
600 
601       fprintf(fp, "STORE_MULTIPLE ");
602       print_indirect(I.address, I.offset, fp);
603       fprintf(fp, ", ");
604       print_reg_tuple(I.base_register, I.mask, fp);
605       fprintf(fp, "\n");
606       break;
607    }
608 
609    case MALI_CS_OPCODE_BRANCH: {
610       pan_unpack(bytes, CS_BRANCH, I);
611       fprintf(fp, "BRANCH.%s r%u, #%d\n", conditions_str[I.condition], I.value,
612               I.offset);
613       break;
614    }
615 
616    case MALI_CS_OPCODE_SET_SB_ENTRY: {
617       pan_unpack(bytes, CS_SET_SB_ENTRY, I);
618       fprintf(fp, "SET_SB_ENTRY #%u, #%u\n", I.endpoint_entry, I.other_entry);
619       break;
620    }
621 
622    case MALI_CS_OPCODE_PROGRESS_WAIT: {
623       pan_unpack(bytes, CS_PROGRESS_WAIT, I);
624       fprintf(fp, "PROGRESS_WAIT d%u, #%u\n", I.source, I.queue);
625       break;
626    }
627 
628    case MALI_CS_OPCODE_SET_EXCEPTION_HANDLER: {
629       pan_unpack(bytes, CS_SET_EXCEPTION_HANDLER, I);
630       fprintf(fp, "SET_EXCEPTION_HANDLER d%u, r%u\n", I.address, I.length);
631       break;
632    }
633 
634    case MALI_CS_OPCODE_CALL: {
635       pan_unpack(bytes, CS_CALL, I);
636       fprintf(fp, "CALL d%u, r%u\n", I.address, I.length);
637       break;
638    }
639 
640    case MALI_CS_OPCODE_JUMP: {
641       pan_unpack(bytes, CS_JUMP, I);
642       fprintf(fp, "JUMP d%u, r%u\n", I.address, I.length);
643       break;
644    }
645 
646    case MALI_CS_OPCODE_REQ_RESOURCE: {
647       pan_unpack(bytes, CS_REQ_RESOURCE, I);
648 
649       fprintf(fp, "REQ_RESOURCE");
650       if (I.compute)
651          fprintf(fp, ".compute");
652       if (I.fragment)
653          fprintf(fp, ".fragment");
654       if (I.tiler)
655          fprintf(fp, ".tiler");
656       if (I.idvs)
657          fprintf(fp, ".idvs");
658       fprintf(fp, "\n");
659       break;
660    }
661 
662    case MALI_CS_OPCODE_FLUSH_CACHE2: {
663       pan_unpack(bytes, CS_FLUSH_CACHE2, I);
664       static const char *mode[] = {
665          "nop",
666          "clean",
667          "INVALID",
668          "clean_invalidate",
669       };
670 
671       fprintf(fp, "FLUSH_CACHE2.%s_l2.%s_lsc%s r%u, #%x, #%u\n",
672               mode[I.l2_flush_mode], mode[I.lsc_flush_mode],
673               I.other_invalidate ? ".invalidate_other" : ".nop_other",
674               I.latest_flush_id, I.wait_mask, I.signal_slot);
675       break;
676    }
677 
678    case MALI_CS_OPCODE_SYNC_ADD32: {
679       pan_unpack(bytes, CS_SYNC_ADD32, I);
680       fprintf(fp, "SYNC_ADD32%s%s [d%u], r%u, #%x, #%u\n",
681               I.error_propagate ? ".error_propagate" : "",
682               I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
683               I.data, I.wait_mask, I.signal_slot);
684       break;
685    }
686 
687    case MALI_CS_OPCODE_SYNC_SET32: {
688       pan_unpack(bytes, CS_SYNC_SET32, I);
689       fprintf(fp, "SYNC_SET32.%s%s [d%u], r%u, #%x, #%u\n",
690               I.error_propagate ? ".error_propagate" : "",
691               I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
692               I.data, I.wait_mask, I.signal_slot);
693       break;
694    }
695 
696    case MALI_CS_OPCODE_SYNC_WAIT32: {
697       pan_unpack(bytes, CS_SYNC_WAIT32, I);
698       fprintf(fp, "SYNC_WAIT32%s%s d%u, r%u\n", conditions_str[I.condition],
699               I.error_reject ? ".reject" : ".inherit", I.address, I.data);
700       break;
701    }
702 
703    case MALI_CS_OPCODE_STORE_STATE: {
704       static const char *states_str[] = {
705          "SYSTEM_TIMESTAMP",
706          "CYCLE_COUNT",
707          "DISJOINT_COUNT",
708          "ERROR_STATE",
709       };
710 
711       pan_unpack(bytes, CS_STORE_STATE, I);
712       fprintf(fp, "STORE_STATE.%s d%u, #%i, #%x, #%u\n",
713               I.state >= ARRAY_SIZE(states_str) ? "UNKNOWN_STATE"
714                                                 : states_str[I.state],
715               I.address, I.offset, I.wait_mask, I.signal_slot);
716       break;
717    }
718 
719    case MALI_CS_OPCODE_PROT_REGION: {
720       pan_unpack(bytes, CS_PROT_REGION, I);
721       fprintf(fp, "PROT_REGION #%u\n", I.size);
722       break;
723    }
724 
725    case MALI_CS_OPCODE_PROGRESS_STORE: {
726       pan_unpack(bytes, CS_PROGRESS_STORE, I);
727       fprintf(fp, "PROGRESS_STORE d%u\n", I.source);
728       break;
729    }
730 
731    case MALI_CS_OPCODE_PROGRESS_LOAD: {
732       pan_unpack(bytes, CS_PROGRESS_LOAD, I);
733       fprintf(fp, "PROGRESS_LOAD d%u\n", I.destination);
734       break;
735    }
736 
737    case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
738       pan_unpack(bytes, CS_RUN_COMPUTE_INDIRECT, I);
739       pandecode_run_compute_indirect(ctx, fp, qctx, &I);
740       break;
741    }
742 
743    case MALI_CS_OPCODE_ERROR_BARRIER: {
744       pan_unpack(bytes, CS_ERROR_BARRIER, I);
745       fprintf(fp, "ERROR_BARRIER");
746       break;
747    }
748 
749    case MALI_CS_OPCODE_HEAP_SET: {
750       pan_unpack(bytes, CS_HEAP_SET, I);
751       fprintf(fp, "HEAP_SET d%u\n", I.address);
752       break;
753    }
754 
755    case MALI_CS_OPCODE_HEAP_OPERATION: {
756       pan_unpack(bytes, CS_HEAP_OPERATION, I);
757       const char *counter_names[] = {"vt_start", "vt_end", NULL, "frag_end"};
758       fprintf(fp, "HEAP_OPERATION.%s #%x, #%d\n", counter_names[I.operation],
759               I.wait_mask, I.signal_slot);
760       break;
761    }
762 
763    case MALI_CS_OPCODE_TRACE_POINT: {
764       pan_unpack(bytes, CS_TRACE_POINT, I);
765       fprintf(fp, "TRACE_POINT r%d:r%d, #%x, #%u\n", I.base_register,
766               I.base_register + I.register_count - 1, I.wait_mask,
767               I.signal_slot);
768       break;
769    }
770 
771    case MALI_CS_OPCODE_SYNC_ADD64: {
772       pan_unpack(bytes, CS_SYNC_ADD64, I);
773       fprintf(fp, "SYNC_ADD64%s%s [d%u], d%u, #%x, #%u\n",
774               I.error_propagate ? ".error_propagate" : "",
775               I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
776               I.data, I.wait_mask, I.signal_slot);
777       break;
778    }
779 
780    case MALI_CS_OPCODE_SYNC_SET64: {
781       pan_unpack(bytes, CS_SYNC_SET64, I);
782       fprintf(fp, "SYNC_SET64.%s%s [d%u], d%u, #%x, #%u\n",
783               I.error_propagate ? ".error_propagate" : "",
784               I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
785               I.data, I.wait_mask, I.signal_slot);
786       break;
787    }
788 
789    case MALI_CS_OPCODE_SYNC_WAIT64: {
790       pan_unpack(bytes, CS_SYNC_WAIT64, I);
791 
792       fprintf(fp, "SYNC_WAIT64%s%s d%u, d%u\n", conditions_str[I.condition],
793               I.error_reject ? ".reject" : ".inherit", I.address, I.data);
794       break;
795    }
796 
797    default: {
798       fprintf(fp, "UNKNOWN_%u 0x%" PRIX64 "\n", base.opcode, base.data);
799       break;
800    }
801    }
802 }
803 
804 static bool
interpret_ceu_jump(struct pandecode_context * ctx,struct queue_ctx * qctx,uint64_t reg_address,uint32_t reg_length)805 interpret_ceu_jump(struct pandecode_context *ctx, struct queue_ctx *qctx,
806                    uint64_t reg_address, uint32_t reg_length)
807 {
808    uint32_t address_lo = qctx->regs[reg_address];
809    uint32_t address_hi = qctx->regs[reg_address + 1];
810    uint32_t length = qctx->regs[reg_length];
811 
812    if (length % 8) {
813       fprintf(stderr, "CS call alignment error\n");
814       return false;
815    }
816 
817    /* Map the entire subqueue now */
818    uint64_t address = ((uint64_t)address_hi << 32) | address_lo;
819    uint64_t *cs = pandecode_fetch_gpu_mem(ctx, address, length);
820 
821    qctx->ip = cs;
822    qctx->end = cs + (length / 8);
823 
824    /* Skip the usual IP update */
825    return true;
826 }
827 
828 /*
829  * Interpret a single instruction of the CS, updating the register file,
830  * instruction pointer, and call stack. Memory access and GPU controls are
831  * ignored for now.
832  *
833  * Returns true if execution should continue.
834  */
835 static bool
interpret_ceu_instr(struct pandecode_context * ctx,struct queue_ctx * qctx)836 interpret_ceu_instr(struct pandecode_context *ctx, struct queue_ctx *qctx)
837 {
838    /* Unpack the base so we get the opcode */
839    uint8_t *bytes = (uint8_t *)qctx->ip;
840    pan_unpack(bytes, CS_BASE, base);
841 
842    assert(qctx->ip < qctx->end);
843 
844    switch (base.opcode) {
845    case MALI_CS_OPCODE_MOVE: {
846       pan_unpack(bytes, CS_MOVE, I);
847 
848       qctx->regs[I.destination + 0] = (uint32_t)I.immediate;
849       qctx->regs[I.destination + 1] = (uint32_t)(I.immediate >> 32);
850       break;
851    }
852 
853    case MALI_CS_OPCODE_MOVE32: {
854       pan_unpack(bytes, CS_MOVE32, I);
855 
856       qctx->regs[I.destination] = I.immediate;
857       break;
858    }
859 
860    case MALI_CS_OPCODE_LOAD_MULTIPLE: {
861       pan_unpack(bytes, CS_LOAD_MULTIPLE, I);
862       mali_ptr addr =
863          ((uint64_t)qctx->regs[I.address + 1] << 32) | qctx->regs[I.address];
864       addr += I.offset;
865 
866       uint32_t *src =
867          pandecode_fetch_gpu_mem(ctx, addr, util_last_bit(I.mask) * 4);
868 
869       for (uint32_t i = 0; i < 16; i++) {
870          if (I.mask & BITFIELD_BIT(i))
871             qctx->regs[I.base_register + i] = src[i];
872       }
873       break;
874    }
875 
876    case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
877       pan_unpack(bytes, CS_ADD_IMMEDIATE32, I);
878 
879       qctx->regs[I.destination] = qctx->regs[I.source] + I.immediate;
880       break;
881    }
882 
883    case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
884       pan_unpack(bytes, CS_ADD_IMMEDIATE64, I);
885 
886       int64_t value =
887          (qctx->regs[I.source] | ((int64_t)qctx->regs[I.source + 1] << 32)) +
888          I.immediate;
889 
890       qctx->regs[I.destination] = value;
891       qctx->regs[I.destination + 1] = value >> 32;
892       break;
893    }
894 
895    case MALI_CS_OPCODE_CALL: {
896       pan_unpack(bytes, CS_CALL, I);
897 
898       if (qctx->call_stack_depth == MAX_CALL_STACK_DEPTH) {
899          fprintf(stderr, "CS call stack overflow\n");
900          return false;
901       }
902 
903       assert(qctx->call_stack_depth < MAX_CALL_STACK_DEPTH);
904 
905       qctx->ip++;
906 
907       /* Note: tail calls are not optimized in the hardware. */
908       assert(qctx->ip <= qctx->end);
909 
910       unsigned depth = qctx->call_stack_depth++;
911 
912       qctx->call_stack[depth].lr = qctx->ip;
913       qctx->call_stack[depth].end = qctx->end;
914 
915       return interpret_ceu_jump(ctx, qctx, I.address, I.length);
916    }
917 
918    case MALI_CS_OPCODE_JUMP: {
919       pan_unpack(bytes, CS_JUMP, I);
920 
921       if (qctx->call_stack_depth == 0) {
922          fprintf(stderr, "Cannot jump from the entrypoint\n");
923          return false;
924       }
925 
926       return interpret_ceu_jump(ctx, qctx, I.address, I.length);
927    }
928 
929    default:
930       break;
931    }
932 
933    /* Update IP first to point to the next instruction, so call doesn't
934     * require special handling (even for tail calls).
935     */
936    qctx->ip++;
937 
938    while (qctx->ip == qctx->end) {
939       /* Graceful termination */
940       if (qctx->call_stack_depth == 0)
941          return false;
942 
943       /* Pop off the call stack */
944       unsigned old_depth = --qctx->call_stack_depth;
945 
946       qctx->ip = qctx->call_stack[old_depth].lr;
947       qctx->end = qctx->call_stack[old_depth].end;
948    }
949 
950    return true;
951 }
952 
953 void
GENX(pandecode_cs)954 GENX(pandecode_cs)(struct pandecode_context *ctx, mali_ptr queue, uint32_t size,
955                    unsigned gpu_id, uint32_t *regs)
956 {
957    pandecode_dump_file_open(ctx);
958 
959    uint64_t *cs = pandecode_fetch_gpu_mem(ctx, queue, size);
960 
961    /* Mali-G610 has 96 registers. Other devices not yet supported, we can make
962     * this configurable later when we encounter new Malis.
963     */
964    struct queue_ctx qctx = {
965       .nr_regs = 96,
966       .regs = regs,
967       .ip = cs,
968       .end = cs + (size / 8),
969       .gpu_id = gpu_id,
970 
971       /* If this is a kernel mode queue, we don't see the root ring buffer and
972        * we must adjust the initial call stack depth accordingly.
973        */
974       .call_stack_depth = ctx->usermode_queue ? 0 : 1,
975    };
976 
977    if (size) {
978       do {
979          disassemble_ceu_instr(ctx, *(qctx.ip), 1 + qctx.call_stack_depth, true,
980                                ctx->dump_stream, &qctx);
981       } while (interpret_ceu_instr(ctx, &qctx));
982    }
983 
984    fflush(ctx->dump_stream);
985    pandecode_map_read_write(ctx);
986 }
987 #endif
988