xref: /aosp_15_r20/external/mesa3d/src/freedreno/decode/rdcompiler-utils.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Igalia S.L.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include <assert.h>
7 #include <err.h>
8 #include <getopt.h>
9 #include <stdint.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <sys/types.h>
14 
15 #include "redump.h"
16 
17 #include "util/u_math.h"
18 
19 #include "adreno_common.xml.h"
20 #include "adreno_pm4.xml.h"
21 #include "freedreno_pm4.h"
22 
23 #include "a6xx.xml.h"
24 
25 #include "ir3/ir3_assembler.h"
26 #include "ir3/ir3_compiler.h"
27 #include "ir3/ir3_shader.h"
28 
29 #include "util/list.h"
30 #include "util/vma.h"
31 
32 struct cmdstream {
33    struct list_head link;
34 
35    uint32_t *mem;
36    uint32_t total_size;
37    uint32_t cur;
38 
39    uint64_t iova;
40 };
41 
42 static uint64_t
cs_get_cur_iova(struct cmdstream * cs)43 cs_get_cur_iova(struct cmdstream *cs)
44 {
45    return cs->iova + cs->cur * sizeof(uint32_t);
46 }
47 
48 struct wrbuf {
49    struct list_head link;
50 
51    uint64_t iova;
52    uint64_t size;
53    uint64_t clear;
54    const char *name;
55 };
56 
57 struct replay_context {
58    void *mem_ctx;
59 
60    struct util_vma_heap vma;
61 
62    struct cmdstream *submit_cs;
63    struct cmdstream *state_cs;
64    struct cmdstream *shader_cs;
65 
66    struct cmdstream *shader_log;
67    struct cmdstream *cp_log;
68 
69    struct list_head cs_list;
70 
71    struct list_head wrbuf_list;
72 
73    struct ir3_compiler *compiler;
74 
75    struct hash_table_u64 *compiled_shaders;
76 
77    const char *output_name;
78 };
79 
80 static void
pkt(struct cmdstream * cs,uint32_t payload)81 pkt(struct cmdstream *cs, uint32_t payload)
82 {
83    assert(cs->cur <= cs->total_size);
84    cs->mem[cs->cur++] = payload;
85 }
86 
87 static void
pkt_qw(struct cmdstream * cs,uint64_t payload)88 pkt_qw(struct cmdstream *cs, uint64_t payload)
89 {
90    pkt(cs, payload);
91    pkt(cs, payload >> 32);
92 }
93 
94 static uint64_t
pkt_blob(struct cmdstream * cs,void * payload,uint32_t size,uint32_t alignment)95 pkt_blob(struct cmdstream *cs, void *payload, uint32_t size, uint32_t alignment)
96 {
97    cs->cur = align(cs->cur, alignment / sizeof(uint32_t));
98    uint64_t start_iova = cs_get_cur_iova(cs);
99 
100    memcpy(cs->mem + cs->cur, payload, size);
101    cs->cur += size;
102 
103    return start_iova;
104 }
105 
106 static void
pkt4(struct cmdstream * cs,uint16_t regindx,uint16_t cnt,uint32_t payload)107 pkt4(struct cmdstream *cs, uint16_t regindx, uint16_t cnt, uint32_t payload)
108 {
109    pkt(cs, pm4_pkt4_hdr(regindx, cnt));
110    pkt(cs, payload);
111 }
112 
113 static void
pkt7(struct cmdstream * cs,uint8_t opcode,uint16_t cnt)114 pkt7(struct cmdstream *cs, uint8_t opcode, uint16_t cnt)
115 {
116    pkt(cs, pm4_pkt7_hdr(opcode, cnt));
117 }
118 
119 struct rd_section {
120    uint32_t type;
121    uint32_t size;
122 };
123 
124 static struct cmdstream *
cs_alloc(struct replay_context * ctx,uint32_t size)125 cs_alloc(struct replay_context *ctx, uint32_t size)
126 {
127    struct cmdstream *cs = (struct cmdstream *) calloc(1, sizeof(struct cmdstream));
128    cs->mem = (uint32_t *)calloc(1, size);
129    cs->total_size = size / sizeof(uint32_t);
130    cs->cur = 0;
131    cs->iova = util_vma_heap_alloc(&ctx->vma, size, 4096);
132 
133    assert(cs->iova != 0);
134 
135    list_addtail(&cs->link, &ctx->cs_list);
136 
137    return cs;
138 }
139 
140 static void
rd_write_gpu_addr_section(FILE * out,struct cmdstream * cs,enum rd_sect_type section)141 rd_write_gpu_addr_section(FILE *out, struct cmdstream *cs, enum rd_sect_type section)
142 {
143    const uint32_t packet[] = {(uint32_t)cs->iova,
144                               (uint32_t)(cs->cur * sizeof(uint32_t)),
145                               (uint32_t)(cs->iova >> 32)};
146    struct rd_section section_address = {.type = section,
147                                         .size = sizeof(packet)};
148    fwrite(&section_address, sizeof(section_address), 1, out);
149    fwrite(packet, sizeof(packet), 1, out);
150 }
151 
152 static void
rd_write_cs_buffer(FILE * out,struct cmdstream * cs)153 rd_write_cs_buffer(FILE *out, struct cmdstream *cs)
154 {
155    if (cs->cur == 0)
156       return;
157 
158    rd_write_gpu_addr_section(out, cs, RD_GPUADDR);
159 
160    struct rd_section section_contents = {.type = RD_BUFFER_CONTENTS,
161                                          .size = uint32_t(cs->cur * sizeof(uint32_t))};
162 
163    fwrite(&section_contents, sizeof(section_contents), 1, out);
164    fwrite(cs->mem, sizeof(uint32_t), cs->cur, out);
165 }
166 
167 static void
rd_write_cs_submit(FILE * out,struct cmdstream * cs)168 rd_write_cs_submit(FILE *out, struct cmdstream *cs)
169 {
170    const uint32_t packet[] = {(uint32_t)cs->iova, cs->cur,
171                               (uint32_t)(cs->iova >> 32)};
172    struct rd_section section_cmdstream = {.type = RD_CMDSTREAM_ADDR,
173                                           .size = sizeof(packet)};
174 
175    fwrite(&section_cmdstream, sizeof(section_cmdstream), 1, out);
176    fwrite(packet, sizeof(packet), 1, out);
177 }
178 
179 static void
rd_write_wrbuffer(FILE * out,struct wrbuf * wrbuf)180 rd_write_wrbuffer(FILE *out, struct wrbuf *wrbuf)
181 {
182    uint32_t name_len = strlen(wrbuf->name) + 1;
183    struct rd_section section = {.type = RD_WRBUFFER,
184                                 .size = (uint32_t)(sizeof(uint64_t) * 3) + name_len};
185    fwrite(&section, sizeof(section), 1, out);
186    fwrite(&wrbuf->iova, sizeof(uint64_t), 1, out);
187    fwrite(&wrbuf->size, sizeof(uint64_t), 1, out);
188    fwrite(&wrbuf->clear, sizeof(uint64_t), 1, out);
189    fwrite(wrbuf->name, sizeof(char), name_len, out);
190 }
191 
192 static void
print_usage(const char * name)193 print_usage(const char *name)
194 {
195    /* clang-format off */
196    fprintf(stderr, "Usage:\n\n"
197            "\t%s [OPTIONS]... FILE...\n\n"
198            "Options:\n"
199            "\t    --vastart=offset\n"
200            "\t    --vasize=size\n"
201            "\t-h, --help             - show this message\n"
202            , name);
203    /* clang-format on */
204    exit(2);
205 }
206 
207 #define OPT_VA_START 1000
208 #define OPT_VA_SIZE  1001
209 
210 /* clang-format off */
211 static const struct option opts[] = {
212       { "vastart",  required_argument, 0, OPT_VA_START },
213       { "vasize",   required_argument, 0, OPT_VA_SIZE },
214       { "help",     no_argument,       0, 'h' },
215 };
216 /* clang-format on */
217 
218 static void
replay_context_init(struct replay_context * ctx,struct fd_dev_id * dev_id,int argc,char ** argv)219 replay_context_init(struct replay_context *ctx, struct fd_dev_id *dev_id,
220                     int argc, char **argv)
221 {
222    uint64_t va_start = 0;
223    uint64_t va_size = 0;
224 
225    int c;
226    while ((c = getopt_long(argc, argv, "h", opts, NULL)) != -1) {
227       switch (c) {
228       case OPT_VA_START:
229          va_start = strtoull(optarg, NULL, 0);
230          break;
231       case OPT_VA_SIZE:
232          va_size = strtoull(optarg, NULL, 0);
233          break;
234       case 'h':
235       default:
236          print_usage(argv[0]);
237       }
238    }
239 
240    if (optind < argc) {
241       ctx->output_name = argv[optind];
242    } else {
243    }
244 
245    if (!va_start || !va_size || !ctx->output_name) {
246       print_usage(argv[0]);
247       exit(1);
248    }
249 
250    ctx->mem_ctx = ralloc_context(NULL);
251    list_inithead(&ctx->cs_list);
252    list_inithead(&ctx->wrbuf_list);
253 
254    util_vma_heap_init(&ctx->vma, va_start, ROUND_DOWN_TO(va_size, 4096));
255 
256    ctx->submit_cs = cs_alloc(ctx, 1024 * 1024);
257    ctx->state_cs = cs_alloc(ctx, 2 * 1024 * 1024);
258    ctx->shader_cs = cs_alloc(ctx, 8 * 1024 * 1024);
259 
260    ctx->shader_log = cs_alloc(ctx, 1024 * 1024);
261    ctx->shader_log->mem[0] = (ctx->shader_log->iova & 0xffffffff) + sizeof(uint64_t);
262    ctx->shader_log->mem[1] = ctx->shader_log->iova >> 32;
263    ctx->shader_log->cur = ctx->shader_log->total_size;
264 
265    ctx->cp_log = cs_alloc(ctx, 8 * 1024 * 1024);
266    ((uint64_t *)ctx->cp_log->mem)[0] = ctx->cp_log->iova + 2 * sizeof(uint64_t);
267    ((uint64_t *)ctx->cp_log->mem)[1] = sizeof(uint64_t);
268    ctx->cp_log->cur = ctx->cp_log->total_size;
269 
270    struct ir3_compiler_options options{
271       .disable_cache = true,
272    };
273    ctx->compiler =
274       ir3_compiler_create(NULL, dev_id, fd_dev_info_raw(dev_id), &options);
275    ctx->compiled_shaders = _mesa_hash_table_u64_create(ctx->mem_ctx);
276 }
277 
278 static void
replay_context_finish(struct replay_context * ctx)279 replay_context_finish(struct replay_context *ctx)
280 {
281    FILE *out = fopen(ctx->output_name, "w");
282    if (!out) {
283       errx(1, "Cannot open '%s' for writing\n", ctx->output_name);
284    }
285 
286    static const uint32_t gpu_id = 660;
287    struct rd_section section_gpu_id = {.type = RD_GPU_ID,
288                                        .size = 1 * sizeof(uint32_t)};
289    fwrite(&section_gpu_id, sizeof(section_gpu_id), 1, out);
290    fwrite(&gpu_id, sizeof(uint32_t), 1, out);
291 
292    rd_write_gpu_addr_section(out, ctx->shader_log, RD_SHADER_LOG_BUFFER);
293    rd_write_gpu_addr_section(out, ctx->cp_log, RD_CP_LOG_BUFFER);
294 
295    list_for_each_entry (struct cmdstream, cs, &ctx->cs_list, link) {
296       rd_write_cs_buffer(out, cs);
297    }
298    rd_write_cs_submit(out, ctx->submit_cs);
299 
300    list_for_each_entry (struct wrbuf, wrbuf, &ctx->wrbuf_list, link) {
301       rd_write_wrbuffer(out, wrbuf);
302    }
303 
304    fclose(out);
305 }
306 
307 static void
upload_shader(struct replay_context * ctx,uint64_t id,const char * source)308 upload_shader(struct replay_context *ctx, uint64_t id, const char *source)
309 {
310    FILE *in = fmemopen((void *)source, strlen(source), "r");
311 
312    struct ir3_kernel_info info = {
313       .shader_print_buffer_iova = ctx->shader_log->iova,
314    };
315    struct ir3_shader *shader = ir3_parse_asm(ctx->compiler, &info, in);
316    assert(shader);
317 
318    fclose(in);
319 
320    uint64_t *shader_iova = ralloc(ctx->mem_ctx, uint64_t);
321    *shader_iova = pkt_blob(ctx->shader_cs, shader->variants->bin,
322                            shader->variants->info.size, 128);
323    ralloc_free(shader);
324 
325    _mesa_hash_table_u64_insert(ctx->compiled_shaders, id, shader_iova);
326 }
327 
328 static void
emit_shader_iova(struct replay_context * ctx,struct cmdstream * cs,uint64_t id)329 emit_shader_iova(struct replay_context *ctx, struct cmdstream *cs, uint64_t id)
330 {
331    uint64_t *shader_iova = (uint64_t *)
332       _mesa_hash_table_u64_search(ctx->compiled_shaders, id);
333    if (shader_iova) {
334       pkt_qw(cs, *shader_iova);
335    } else {
336       fprintf(stderr,
337               "Not override for shader at 0x%" PRIx64 ", using original\n", id);
338       pkt_qw(cs, id);
339    }
340 }
341 
342 #define begin_draw_state()                                                     \
343    uint64_t subcs_iova_start = cs_get_cur_iova(ctx.state_cs);                  \
344    struct cmdstream *prev_cs = cs;                                             \
345    struct cmdstream *cs = ctx.state_cs;
346 
347 #define end_draw_state(params)                                                 \
348    uint64_t subcs_iova_end = cs_get_cur_iova(ctx.state_cs);                    \
349    uint32_t subcs_size =                                                       \
350       (subcs_iova_end - subcs_iova_start) / sizeof(uint32_t);                  \
351    pkt7(prev_cs, CP_SET_DRAW_STATE, 3);                                        \
352    pkt(prev_cs, (params) | subcs_size);                                        \
353    pkt_qw(prev_cs, subcs_iova_start);
354 
355 #define begin_ib()                                                             \
356    struct cmdstream *prev_cs = cs;                                             \
357    struct cmdstream *cs = cs_alloc(&ctx, 1024 * 1024);
358 
359 #define end_ib()                                                               \
360    uint64_t ibcs_size = cs->cur;                                               \
361    pkt7(prev_cs, CP_INDIRECT_BUFFER, 3);                                       \
362    pkt_qw(prev_cs, cs->iova);                                                  \
363    pkt(prev_cs, ibcs_size);
364 
365 static void
gpu_print(struct replay_context * ctx,struct cmdstream * _cs,uint64_t iova,uint32_t dwords)366 gpu_print(struct replay_context *ctx, struct cmdstream *_cs, uint64_t iova,
367           uint32_t dwords)
368 {
369    uint64_t header_iova, body_iova;
370    struct cmdstream *prev_cs = _cs;
371    struct cmdstream *cs = cs_alloc(ctx, 4096);
372    /* Commands that are being modified should be in a separate cmdstream,
373     * otherwise they would be prefetched and writes would not be visible.
374     */
375    {
376       /* Write size into entry's header */
377       pkt7(cs, CP_MEM_WRITE, 4);
378       header_iova = cs_get_cur_iova(cs);
379       pkt_qw(cs, 0xdeadbeef);
380       uint64_t size_iova = cs_get_cur_iova(cs);
381       pkt(cs, dwords * 4);
382       pkt(cs, 0);
383 
384       /* Copy the data into entry's body */
385       pkt7(cs, CP_MEMCPY, 5);
386       pkt(cs, dwords);
387       pkt_qw(cs, iova);
388       body_iova = cs_get_cur_iova(cs);
389       pkt_qw(cs, 0xdeadbeef);
390 
391       /* iova = iova + body_size + header_size */
392       pkt7(cs, CP_MEM_TO_MEM, 9);
393       pkt(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
394       pkt_qw(cs, ctx->cp_log->iova);
395       pkt_qw(cs, ctx->cp_log->iova);
396       pkt_qw(cs, size_iova);
397       pkt_qw(cs, ctx->cp_log->iova + sizeof(uint64_t));
398    }
399 
400    {
401       struct cmdstream *cs = prev_cs;
402       pkt7(cs, CP_MEM_TO_MEM, 5);
403       pkt(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
404       pkt_qw(cs, header_iova);
405       pkt_qw(cs, ctx->cp_log->iova);
406 
407       pkt7(cs, CP_MEM_TO_MEM, 7);
408       pkt(cs, CP_MEM_TO_MEM_0_DOUBLE);
409       pkt_qw(cs, body_iova);
410       pkt_qw(cs, ctx->cp_log->iova);
411       pkt_qw(cs, ctx->cp_log->iova + sizeof(uint64_t));
412 
413       pkt7(cs, CP_WAIT_MEM_WRITES, 0);
414       pkt7(cs, CP_WAIT_FOR_ME, 0);
415    }
416 
417    end_ib();
418 }
419 
420 /* This function is used to read a buffer from the GPU into a file.
421  * The buffer can optionally be cleared to 0xdeadbeef at the start
422  * of the cmdstream by setting the clear parameter to true.
423  *
424  * Note: Unlike gpu_print, this function isn't sequenced, it will
425  * read the state of the buffer at the end of the cmdstream, not
426  * at the point of the call.
427  */
428 static void
gpu_read_into_file(struct replay_context * ctx,struct cmdstream * _cs,uint64_t iova,uint64_t size,bool clear,const char * name)429 gpu_read_into_file(struct replay_context *ctx, struct cmdstream *_cs,
430                     uint64_t iova, uint64_t size, bool clear, const char *name)
431 {
432    struct wrbuf *wrbuf = (struct wrbuf *) calloc(1, sizeof(struct wrbuf));
433    wrbuf->iova = iova;
434    wrbuf->size = size;
435    wrbuf->clear = clear;
436    wrbuf->name = strdup(name);
437 
438    assert(wrbuf->iova != 0);
439 
440    list_addtail(&wrbuf->link, &ctx->wrbuf_list);
441 }