xref: /aosp_15_r20/external/mesa3d/src/asahi/lib/agx_linker.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "agx_linker.h"
7 #include <stddef.h>
8 #include <stdint.h>
9 #include "util/ralloc.h"
10 #include "agx_compile.h"
11 #include "agx_device.h"
12 #include "agx_pack.h"
13 #include "agx_scratch.h"
14 
15 /*
16  * When sample shading is used with a non-monolithic fragment shader, we
17  * fast-link a program with the following structure:
18  *
19  *    Fragment prolog;
20  *
21  *    for (u16 sample_bit = 1; sample_bit < (1 << # of samples); ++sample_bit) {
22  *       API fragment shader;
23  *       Fragment epilog;
24  *    }
25  *
26  * This means the prolog runs per-pixel but the fragment shader and epilog run
27  * per-sample. To do this, we need to generate the loop on the fly. The
28  * following binary sequences form the relevant loop.
29  */
30 
31 /* clang-format off */
32 static const uint8_t sample_loop_header[] = {
33    /* mov_imm r0, 0x10000, 0b0 */
34    0x62, 0x01, 0x00, 0x00, 0x01, 0x00,
35 };
36 
37 #define STOP                                                                   \
38    /* stop */                                                                  \
39    0x88, 0x00,                                                                 \
40                                                                                \
41    /* trap */                                                                  \
42    0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00,                             \
43    0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00,
44 
45 static const uint8_t stop[] = {STOP};
46 
47 static const uint8_t sample_loop_footer[] = {
48    /* iadd r0h, 0, r0h, lsl 1 */
49    0x0e, 0x02, 0x00, 0x10, 0x84, 0x00, 0x00, 0x00,
50 
51    /* while_icmp r0l, ult, r0h, 0, 1 */
52    0x52, 0x2c, 0x41, 0x00, 0x00, 0x00,
53 
54    /* jmp_exec_any */
55    0x00, 0xc0, 0x00, 0x00, 0x00, 0x00,
56 
57    /* pop_exec r0l, 1 */
58    0x52, 0x0e, 0x00, 0x00, 0x00, 0x00,
59 
60    STOP
61 };
62 
63 /* Offset in sample_loop_footer to the jmp_exec_any's target */
64 #define SAMPLE_LOOP_FOOTER_JMP_PATCH_OFFS (16)
65 
66 /* Offset of the jmp_exec_any, for calculating the PC offsets */
67 #define SAMPLE_LOOP_FOOTER_JMP_OFFS (14)
68 
69 /* Offset in sample_loop_footer to the while_icmp's sample count immediate. Bit
70  * position in the byte given by the shift.
71  */
72 #define SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS (11)
73 #define SAMPLE_LOOP_FOOTER_COUNT_SHIFT (4)
74 /* clang-format on */
75 
76 void
agx_fast_link(struct agx_linked_shader * linked,struct agx_device * dev,bool fragment,struct agx_shader_part * main,struct agx_shader_part * prolog,struct agx_shader_part * epilog,unsigned nr_samples_shaded)77 agx_fast_link(struct agx_linked_shader *linked, struct agx_device *dev,
78               bool fragment, struct agx_shader_part *main,
79               struct agx_shader_part *prolog, struct agx_shader_part *epilog,
80               unsigned nr_samples_shaded)
81 {
82    size_t size = 0;
83    unsigned nr_gprs = 0, scratch_size = 0;
84    bool reads_tib = false, writes_sample_mask = false,
85         disable_tri_merging = false, tag_write_disable = true;
86 
87    if (nr_samples_shaded) {
88       size += sizeof(sample_loop_header);
89 
90       if (nr_samples_shaded > 1)
91          size += sizeof(sample_loop_footer);
92       else
93          size += sizeof(stop);
94    }
95 
96    struct agx_shader_part *parts[] = {prolog, main, epilog};
97 
98    for (unsigned i = 0; i < ARRAY_SIZE(parts); ++i) {
99       struct agx_shader_part *part = parts[i];
100       if (!part)
101          continue;
102 
103       assert(part->info.main_offset == 0);
104       size += part->info.main_size;
105 
106       nr_gprs = MAX2(nr_gprs, part->info.nr_gprs);
107       scratch_size = MAX2(scratch_size, part->info.scratch_size);
108       reads_tib |= part->info.reads_tib;
109       writes_sample_mask |= part->info.writes_sample_mask;
110       disable_tri_merging |= part->info.disable_tri_merging;
111       linked->uses_base_param |= part->info.uses_base_param;
112       linked->uses_txf |= part->info.uses_txf;
113       tag_write_disable &= part->info.tag_write_disable;
114    }
115 
116    assert(size > 0 && "must stop");
117 
118    linked->bo = agx_bo_create(dev, size, 0, AGX_BO_EXEC | AGX_BO_LOW_VA,
119                               "Linked executable");
120 
121    size_t offset = 0;
122 
123    /* FS prolog happens per-pixel, outside the sample loop */
124    if (prolog) {
125       size_t sz = prolog->info.main_size;
126       memcpy((uint8_t *)linked->bo->map + offset, prolog->binary, sz);
127       offset += sz;
128    }
129 
130    if (nr_samples_shaded) {
131       memcpy((uint8_t *)linked->bo->map + offset, sample_loop_header,
132              sizeof(sample_loop_header));
133       offset += sizeof(sample_loop_header);
134    }
135 
136    size_t sample_loop_begin = offset;
137 
138    /* Main shader and epilog happen in the sample loop, so start from i=1 */
139    for (unsigned i = 1; i < ARRAY_SIZE(parts); ++i) {
140       struct agx_shader_part *part = parts[i];
141       if (!part)
142          continue;
143 
144       size_t sz = part->info.main_size;
145       memcpy((uint8_t *)linked->bo->map + offset, part->binary, sz);
146       offset += sz;
147    }
148 
149    if (nr_samples_shaded > 1) {
150       assert(sample_loop_footer[SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS] == 0);
151 
152       /* Make a stack copy of the footer so we can efficiently patch it */
153       uint8_t footer[sizeof(sample_loop_footer)];
154       memcpy(footer, sample_loop_footer, sizeof(footer));
155 
156       /* Patch in sample end */
157       uint8_t end = (1u << nr_samples_shaded) - 1;
158       footer[SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS] =
159          end << SAMPLE_LOOP_FOOTER_COUNT_SHIFT;
160 
161       /* Patch in the branch target */
162       int32_t loop_size = offset - sample_loop_begin;
163       int32_t branch_offs = -(SAMPLE_LOOP_FOOTER_JMP_OFFS + loop_size);
164       int32_t *target = (int32_t *)(footer + SAMPLE_LOOP_FOOTER_JMP_PATCH_OFFS);
165       *target = branch_offs;
166 
167       /* Copy in the patched footer */
168       memcpy((uint8_t *)linked->bo->map + offset, footer, sizeof(footer));
169       offset += sizeof(footer);
170    } else if (nr_samples_shaded) {
171       /* Just end after the first sample, no need to loop for a single sample */
172       memcpy((uint8_t *)linked->bo->map + offset, stop, sizeof(stop));
173       offset += sizeof(stop);
174    }
175 
176    assert(offset == size);
177 
178    agx_pack(&linked->shader, USC_SHADER, cfg) {
179       cfg.code = agx_usc_addr(dev, linked->bo->va->addr);
180       cfg.unk_2 = fragment ? 2 : 3;
181 
182       if (fragment)
183          cfg.loads_varyings = linked->cf.nr_bindings > 0;
184    }
185 
186    agx_pack(&linked->regs, USC_REGISTERS, cfg) {
187       cfg.register_count = nr_gprs;
188       cfg.unk_1 = fragment;
189       cfg.spill_size = scratch_size ? agx_scratch_get_bucket(scratch_size) : 0;
190    }
191 
192    if (fragment) {
193       agx_pack(&linked->fragment_props, USC_FRAGMENT_PROPERTIES, cfg) {
194          cfg.early_z_testing = !writes_sample_mask;
195          cfg.unk_4 = 0x2;
196          cfg.unk_5 = 0x0;
197       }
198 
199       agx_pack(&linked->fragment_control, FRAGMENT_CONTROL, cfg) {
200          cfg.tag_write_disable = tag_write_disable;
201          cfg.disable_tri_merging = disable_tri_merging;
202 
203          if (reads_tib && writes_sample_mask)
204             cfg.pass_type = AGX_PASS_TYPE_TRANSLUCENT_PUNCH_THROUGH;
205          else if (reads_tib)
206             cfg.pass_type = AGX_PASS_TYPE_TRANSLUCENT;
207          else if (writes_sample_mask)
208             cfg.pass_type = AGX_PASS_TYPE_PUNCH_THROUGH;
209          else
210             cfg.pass_type = AGX_PASS_TYPE_OPAQUE;
211       }
212 
213       /* Merge the CF binding lists from the prolog to handle cull distance */
214       memcpy(&linked->cf, &main->info.varyings.fs,
215              sizeof(struct agx_varyings_fs));
216 
217       struct agx_varyings_fs *prolog_vary =
218          prolog ? &prolog->info.varyings.fs : NULL;
219 
220       if (prolog_vary && prolog_vary->nr_bindings) {
221          assert(!prolog_vary->reads_z);
222          linked->cf.nr_cf = MAX2(linked->cf.nr_cf, prolog_vary->nr_cf);
223 
224          assert(linked->cf.nr_bindings + prolog_vary->nr_bindings <=
225                    ARRAY_SIZE(linked->cf.bindings) &&
226                 "bounded by # of coeff registers");
227 
228          memcpy(linked->cf.bindings + linked->cf.nr_bindings,
229                 prolog_vary->bindings,
230                 sizeof(struct agx_cf_binding) * prolog_vary->nr_bindings);
231 
232          linked->cf.nr_bindings += prolog_vary->nr_bindings;
233       }
234 
235       agx_pack(&linked->osel, OUTPUT_SELECT, cfg) {
236          cfg.varyings = linked->cf.nr_bindings > 0;
237          cfg.frag_coord_z = linked->cf.reads_z;
238       }
239    }
240 }
241