xref: /aosp_15_r20/external/mesa3d/src/asahi/compiler/agx_compile.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2018-2021 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #pragma once
7 
8 #include "compiler/nir/nir.h"
9 #include "util/u_dynarray.h"
10 #include "shader_enums.h"
11 
12 struct agx_cf_binding {
13    /* Base coefficient register */
14    unsigned cf_base;
15 
16    /* Slot being bound */
17    gl_varying_slot slot;
18 
19    /* First component bound.
20     *
21     * Must be 2 (Z) or 3 (W) if slot == VARYING_SLOT_POS.
22     */
23    unsigned offset : 2;
24 
25    /* Number of components bound */
26    unsigned count : 3;
27 
28    /* Is smooth shading enabled? If false, flat shading is used */
29    bool smooth : 1;
30 
31    /* Perspective correct interpolation */
32    bool perspective : 1;
33 };
34 
35 /* Conservative bound, * 4 due to offsets (TODO: maybe worth eliminating
36  * coefficient register aliasing?)
37  */
38 #define AGX_MAX_CF_BINDINGS (VARYING_SLOT_MAX * 4)
39 
40 struct agx_varyings_fs {
41    /* Number of coefficient registers used */
42    unsigned nr_cf;
43 
44    /* Number of coefficient register bindings */
45    unsigned nr_bindings;
46 
47    /* Whether gl_FragCoord.z is read */
48    bool reads_z;
49 
50    /* Coefficient register bindings */
51    struct agx_cf_binding bindings[AGX_MAX_CF_BINDINGS];
52 };
53 
54 union agx_varyings {
55    struct agx_varyings_fs fs;
56 };
57 
58 struct agx_interp_info {
59    /* Bit masks indexed by I/O location of flat and linear varyings */
60    uint64_t flat;
61    uint64_t linear;
62 };
63 static_assert(sizeof(struct agx_interp_info) == 16, "packed");
64 
65 struct agx_shader_info {
66    enum pipe_shader_type stage;
67 
68    union agx_varyings varyings;
69 
70    /* Number of uniforms */
71    unsigned push_count;
72 
73    /* Local memory allocation in bytes */
74    unsigned local_size;
75 
76    /* Local imageblock allocation in bytes per thread */
77    unsigned imageblock_stride;
78 
79    /* Scratch memory allocation in bytes for main/preamble respectively */
80    unsigned scratch_size, preamble_scratch_size;
81 
82    /* Size in bytes of the main sahder */
83    unsigned main_size;
84 
85    /* Does the shader have a preamble? If so, it is at offset preamble_offset.
86     * The main shader is at offset main_offset. The preamble is executed first.
87     */
88    bool has_preamble;
89    unsigned preamble_offset, main_offset;
90 
91    /* Does the shader read the tilebuffer? */
92    bool reads_tib;
93 
94    /* Does the shader require early fragment tests? */
95    bool early_fragment_tests;
96 
97    /* Does the shader potentially draw to a nonzero viewport? */
98    bool nonzero_viewport;
99 
100    /* Does the shader write layer and/or viewport index? Written together */
101    bool writes_layer_viewport;
102 
103    /* Does the shader control the sample mask? */
104    bool writes_sample_mask;
105 
106    /* Depth layout, never equal to NONE */
107    enum gl_frag_depth_layout depth_layout;
108 
109    /* Based only the compiled shader, should tag writes be disabled? This is set
110     * based on what is outputted. Note if rasterizer discard is used, that needs
111     * to disable tag writes regardless of this flag.
112     */
113    bool tag_write_disable;
114 
115    /* Shader is incompatible with triangle merging */
116    bool disable_tri_merging;
117 
118    /* Reads draw ID system value */
119    bool uses_draw_id;
120 
121    /* Reads base vertex/instance */
122    bool uses_base_param;
123 
124    /* Uses txf and hence needs a txf sampler mapped */
125    bool uses_txf;
126 
127    /* Number of 16-bit registers used by the main shader and preamble
128     * respectively.
129     */
130    unsigned nr_gprs, nr_preamble_gprs;
131 
132    /* Output mask set during driver lowering */
133    uint64_t outputs;
134 
135    /* Immediate data that must be uploaded and mapped as uniform registers */
136    unsigned immediate_base_uniform;
137    unsigned immediate_size_16;
138    uint16_t immediates[512];
139 };
140 
141 struct agx_shader_part {
142    struct agx_shader_info info;
143    void *binary;
144    size_t binary_size;
145 };
146 
147 #define AGX_MAX_RTS (8)
148 
149 enum agx_format {
150    AGX_FORMAT_I8 = 0,
151    AGX_FORMAT_I16 = 1,
152    AGX_FORMAT_I32 = 2,
153    AGX_FORMAT_F16 = 3,
154    AGX_FORMAT_U8NORM = 4,
155    AGX_FORMAT_S8NORM = 5,
156    AGX_FORMAT_U16NORM = 6,
157    AGX_FORMAT_S16NORM = 7,
158    AGX_FORMAT_RGB10A2 = 8,
159    AGX_FORMAT_SRGBA8 = 10,
160    AGX_FORMAT_RG11B10F = 12,
161    AGX_FORMAT_RGB9E5 = 13,
162 
163    /* Keep last */
164    AGX_NUM_FORMATS,
165 };
166 
167 struct agx_fs_shader_key {
168    /* Normally, access to the tilebuffer must be guarded by appropriate fencing
169     * instructions to ensure correct results in the presence of out-of-order
170     * hardware optimizations. However, specially dispatched clear shaders are
171     * not subject to these conditions and can omit the wait instructions.
172     *
173     * Must (only) be set for special clear shaders.
174     *
175     * Must not be used with sample mask writes (including discards) or
176     * tilebuffer loads (including blending).
177     */
178    bool ignore_tib_dependencies;
179 
180    /* When dynamic sample shading is used, the fragment shader is wrapped in a
181     * loop external to the API shader. This bit indicates that we are compiling
182     * inside the sample loop, meaning the execution nesting counter is already
183     * zero and must be preserved.
184     */
185    bool inside_sample_loop;
186 
187    /* Base coefficient register. 0 for API shaders but nonzero for FS prolog */
188    uint8_t cf_base;
189 };
190 
191 struct agx_device_key {
192    /* Does the target GPU need explicit cluster coherency for atomics?
193     * Only used on G13X.
194     */
195    bool needs_g13x_coherency;
196 
197    /* Is soft fault enabled? This is technically system-wide policy set by the
198     * kernel, but that's functionally a hardware feature.
199     */
200    bool soft_fault;
201 };
202 
203 struct agx_shader_key {
204    /* Device info */
205    struct agx_device_key dev;
206 
207    /* Number of reserved preamble slots at the start */
208    unsigned reserved_preamble;
209 
210    /* Library routines to link against */
211    const nir_shader *libagx;
212 
213    /* Whether scratch memory is available in the given shader stage */
214    bool has_scratch;
215 
216    /* Whether we're compiling the helper program used for scratch allocation.
217     * This has special register allocation requirements.
218     */
219    bool is_helper;
220 
221    /* Whether the driver supports uploading constants for this shader. If
222     * false, constants will not be promoted to uniforms.
223     */
224    bool promote_constants;
225 
226    /* Set if this is a non-monolithic shader that must be linked with additional
227     * shader parts before the program can be used. This suppresses omission of
228     * `stop` instructions, which the linker must insert instead.
229     */
230    bool no_stop;
231 
232    /* Set if this is a secondary shader part (prolog or epilog). This prevents
233     * the compiler from allocating uniform registers. For example, this turns
234     * off preambles.
235     */
236    bool secondary;
237 
238    union {
239       struct agx_fs_shader_key fs;
240    };
241 };
242 
243 struct agx_interp_info agx_gather_interp_info(nir_shader *nir);
244 uint64_t agx_gather_texcoords(nir_shader *nir);
245 
246 void agx_link_libagx(nir_shader *nir, const nir_shader *libagx);
247 void agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx);
248 bool agx_nir_lower_discard_zs_emit(nir_shader *s);
249 bool agx_nir_lower_sample_mask(nir_shader *s);
250 bool agx_nir_lower_interpolation(nir_shader *s);
251 
252 bool agx_nir_lower_cull_distance_vs(struct nir_shader *s);
253 bool agx_nir_lower_cull_distance_fs(struct nir_shader *s,
254                                     unsigned nr_distances);
255 
256 void agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
257                             struct util_debug_callback *debug,
258                             struct agx_shader_part *out);
259 
260 struct agx_occupancy {
261    unsigned max_registers;
262    unsigned max_threads;
263 };
264 
265 struct agx_occupancy agx_occupancy_for_register_count(unsigned halfregs);
266 unsigned agx_max_registers_for_occupancy(unsigned occupancy);
267 
268 static const nir_shader_compiler_options agx_nir_options = {
269    .lower_fdiv = true,
270    .fuse_ffma16 = true,
271    .fuse_ffma32 = true,
272    .lower_flrp16 = true,
273    .lower_flrp32 = true,
274    .lower_fpow = true,
275    .lower_fmod = true,
276    .lower_bitfield_insert = true,
277    .lower_ifind_msb = true,
278    .lower_find_lsb = true,
279    .lower_uadd_carry = true,
280    .lower_usub_borrow = true,
281    .lower_fisnormal = true,
282    .lower_scmp = true,
283    .lower_isign = true,
284    .lower_fsign = true,
285    .lower_iabs = true,
286    .lower_fminmax_signed_zero = true,
287    .lower_fdph = true,
288    .lower_ffract = true,
289    .lower_ldexp = true,
290    .lower_pack_half_2x16 = true,
291    .lower_pack_unorm_2x16 = true,
292    .lower_pack_snorm_2x16 = true,
293    .lower_pack_unorm_4x8 = true,
294    .lower_pack_snorm_4x8 = true,
295    .lower_pack_64_2x32 = true,
296    .lower_unpack_half_2x16 = true,
297    .lower_unpack_unorm_2x16 = true,
298    .lower_unpack_snorm_2x16 = true,
299    .lower_unpack_unorm_4x8 = true,
300    .lower_unpack_snorm_4x8 = true,
301    .lower_extract_byte = true,
302    .lower_insert_byte = true,
303    .lower_insert_word = true,
304    .has_cs_global_id = true,
305    .lower_device_index_to_zero = true,
306    .lower_hadd = true,
307    .vectorize_io = true,
308    .use_interpolated_input_intrinsics = true,
309    .has_isub = true,
310    .support_16bit_alu = true,
311    .max_unroll_iterations = 32,
312    .lower_uniforms_to_ubo = true,
313    .lower_int64_options =
314       (nir_lower_int64_options) ~(nir_lower_iadd64 | nir_lower_imul_2x32_64),
315    .lower_doubles_options = (nir_lower_doubles_options)(~0),
316    .lower_fquantize2f16 = true,
317    .compact_arrays = true,
318    .discard_is_demote = true,
319    .has_ddx_intrinsics = true,
320    .scalarize_ddx = true,
321 };
322