xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_shader.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* The compiler middle-end architecture: Explaining (non-)monolithic shaders
8  * -------------------------------------------------------------------------
9  *
10  * Typically, there is one-to-one correspondence between API and HW shaders,
11  * that is, for every API shader, there is exactly one shader binary in
12  * the driver.
13  *
14  * The problem with that is that we also have to emulate some API states
15  * (e.g. alpha-test, and many others) in shaders too. The two obvious ways
16  * to deal with it are:
17  * - each shader has multiple variants for each combination of emulated states,
18  *   and the variants are compiled on demand, possibly relying on a shader
19  *   cache for good performance
20  * - patch shaders at the binary level
21  *
22  * This driver uses something completely different. The emulated states are
23  * usually implemented at the beginning or end of shaders. Therefore, we can
24  * split the shader into 3 parts:
25  * - prolog part (shader code dependent on states)
26  * - main part (the API shader)
27  * - epilog part (shader code dependent on states)
28  *
29  * Each part is compiled as a separate shader and the final binaries are
30  * concatenated. This type of shader is called non-monolithic, because it
31  * consists of multiple independent binaries. Creating a new shader variant
32  * is therefore only a concatenation of shader parts (binaries) and doesn't
33  * involve any compilation. The main shader parts are the only parts that are
34  * compiled when applications create shader objects. The prolog and epilog
35  * parts are compiled on the first use and saved, so that their binaries can
36  * be reused by many other shaders.
37  *
38  * One of the roles of the prolog part is to compute vertex buffer addresses
39  * for vertex shaders. A few of the roles of the epilog part are color buffer
40  * format conversions in pixel shaders that we have to do manually, and write
41  * tessellation factors in tessellation control shaders. The prolog and epilog
42  * have many other important responsibilities in various shader stages.
43  * They don't just "emulate legacy stuff".
44  *
45  * Monolithic shaders are shaders where the parts are combined before LLVM
46  * compilation, and the whole thing is compiled and optimized as one unit with
47  * one binary on the output. The result is the same as the non-monolithic
48  * shader, but the final code can be better, because LLVM can optimize across
49  * all shader parts. Monolithic shaders aren't usually used except for these
50  * special cases:
51  *
52  * 1) Some rarely-used states require modification of the main shader part
53  *    itself, and in such cases, only the monolithic shader variant is
54  *    compiled, and that's always done on the first use.
55  *
56  * 2) When we do cross-stage optimizations for separate shader objects and
57  *    e.g. eliminate unused shader varyings, the resulting optimized shader
58  *    variants are always compiled as monolithic shaders, and always
59  *    asynchronously (i.e. not stalling ongoing rendering). We call them
60  *    "optimized monolithic" shaders. The important property here is that
61  *    the non-monolithic unoptimized shader variant is always available for use
62  *    when the asynchronous compilation of the optimized shader is not done
63  *    yet.
64  *
65  * Starting with GFX9 chips, some shader stages are merged, and the number of
66  * shader parts per shader increased. The complete new list of shader parts is:
67  * - 1st shader: prolog part
68  * - 1st shader: main part
69  * - 2nd shader: main part
70  * - 2nd shader: epilog part
71  */
72 
73 /* How linking shader inputs and outputs between vertex, tessellation, and
74  * geometry shaders works.
75  *
76  * Inputs and outputs between shaders are stored in a buffer. This buffer
77  * lives in LDS (typical case for tessellation), but it can also live
78  * in memory (ESGS). Each input or output has a fixed location within a vertex.
79  * The highest used input or output determines the stride between vertices.
80  *
81  * Since GS and tessellation are only possible in the OpenGL core profile,
82  * only these semantics are valid for per-vertex data:
83  *
84  *   Name             Location
85  *
86  *   POSITION         0
87  *   VAR0..31         1..32
88  *   CLIP_DIST0..1    49..50
89  *   PSIZ             51
90  *
91  * For example, a shader only writing GENERIC0 has the output stride of 5.
92  *
93  * Only these semantics are valid for per-patch data:
94  *
95  *   Name             Location
96  *
97  *   TESSOUTER        0
98  *   TESSINNER        1
99  *   PATCH0..29       2..31
100  *
101  * That's how independent shaders agree on input and output locations.
102  * The si_shader_io_get_unique_index function assigns the locations.
103  *
104  * For tessellation, other required information for calculating the input and
105  * output addresses like the vertex stride, the patch stride, and the offsets
106  * where per-vertex and per-patch data start, is passed to the shader via
107  * user data SGPRs. The offsets and strides are calculated at draw time and
108  * aren't available at compile time.
109  */
110 
111 #ifndef SI_SHADER_H
112 #define SI_SHADER_H
113 
114 #include "shader_info.h"
115 #include "ac_binary.h"
116 #include "ac_gpu_info.h"
117 #include "util/mesa-blake3.h"
118 #include "util/u_live_shader_cache.h"
119 #include "util/u_queue.h"
120 #include "si_pm4.h"
121 
122 #ifdef __cplusplus
123 extern "C" {
124 #endif
125 
126 struct nir_shader;
127 struct nir_instr;
128 
129 #define SI_NUM_INTERP     32
130 #define SI_MAX_ATTRIBS    16
131 #define SI_MAX_VS_OUTPUTS 40
132 #define SI_USER_CLIP_PLANE_MASK  0x3F
133 
134 #define INTERP_MODE_COLOR  INTERP_MODE_COUNT
135 
136 #define SI_PS_INPUT_CNTL_0000          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
137 #define SI_PS_INPUT_CNTL_0001          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
138 #define SI_PS_INPUT_CNTL_UNUSED        SI_PS_INPUT_CNTL_0000
139 /* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
140 #define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
141 
142 #define SI_VECTOR_ARG_IS_COLOR               BITFIELD_BIT(0)
143 #define SI_VECTOR_ARG_COLOR_COMPONENT(x)     (((x) & 0x7) << 1)
144 #define SI_GET_VECTOR_ARG_COLOR_COMPONENT(x) (((x) >> 1) & 0x7)
145 
146 /* SGPR user data indices */
147 enum
148 {
149    SI_SGPR_INTERNAL_BINDINGS,
150    SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
151    SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
152    SI_SGPR_SAMPLERS_AND_IMAGES,
153    SI_NUM_RESOURCE_SGPRS,
154 
155    /* API VS, TES without GS, GS copy shader */
156    SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
157    SI_NUM_VS_STATE_RESOURCE_SGPRS,
158 
159    /* all VS variants */
160    SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
161    SI_SGPR_DRAWID,
162    SI_SGPR_START_INSTANCE,
163    SI_VS_NUM_USER_SGPR,
164 
165    SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
166 
167    /* TES */
168    SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
169    SI_SGPR_TES_OFFCHIP_ADDR,
170    SI_TES_NUM_USER_SGPR,
171 
172    /* GFX6-8: TCS only */
173    GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
174    GFX6_SGPR_TCS_OFFCHIP_ADDR,
175    GFX6_SGPR_TCS_IN_LAYOUT,
176    GFX6_TCS_NUM_USER_SGPR,
177 
178    /* GFX9: Merged LS-HS (VS-TCS) only. */
179    GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
180    GFX9_SGPR_TCS_OFFCHIP_ADDR,
181    GFX9_TCS_NUM_USER_SGPR,
182 
183    /* GS limits */
184    GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
185    SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
186 
187    GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR),
188    GFX9_SGPR_ATTRIBUTE_RING_ADDR,
189    GFX9_GS_NUM_USER_SGPR,
190 
191    /* PS only */
192    SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
193    SI_PS_NUM_USER_SGPR,
194 
195    /* The value has to be 12, because the hw requires that descriptors
196     * are aligned to 4 SGPRs.
197     */
198    SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
199 };
200 
201 /* LLVM function parameter indices */
202 enum
203 {
204    SI_NUM_RESOURCE_PARAMS = 4,
205 
206    /* PS only parameters */
207    SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
208    SI_PARAM_PRIM_MASK,
209    SI_PARAM_PERSP_SAMPLE,
210    SI_PARAM_PERSP_CENTER,
211    SI_PARAM_PERSP_CENTROID,
212    SI_PARAM_PERSP_PULL_MODEL,
213    SI_PARAM_LINEAR_SAMPLE,
214    SI_PARAM_LINEAR_CENTER,
215    SI_PARAM_LINEAR_CENTROID,
216    SI_PARAM_LINE_STIPPLE_TEX,
217    SI_PARAM_POS_X_FLOAT,
218    SI_PARAM_POS_Y_FLOAT,
219    SI_PARAM_POS_Z_FLOAT,
220    SI_PARAM_POS_W_FLOAT,
221    SI_PARAM_FRONT_FACE,
222    SI_PARAM_ANCILLARY,
223    SI_PARAM_SAMPLE_COVERAGE,
224    SI_PARAM_POS_FIXED_PT,
225 
226    SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
227 };
228 
229 /* These fields are only set in current_vs_state (except INDEXED) in si_context, and they are
230  * accessible in the shader via vs_state_bits in VS, TES, and GS.
231  */
232 #define VS_STATE_CLAMP_VERTEX_COLOR__SHIFT   0
233 #define VS_STATE_CLAMP_VERTEX_COLOR__MASK    0x1 /* Shared by VS and GS */
234 #define VS_STATE_INDEXED__SHIFT              1
235 #define VS_STATE_INDEXED__MASK               0x1 /* Shared by VS and GS */
236 
237 /* These fields are only set in current_gs_state in si_context, and they are accessible
238  * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
239  */
240 /* bit gap */
241 /* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which
242  * can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1
243  * Only used by GFX9+ to compute LDS addresses of GS inputs.
244  */
245 #define GS_STATE_NUM_ES_OUTPUTS__SHIFT          13
246 #define GS_STATE_NUM_ES_OUTPUTS__MASK           0x3f
247 /* Small prim filter precision = num_samples / quant_mode, which can only be equal to 1/2^n
248  * where n is between 4 and 12. Knowing that, we only need to store 4 bits of the FP32 exponent.
249  * Set it like this: value = (fui(num_samples / quant_mode) >> 23) & 0xf;
250  * Expand to FP32 like this: ((0x70 | value) << 23);
251  * With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15), which is always a negative
252  * exponent and it's equal to 1/2^(15 - value).
253  */
254 #define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__SHIFT 19
255 #define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__MASK  0xf
256 #define GS_STATE_SMALL_PRIM_PRECISION__SHIFT    23
257 #define GS_STATE_SMALL_PRIM_PRECISION__MASK     0xf
258 #define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 27
259 #define GS_STATE_STREAMOUT_QUERY_ENABLED__MASK  0x1
260 #define GS_STATE_PROVOKING_VTX_FIRST__SHIFT     28
261 #define GS_STATE_PROVOKING_VTX_FIRST__MASK      0x1
262 #define GS_STATE_OUTPRIM__SHIFT                 29
263 #define GS_STATE_OUTPRIM__MASK                  0x3
264 #define GS_STATE_PIPELINE_STATS_EMU__SHIFT      31
265 #define GS_STATE_PIPELINE_STATS_EMU__MASK       0x1
266 
267 #define ENCODE_FIELD(field, value) (((unsigned)(value) & field##__MASK) << field##__SHIFT)
268 #define CLEAR_FIELD(field) (~((unsigned)field##__MASK << field##__SHIFT))
269 
270 /* This is called by functions that change states. */
271 #define SET_FIELD(var, field, value) do { \
272    assert((value) == ((unsigned)(value) & field##__MASK)); \
273    (var) &= CLEAR_FIELD(field); \
274    (var) |= ENCODE_FIELD(field, value); \
275 } while (0)
276 
277 /* This is called during shader compilation and returns LLVMValueRef. */
278 #define GET_FIELD(ctx, field) si_unpack_param((ctx), (ctx)->args->vs_state_bits, field##__SHIFT, \
279                                              util_bitcount(field##__MASK))
280 
281 enum
282 {
283    /* These represent the number of SGPRs the shader uses. */
284    SI_VS_BLIT_SGPRS_POS = 3,
285    SI_VS_BLIT_SGPRS_POS_COLOR = 7,
286    SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
287 
288    MAX_SI_VS_BLIT_SGPRS = 10, /* +1 for the attribute ring address */
289 };
290 
291 #define SI_NGG_CULL_TRIANGLES                (1 << 0)   /* this implies W, view.xy, and small prim culling */
292 #define SI_NGG_CULL_BACK_FACE                (1 << 1)   /* back faces */
293 #define SI_NGG_CULL_FRONT_FACE               (1 << 2)   /* front faces */
294 #define SI_NGG_CULL_LINES                    (1 << 3)   /* the primitive type is lines */
295 #define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4)   /* cull small lines according to the diamond exit rule */
296 #define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
297 #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x)  (((x) >> 5) & 0xff)
298 
299 struct si_shader_profile {
300    uint32_t blake3[BLAKE3_OUT_LEN32];
301    uint32_t options;
302 };
303 
304 extern struct si_shader_profile si_shader_profiles[];
305 unsigned si_get_num_shader_profiles(void);
306 
307 #define SI_PROFILE_WAVE32                    (1 << 0)
308 #define SI_PROFILE_GFX10_WAVE64              (1 << 1)
309 /* bit gap */
310 #define SI_PROFILE_VS_NO_BINNING             (1 << 3)
311 #define SI_PROFILE_GFX9_GFX10_PS_NO_BINNING  (1 << 4)
312 #define SI_PROFILE_CLAMP_DIV_BY_ZERO         (1 << 5)
313 #define SI_PROFILE_NO_OPT_UNIFORM_VARYINGS   (1 << 6)
314 
315 enum si_shader_dump_type {
316    SI_DUMP_SHADER_KEY,
317    SI_DUMP_INIT_NIR,       /* initial input NIR when shaders are created (before lowering) */
318    SI_DUMP_NIR,            /* final NIR after lowering when shader variants are created */
319    SI_DUMP_INIT_LLVM_IR,   /* initial LLVM IR before optimizations */
320    SI_DUMP_LLVM_IR,        /* final LLVM IR */
321    SI_DUMP_INIT_ACO_IR,    /* initial ACO IR before optimizations */
322    SI_DUMP_ACO_IR,         /* final ACO IR */
323    SI_DUMP_ASM,            /* final asm shaders */
324    SI_DUMP_STATS,          /* print statistics as shader-db */
325    SI_DUMP_ALWAYS,
326 };
327 
328 enum {
329    SI_UNIQUE_SLOT_POS = 0,
330 
331    /* Since some shader stages use the highest used IO index
332     * to determine the size to allocate for inputs/outputs
333     * (in LDS, tess and GS rings). VARn should be placed right
334     * after POSITION to make that size as small as possible.
335     */
336    SI_UNIQUE_SLOT_VAR0 = 1, /* 0..31 */
337 
338    /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
339     * legacy desktop GL varyings because they are mutually exclusive.
340     */
341    SI_UNIQUE_SLOT_VAR0_16BIT = 33, /* 0..15 */
342 
343    /* Legacy GL-only varyings can alias GLES-only 16-bit varyings. */
344    SI_UNIQUE_SLOT_FOGC = 33,
345    SI_UNIQUE_SLOT_COL0,
346    SI_UNIQUE_SLOT_COL1,
347    SI_UNIQUE_SLOT_BFC0,
348    SI_UNIQUE_SLOT_BFC1,
349    SI_UNIQUE_SLOT_TEX0,
350    SI_UNIQUE_SLOT_TEX1,
351    SI_UNIQUE_SLOT_TEX2,
352    SI_UNIQUE_SLOT_TEX3,
353    SI_UNIQUE_SLOT_TEX4,
354    SI_UNIQUE_SLOT_TEX5,
355    SI_UNIQUE_SLOT_TEX6,
356    SI_UNIQUE_SLOT_TEX7,
357    SI_UNIQUE_SLOT_CLIP_VERTEX,
358 
359    /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
360    SI_UNIQUE_SLOT_CLIP_DIST0 = 49,
361    SI_UNIQUE_SLOT_CLIP_DIST1,
362    SI_UNIQUE_SLOT_PSIZ,
363    /* These can't be written by LS, HS, and ES. */
364    SI_UNIQUE_SLOT_LAYER,
365    SI_UNIQUE_SLOT_VIEWPORT,
366    SI_UNIQUE_SLOT_PRIMITIVE_ID,
367 };
368 
369 /**
370  * For VS shader keys, describe any fixups required for vertex fetch.
371  *
372  * \ref log_size, \ref format, and the number of channels are interpreted as
373  * by \ref ac_build_opencoded_load_format.
374  *
375  * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
376  * impossible format and indicates that no fixup is needed (just use
377  * buffer_load_format_xyzw).
378  */
379 union si_vs_fix_fetch {
380    struct {
381       uint8_t log_size : 2;        /* 1, 2, 4, 8 or bytes per channel */
382       uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
383       uint8_t format : 3;          /* AC_FETCH_FORMAT_xxx */
384       uint8_t reverse : 1;         /* reverse XYZ channels */
385    } u;
386    uint8_t bits;
387 };
388 
389 struct si_shader;
390 
391 /* State of the context creating the shader object. */
392 struct si_compiler_ctx_state {
393    /* Should only be used by si_init_shader_selector_async and
394     * si_build_shader_variant if thread_index == -1 (non-threaded). */
395    struct ac_llvm_compiler *compiler;
396 
397    /* Used if thread_index == -1 or if debug.async is true. */
398    struct util_debug_callback debug;
399 
400    /* Used for creating the log string for gallium/ddebug. */
401    bool is_debug_context;
402 };
403 
404 enum si_color_output_type {
405    SI_TYPE_ANY32,
406    SI_TYPE_FLOAT16,
407    SI_TYPE_INT16,
408    SI_TYPE_UINT16,
409 };
410 
411 union si_input_info {
412    struct {
413       uint8_t semantic;
414       uint8_t interpolate;
415       uint8_t fp16_lo_hi_valid;
416       uint8_t usage_mask;
417    };
418    uint32_t _unused; /* this just forces 4-byte alignment */
419 };
420 
421 struct si_shader_info {
422    shader_info base;
423 
424    uint32_t options; /* bitmask of SI_PROFILE_* */
425 
426    uint8_t num_inputs;
427    uint8_t num_outputs;
428    union si_input_info input[PIPE_MAX_SHADER_INPUTS];
429    uint8_t output_semantic[PIPE_MAX_SHADER_OUTPUTS];
430    uint8_t output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
431    uint8_t output_readmask[PIPE_MAX_SHADER_OUTPUTS];
432    uint8_t output_streams[PIPE_MAX_SHADER_OUTPUTS];
433    uint8_t output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
434 
435    uint8_t num_vs_inputs;
436    uint8_t num_vbos_in_user_sgprs;
437    uint8_t num_stream_output_components[4];
438    uint16_t enabled_streamout_buffer_mask;
439 
440    uint64_t inputs_read; /* "get_unique_index" bits */
441    uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */
442 
443    uint64_t outputs_written_before_tes_gs; /* "get_unique_index" bits */
444    uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
445    uint32_t patch_outputs_written;     /* "get_unique_index_patch" bits */
446 
447    uint8_t clipdist_mask;
448    uint8_t culldist_mask;
449 
450    uint16_t esgs_vertex_stride;
451    uint16_t gsvs_vertex_size;
452    uint8_t gs_input_verts_per_prim;
453    unsigned max_gsvs_emit_size;
454 
455    /* Set 0xf or 0x0 (4 bits) per each written output.
456     * ANDed with spi_shader_col_format.
457     */
458    unsigned colors_written_4bit;
459 
460    int constbuf0_num_slots;
461    uint num_memory_stores;
462    uint8_t color_attr_index[2];
463    uint8_t color_interpolate[2];
464    uint8_t color_interpolate_loc[2];
465    uint8_t colors_read; /**< which color components are read by the FS */
466    uint8_t colors_written;
467    uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */
468    bool color0_writes_all_cbufs; /**< gl_FragColor */
469    bool reads_samplemask;   /**< does fragment shader read sample mask? */
470    bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
471    bool writes_z;           /**< does fragment shader write Z value? */
472    bool writes_stencil;     /**< does fragment shader write stencil value? */
473    bool writes_samplemask;  /**< does fragment shader write sample mask? */
474    bool writes_edgeflag;    /**< vertex shader outputs edgeflag */
475    bool uses_interp_color;
476    bool uses_persp_center_color;
477    bool uses_persp_centroid_color;
478    bool uses_persp_sample_color;
479    bool uses_persp_center;
480    bool uses_persp_centroid;
481    bool uses_persp_sample;
482    bool uses_linear_center;
483    bool uses_linear_centroid;
484    bool uses_linear_sample;
485    bool uses_interp_at_sample;
486    bool uses_instanceid;
487    bool uses_base_vertex;
488    bool uses_base_instance;
489    bool uses_drawid;
490    bool uses_primid;
491    bool uses_frontface;
492    bool uses_invocationid;
493    bool uses_thread_id[3];
494    bool uses_block_id[3];
495    bool uses_variable_block_size;
496    bool uses_grid_size;
497    bool uses_tg_size;
498    bool uses_atomic_ordered_add;
499    bool writes_position;
500    bool writes_psize;
501    bool writes_clipvertex;
502    bool writes_primid;
503    bool writes_viewport_index;
504    bool writes_layer;
505    bool uses_bindless_samplers;
506    bool uses_bindless_images;
507    bool uses_indirect_descriptor;
508    bool has_divergent_loop;
509    bool uses_sampleid;
510    bool uses_layer_id;
511    bool has_non_uniform_tex_access;
512 
513    bool uses_vmem_sampler_or_bvh;
514    bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
515 
516    /** Whether all codepaths write tess factors in all invocations. */
517    bool tessfactors_are_def_in_all_invocs;
518 
519    /* A flag to check if vrs2x2 can be enabled to reduce number of
520     * fragment shader invocations if flat shading.
521     */
522    bool allow_flat_shading;
523 
524    /* Optimization: if the texture bound to this texunit has been cleared to 1,
525     * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the
526     * value is 0xff (undetermined) and can be later changed to 0 (= false) or
527     * texunit + 1.
528     */
529    uint8_t writes_1_if_tex_is_1;
530 
531    /* frag coord and sample pos per component read mask. */
532    uint8_t reads_frag_coord_mask;
533    uint8_t reads_sample_pos_mask;
534 };
535 
536 /* A shader selector is a gallium CSO and contains shader variants and
537  * binaries for one NIR program. This can be shared by multiple contexts.
538  */
539 struct si_shader_selector {
540    struct util_live_shader base;
541    struct si_screen *screen;
542    struct util_queue_fence ready;
543    struct si_compiler_ctx_state compiler_ctx_state;
544    gl_shader_stage stage;
545 
546    simple_mtx_t mutex;
547    union si_shader_key *keys;
548    unsigned variants_count;
549    unsigned variants_max_count;
550    struct si_shader **variants;
551 
552    /* The compiled NIR shader without a prolog and/or epilog (not
553     * uploaded to a buffer object).
554     *
555     * [0] for wave32, [1] for wave64.
556     */
557    struct si_shader *main_shader_part[2];
558    struct si_shader *main_shader_part_ls[2];     /* as_ls is set in the key */
559    struct si_shader *main_shader_part_es;        /* as_es && !as_ngg in the key */
560    struct si_shader *main_shader_part_ngg[2];    /* !as_es && as_ngg in the key */
561    struct si_shader *main_shader_part_ngg_es[2]; /* as_es && as_ngg in the key */
562 
563    struct nir_shader *nir;
564    void *nir_binary;
565    unsigned nir_size;
566 
567    struct si_shader_info info;
568 
569    enum pipe_shader_type pipe_shader_type;
570    uint8_t const_and_shader_buf_descriptors_index;
571    uint8_t sampler_and_images_descriptors_index;
572    uint8_t cs_shaderbufs_sgpr_index;
573    uint8_t cs_num_shaderbufs_in_user_sgprs;
574    uint8_t cs_images_sgpr_index;
575    uint8_t cs_images_num_sgprs;
576    uint8_t cs_num_images_in_user_sgprs;
577    unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
578    enum mesa_prim rast_prim;
579 
580    /* GS parameters. */
581    bool tess_turns_off_ngg;
582 
583    /* bitmasks of used descriptor slots */
584    uint64_t active_const_and_shader_buffers;
585    uint64_t active_samplers_and_images;
586 };
587 
588 /* Valid shader configurations:
589  *
590  * API shaders           VS | TCS | TES | GS |pass| PS
591  * are compiled as:         |     |     |    |thru|
592  *                          |     |     |    |    |
593  * Only VS & PS:         VS |     |     |    |    | PS
594  * GFX6     - with GS:   ES |     |     | GS | VS | PS
595  *          - with tess: LS | HS  | VS  |    |    | PS
596  *          - with both: LS | HS  | ES  | GS | VS | PS
597  * GFX9     - with GS:   -> |     |     | GS | VS | PS
598  *          - with tess: -> | HS  | VS  |    |    | PS
599  *          - with both: -> | HS  | ->  | GS | VS | PS
600  *                          |     |     |    |    |
601  * NGG      - VS & PS:   GS |     |     |    |    | PS
602  * (GFX10+) - with GS:   -> |     |     | GS |    | PS
603  *          - with tess: -> | HS  | GS  |    |    | PS
604  *          - with both: -> | HS  | ->  | GS |    | PS
605  *
606  * -> = merged with the next stage
607  */
608 
609 /* Use the byte alignment for all following structure members for optimal
610  * shader key memory footprint.
611  */
612 #pragma pack(push, 1)
613 
614 /* Common PS bits between the shader key and the prolog key. */
615 struct si_ps_prolog_bits {
616    unsigned color_two_side : 1;
617    unsigned flatshade_colors : 1;
618    unsigned poly_stipple : 1;
619    unsigned force_persp_sample_interp : 1;
620    unsigned force_linear_sample_interp : 1;
621    unsigned force_persp_center_interp : 1;
622    unsigned force_linear_center_interp : 1;
623    unsigned bc_optimize_for_persp : 1;
624    unsigned bc_optimize_for_linear : 1;
625    unsigned samplemask_log_ps_iter : 3;
626 };
627 
628 /* Common PS bits between the shader key and the epilog key. */
629 struct si_ps_epilog_bits {
630    unsigned spi_shader_col_format;
631    unsigned color_is_int8 : 8;
632    unsigned color_is_int10 : 8;
633    unsigned last_cbuf : 3;
634    unsigned alpha_func : 3;
635    unsigned alpha_to_one : 1;
636    unsigned alpha_to_coverage_via_mrtz : 1;  /* gfx11+ */
637    unsigned clamp_color : 1;
638    unsigned dual_src_blend_swizzle : 1;      /* gfx11+ */
639    unsigned rbplus_depth_only_opt:1;
640    unsigned kill_samplemask:1;
641 };
642 
643 union si_shader_part_key {
644    struct {
645       struct si_ps_prolog_bits states;
646       unsigned use_aco : 1;
647       unsigned wave32 : 1;
648       unsigned num_input_sgprs : 6;
649       /* Color interpolation and two-side color selection. */
650       unsigned colors_read : 8;       /* color input components read */
651       unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
652       unsigned num_fragcoord_components : 3;
653       unsigned wqm : 1;
654       char color_attr_index[2];
655       signed char color_interp_vgpr_index[2]; /* -1 == constant */
656    } ps_prolog;
657    struct {
658       struct si_ps_epilog_bits states;
659       unsigned use_aco : 1;
660       unsigned wave32 : 1;
661       unsigned uses_discard : 1;
662       unsigned colors_written : 8;
663       unsigned color_types : 16;
664       unsigned writes_z : 1;
665       unsigned writes_stencil : 1;
666       unsigned writes_samplemask : 1;
667    } ps_epilog;
668 };
669 
670 /* The shader key for geometry stages (VS, TCS, TES, GS) */
671 struct si_shader_key_ge {
672    /* Prolog and epilog flags. */
673    union {
674       struct {
675          struct si_shader_selector *ls;      /* for merged LS-HS */
676       } tcs; /* tessellation control shader */
677       struct {
678          struct si_shader_selector *es;      /* for merged ES-GS */
679       } gs;
680    } part;
681 
682    /* These three are initially set according to the NEXT_SHADER property,
683     * or guessed if the property doesn't seem correct.
684     */
685    unsigned as_es : 1;  /* whether it's a shader before GS */
686    unsigned as_ls : 1;  /* whether it's VS before TCS */
687    unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
688                            also set for the stage right before GS */
689 
690    /* Flags for monolithic compilation only. */
691    struct {
692       /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
693        *   divisor is 0.
694        * - If "is_one" has a bit set, the instance divisor is 1.
695        * - If "is_fetched" has a bit set, the instance divisor will be loaded
696        *   from the constant buffer.
697        */
698       uint16_t instance_divisor_is_one;     /* bitmask of inputs */
699       uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
700 
701       /* Whether fetch should be opencoded according to vs_fix_fetch.
702        * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
703        * with minimal fixups is used. */
704       uint16_t vs_fetch_opencode;
705       union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
706 
707       union {
708          /* When PS needs PrimID and GS is disabled. */
709          unsigned vs_export_prim_id : 1;    /* VS and TES only */
710          unsigned gs_tri_strip_adj_fix : 1; /* GS only */
711       } u;
712 
713       /* Gfx12: When no streamout buffers are bound, streamout must be disabled. */
714       unsigned remove_streamout : 1;
715    } mono;
716 
717    /* Optimization flags for asynchronous compilation only. */
718    struct {
719       /* For HW VS (it can be VS, TES, GS) */
720       uint64_t kill_outputs; /* "get_unique_index" bits */
721       unsigned kill_clip_distances : 8;
722       unsigned kill_pointsize : 1;
723       unsigned kill_layer : 1;
724       unsigned remove_streamout : 1;
725 
726       /* For NGG VS and TES. */
727       unsigned ngg_culling : 13; /* SI_NGG_CULL_* */
728 
729 
730       /* For shaders where monolithic variants have better code.
731        *
732        * This is a flag that has no effect on code generation,
733        * but forces monolithic shaders to be used as soon as
734        * possible, because it's in the "opt" group.
735        */
736       unsigned prefer_mono : 1;
737 
738       /* VS and TCS have the same number of patch vertices. */
739       unsigned same_patch_vertices:1;
740 
741       /* For TCS. */
742       unsigned tes_prim_mode : 3;
743       unsigned tes_reads_tess_factors : 1;
744 
745       unsigned inline_uniforms:1;
746 
747       /* This must be kept last to limit the number of variants
748        * depending only on the uniform values.
749        */
750       uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
751    } opt;
752 };
753 
754 struct si_shader_key_ps {
755    struct {
756       /* Prolog and epilog flags. */
757       struct si_ps_prolog_bits prolog;
758       struct si_ps_epilog_bits epilog;
759    } part;
760 
761    /* Flags for monolithic compilation only. */
762    struct {
763       unsigned poly_line_smoothing : 1;
764       unsigned point_smoothing : 1;
765       unsigned interpolate_at_sample_force_center : 1;
766       unsigned fbfetch_msaa : 1;
767       unsigned fbfetch_is_1D : 1;
768       unsigned fbfetch_layered : 1;
769    } mono;
770 
771    /* Optimization flags for asynchronous compilation only. */
772    struct {
773       /* For shaders where monolithic variants have better code.
774        *
775        * This is a flag that has no effect on code generation,
776        * but forces monolithic shaders to be used as soon as
777        * possible, because it's in the "opt" group.
778        */
779       unsigned prefer_mono : 1;
780       unsigned inline_uniforms:1;
781 
782       /* This eliminates the FRONT_FACE input VGPR as well as shader code using it. */
783       int force_front_face_input : 2; /* 0 = gl_FrontFacing, 1 = true, -1 = false */
784 
785       /* This must be kept last to limit the number of variants
786        * depending only on the uniform values.
787        */
788       uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
789    } opt;
790 };
791 
792 union si_shader_key {
793    struct si_shader_key_ge ge; /* geometry engine shaders */
794    struct si_shader_key_ps ps;
795 };
796 
797 /* Restore the pack alignment to default. */
798 #pragma pack(pop)
799 
800 /* GCN-specific shader info. */
801 struct si_shader_binary_info {
802    uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
803    uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
804    union si_input_info ps_inputs[SI_NUM_INTERP];
805    uint8_t num_ps_inputs;
806    uint8_t ps_colors_read;
807    uint8_t num_input_sgprs;
808    uint8_t num_input_vgprs;
809    bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
810    bool uses_vmem_sampler_or_bvh;
811    uint8_t num_fragcoord_components;
812    bool uses_instanceid;
813    uint8_t nr_pos_exports;
814    uint8_t nr_param_exports;
815    unsigned private_mem_vgprs;
816    unsigned max_simd_waves;
817 };
818 
819 enum si_shader_binary_type {
820    SI_SHADER_BINARY_ELF,
821    SI_SHADER_BINARY_RAW,
822 };
823 
824 struct si_shader_binary {
825    enum si_shader_binary_type type;
826 
827    /* Depends on binary type, either ELF or raw buffer. */
828    const char *code_buffer;
829    size_t code_size;
830    uint32_t exec_size;
831 
832    char *uploaded_code;
833    size_t uploaded_code_size;
834 
835    char *llvm_ir_string;
836 
837    const char *disasm_string;
838    size_t disasm_size;
839 
840    const unsigned *symbols;
841    unsigned num_symbols;
842 };
843 
844 struct gfx9_gs_info {
845    unsigned es_verts_per_subgroup;
846    unsigned gs_prims_per_subgroup;
847    unsigned gs_inst_prims_in_subgroup;
848    unsigned max_prims_per_subgroup;
849    unsigned esgs_ring_size; /* in bytes */
850 };
851 
852 struct si_shader {
853    struct si_pm4_state pm4; /* base class */
854    struct si_compiler_ctx_state compiler_ctx_state;
855 
856    struct si_shader_selector *selector;
857    struct si_shader_selector *previous_stage_sel; /* for refcounting */
858    struct si_shader *next_shader; /* Only used during compilation of LS and ES when merged. */
859 
860    struct si_shader_part *prolog;
861    struct si_shader *previous_stage; /* for GFX9 */
862    struct si_shader_part *epilog;
863    struct si_shader *gs_copy_shader;
864 
865    struct si_resource *bo;
866    /* gpu_address should be bo->gpu_address except if SQTT is
867     * in use.
868     */
869    uint64_t gpu_address;
870    /* Only used on GFX6-10 where the scratch address must be inserted into the shader binary.
871     * This is the scratch address that the current shader binary contains.
872     */
873    uint64_t scratch_va;
874    union si_shader_key key;
875    struct util_queue_fence ready;
876    bool compilation_failed;
877    bool is_monolithic;
878    bool is_optimized;
879    bool is_binary_shared;
880    bool is_gs_copy_shader;
881    uint8_t wave_size;
882    unsigned complete_shader_binary_size;
883 
884    /* The following data is all that's needed for binary shaders. */
885    struct si_shader_binary binary;
886    struct ac_shader_config config;
887    struct si_shader_binary_info info;
888 
889    /* SI_SGPR_VS_STATE_BITS */
890    bool uses_vs_state_provoking_vertex;
891    bool uses_gs_state_outprim;
892 
893    bool uses_base_instance;
894 
895    /* Shader key + LLVM IR + disassembly + statistics.
896     * Generated for debug contexts only.
897     */
898    char *shader_log;
899    size_t shader_log_size;
900 
901    struct gfx9_gs_info gs_info;
902 
903    /* Precomputed register values. */
904    union {
905       struct {
906          unsigned vgt_gsvs_ring_offset_1;
907          unsigned vgt_gsvs_ring_offset_2;
908          unsigned vgt_gsvs_ring_offset_3;
909          unsigned vgt_gsvs_ring_itemsize;
910          unsigned vgt_gs_max_vert_out;
911          unsigned vgt_gs_vert_itemsize;
912          unsigned vgt_gs_vert_itemsize_1;
913          unsigned vgt_gs_vert_itemsize_2;
914          unsigned vgt_gs_vert_itemsize_3;
915          unsigned vgt_gs_instance_cnt;
916          unsigned vgt_gs_onchip_cntl;
917          unsigned vgt_gs_max_prims_per_subgroup;
918          unsigned vgt_esgs_ring_itemsize;
919          unsigned spi_shader_pgm_rsrc3_gs;
920          unsigned spi_shader_pgm_rsrc4_gs;
921       } gs;
922 
923       struct {
924          /* Computed by gfx10_ngg_calculate_subgroup_info. */
925          uint16_t ngg_emit_size; /* in dwords */
926          uint16_t hw_max_esverts;
927          uint16_t max_gsprims;
928          uint16_t max_out_verts;
929          bool max_vert_out_per_gs_instance;
930          /* Register values. */
931          unsigned ge_max_output_per_subgroup;
932          unsigned ge_ngg_subgrp_cntl;
933          unsigned vgt_primitiveid_en;
934          unsigned vgt_gs_onchip_cntl;
935          unsigned vgt_gs_instance_cnt;
936          unsigned esgs_vertex_stride;
937          unsigned spi_vs_out_config;
938          unsigned spi_shader_pos_format;
939          unsigned pa_cl_vte_cntl;
940          unsigned vgt_gs_max_vert_out; /* for API GS */
941          unsigned ge_pc_alloc;         /* uconfig register */
942          unsigned spi_shader_pgm_rsrc3_gs;
943          unsigned spi_shader_pgm_rsrc4_gs;
944          unsigned vgt_shader_stages_en;
945       } ngg;
946 
947       struct {
948          unsigned vgt_gs_mode;
949          unsigned vgt_primitiveid_en;
950          unsigned vgt_reuse_off;
951          unsigned spi_vs_out_config;
952          unsigned spi_shader_pos_format;
953          unsigned pa_cl_vte_cntl;
954          unsigned ge_pc_alloc; /* uconfig register */
955       } vs;
956 
957       struct {
958          unsigned spi_ps_input_ena;
959          unsigned spi_ps_input_addr;
960          unsigned spi_baryc_cntl;
961          unsigned spi_ps_in_control;
962          unsigned spi_shader_z_format;
963          unsigned spi_shader_col_format;
964          unsigned cb_shader_mask;
965          unsigned db_shader_control;
966          unsigned num_interp;
967          unsigned spi_gs_out_config_ps;
968          unsigned pa_sc_hisz_control;
969          bool writes_samplemask;
970       } ps;
971    };
972 
973    /* Precomputed register values. */
974    unsigned vgt_tf_param;                /* VGT_TF_PARAM */
975    unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
976    unsigned pa_cl_vs_out_cntl;
977    unsigned ge_cntl;
978 };
979 
980 struct si_shader_part {
981    struct si_shader_part *next;
982    union si_shader_part_key key;
983    struct si_shader_binary binary;
984    struct ac_shader_config config;
985 };
986 
987 /* si_shader.c */
988 struct ac_rtld_binary;
989 
990 void si_update_shader_binary_info(struct si_shader *shader, struct nir_shader *nir);
991 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
992                        struct si_shader *shader, struct util_debug_callback *debug);
993 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
994                               struct si_shader *shader, struct util_debug_callback *debug);
995 void si_shader_destroy(struct si_shader *shader);
996 unsigned si_shader_io_get_unique_index(unsigned semantic);
997 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
998                             uint64_t scratch_va);
999 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
1000                                uint64_t scratch_va, int64_t bo_offset);
1001 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1002                         enum si_shader_dump_type dump_type);
1003 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1004                     struct util_debug_callback *debug, FILE *f, bool check_debug_option);
1005 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1006                                         struct util_debug_callback *debug);
1007 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
1008 const char *si_get_shader_name(const struct si_shader *shader);
1009 void si_shader_binary_clean(struct si_shader_binary *binary);
1010 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
1011 unsigned si_get_ps_num_interp(struct si_shader *ps);
1012 bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
1013                            struct ac_rtld_binary *rtld);
1014 bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
1015                             uint64_t *value);
1016 unsigned si_get_shader_prefetch_size(struct si_shader *shader);
1017 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader);
1018 
1019 /* si_shader_info.c */
1020 void si_nir_scan_shader(struct si_screen *sscreen,  const struct nir_shader *nir,
1021                         struct si_shader_info *info);
1022 
1023 /* si_shader_nir.c */
1024 void si_lower_mediump_io(struct nir_shader *nir);
1025 
1026 bool si_alu_to_scalar_packed_math_filter(const struct nir_instr *instr, const void *data);
1027 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
1028 void si_nir_late_opts(struct nir_shader *nir);
1029 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);
1030 
1031 /* si_state_shaders.cpp */
1032 unsigned si_shader_num_alloc_param_exports(struct si_shader *shader);
1033 unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
1034 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
1035                       struct gfx9_gs_info *out);
1036 bool gfx10_is_ngg_passthrough(struct si_shader *shader);
1037 unsigned si_shader_lshs_vertex_stride(struct si_shader *ls);
1038 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader);
1039 
1040 /* Inline helpers. */
1041 
1042 /* Return the pointer to the main shader part's pointer. */
si_get_main_shader_part(struct si_shader_selector * sel,const union si_shader_key * key,unsigned wave_size)1043 static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
1044                                                          const union si_shader_key *key,
1045                                                          unsigned wave_size)
1046 {
1047    assert(wave_size == 32 || wave_size == 64);
1048    unsigned index = wave_size / 32 - 1;
1049 
1050    if (sel->stage <= MESA_SHADER_GEOMETRY) {
1051       if (key->ge.as_ls)
1052          return &sel->main_shader_part_ls[index];
1053       if (key->ge.as_es && key->ge.as_ngg)
1054          return &sel->main_shader_part_ngg_es[index];
1055       if (key->ge.as_es) {
1056          /* legacy GS only support wave 64 */
1057          assert(wave_size == 64);
1058          return &sel->main_shader_part_es;
1059       }
1060       if (key->ge.as_ngg)
1061          return &sel->main_shader_part_ngg[index];
1062    }
1063    return &sel->main_shader_part[index];
1064 }
1065 
si_shader_uses_bindless_samplers(struct si_shader_selector * selector)1066 static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
1067 {
1068    return selector ? selector->info.uses_bindless_samplers : false;
1069 }
1070 
si_shader_uses_bindless_images(struct si_shader_selector * selector)1071 static inline bool si_shader_uses_bindless_images(struct si_shader_selector *selector)
1072 {
1073    return selector ? selector->info.uses_bindless_images : false;
1074 }
1075 
gfx10_edgeflags_have_effect(struct si_shader * shader)1076 static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader)
1077 {
1078    if (shader->selector->stage == MESA_SHADER_VERTEX &&
1079        !shader->selector->info.base.vs.blit_sgprs_amd &&
1080        !(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES))
1081       return true;
1082 
1083    return false;
1084 }
1085 
gfx10_ngg_writes_user_edgeflags(struct si_shader * shader)1086 static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
1087 {
1088    return gfx10_edgeflags_have_effect(shader) &&
1089           shader->selector->info.writes_edgeflag;
1090 }
1091 
si_shader_uses_streamout(const struct si_shader * shader)1092 static inline bool si_shader_uses_streamout(const struct si_shader *shader)
1093 {
1094    return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
1095           shader->selector->info.enabled_streamout_buffer_mask &&
1096           !shader->key.ge.opt.remove_streamout &&
1097           !shader->key.ge.mono.remove_streamout;
1098 }
1099 
si_shader_uses_discard(struct si_shader * shader)1100 static inline bool si_shader_uses_discard(struct si_shader *shader)
1101 {
1102    /* Changes to this should also update ps_modifies_zs. */
1103    return shader->selector->info.base.fs.uses_discard ||
1104           shader->key.ps.part.prolog.poly_stipple ||
1105           shader->key.ps.mono.point_smoothing ||
1106           shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
1107 }
1108 
1109 #ifdef __cplusplus
1110 }
1111 #endif
1112 
1113 #endif
1114