xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_shader.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2014 Rob Clark <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <[email protected]>
7  */
8 
9 #ifndef IR3_SHADER_H_
10 #define IR3_SHADER_H_
11 
12 #include <stdio.h>
13 
14 #include "c11/threads.h"
15 #include "compiler/nir/nir.h"
16 #include "compiler/shader_enums.h"
17 #include "util/bitscan.h"
18 #include "util/disk_cache.h"
19 
20 #include "ir3_compiler.h"
21 
22 BEGINC;
23 
24 /* driver param indices: */
25 enum ir3_driver_param {
26    /* compute shader driver params: */
27    IR3_DP_NUM_WORK_GROUPS_X = 0,
28    IR3_DP_NUM_WORK_GROUPS_Y = 1,
29    IR3_DP_NUM_WORK_GROUPS_Z = 2,
30    IR3_DP_WORK_DIM          = 3,
31    IR3_DP_BASE_GROUP_X = 4,
32    IR3_DP_BASE_GROUP_Y = 5,
33    IR3_DP_BASE_GROUP_Z = 6,
34    IR3_DP_CS_SUBGROUP_SIZE = 7,
35    IR3_DP_LOCAL_GROUP_SIZE_X = 8,
36    IR3_DP_LOCAL_GROUP_SIZE_Y = 9,
37    IR3_DP_LOCAL_GROUP_SIZE_Z = 10,
38    IR3_DP_SUBGROUP_ID_SHIFT = 11,
39    IR3_DP_WORKGROUP_ID_X = 12,
40    IR3_DP_WORKGROUP_ID_Y = 13,
41    IR3_DP_WORKGROUP_ID_Z = 14,
42    /* NOTE: gl_NumWorkGroups should be vec4 aligned because
43     * glDispatchComputeIndirect() needs to load these from
44     * the info->indirect buffer.  Keep that in mind when/if
45     * adding any addition CS driver params.
46     */
47    IR3_DP_CS_COUNT = 16, /* must be aligned to vec4 */
48 
49    /* vertex shader driver params: */
50    IR3_DP_DRAWID = 0,
51    IR3_DP_VTXID_BASE = 1,
52    IR3_DP_INSTID_BASE = 2,
53    IR3_DP_VTXCNT_MAX = 3,
54    IR3_DP_IS_INDEXED_DRAW = 4,  /* Note: boolean, ie. 0 or ~0 */
55    /* user-clip-plane components, up to 8x vec4's: */
56    IR3_DP_UCP0_X = 5,
57    /* .... */
58    IR3_DP_UCP7_W = 36,
59    IR3_DP_VS_COUNT = 40, /* must be aligned to vec4 */
60 
61    /* TCS driver params: */
62    IR3_DP_HS_DEFAULT_OUTER_LEVEL_X = 0,
63    IR3_DP_HS_DEFAULT_OUTER_LEVEL_Y = 1,
64    IR3_DP_HS_DEFAULT_OUTER_LEVEL_Z = 2,
65    IR3_DP_HS_DEFAULT_OUTER_LEVEL_W = 3,
66    IR3_DP_HS_DEFAULT_INNER_LEVEL_X = 4,
67    IR3_DP_HS_DEFAULT_INNER_LEVEL_Y = 5,
68    IR3_DP_HS_COUNT = 8, /* must be aligned to vec4 */
69 
70    /* fragment shader driver params: */
71    IR3_DP_FS_SUBGROUP_SIZE = 0,
72    /* Dynamic params (that aren't known when compiling the shader) */
73    IR3_DP_FS_DYNAMIC = 4,
74    IR3_DP_FS_FRAG_INVOCATION_COUNT = IR3_DP_FS_DYNAMIC,
75    IR3_DP_FS_FRAG_SIZE = IR3_DP_FS_DYNAMIC + 4,
76    IR3_DP_FS_FRAG_OFFSET = IR3_DP_FS_DYNAMIC + 6,
77 };
78 
79 #define IR3_MAX_SHADER_BUFFERS  32
80 #define IR3_MAX_SHADER_IMAGES   32
81 #define IR3_MAX_SO_BUFFERS      4
82 #define IR3_MAX_SO_STREAMS      4
83 #define IR3_MAX_SO_OUTPUTS      128
84 #define IR3_MAX_UBO_PUSH_RANGES 32
85 
86 /* mirrors SYSTEM_VALUE_BARYCENTRIC_ but starting from 0 */
87 enum ir3_bary {
88    IJ_PERSP_PIXEL,
89    IJ_PERSP_SAMPLE,
90    IJ_PERSP_CENTROID,
91    IJ_PERSP_CENTER_RHW,
92    IJ_LINEAR_PIXEL,
93    IJ_LINEAR_CENTROID,
94    IJ_LINEAR_SAMPLE,
95    IJ_COUNT,
96 };
97 
98 /* Description of what wavesizes are allowed. */
99 enum ir3_wavesize_option {
100    IR3_SINGLE_ONLY,
101    IR3_SINGLE_OR_DOUBLE,
102    IR3_DOUBLE_ONLY,
103 };
104 
105 /**
106  * Description of a lowered UBO.
107  */
108 struct nir_def;
109 
110 struct ir3_ubo_info {
111    struct nir_def *global_base; /* For global loads, the base address */
112    uint32_t block;         /* Which constant block */
113    uint16_t bindless_base; /* For bindless, which base register is used */
114    bool bindless;
115    bool global;
116 };
117 
118 /**
119  * Description of a range of a lowered UBO access.
120  *
121  * Drivers should not assume that there are not multiple disjoint
122  * lowered ranges of a single UBO.
123  */
124 struct ir3_ubo_range {
125    struct ir3_ubo_info ubo;
126    uint32_t offset;     /* start offset to push in the const register file */
127    uint32_t start, end; /* range of block that's actually used */
128 };
129 
130 struct ir3_ubo_analysis_state {
131    struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES];
132    uint32_t num_enabled;
133    uint32_t size;
134 };
135 
136 enum ir3_push_consts_type {
137    IR3_PUSH_CONSTS_NONE,
138    IR3_PUSH_CONSTS_PER_STAGE,
139    IR3_PUSH_CONSTS_SHARED,
140    IR3_PUSH_CONSTS_SHARED_PREAMBLE,
141 };
142 
143 /* This represents an internal UBO filled out by the driver. There are a few
144  * common UBOs that must be filled out identically by all drivers, for example
145  * for shader linkage, but drivers can also add their own that they manage
146  * themselves.
147  */
148 struct ir3_driver_ubo {
149    int32_t idx;
150    uint32_t size;
151 };
152 
153 /**
154  * Describes the layout of shader consts in the const register file.
155  *
156  * Layout of constant registers, each section aligned to vec4.  Note
157  * that pointer size (ubo, etc) changes depending on generation.
158  *
159  *   + user consts: only used for turnip push consts
160  *   + lowered UBO ranges
161  *   + preamble consts
162  *   + UBO addresses: turnip is bindless and these are wasted
163  *   + image dimensions: a5xx only; needed to calculate pixel offset, but only
164  *     for images that have image_{load,store,size,atomic*} intrinsics
165  *   + kernel params: cl only
166  *   + driver params: these are stage-dependent; see ir3_driver_param
167  *   + TFBO addresses: only for vs on a3xx/a4xx
168  *   + primitive params: these are stage-dependent
169  *       vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0)
170  *       hs, ds: uvec4(primitive_stride, vertex_stride,
171  *                     patch_stride, patch_vertices_in)
172  *               uvec4(tess_param_base, tess_factor_base)
173  *   + primitive map
174  *   + lowered immediates
175  *
176  * Immediates go last mostly because they are inserted in the CP pass
177  * after the nir -> ir3 frontend.
178  *
179  * Note UBO size in bytes should be aligned to vec4
180  */
181 struct ir3_const_state {
182    unsigned num_ubos;
183    unsigned num_driver_params; /* scalar */
184 
185    struct ir3_driver_ubo consts_ubo;
186    struct ir3_driver_ubo driver_params_ubo;
187    struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo;
188 
189    /* Optional const allocations (preamble, UBO, etc.) may shift the required
190     * consts more than they expect. The free space for optional allocations
191     * should respect required_consts_aligment_vec4.
192     */
193    uint32_t required_consts_aligment_vec4;
194 
195    int32_t constant_data_dynamic_offsets;
196 
197    struct {
198       /* user const start at zero */
199       unsigned ubo;
200       unsigned image_dims;
201       unsigned kernel_params;
202       unsigned driver_param;
203       unsigned tfbo;
204       unsigned primitive_param;
205       unsigned primitive_map;
206       unsigned immediate;
207    } offsets;
208 
209    struct {
210       uint32_t mask;  /* bitmask of images that have image_store */
211       uint32_t count; /* number of consts allocated */
212       /* three const allocated per image which has image_store:
213        *  + cpp         (bytes per pixel)
214        *  + pitch       (y pitch)
215        *  + array_pitch (z pitch)
216        */
217       uint32_t off[IR3_MAX_SHADER_IMAGES];
218    } image_dims;
219 
220    unsigned immediates_count;
221    unsigned immediates_size;
222    uint32_t *immediates;
223 
224    unsigned preamble_size;
225    unsigned global_size;
226 
227    /* State of ubo access lowered to push consts: */
228    struct ir3_ubo_analysis_state ubo_state;
229    enum ir3_push_consts_type push_consts_type;
230 };
231 
232 /**
233  * A single output for vertex transform feedback.
234  */
235 struct ir3_stream_output {
236    unsigned register_index  : 6;  /**< 0 to 63 (OUT index) */
237    unsigned start_component : 2;  /** 0 to 3 */
238    unsigned num_components  : 3;  /** 1 to 4 */
239    unsigned output_buffer   : 3;  /**< 0 to PIPE_MAX_SO_BUFFERS */
240    unsigned dst_offset      : 16; /**< offset into the buffer in dwords */
241    unsigned stream          : 2;  /**< 0 to 3 */
242 };
243 
244 /**
245  * Stream output for vertex transform feedback.
246  */
247 struct ir3_stream_output_info {
248    unsigned num_outputs;
249    /** stride for an entire vertex for each buffer in dwords */
250    uint16_t stride[IR3_MAX_SO_BUFFERS];
251 
252    /* These correspond to the VPC_SO_STREAM_CNTL fields */
253    uint8_t streams_written;
254    uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS];
255 
256    /**
257     * Array of stream outputs, in the order they are to be written in.
258     * Selected components are tightly packed into the output buffer.
259     */
260    struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
261 };
262 
263 /**
264  * Starting from a4xx, HW supports pre-dispatching texture sampling
265  * instructions prior to scheduling a shader stage, when the
266  * coordinate maps exactly to an output of the previous stage.
267  */
268 
269 /**
270  * There is a limit in the number of pre-dispatches allowed for any
271  * given stage.
272  */
273 #define IR3_MAX_SAMPLER_PREFETCH 4
274 
275 /**
276  * This is the output stream value for 'cmd', as used by blob. It may
277  * encode the return type (in 3 bits) but it hasn't been verified yet.
278  */
279 #define IR3_SAMPLER_PREFETCH_CMD          0x4
280 #define IR3_SAMPLER_BINDLESS_PREFETCH_CMD 0x6
281 
282 /**
283  * Stream output for texture sampling pre-dispatches.
284  */
285 struct ir3_sampler_prefetch {
286    uint8_t src;
287    bool bindless;
288    uint8_t samp_id;
289    uint8_t tex_id;
290    uint16_t samp_bindless_id;
291    uint16_t tex_bindless_id;
292    uint8_t dst;
293    uint8_t wrmask;
294    uint8_t half_precision;
295    opc_t tex_opc;
296 };
297 
298 /* Configuration key used to identify a shader variant.. different
299  * shader variants can be used to implement features not supported
300  * in hw (two sided color), binning-pass vertex shader, etc.
301  *
302  * When adding to this struct, please update ir3_shader_variant()'s debug
303  * output.
304  */
305 struct ir3_shader_key {
306    union {
307       struct {
308          /*
309           * Combined Vertex/Fragment shader parameters:
310           */
311          unsigned ucp_enables : 8;
312 
313          /* do we need to check {v,f}saturate_{s,t,r}? */
314          unsigned has_per_samp : 1;
315 
316          /*
317           * Fragment shader variant parameters:
318           */
319          unsigned sample_shading : 1;
320          unsigned msaa           : 1;
321          /* used when shader needs to handle flat varyings (a4xx)
322           * for front/back color inputs to frag shader:
323           */
324          unsigned rasterflat : 1;
325 
326          /* Indicates that this is a tessellation pipeline which requires a
327           * whole different kind of vertex shader.  In case of
328           * tessellation, this field also tells us which kind of output
329           * topology the TES uses, which the TCS needs to know.
330           */
331 #define IR3_TESS_NONE      0
332 #define IR3_TESS_QUADS     1
333 #define IR3_TESS_TRIANGLES 2
334 #define IR3_TESS_ISOLINES  3
335          unsigned tessellation : 2;
336 
337          unsigned has_gs : 1;
338 
339          /* Whether stages after TCS read gl_PrimitiveID, used to determine
340           * whether the TCS has to store it in the tess factor BO.
341           */
342          unsigned tcs_store_primid : 1;
343 
344          /* Whether this variant sticks to the "safe" maximum constlen,
345           * which guarantees that the combined stages will never go over
346           * the limit:
347           */
348          unsigned safe_constlen : 1;
349 
350          /* Whether driconf "dual_color_blend_by_location" workaround is
351           * enabled
352           */
353          unsigned force_dual_color_blend : 1;
354       };
355       uint32_t global;
356    };
357 
358    /* bitmask of ms shifts (a3xx) */
359    uint32_t vsamples, fsamples;
360 
361    /* bitmask of samplers which need astc srgb workaround (a4xx): */
362    uint16_t vastc_srgb, fastc_srgb;
363 
364    /* per-component (3-bit) swizzles of each sampler (a4xx tg4): */
365    uint16_t vsampler_swizzles[16];
366    uint16_t fsampler_swizzles[16];
367 };
368 
369 static inline unsigned
ir3_tess_mode(enum tess_primitive_mode tess_mode)370 ir3_tess_mode(enum tess_primitive_mode tess_mode)
371 {
372    switch (tess_mode) {
373    case TESS_PRIMITIVE_ISOLINES:
374       return IR3_TESS_ISOLINES;
375    case TESS_PRIMITIVE_TRIANGLES:
376       return IR3_TESS_TRIANGLES;
377    case TESS_PRIMITIVE_QUADS:
378       return IR3_TESS_QUADS;
379    default:
380       unreachable("bad tessmode");
381    }
382 }
383 
384 static inline uint32_t
ir3_tess_factor_stride(unsigned patch_type)385 ir3_tess_factor_stride(unsigned patch_type)
386 {
387    /* note: this matches the stride used by ir3's build_tessfactor_base */
388    switch (patch_type) {
389    case IR3_TESS_ISOLINES:
390       return 12;
391    case IR3_TESS_TRIANGLES:
392       return 20;
393    case IR3_TESS_QUADS:
394       return 28;
395    default:
396       unreachable("bad tessmode");
397    }
398 }
399 
400 static inline bool
ir3_shader_key_equal(const struct ir3_shader_key * a,const struct ir3_shader_key * b)401 ir3_shader_key_equal(const struct ir3_shader_key *a,
402                      const struct ir3_shader_key *b)
403 {
404    /* slow-path if we need to check {v,f}saturate_{s,t,r} */
405    if (a->has_per_samp || b->has_per_samp)
406       return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
407    return a->global == b->global;
408 }
409 
410 /* will the two keys produce different lowering for a fragment shader? */
411 static inline bool
ir3_shader_key_changes_fs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)412 ir3_shader_key_changes_fs(struct ir3_shader_key *key,
413                           struct ir3_shader_key *last_key)
414 {
415    if (last_key->has_per_samp || key->has_per_samp) {
416       if ((last_key->fsamples != key->fsamples) ||
417           (last_key->fastc_srgb != key->fastc_srgb) ||
418           memcmp(last_key->fsampler_swizzles, key->fsampler_swizzles,
419                 sizeof(key->fsampler_swizzles)))
420          return true;
421    }
422 
423    if (last_key->rasterflat != key->rasterflat)
424       return true;
425 
426    if (last_key->ucp_enables != key->ucp_enables)
427       return true;
428 
429    if (last_key->safe_constlen != key->safe_constlen)
430       return true;
431 
432    return false;
433 }
434 
435 /* will the two keys produce different lowering for a vertex shader? */
436 static inline bool
ir3_shader_key_changes_vs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)437 ir3_shader_key_changes_vs(struct ir3_shader_key *key,
438                           struct ir3_shader_key *last_key)
439 {
440    if (last_key->has_per_samp || key->has_per_samp) {
441       if ((last_key->vsamples != key->vsamples) ||
442           (last_key->vastc_srgb != key->vastc_srgb) ||
443           memcmp(last_key->vsampler_swizzles, key->vsampler_swizzles,
444                 sizeof(key->vsampler_swizzles)))
445          return true;
446    }
447 
448    if (last_key->ucp_enables != key->ucp_enables)
449       return true;
450 
451    if (last_key->safe_constlen != key->safe_constlen)
452       return true;
453 
454    return false;
455 }
456 
457 /**
458  * On a4xx+a5xx, Images share state with textures and SSBOs:
459  *
460  *   + Uses texture (cat5) state/instruction (isam) to read
461  *   + Uses SSBO state and instructions (cat6) to write and for atomics
462  *
463  * Starting with a6xx, Images and SSBOs are basically the same thing,
464  * with texture state and isam also used for SSBO reads.
465  *
466  * On top of that, gallium makes the SSBO (shader_buffers) state semi
467  * sparse, with the first half of the state space used for atomic
468  * counters lowered to atomic buffers.  We could ignore this, but I
469  * don't think we could *really* handle the case of a single shader
470  * that used the max # of textures + images + SSBOs.  And once we are
471  * offsetting images by num_ssbos (or visa versa) to map them into
472  * the same hardware state, the hardware state has become coupled to
473  * the shader state, so at this point we might as well just use a
474  * mapping table to remap things from image/SSBO idx to hw idx.
475  *
476  * To make things less (more?) confusing, for the hw "SSBO" state
477  * (since it is really both SSBO and Image) I'll use the name "IBO"
478  */
479 struct ir3_ibo_mapping {
480 #define IBO_INVALID 0xff
481    /* Maps logical SSBO state to hw tex state: */
482    uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS];
483 
484    /* Maps logical Image state to hw tex state: */
485    uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES];
486 
487    /* Maps hw state back to logical SSBO or Image state:
488     *
489     * note IBO_SSBO ORd into values to indicate that the
490     * hw slot is used for SSBO state vs Image state.
491     */
492 #define IBO_SSBO 0x80
493    uint8_t tex_to_image[32];
494 
495    /* including real textures */
496    uint8_t num_tex;
497    /* the number of real textures, ie. image/ssbo start here */
498    uint8_t tex_base;
499 };
500 
501 struct ir3_disasm_info {
502    bool write_disasm;
503    char *nir;
504    char *disasm;
505 };
506 
507 /* Represents half register in regid */
508 #define HALF_REG_ID 0x100
509 
510 struct ir3_shader_options {
511    unsigned num_reserved_user_consts;
512    /* What API-visible wavesizes are allowed. Even if only double wavesize is
513     * allowed, we may still use the smaller wavesize "under the hood" and the
514     * application simply sees the upper half as always disabled.
515     */
516    enum ir3_wavesize_option api_wavesize;
517    /* What wavesizes we're allowed to actually use. If the API wavesize is
518     * single-only, then this must be single-only too.
519     */
520    enum ir3_wavesize_option real_wavesize;
521    enum ir3_push_consts_type push_consts_type;
522 
523    uint32_t push_consts_base;
524    uint32_t push_consts_dwords;
525 };
526 
527 /**
528  * Shader variant which contains the actual hw shader instructions,
529  * and necessary info for shader state setup.
530  */
531 struct ir3_shader_variant {
532    struct fd_bo *bo;
533 
534    /* variant id (for debug) */
535    uint32_t id;
536 
537    /* id of the shader the variant came from (for debug) */
538    uint32_t shader_id;
539 
540    struct ir3_shader_key key;
541 
542    /* vertex shaders can have an extra version for hwbinning pass,
543     * which is pointed to by so->binning:
544     */
545    bool binning_pass;
546    //	union {
547    struct ir3_shader_variant *binning;
548    struct ir3_shader_variant *nonbinning;
549    //	};
550 
551    struct ir3 *ir; /* freed after assembling machine instructions */
552 
553    /* shader variants form a linked list: */
554    struct ir3_shader_variant *next;
555 
556    /* replicated here to avoid passing extra ptrs everywhere: */
557    gl_shader_stage type;
558    struct ir3_compiler *compiler;
559 
560    char *name;
561 
562    /* variant's copy of nir->constant_data (since we don't track the NIR in
563     * the variant, and shader->nir is before the opt pass).  Moves to v->bin
564     * after assembly.
565     */
566    void *constant_data;
567 
568    struct ir3_disasm_info disasm_info;
569 
570    /*
571     * Below here is serialized when written to disk cache:
572     */
573 
574    /* The actual binary shader instructions, size given by info.sizedwords: */
575    uint32_t *bin;
576 
577    struct ir3_const_state *const_state;
578 
579    /*
580     * The following macros are used by the shader disk cache save/
581     * restore paths to serialize/deserialize the variant.  Any
582     * pointers that require special handling in store_variant()
583     * and retrieve_variant() should go above here.
584     */
585 #define VARIANT_CACHE_START  offsetof(struct ir3_shader_variant, info)
586 #define VARIANT_CACHE_PTR(v) (((char *)v) + VARIANT_CACHE_START)
587 #define VARIANT_CACHE_SIZE                                                     \
588    (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START)
589 
590    struct ir3_info info;
591 
592    struct ir3_shader_options shader_options;
593 
594    uint32_t constant_data_size;
595 
596    /* Levels of nesting of flow control:
597     */
598    unsigned branchstack;
599 
600    unsigned loops;
601 
602    /* the instructions length is in units of instruction groups
603     * (4 instructions for a3xx, 16 instructions for a4xx.. each
604     * instruction is 2 dwords):
605     */
606    unsigned instrlen;
607 
608    /* the constants length is in units of vec4's, and is the sum of
609     * the uniforms and the built-in compiler constants
610     */
611    unsigned constlen;
612 
613    /* The private memory size in bytes per fiber */
614    unsigned pvtmem_size;
615    /* Whether we should use the new per-wave layout rather than per-fiber. */
616    bool pvtmem_per_wave;
617 
618    /* Whether multi-position output is enabled. */
619    bool multi_pos_output;
620 
621    /* Whether dual-source blending is enabled. */
622    bool dual_src_blend;
623 
624    /* Whether early preamble is enabled. */
625    bool early_preamble;
626 
627    /* Size in bytes of required shared memory */
628    unsigned shared_size;
629 
630    /* About Linkage:
631     *   + Let the frag shader determine the position/compmask for the
632     *     varyings, since it is the place where we know if the varying
633     *     is actually used, and if so, which components are used.  So
634     *     what the hw calls "outloc" is taken from the "inloc" of the
635     *     frag shader.
636     *   + From the vert shader, we only need the output regid
637     */
638 
639    bool frag_face, color0_mrt;
640    uint8_t fragcoord_compmask;
641 
642    /* NOTE: for input/outputs, slot is:
643     *   gl_vert_attrib  - for VS inputs
644     *   gl_varying_slot - for VS output / FS input
645     *   gl_frag_result  - for FS output
646     */
647 
648    /* varyings/outputs: */
649    unsigned outputs_count;
650    struct {
651       uint8_t slot;
652       uint8_t regid;
653       uint8_t view;
654       bool half : 1;
655    } outputs[32 + 2]; /* +POSITION +PSIZE */
656    bool writes_pos, writes_smask, writes_psize, writes_viewport, writes_stencilref;
657 
658    /* Size in dwords of all outputs for VS, size of entire patch for HS. */
659    uint32_t output_size;
660 
661    /* Expected size of incoming output_loc for HS, DS, and GS */
662    uint32_t input_size;
663 
664    /* Map from location to offset in per-primitive storage. In dwords for
665     * HS, where varyings are read in the next stage via ldg with a dword
666     * offset, and in bytes for all other stages.
667     * +POSITION, +PSIZE, ... - see shader_io_get_unique_index
668     */
669    unsigned output_loc[12 + 32];
670 
671    /* attributes (VS) / varyings (FS):
672     * Note that sysval's should come *after* normal inputs.
673     */
674    unsigned inputs_count;
675    struct {
676       uint8_t slot;
677       uint8_t regid;
678       uint8_t compmask;
679       /* location of input (ie. offset passed to bary.f, etc).  This
680        * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
681        * have the OUTLOCn value offset by 8, presumably to account
682        * for gl_Position/gl_PointSize)
683        */
684       uint8_t inloc;
685       /* vertex shader specific: */
686       bool sysval : 1; /* slot is a gl_system_value */
687       /* fragment shader specific: */
688       bool bary       : 1; /* fetched varying (vs one loaded into reg) */
689       bool rasterflat : 1; /* special handling for emit->rasterflat */
690       bool half       : 1;
691       bool flat       : 1;
692    } inputs[32 + 2]; /* +POSITION +FACE */
693    bool reads_primid;
694 
695    /* sum of input components (scalar).  For frag shaders, it only counts
696     * the varying inputs:
697     */
698    unsigned total_in;
699 
700    /* sum of sysval input components (scalar). */
701    unsigned sysval_in;
702 
703    /* For frag shaders, the total number of inputs (not scalar,
704     * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
705     */
706    unsigned varying_in;
707 
708    /* Remapping table to map Image and SSBO to hw state: */
709    struct ir3_ibo_mapping image_mapping;
710 
711    /* number of samplers/textures (which are currently 1:1): */
712    int num_samp;
713 
714    /* is there an implicit sampler to read framebuffer (FS only).. if
715     * so the sampler-idx is 'num_samp - 1' (ie. it is appended after
716     * the last "real" texture)
717     */
718    bool fb_read;
719 
720    /* do we have one or more SSBO instructions: */
721    bool has_ssbo;
722 
723    /* Which bindless resources are used, for filling out sp_xs_config */
724    bool bindless_tex;
725    bool bindless_samp;
726    bool bindless_ibo;
727    bool bindless_ubo;
728 
729    /* do we need derivatives: */
730    bool need_pixlod;
731 
732    bool need_full_quad;
733 
734    /* do we need VS driver params? */
735    bool need_driver_params;
736 
737    /* do we have image write, etc (which prevents early-z): */
738    bool no_earlyz;
739 
740    /* do we have kill, which also prevents early-z, but not necessarily
741     * early-lrz (as long as lrz-write is disabled, which must be handled
742     * outside of ir3.  Unlike other no_earlyz cases, kill doesn't have
743     * side effects that prevent early-lrz discard.
744     */
745    bool has_kill;
746 
747    bool per_samp;
748 
749    bool post_depth_coverage;
750 
751    /* Are we using split or merged register file? */
752    bool mergedregs;
753 
754    uint8_t clip_mask, cull_mask;
755 
756    /* for astc srgb workaround, the number/base of additional
757     * alpha tex states we need, and index of original tex states
758     */
759    struct {
760       unsigned base, count;
761       unsigned orig_idx[16];
762    } astc_srgb;
763 
764    /* for tg4 workaround, the number/base of additional
765     * unswizzled tex states we need, and index of original tex states
766     */
767    struct {
768       unsigned base, count;
769       unsigned orig_idx[16];
770    } tg4;
771 
772    /* texture sampler pre-dispatches */
773    uint32_t num_sampler_prefetch;
774    struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH];
775 
776    /* If true, the last use of helper invocations is the texture prefetch and
777     * they should be disabled for the actual shader. Equivalent to adding
778     * (eq)nop at the beginning of the shader.
779     */
780    bool prefetch_end_of_quad;
781 
782    uint16_t local_size[3];
783    bool local_size_variable;
784 
785    /* Important for compute shader to determine max reg footprint */
786    bool has_barrier;
787 
788    /* The offset where images start in the IBO array. */
789    unsigned num_ssbos;
790 
791    /* The total number of SSBOs and images, i.e. the number of hardware IBOs. */
792    unsigned num_ibos;
793 
794    union {
795       struct {
796          enum tess_primitive_mode primitive_mode;
797 
798          /** The number of vertices in the TCS output patch. */
799          uint8_t tcs_vertices_out;
800          enum gl_tess_spacing spacing:2; /*gl_tess_spacing*/
801 
802          /** Is the vertex order counterclockwise? */
803          bool ccw:1;
804          bool point_mode:1;
805       } tess;
806       struct {
807          /** The output primitive type */
808          uint16_t output_primitive;
809 
810          /** The maximum number of vertices the geometry shader might write. */
811          uint16_t vertices_out;
812 
813          /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
814          uint8_t invocations;
815 
816          /** The number of vertices received per input primitive (max. 6) */
817          uint8_t vertices_in:3;
818       } gs;
819       struct {
820          bool early_fragment_tests : 1;
821          bool color_is_dual_source : 1;
822          bool uses_fbfetch_output  : 1;
823          bool fbfetch_coherent     : 1;
824       } fs;
825       struct {
826          unsigned req_input_mem;
827          unsigned req_local_mem;
828          bool force_linear_dispatch;
829       } cs;
830    };
831 
832    /* For when we don't have a shader, variant's copy of streamout state */
833    struct ir3_stream_output_info stream_output;
834 };
835 
836 static inline const char *
ir3_shader_stage(struct ir3_shader_variant * v)837 ir3_shader_stage(struct ir3_shader_variant *v)
838 {
839    switch (v->type) {
840    case MESA_SHADER_VERTEX:
841       return v->binning_pass ? "BVERT" : "VERT";
842    case MESA_SHADER_TESS_CTRL:
843       return "TCS";
844    case MESA_SHADER_TESS_EVAL:
845       return "TES";
846    case MESA_SHADER_GEOMETRY:
847       return "GEOM";
848    case MESA_SHADER_FRAGMENT:
849       return "FRAG";
850    case MESA_SHADER_COMPUTE:
851    case MESA_SHADER_KERNEL:
852       return "CL";
853    default:
854       unreachable("invalid type");
855       return NULL;
856    }
857 }
858 
859 /* Currently we do not do binning for tess.  And for GS there is no
860  * cross-stage VS+GS optimization, so the full VS+GS is used in
861  * the binning pass.
862  */
863 static inline bool
ir3_has_binning_vs(const struct ir3_shader_key * key)864 ir3_has_binning_vs(const struct ir3_shader_key *key)
865 {
866    if (key->tessellation || key->has_gs)
867       return false;
868    return true;
869 }
870 
871 /**
872  * Represents a shader at the API level, before state-specific variants are
873  * generated.
874  */
875 struct ir3_shader {
876    gl_shader_stage type;
877 
878    /* shader id (for debug): */
879    uint32_t id;
880    uint32_t variant_count;
881 
882    /* Set by freedreno after shader_state_create, so we can emit debug info
883     * when recompiling a shader at draw time.
884     */
885    bool initial_variants_done;
886 
887    struct ir3_compiler *compiler;
888 
889    struct ir3_shader_options options;
890 
891    bool nir_finalized;
892    struct nir_shader *nir;
893    struct ir3_stream_output_info stream_output;
894 
895    /* per shader stage specific info: */
896    union {
897       /* for compute shaders: */
898       struct {
899          unsigned req_input_mem;    /* in dwords */
900          unsigned req_local_mem;
901          bool force_linear_dispatch;
902       } cs;
903       /* For vertex shaders: */
904       struct {
905          /* If we need to generate a passthrough TCS, it will be a function of
906           * (a) the VS and (b) the # of patch_vertices (max 32), so cache them
907           * in the VS keyed by # of patch_vertices-1.
908           */
909          unsigned passthrough_tcs_compiled;
910          struct ir3_shader *passthrough_tcs[32];
911       } vs;
912    };
913 
914    struct ir3_shader_variant *variants;
915    mtx_t variants_lock;
916 
917    cache_key cache_key; /* shader disk-cache key */
918 
919    /* Bitmask of bits of the shader key used by this shader.  Used to avoid
920     * recompiles for GL NOS that doesn't actually apply to the shader.
921     */
922    struct ir3_shader_key key_mask;
923 };
924 
925 /**
926  * In order to use the same cmdstream, in particular constlen setup and const
927  * emit, for both binning and draw pass (a6xx+), the binning pass re-uses it's
928  * corresponding draw pass shaders const_state.
929  */
930 static inline const struct ir3_const_state *
ir3_const_state(const struct ir3_shader_variant * v)931 ir3_const_state(const struct ir3_shader_variant *v)
932 {
933    if (v->binning_pass)
934       return v->nonbinning->const_state;
935    return v->const_state;
936 }
937 
938 static inline struct ir3_const_state *
ir3_const_state_mut(const struct ir3_shader_variant * v)939 ir3_const_state_mut(const struct ir3_shader_variant *v)
940 {
941    assert(!v->binning_pass);
942    return v->const_state;
943 }
944 
945 static inline unsigned
_ir3_max_const(const struct ir3_shader_variant * v,bool safe_constlen)946 _ir3_max_const(const struct ir3_shader_variant *v, bool safe_constlen)
947 {
948    const struct ir3_compiler *compiler = v->compiler;
949    bool shared_consts_enable =
950       ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
951 
952    /* Shared consts size for CS and FS matches with what's acutally used,
953     * but the size of shared consts for geomtry stages doesn't.
954     * So we use a hw quirk for geometry shared consts.
955     */
956    uint32_t shared_consts_size = shared_consts_enable ?
957          compiler->shared_consts_size : 0;
958 
959    uint32_t shared_consts_size_geom = shared_consts_enable ?
960          compiler->geom_shared_consts_size_quirk : 0;
961 
962    uint32_t safe_shared_consts_size = shared_consts_enable ?
963       ALIGN_POT(MAX2(DIV_ROUND_UP(shared_consts_size_geom, 4),
964                      DIV_ROUND_UP(shared_consts_size, 5)), 4) : 0;
965 
966    if ((v->type == MESA_SHADER_COMPUTE) ||
967        (v->type == MESA_SHADER_KERNEL)) {
968       return compiler->max_const_compute - shared_consts_size;
969    } else if (safe_constlen) {
970       return compiler->max_const_safe - safe_shared_consts_size;
971    } else if (v->type == MESA_SHADER_FRAGMENT) {
972       return compiler->max_const_frag - shared_consts_size;
973    } else {
974       return compiler->max_const_geom - shared_consts_size_geom;
975    }
976 }
977 
978 /* Given a variant, calculate the maximum constlen it can have.
979  */
980 static inline unsigned
ir3_max_const(const struct ir3_shader_variant * v)981 ir3_max_const(const struct ir3_shader_variant *v)
982 {
983    return _ir3_max_const(v, v->key.safe_constlen);
984 }
985 
986 uint16_t ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm);
987 uint16_t ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm);
988 
989 /* Return true if a variant may need to be recompiled due to exceeding the
990  * maximum "safe" constlen.
991  */
992 static inline bool
ir3_exceeds_safe_constlen(const struct ir3_shader_variant * v)993 ir3_exceeds_safe_constlen(const struct ir3_shader_variant *v)
994 {
995    return v->constlen > _ir3_max_const(v, true);
996 }
997 
998 void *ir3_shader_assemble(struct ir3_shader_variant *v);
999 struct ir3_shader_variant *
1000 ir3_shader_create_variant(struct ir3_shader *shader,
1001                           const struct ir3_shader_key *key,
1002                           bool keep_ir);
1003 struct ir3_shader_variant *
1004 ir3_shader_get_variant(struct ir3_shader *shader,
1005                        const struct ir3_shader_key *key, bool binning_pass,
1006                        bool keep_ir, bool *created);
1007 
1008 struct ir3_shader *
1009 ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
1010                     const struct ir3_shader_options *options,
1011                     struct ir3_stream_output_info *stream_output);
1012 uint32_t ir3_trim_constlen(const struct ir3_shader_variant **variants,
1013                            const struct ir3_compiler *compiler);
1014 struct ir3_shader *
1015 ir3_shader_passthrough_tcs(struct ir3_shader *vs, unsigned patch_vertices);
1016 void ir3_shader_destroy(struct ir3_shader *shader);
1017 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
1018 uint64_t ir3_shader_outputs(const struct ir3_shader *so);
1019 
1020 int ir3_glsl_type_size(const struct glsl_type *type, bool bindless);
1021 
1022 /*
1023  * Helper/util:
1024  */
1025 
1026 /* clears shader-key flags which don't apply to the given shader.
1027  */
1028 static inline void
ir3_key_clear_unused(struct ir3_shader_key * key,struct ir3_shader * shader)1029 ir3_key_clear_unused(struct ir3_shader_key *key, struct ir3_shader *shader)
1030 {
1031    uint32_t *key_bits = (uint32_t *)key;
1032    uint32_t *key_mask = (uint32_t *)&shader->key_mask;
1033    STATIC_ASSERT(sizeof(*key) % 4 == 0);
1034    for (int i = 0; i < sizeof(*key) >> 2; i++)
1035       key_bits[i] &= key_mask[i];
1036 }
1037 
1038 static inline int
ir3_find_output(const struct ir3_shader_variant * so,gl_varying_slot slot)1039 ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
1040 {
1041    int j;
1042 
1043    for (j = 0; j < so->outputs_count; j++)
1044       if (so->outputs[j].slot == slot)
1045          return j;
1046 
1047    /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
1048     * in the vertex shader.. but the fragment shader doesn't know this
1049     * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
1050     * at link time if there is no matching OUT.BCOLOR[n], we must map
1051     * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
1052     * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
1053     */
1054    if (slot == VARYING_SLOT_BFC0) {
1055       slot = VARYING_SLOT_COL0;
1056    } else if (slot == VARYING_SLOT_BFC1) {
1057       slot = VARYING_SLOT_COL1;
1058    } else if (slot == VARYING_SLOT_COL0) {
1059       slot = VARYING_SLOT_BFC0;
1060    } else if (slot == VARYING_SLOT_COL1) {
1061       slot = VARYING_SLOT_BFC1;
1062    } else {
1063       return -1;
1064    }
1065 
1066    for (j = 0; j < so->outputs_count; j++)
1067       if (so->outputs[j].slot == slot)
1068          return j;
1069 
1070    return -1;
1071 }
1072 
1073 static inline int
ir3_next_varying(const struct ir3_shader_variant * so,int i)1074 ir3_next_varying(const struct ir3_shader_variant *so, int i)
1075 {
1076    while (++i < so->inputs_count)
1077       if (so->inputs[i].compmask && so->inputs[i].bary)
1078          break;
1079    return i;
1080 }
1081 
1082 static inline int
ir3_find_input(const struct ir3_shader_variant * so,gl_varying_slot slot)1083 ir3_find_input(const struct ir3_shader_variant *so, gl_varying_slot slot)
1084 {
1085    int j = -1;
1086 
1087    while (true) {
1088       j = ir3_next_varying(so, j);
1089 
1090       if (j >= so->inputs_count)
1091          return -1;
1092 
1093       if (so->inputs[j].slot == slot)
1094          return j;
1095    }
1096 }
1097 
1098 static inline unsigned
ir3_find_input_loc(const struct ir3_shader_variant * so,gl_varying_slot slot)1099 ir3_find_input_loc(const struct ir3_shader_variant *so, gl_varying_slot slot)
1100 {
1101    int var = ir3_find_input(so, slot);
1102    return var == -1 ? 0xff : so->inputs[var].inloc;
1103 }
1104 
1105 struct ir3_shader_linkage {
1106    /* Maximum location either consumed by the fragment shader or produced by
1107     * the last geometry stage, i.e. the size required for each vertex in the
1108     * VPC in DWORD's.
1109     */
1110    uint8_t max_loc;
1111 
1112    /* Number of entries in var. */
1113    uint8_t cnt;
1114 
1115    /* Bitset of locations used, including ones which are only used by the FS.
1116     */
1117    uint32_t varmask[4];
1118 
1119    /* Map from VS output to location. */
1120    struct {
1121       uint8_t slot;
1122       uint8_t regid;
1123       uint8_t compmask;
1124       uint8_t loc;
1125    } var[32];
1126 
1127    /* location for fixed-function gl_PrimitiveID passthrough */
1128    uint8_t primid_loc;
1129 
1130    /* location for fixed-function gl_ViewIndex passthrough */
1131    uint8_t viewid_loc;
1132 
1133    /* location for combined clip/cull distance arrays */
1134    uint8_t clip0_loc, clip1_loc;
1135 };
1136 
1137 static inline void
ir3_link_add(struct ir3_shader_linkage * l,uint8_t slot,uint8_t regid_,uint8_t compmask,uint8_t loc)1138 ir3_link_add(struct ir3_shader_linkage *l, uint8_t slot, uint8_t regid_,
1139              uint8_t compmask, uint8_t loc)
1140 {
1141    for (int j = 0; j < util_last_bit(compmask); j++) {
1142       uint8_t comploc = loc + j;
1143       l->varmask[comploc / 32] |= 1 << (comploc % 32);
1144    }
1145 
1146    l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
1147 
1148    if (regid_ != regid(63, 0)) {
1149       int i = l->cnt++;
1150       assert(i < ARRAY_SIZE(l->var));
1151 
1152       l->var[i].slot = slot;
1153       l->var[i].regid = regid_;
1154       l->var[i].compmask = compmask;
1155       l->var[i].loc = loc;
1156    }
1157 }
1158 
1159 static inline void
ir3_link_shaders(struct ir3_shader_linkage * l,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * fs,bool pack_vs_out)1160 ir3_link_shaders(struct ir3_shader_linkage *l,
1161                  const struct ir3_shader_variant *vs,
1162                  const struct ir3_shader_variant *fs, bool pack_vs_out)
1163 {
1164    /* On older platforms, varmask isn't programmed at all, and it appears
1165     * that the hardware generates a mask of used VPC locations using the VS
1166     * output map, and hangs if a FS bary instruction references a location
1167     * not in the list. This means that we need to have a dummy entry in the
1168     * VS out map for things like gl_PointCoord which aren't written by the
1169     * VS. Furthermore we can't use r63.x, so just pick a random register to
1170     * use if there is no VS output.
1171     */
1172    const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0);
1173    int j = -1, k;
1174 
1175    l->primid_loc = 0xff;
1176    l->viewid_loc = 0xff;
1177    l->clip0_loc = 0xff;
1178    l->clip1_loc = 0xff;
1179 
1180    while (l->cnt < ARRAY_SIZE(l->var)) {
1181       j = ir3_next_varying(fs, j);
1182 
1183       if (j >= fs->inputs_count)
1184          break;
1185 
1186       if (fs->inputs[j].inloc >= fs->total_in)
1187          continue;
1188 
1189       k = ir3_find_output(vs, (gl_varying_slot)fs->inputs[j].slot);
1190 
1191       if (fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) {
1192          l->primid_loc = fs->inputs[j].inloc;
1193       }
1194 
1195       if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) {
1196          assert(k < 0);
1197          l->viewid_loc = fs->inputs[j].inloc;
1198       }
1199 
1200       if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0)
1201          l->clip0_loc = fs->inputs[j].inloc;
1202 
1203       if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1)
1204          l->clip1_loc = fs->inputs[j].inloc;
1205 
1206       ir3_link_add(l, fs->inputs[j].slot,
1207                    k >= 0 ? vs->outputs[k].regid : default_regid,
1208                    fs->inputs[j].compmask, fs->inputs[j].inloc);
1209    }
1210 }
1211 
1212 static inline uint32_t
ir3_find_output_regid(const struct ir3_shader_variant * so,unsigned slot)1213 ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
1214 {
1215    int j;
1216    for (j = 0; j < so->outputs_count; j++)
1217       if (so->outputs[j].slot == slot) {
1218          uint32_t regid = so->outputs[j].regid;
1219          if (so->outputs[j].half)
1220             regid |= HALF_REG_ID;
1221          return regid;
1222       }
1223    return regid(63, 0);
1224 }
1225 
1226 void print_raw(FILE *out, const BITSET_WORD *data, size_t size);
1227 
1228 void ir3_link_stream_out(struct ir3_shader_linkage *l,
1229                          const struct ir3_shader_variant *v);
1230 
1231 #define VARYING_SLOT_GS_HEADER_IR3       (VARYING_SLOT_MAX + 0)
1232 #define VARYING_SLOT_GS_VERTEX_FLAGS_IR3 (VARYING_SLOT_MAX + 1)
1233 #define VARYING_SLOT_TCS_HEADER_IR3      (VARYING_SLOT_MAX + 2)
1234 #define VARYING_SLOT_REL_PATCH_ID_IR3    (VARYING_SLOT_MAX + 3)
1235 
1236 static inline uint32_t
ir3_find_sysval_regid(const struct ir3_shader_variant * so,unsigned slot)1237 ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
1238 {
1239    if (!so)
1240       return regid(63, 0);
1241    for (int j = 0; j < so->inputs_count; j++)
1242       if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
1243          return so->inputs[j].regid;
1244    return regid(63, 0);
1245 }
1246 
1247 /* calculate register footprint in terms of half-regs (ie. one full
1248  * reg counts as two half-regs).
1249  */
1250 static inline uint32_t
ir3_shader_halfregs(const struct ir3_shader_variant * v)1251 ir3_shader_halfregs(const struct ir3_shader_variant *v)
1252 {
1253    return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
1254 }
1255 
1256 static inline uint32_t
ir3_shader_nibo(const struct ir3_shader_variant * v)1257 ir3_shader_nibo(const struct ir3_shader_variant *v)
1258 {
1259    return v->num_ibos;
1260 }
1261 
1262 static inline uint32_t
ir3_shader_branchstack_hw(const struct ir3_shader_variant * v)1263 ir3_shader_branchstack_hw(const struct ir3_shader_variant *v)
1264 {
1265    /* Dummy shader */
1266    if (!v->compiler)
1267       return 0;
1268 
1269    if (v->compiler->gen < 5)
1270       return v->branchstack;
1271 
1272    return DIV_ROUND_UP(MIN2(v->branchstack, v->compiler->branchstack_size), 2);
1273 }
1274 
1275 ENDC;
1276 
1277 #endif /* IR3_SHADER_H_ */
1278