1 /*
2 * Copyright © 2014 Rob Clark <[email protected]>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <[email protected]>
7 */
8
9 #ifndef IR3_SHADER_H_
10 #define IR3_SHADER_H_
11
12 #include <stdio.h>
13
14 #include "c11/threads.h"
15 #include "compiler/nir/nir.h"
16 #include "compiler/shader_enums.h"
17 #include "util/bitscan.h"
18 #include "util/disk_cache.h"
19
20 #include "ir3_compiler.h"
21
22 BEGINC;
23
24 /* driver param indices: */
25 enum ir3_driver_param {
26 /* compute shader driver params: */
27 IR3_DP_NUM_WORK_GROUPS_X = 0,
28 IR3_DP_NUM_WORK_GROUPS_Y = 1,
29 IR3_DP_NUM_WORK_GROUPS_Z = 2,
30 IR3_DP_WORK_DIM = 3,
31 IR3_DP_BASE_GROUP_X = 4,
32 IR3_DP_BASE_GROUP_Y = 5,
33 IR3_DP_BASE_GROUP_Z = 6,
34 IR3_DP_CS_SUBGROUP_SIZE = 7,
35 IR3_DP_LOCAL_GROUP_SIZE_X = 8,
36 IR3_DP_LOCAL_GROUP_SIZE_Y = 9,
37 IR3_DP_LOCAL_GROUP_SIZE_Z = 10,
38 IR3_DP_SUBGROUP_ID_SHIFT = 11,
39 IR3_DP_WORKGROUP_ID_X = 12,
40 IR3_DP_WORKGROUP_ID_Y = 13,
41 IR3_DP_WORKGROUP_ID_Z = 14,
42 /* NOTE: gl_NumWorkGroups should be vec4 aligned because
43 * glDispatchComputeIndirect() needs to load these from
44 * the info->indirect buffer. Keep that in mind when/if
45 * adding any addition CS driver params.
46 */
47 IR3_DP_CS_COUNT = 16, /* must be aligned to vec4 */
48
49 /* vertex shader driver params: */
50 IR3_DP_DRAWID = 0,
51 IR3_DP_VTXID_BASE = 1,
52 IR3_DP_INSTID_BASE = 2,
53 IR3_DP_VTXCNT_MAX = 3,
54 IR3_DP_IS_INDEXED_DRAW = 4, /* Note: boolean, ie. 0 or ~0 */
55 /* user-clip-plane components, up to 8x vec4's: */
56 IR3_DP_UCP0_X = 5,
57 /* .... */
58 IR3_DP_UCP7_W = 36,
59 IR3_DP_VS_COUNT = 40, /* must be aligned to vec4 */
60
61 /* TCS driver params: */
62 IR3_DP_HS_DEFAULT_OUTER_LEVEL_X = 0,
63 IR3_DP_HS_DEFAULT_OUTER_LEVEL_Y = 1,
64 IR3_DP_HS_DEFAULT_OUTER_LEVEL_Z = 2,
65 IR3_DP_HS_DEFAULT_OUTER_LEVEL_W = 3,
66 IR3_DP_HS_DEFAULT_INNER_LEVEL_X = 4,
67 IR3_DP_HS_DEFAULT_INNER_LEVEL_Y = 5,
68 IR3_DP_HS_COUNT = 8, /* must be aligned to vec4 */
69
70 /* fragment shader driver params: */
71 IR3_DP_FS_SUBGROUP_SIZE = 0,
72 /* Dynamic params (that aren't known when compiling the shader) */
73 IR3_DP_FS_DYNAMIC = 4,
74 IR3_DP_FS_FRAG_INVOCATION_COUNT = IR3_DP_FS_DYNAMIC,
75 IR3_DP_FS_FRAG_SIZE = IR3_DP_FS_DYNAMIC + 4,
76 IR3_DP_FS_FRAG_OFFSET = IR3_DP_FS_DYNAMIC + 6,
77 };
78
79 #define IR3_MAX_SHADER_BUFFERS 32
80 #define IR3_MAX_SHADER_IMAGES 32
81 #define IR3_MAX_SO_BUFFERS 4
82 #define IR3_MAX_SO_STREAMS 4
83 #define IR3_MAX_SO_OUTPUTS 128
84 #define IR3_MAX_UBO_PUSH_RANGES 32
85
86 /* mirrors SYSTEM_VALUE_BARYCENTRIC_ but starting from 0 */
87 enum ir3_bary {
88 IJ_PERSP_PIXEL,
89 IJ_PERSP_SAMPLE,
90 IJ_PERSP_CENTROID,
91 IJ_PERSP_CENTER_RHW,
92 IJ_LINEAR_PIXEL,
93 IJ_LINEAR_CENTROID,
94 IJ_LINEAR_SAMPLE,
95 IJ_COUNT,
96 };
97
98 /* Description of what wavesizes are allowed. */
99 enum ir3_wavesize_option {
100 IR3_SINGLE_ONLY,
101 IR3_SINGLE_OR_DOUBLE,
102 IR3_DOUBLE_ONLY,
103 };
104
105 /**
106 * Description of a lowered UBO.
107 */
108 struct nir_def;
109
110 struct ir3_ubo_info {
111 struct nir_def *global_base; /* For global loads, the base address */
112 uint32_t block; /* Which constant block */
113 uint16_t bindless_base; /* For bindless, which base register is used */
114 bool bindless;
115 bool global;
116 };
117
118 /**
119 * Description of a range of a lowered UBO access.
120 *
121 * Drivers should not assume that there are not multiple disjoint
122 * lowered ranges of a single UBO.
123 */
124 struct ir3_ubo_range {
125 struct ir3_ubo_info ubo;
126 uint32_t offset; /* start offset to push in the const register file */
127 uint32_t start, end; /* range of block that's actually used */
128 };
129
130 struct ir3_ubo_analysis_state {
131 struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES];
132 uint32_t num_enabled;
133 uint32_t size;
134 };
135
136 enum ir3_push_consts_type {
137 IR3_PUSH_CONSTS_NONE,
138 IR3_PUSH_CONSTS_PER_STAGE,
139 IR3_PUSH_CONSTS_SHARED,
140 IR3_PUSH_CONSTS_SHARED_PREAMBLE,
141 };
142
143 /* This represents an internal UBO filled out by the driver. There are a few
144 * common UBOs that must be filled out identically by all drivers, for example
145 * for shader linkage, but drivers can also add their own that they manage
146 * themselves.
147 */
148 struct ir3_driver_ubo {
149 int32_t idx;
150 uint32_t size;
151 };
152
153 /**
154 * Describes the layout of shader consts in the const register file.
155 *
156 * Layout of constant registers, each section aligned to vec4. Note
157 * that pointer size (ubo, etc) changes depending on generation.
158 *
159 * + user consts: only used for turnip push consts
160 * + lowered UBO ranges
161 * + preamble consts
162 * + UBO addresses: turnip is bindless and these are wasted
163 * + image dimensions: a5xx only; needed to calculate pixel offset, but only
164 * for images that have image_{load,store,size,atomic*} intrinsics
165 * + kernel params: cl only
166 * + driver params: these are stage-dependent; see ir3_driver_param
167 * + TFBO addresses: only for vs on a3xx/a4xx
168 * + primitive params: these are stage-dependent
169 * vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0)
170 * hs, ds: uvec4(primitive_stride, vertex_stride,
171 * patch_stride, patch_vertices_in)
172 * uvec4(tess_param_base, tess_factor_base)
173 * + primitive map
174 * + lowered immediates
175 *
176 * Immediates go last mostly because they are inserted in the CP pass
177 * after the nir -> ir3 frontend.
178 *
179 * Note UBO size in bytes should be aligned to vec4
180 */
181 struct ir3_const_state {
182 unsigned num_ubos;
183 unsigned num_driver_params; /* scalar */
184
185 struct ir3_driver_ubo consts_ubo;
186 struct ir3_driver_ubo driver_params_ubo;
187 struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo;
188
189 /* Optional const allocations (preamble, UBO, etc.) may shift the required
190 * consts more than they expect. The free space for optional allocations
191 * should respect required_consts_aligment_vec4.
192 */
193 uint32_t required_consts_aligment_vec4;
194
195 int32_t constant_data_dynamic_offsets;
196
197 struct {
198 /* user const start at zero */
199 unsigned ubo;
200 unsigned image_dims;
201 unsigned kernel_params;
202 unsigned driver_param;
203 unsigned tfbo;
204 unsigned primitive_param;
205 unsigned primitive_map;
206 unsigned immediate;
207 } offsets;
208
209 struct {
210 uint32_t mask; /* bitmask of images that have image_store */
211 uint32_t count; /* number of consts allocated */
212 /* three const allocated per image which has image_store:
213 * + cpp (bytes per pixel)
214 * + pitch (y pitch)
215 * + array_pitch (z pitch)
216 */
217 uint32_t off[IR3_MAX_SHADER_IMAGES];
218 } image_dims;
219
220 unsigned immediates_count;
221 unsigned immediates_size;
222 uint32_t *immediates;
223
224 unsigned preamble_size;
225 unsigned global_size;
226
227 /* State of ubo access lowered to push consts: */
228 struct ir3_ubo_analysis_state ubo_state;
229 enum ir3_push_consts_type push_consts_type;
230 };
231
232 /**
233 * A single output for vertex transform feedback.
234 */
235 struct ir3_stream_output {
236 unsigned register_index : 6; /**< 0 to 63 (OUT index) */
237 unsigned start_component : 2; /** 0 to 3 */
238 unsigned num_components : 3; /** 1 to 4 */
239 unsigned output_buffer : 3; /**< 0 to PIPE_MAX_SO_BUFFERS */
240 unsigned dst_offset : 16; /**< offset into the buffer in dwords */
241 unsigned stream : 2; /**< 0 to 3 */
242 };
243
244 /**
245 * Stream output for vertex transform feedback.
246 */
247 struct ir3_stream_output_info {
248 unsigned num_outputs;
249 /** stride for an entire vertex for each buffer in dwords */
250 uint16_t stride[IR3_MAX_SO_BUFFERS];
251
252 /* These correspond to the VPC_SO_STREAM_CNTL fields */
253 uint8_t streams_written;
254 uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS];
255
256 /**
257 * Array of stream outputs, in the order they are to be written in.
258 * Selected components are tightly packed into the output buffer.
259 */
260 struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
261 };
262
263 /**
264 * Starting from a4xx, HW supports pre-dispatching texture sampling
265 * instructions prior to scheduling a shader stage, when the
266 * coordinate maps exactly to an output of the previous stage.
267 */
268
269 /**
270 * There is a limit in the number of pre-dispatches allowed for any
271 * given stage.
272 */
273 #define IR3_MAX_SAMPLER_PREFETCH 4
274
275 /**
276 * This is the output stream value for 'cmd', as used by blob. It may
277 * encode the return type (in 3 bits) but it hasn't been verified yet.
278 */
279 #define IR3_SAMPLER_PREFETCH_CMD 0x4
280 #define IR3_SAMPLER_BINDLESS_PREFETCH_CMD 0x6
281
282 /**
283 * Stream output for texture sampling pre-dispatches.
284 */
285 struct ir3_sampler_prefetch {
286 uint8_t src;
287 bool bindless;
288 uint8_t samp_id;
289 uint8_t tex_id;
290 uint16_t samp_bindless_id;
291 uint16_t tex_bindless_id;
292 uint8_t dst;
293 uint8_t wrmask;
294 uint8_t half_precision;
295 opc_t tex_opc;
296 };
297
298 /* Configuration key used to identify a shader variant.. different
299 * shader variants can be used to implement features not supported
300 * in hw (two sided color), binning-pass vertex shader, etc.
301 *
302 * When adding to this struct, please update ir3_shader_variant()'s debug
303 * output.
304 */
305 struct ir3_shader_key {
306 union {
307 struct {
308 /*
309 * Combined Vertex/Fragment shader parameters:
310 */
311 unsigned ucp_enables : 8;
312
313 /* do we need to check {v,f}saturate_{s,t,r}? */
314 unsigned has_per_samp : 1;
315
316 /*
317 * Fragment shader variant parameters:
318 */
319 unsigned sample_shading : 1;
320 unsigned msaa : 1;
321 /* used when shader needs to handle flat varyings (a4xx)
322 * for front/back color inputs to frag shader:
323 */
324 unsigned rasterflat : 1;
325
326 /* Indicates that this is a tessellation pipeline which requires a
327 * whole different kind of vertex shader. In case of
328 * tessellation, this field also tells us which kind of output
329 * topology the TES uses, which the TCS needs to know.
330 */
331 #define IR3_TESS_NONE 0
332 #define IR3_TESS_QUADS 1
333 #define IR3_TESS_TRIANGLES 2
334 #define IR3_TESS_ISOLINES 3
335 unsigned tessellation : 2;
336
337 unsigned has_gs : 1;
338
339 /* Whether stages after TCS read gl_PrimitiveID, used to determine
340 * whether the TCS has to store it in the tess factor BO.
341 */
342 unsigned tcs_store_primid : 1;
343
344 /* Whether this variant sticks to the "safe" maximum constlen,
345 * which guarantees that the combined stages will never go over
346 * the limit:
347 */
348 unsigned safe_constlen : 1;
349
350 /* Whether driconf "dual_color_blend_by_location" workaround is
351 * enabled
352 */
353 unsigned force_dual_color_blend : 1;
354 };
355 uint32_t global;
356 };
357
358 /* bitmask of ms shifts (a3xx) */
359 uint32_t vsamples, fsamples;
360
361 /* bitmask of samplers which need astc srgb workaround (a4xx): */
362 uint16_t vastc_srgb, fastc_srgb;
363
364 /* per-component (3-bit) swizzles of each sampler (a4xx tg4): */
365 uint16_t vsampler_swizzles[16];
366 uint16_t fsampler_swizzles[16];
367 };
368
369 static inline unsigned
ir3_tess_mode(enum tess_primitive_mode tess_mode)370 ir3_tess_mode(enum tess_primitive_mode tess_mode)
371 {
372 switch (tess_mode) {
373 case TESS_PRIMITIVE_ISOLINES:
374 return IR3_TESS_ISOLINES;
375 case TESS_PRIMITIVE_TRIANGLES:
376 return IR3_TESS_TRIANGLES;
377 case TESS_PRIMITIVE_QUADS:
378 return IR3_TESS_QUADS;
379 default:
380 unreachable("bad tessmode");
381 }
382 }
383
384 static inline uint32_t
ir3_tess_factor_stride(unsigned patch_type)385 ir3_tess_factor_stride(unsigned patch_type)
386 {
387 /* note: this matches the stride used by ir3's build_tessfactor_base */
388 switch (patch_type) {
389 case IR3_TESS_ISOLINES:
390 return 12;
391 case IR3_TESS_TRIANGLES:
392 return 20;
393 case IR3_TESS_QUADS:
394 return 28;
395 default:
396 unreachable("bad tessmode");
397 }
398 }
399
400 static inline bool
ir3_shader_key_equal(const struct ir3_shader_key * a,const struct ir3_shader_key * b)401 ir3_shader_key_equal(const struct ir3_shader_key *a,
402 const struct ir3_shader_key *b)
403 {
404 /* slow-path if we need to check {v,f}saturate_{s,t,r} */
405 if (a->has_per_samp || b->has_per_samp)
406 return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
407 return a->global == b->global;
408 }
409
410 /* will the two keys produce different lowering for a fragment shader? */
411 static inline bool
ir3_shader_key_changes_fs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)412 ir3_shader_key_changes_fs(struct ir3_shader_key *key,
413 struct ir3_shader_key *last_key)
414 {
415 if (last_key->has_per_samp || key->has_per_samp) {
416 if ((last_key->fsamples != key->fsamples) ||
417 (last_key->fastc_srgb != key->fastc_srgb) ||
418 memcmp(last_key->fsampler_swizzles, key->fsampler_swizzles,
419 sizeof(key->fsampler_swizzles)))
420 return true;
421 }
422
423 if (last_key->rasterflat != key->rasterflat)
424 return true;
425
426 if (last_key->ucp_enables != key->ucp_enables)
427 return true;
428
429 if (last_key->safe_constlen != key->safe_constlen)
430 return true;
431
432 return false;
433 }
434
435 /* will the two keys produce different lowering for a vertex shader? */
436 static inline bool
ir3_shader_key_changes_vs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)437 ir3_shader_key_changes_vs(struct ir3_shader_key *key,
438 struct ir3_shader_key *last_key)
439 {
440 if (last_key->has_per_samp || key->has_per_samp) {
441 if ((last_key->vsamples != key->vsamples) ||
442 (last_key->vastc_srgb != key->vastc_srgb) ||
443 memcmp(last_key->vsampler_swizzles, key->vsampler_swizzles,
444 sizeof(key->vsampler_swizzles)))
445 return true;
446 }
447
448 if (last_key->ucp_enables != key->ucp_enables)
449 return true;
450
451 if (last_key->safe_constlen != key->safe_constlen)
452 return true;
453
454 return false;
455 }
456
457 /**
458 * On a4xx+a5xx, Images share state with textures and SSBOs:
459 *
460 * + Uses texture (cat5) state/instruction (isam) to read
461 * + Uses SSBO state and instructions (cat6) to write and for atomics
462 *
463 * Starting with a6xx, Images and SSBOs are basically the same thing,
464 * with texture state and isam also used for SSBO reads.
465 *
466 * On top of that, gallium makes the SSBO (shader_buffers) state semi
467 * sparse, with the first half of the state space used for atomic
468 * counters lowered to atomic buffers. We could ignore this, but I
469 * don't think we could *really* handle the case of a single shader
470 * that used the max # of textures + images + SSBOs. And once we are
471 * offsetting images by num_ssbos (or visa versa) to map them into
472 * the same hardware state, the hardware state has become coupled to
473 * the shader state, so at this point we might as well just use a
474 * mapping table to remap things from image/SSBO idx to hw idx.
475 *
476 * To make things less (more?) confusing, for the hw "SSBO" state
477 * (since it is really both SSBO and Image) I'll use the name "IBO"
478 */
479 struct ir3_ibo_mapping {
480 #define IBO_INVALID 0xff
481 /* Maps logical SSBO state to hw tex state: */
482 uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS];
483
484 /* Maps logical Image state to hw tex state: */
485 uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES];
486
487 /* Maps hw state back to logical SSBO or Image state:
488 *
489 * note IBO_SSBO ORd into values to indicate that the
490 * hw slot is used for SSBO state vs Image state.
491 */
492 #define IBO_SSBO 0x80
493 uint8_t tex_to_image[32];
494
495 /* including real textures */
496 uint8_t num_tex;
497 /* the number of real textures, ie. image/ssbo start here */
498 uint8_t tex_base;
499 };
500
501 struct ir3_disasm_info {
502 bool write_disasm;
503 char *nir;
504 char *disasm;
505 };
506
507 /* Represents half register in regid */
508 #define HALF_REG_ID 0x100
509
510 struct ir3_shader_options {
511 unsigned num_reserved_user_consts;
512 /* What API-visible wavesizes are allowed. Even if only double wavesize is
513 * allowed, we may still use the smaller wavesize "under the hood" and the
514 * application simply sees the upper half as always disabled.
515 */
516 enum ir3_wavesize_option api_wavesize;
517 /* What wavesizes we're allowed to actually use. If the API wavesize is
518 * single-only, then this must be single-only too.
519 */
520 enum ir3_wavesize_option real_wavesize;
521 enum ir3_push_consts_type push_consts_type;
522
523 uint32_t push_consts_base;
524 uint32_t push_consts_dwords;
525 };
526
527 /**
528 * Shader variant which contains the actual hw shader instructions,
529 * and necessary info for shader state setup.
530 */
531 struct ir3_shader_variant {
532 struct fd_bo *bo;
533
534 /* variant id (for debug) */
535 uint32_t id;
536
537 /* id of the shader the variant came from (for debug) */
538 uint32_t shader_id;
539
540 struct ir3_shader_key key;
541
542 /* vertex shaders can have an extra version for hwbinning pass,
543 * which is pointed to by so->binning:
544 */
545 bool binning_pass;
546 // union {
547 struct ir3_shader_variant *binning;
548 struct ir3_shader_variant *nonbinning;
549 // };
550
551 struct ir3 *ir; /* freed after assembling machine instructions */
552
553 /* shader variants form a linked list: */
554 struct ir3_shader_variant *next;
555
556 /* replicated here to avoid passing extra ptrs everywhere: */
557 gl_shader_stage type;
558 struct ir3_compiler *compiler;
559
560 char *name;
561
562 /* variant's copy of nir->constant_data (since we don't track the NIR in
563 * the variant, and shader->nir is before the opt pass). Moves to v->bin
564 * after assembly.
565 */
566 void *constant_data;
567
568 struct ir3_disasm_info disasm_info;
569
570 /*
571 * Below here is serialized when written to disk cache:
572 */
573
574 /* The actual binary shader instructions, size given by info.sizedwords: */
575 uint32_t *bin;
576
577 struct ir3_const_state *const_state;
578
579 /*
580 * The following macros are used by the shader disk cache save/
581 * restore paths to serialize/deserialize the variant. Any
582 * pointers that require special handling in store_variant()
583 * and retrieve_variant() should go above here.
584 */
585 #define VARIANT_CACHE_START offsetof(struct ir3_shader_variant, info)
586 #define VARIANT_CACHE_PTR(v) (((char *)v) + VARIANT_CACHE_START)
587 #define VARIANT_CACHE_SIZE \
588 (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START)
589
590 struct ir3_info info;
591
592 struct ir3_shader_options shader_options;
593
594 uint32_t constant_data_size;
595
596 /* Levels of nesting of flow control:
597 */
598 unsigned branchstack;
599
600 unsigned loops;
601
602 /* the instructions length is in units of instruction groups
603 * (4 instructions for a3xx, 16 instructions for a4xx.. each
604 * instruction is 2 dwords):
605 */
606 unsigned instrlen;
607
608 /* the constants length is in units of vec4's, and is the sum of
609 * the uniforms and the built-in compiler constants
610 */
611 unsigned constlen;
612
613 /* The private memory size in bytes per fiber */
614 unsigned pvtmem_size;
615 /* Whether we should use the new per-wave layout rather than per-fiber. */
616 bool pvtmem_per_wave;
617
618 /* Whether multi-position output is enabled. */
619 bool multi_pos_output;
620
621 /* Whether dual-source blending is enabled. */
622 bool dual_src_blend;
623
624 /* Whether early preamble is enabled. */
625 bool early_preamble;
626
627 /* Size in bytes of required shared memory */
628 unsigned shared_size;
629
630 /* About Linkage:
631 * + Let the frag shader determine the position/compmask for the
632 * varyings, since it is the place where we know if the varying
633 * is actually used, and if so, which components are used. So
634 * what the hw calls "outloc" is taken from the "inloc" of the
635 * frag shader.
636 * + From the vert shader, we only need the output regid
637 */
638
639 bool frag_face, color0_mrt;
640 uint8_t fragcoord_compmask;
641
642 /* NOTE: for input/outputs, slot is:
643 * gl_vert_attrib - for VS inputs
644 * gl_varying_slot - for VS output / FS input
645 * gl_frag_result - for FS output
646 */
647
648 /* varyings/outputs: */
649 unsigned outputs_count;
650 struct {
651 uint8_t slot;
652 uint8_t regid;
653 uint8_t view;
654 bool half : 1;
655 } outputs[32 + 2]; /* +POSITION +PSIZE */
656 bool writes_pos, writes_smask, writes_psize, writes_viewport, writes_stencilref;
657
658 /* Size in dwords of all outputs for VS, size of entire patch for HS. */
659 uint32_t output_size;
660
661 /* Expected size of incoming output_loc for HS, DS, and GS */
662 uint32_t input_size;
663
664 /* Map from location to offset in per-primitive storage. In dwords for
665 * HS, where varyings are read in the next stage via ldg with a dword
666 * offset, and in bytes for all other stages.
667 * +POSITION, +PSIZE, ... - see shader_io_get_unique_index
668 */
669 unsigned output_loc[12 + 32];
670
671 /* attributes (VS) / varyings (FS):
672 * Note that sysval's should come *after* normal inputs.
673 */
674 unsigned inputs_count;
675 struct {
676 uint8_t slot;
677 uint8_t regid;
678 uint8_t compmask;
679 /* location of input (ie. offset passed to bary.f, etc). This
680 * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
681 * have the OUTLOCn value offset by 8, presumably to account
682 * for gl_Position/gl_PointSize)
683 */
684 uint8_t inloc;
685 /* vertex shader specific: */
686 bool sysval : 1; /* slot is a gl_system_value */
687 /* fragment shader specific: */
688 bool bary : 1; /* fetched varying (vs one loaded into reg) */
689 bool rasterflat : 1; /* special handling for emit->rasterflat */
690 bool half : 1;
691 bool flat : 1;
692 } inputs[32 + 2]; /* +POSITION +FACE */
693 bool reads_primid;
694
695 /* sum of input components (scalar). For frag shaders, it only counts
696 * the varying inputs:
697 */
698 unsigned total_in;
699
700 /* sum of sysval input components (scalar). */
701 unsigned sysval_in;
702
703 /* For frag shaders, the total number of inputs (not scalar,
704 * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
705 */
706 unsigned varying_in;
707
708 /* Remapping table to map Image and SSBO to hw state: */
709 struct ir3_ibo_mapping image_mapping;
710
711 /* number of samplers/textures (which are currently 1:1): */
712 int num_samp;
713
714 /* is there an implicit sampler to read framebuffer (FS only).. if
715 * so the sampler-idx is 'num_samp - 1' (ie. it is appended after
716 * the last "real" texture)
717 */
718 bool fb_read;
719
720 /* do we have one or more SSBO instructions: */
721 bool has_ssbo;
722
723 /* Which bindless resources are used, for filling out sp_xs_config */
724 bool bindless_tex;
725 bool bindless_samp;
726 bool bindless_ibo;
727 bool bindless_ubo;
728
729 /* do we need derivatives: */
730 bool need_pixlod;
731
732 bool need_full_quad;
733
734 /* do we need VS driver params? */
735 bool need_driver_params;
736
737 /* do we have image write, etc (which prevents early-z): */
738 bool no_earlyz;
739
740 /* do we have kill, which also prevents early-z, but not necessarily
741 * early-lrz (as long as lrz-write is disabled, which must be handled
742 * outside of ir3. Unlike other no_earlyz cases, kill doesn't have
743 * side effects that prevent early-lrz discard.
744 */
745 bool has_kill;
746
747 bool per_samp;
748
749 bool post_depth_coverage;
750
751 /* Are we using split or merged register file? */
752 bool mergedregs;
753
754 uint8_t clip_mask, cull_mask;
755
756 /* for astc srgb workaround, the number/base of additional
757 * alpha tex states we need, and index of original tex states
758 */
759 struct {
760 unsigned base, count;
761 unsigned orig_idx[16];
762 } astc_srgb;
763
764 /* for tg4 workaround, the number/base of additional
765 * unswizzled tex states we need, and index of original tex states
766 */
767 struct {
768 unsigned base, count;
769 unsigned orig_idx[16];
770 } tg4;
771
772 /* texture sampler pre-dispatches */
773 uint32_t num_sampler_prefetch;
774 struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH];
775
776 /* If true, the last use of helper invocations is the texture prefetch and
777 * they should be disabled for the actual shader. Equivalent to adding
778 * (eq)nop at the beginning of the shader.
779 */
780 bool prefetch_end_of_quad;
781
782 uint16_t local_size[3];
783 bool local_size_variable;
784
785 /* Important for compute shader to determine max reg footprint */
786 bool has_barrier;
787
788 /* The offset where images start in the IBO array. */
789 unsigned num_ssbos;
790
791 /* The total number of SSBOs and images, i.e. the number of hardware IBOs. */
792 unsigned num_ibos;
793
794 union {
795 struct {
796 enum tess_primitive_mode primitive_mode;
797
798 /** The number of vertices in the TCS output patch. */
799 uint8_t tcs_vertices_out;
800 enum gl_tess_spacing spacing:2; /*gl_tess_spacing*/
801
802 /** Is the vertex order counterclockwise? */
803 bool ccw:1;
804 bool point_mode:1;
805 } tess;
806 struct {
807 /** The output primitive type */
808 uint16_t output_primitive;
809
810 /** The maximum number of vertices the geometry shader might write. */
811 uint16_t vertices_out;
812
813 /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
814 uint8_t invocations;
815
816 /** The number of vertices received per input primitive (max. 6) */
817 uint8_t vertices_in:3;
818 } gs;
819 struct {
820 bool early_fragment_tests : 1;
821 bool color_is_dual_source : 1;
822 bool uses_fbfetch_output : 1;
823 bool fbfetch_coherent : 1;
824 } fs;
825 struct {
826 unsigned req_input_mem;
827 unsigned req_local_mem;
828 bool force_linear_dispatch;
829 } cs;
830 };
831
832 /* For when we don't have a shader, variant's copy of streamout state */
833 struct ir3_stream_output_info stream_output;
834 };
835
836 static inline const char *
ir3_shader_stage(struct ir3_shader_variant * v)837 ir3_shader_stage(struct ir3_shader_variant *v)
838 {
839 switch (v->type) {
840 case MESA_SHADER_VERTEX:
841 return v->binning_pass ? "BVERT" : "VERT";
842 case MESA_SHADER_TESS_CTRL:
843 return "TCS";
844 case MESA_SHADER_TESS_EVAL:
845 return "TES";
846 case MESA_SHADER_GEOMETRY:
847 return "GEOM";
848 case MESA_SHADER_FRAGMENT:
849 return "FRAG";
850 case MESA_SHADER_COMPUTE:
851 case MESA_SHADER_KERNEL:
852 return "CL";
853 default:
854 unreachable("invalid type");
855 return NULL;
856 }
857 }
858
859 /* Currently we do not do binning for tess. And for GS there is no
860 * cross-stage VS+GS optimization, so the full VS+GS is used in
861 * the binning pass.
862 */
863 static inline bool
ir3_has_binning_vs(const struct ir3_shader_key * key)864 ir3_has_binning_vs(const struct ir3_shader_key *key)
865 {
866 if (key->tessellation || key->has_gs)
867 return false;
868 return true;
869 }
870
871 /**
872 * Represents a shader at the API level, before state-specific variants are
873 * generated.
874 */
875 struct ir3_shader {
876 gl_shader_stage type;
877
878 /* shader id (for debug): */
879 uint32_t id;
880 uint32_t variant_count;
881
882 /* Set by freedreno after shader_state_create, so we can emit debug info
883 * when recompiling a shader at draw time.
884 */
885 bool initial_variants_done;
886
887 struct ir3_compiler *compiler;
888
889 struct ir3_shader_options options;
890
891 bool nir_finalized;
892 struct nir_shader *nir;
893 struct ir3_stream_output_info stream_output;
894
895 /* per shader stage specific info: */
896 union {
897 /* for compute shaders: */
898 struct {
899 unsigned req_input_mem; /* in dwords */
900 unsigned req_local_mem;
901 bool force_linear_dispatch;
902 } cs;
903 /* For vertex shaders: */
904 struct {
905 /* If we need to generate a passthrough TCS, it will be a function of
906 * (a) the VS and (b) the # of patch_vertices (max 32), so cache them
907 * in the VS keyed by # of patch_vertices-1.
908 */
909 unsigned passthrough_tcs_compiled;
910 struct ir3_shader *passthrough_tcs[32];
911 } vs;
912 };
913
914 struct ir3_shader_variant *variants;
915 mtx_t variants_lock;
916
917 cache_key cache_key; /* shader disk-cache key */
918
919 /* Bitmask of bits of the shader key used by this shader. Used to avoid
920 * recompiles for GL NOS that doesn't actually apply to the shader.
921 */
922 struct ir3_shader_key key_mask;
923 };
924
925 /**
926 * In order to use the same cmdstream, in particular constlen setup and const
927 * emit, for both binning and draw pass (a6xx+), the binning pass re-uses it's
928 * corresponding draw pass shaders const_state.
929 */
930 static inline const struct ir3_const_state *
ir3_const_state(const struct ir3_shader_variant * v)931 ir3_const_state(const struct ir3_shader_variant *v)
932 {
933 if (v->binning_pass)
934 return v->nonbinning->const_state;
935 return v->const_state;
936 }
937
938 static inline struct ir3_const_state *
ir3_const_state_mut(const struct ir3_shader_variant * v)939 ir3_const_state_mut(const struct ir3_shader_variant *v)
940 {
941 assert(!v->binning_pass);
942 return v->const_state;
943 }
944
945 static inline unsigned
_ir3_max_const(const struct ir3_shader_variant * v,bool safe_constlen)946 _ir3_max_const(const struct ir3_shader_variant *v, bool safe_constlen)
947 {
948 const struct ir3_compiler *compiler = v->compiler;
949 bool shared_consts_enable =
950 ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
951
952 /* Shared consts size for CS and FS matches with what's acutally used,
953 * but the size of shared consts for geomtry stages doesn't.
954 * So we use a hw quirk for geometry shared consts.
955 */
956 uint32_t shared_consts_size = shared_consts_enable ?
957 compiler->shared_consts_size : 0;
958
959 uint32_t shared_consts_size_geom = shared_consts_enable ?
960 compiler->geom_shared_consts_size_quirk : 0;
961
962 uint32_t safe_shared_consts_size = shared_consts_enable ?
963 ALIGN_POT(MAX2(DIV_ROUND_UP(shared_consts_size_geom, 4),
964 DIV_ROUND_UP(shared_consts_size, 5)), 4) : 0;
965
966 if ((v->type == MESA_SHADER_COMPUTE) ||
967 (v->type == MESA_SHADER_KERNEL)) {
968 return compiler->max_const_compute - shared_consts_size;
969 } else if (safe_constlen) {
970 return compiler->max_const_safe - safe_shared_consts_size;
971 } else if (v->type == MESA_SHADER_FRAGMENT) {
972 return compiler->max_const_frag - shared_consts_size;
973 } else {
974 return compiler->max_const_geom - shared_consts_size_geom;
975 }
976 }
977
978 /* Given a variant, calculate the maximum constlen it can have.
979 */
980 static inline unsigned
ir3_max_const(const struct ir3_shader_variant * v)981 ir3_max_const(const struct ir3_shader_variant *v)
982 {
983 return _ir3_max_const(v, v->key.safe_constlen);
984 }
985
986 uint16_t ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm);
987 uint16_t ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm);
988
989 /* Return true if a variant may need to be recompiled due to exceeding the
990 * maximum "safe" constlen.
991 */
992 static inline bool
ir3_exceeds_safe_constlen(const struct ir3_shader_variant * v)993 ir3_exceeds_safe_constlen(const struct ir3_shader_variant *v)
994 {
995 return v->constlen > _ir3_max_const(v, true);
996 }
997
998 void *ir3_shader_assemble(struct ir3_shader_variant *v);
999 struct ir3_shader_variant *
1000 ir3_shader_create_variant(struct ir3_shader *shader,
1001 const struct ir3_shader_key *key,
1002 bool keep_ir);
1003 struct ir3_shader_variant *
1004 ir3_shader_get_variant(struct ir3_shader *shader,
1005 const struct ir3_shader_key *key, bool binning_pass,
1006 bool keep_ir, bool *created);
1007
1008 struct ir3_shader *
1009 ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
1010 const struct ir3_shader_options *options,
1011 struct ir3_stream_output_info *stream_output);
1012 uint32_t ir3_trim_constlen(const struct ir3_shader_variant **variants,
1013 const struct ir3_compiler *compiler);
1014 struct ir3_shader *
1015 ir3_shader_passthrough_tcs(struct ir3_shader *vs, unsigned patch_vertices);
1016 void ir3_shader_destroy(struct ir3_shader *shader);
1017 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
1018 uint64_t ir3_shader_outputs(const struct ir3_shader *so);
1019
1020 int ir3_glsl_type_size(const struct glsl_type *type, bool bindless);
1021
1022 /*
1023 * Helper/util:
1024 */
1025
1026 /* clears shader-key flags which don't apply to the given shader.
1027 */
1028 static inline void
ir3_key_clear_unused(struct ir3_shader_key * key,struct ir3_shader * shader)1029 ir3_key_clear_unused(struct ir3_shader_key *key, struct ir3_shader *shader)
1030 {
1031 uint32_t *key_bits = (uint32_t *)key;
1032 uint32_t *key_mask = (uint32_t *)&shader->key_mask;
1033 STATIC_ASSERT(sizeof(*key) % 4 == 0);
1034 for (int i = 0; i < sizeof(*key) >> 2; i++)
1035 key_bits[i] &= key_mask[i];
1036 }
1037
1038 static inline int
ir3_find_output(const struct ir3_shader_variant * so,gl_varying_slot slot)1039 ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
1040 {
1041 int j;
1042
1043 for (j = 0; j < so->outputs_count; j++)
1044 if (so->outputs[j].slot == slot)
1045 return j;
1046
1047 /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
1048 * in the vertex shader.. but the fragment shader doesn't know this
1049 * so it will always have both IN.COLOR[n] and IN.BCOLOR[n]. So
1050 * at link time if there is no matching OUT.BCOLOR[n], we must map
1051 * OUT.COLOR[n] to IN.BCOLOR[n]. And visa versa if there is only
1052 * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
1053 */
1054 if (slot == VARYING_SLOT_BFC0) {
1055 slot = VARYING_SLOT_COL0;
1056 } else if (slot == VARYING_SLOT_BFC1) {
1057 slot = VARYING_SLOT_COL1;
1058 } else if (slot == VARYING_SLOT_COL0) {
1059 slot = VARYING_SLOT_BFC0;
1060 } else if (slot == VARYING_SLOT_COL1) {
1061 slot = VARYING_SLOT_BFC1;
1062 } else {
1063 return -1;
1064 }
1065
1066 for (j = 0; j < so->outputs_count; j++)
1067 if (so->outputs[j].slot == slot)
1068 return j;
1069
1070 return -1;
1071 }
1072
1073 static inline int
ir3_next_varying(const struct ir3_shader_variant * so,int i)1074 ir3_next_varying(const struct ir3_shader_variant *so, int i)
1075 {
1076 while (++i < so->inputs_count)
1077 if (so->inputs[i].compmask && so->inputs[i].bary)
1078 break;
1079 return i;
1080 }
1081
1082 static inline int
ir3_find_input(const struct ir3_shader_variant * so,gl_varying_slot slot)1083 ir3_find_input(const struct ir3_shader_variant *so, gl_varying_slot slot)
1084 {
1085 int j = -1;
1086
1087 while (true) {
1088 j = ir3_next_varying(so, j);
1089
1090 if (j >= so->inputs_count)
1091 return -1;
1092
1093 if (so->inputs[j].slot == slot)
1094 return j;
1095 }
1096 }
1097
1098 static inline unsigned
ir3_find_input_loc(const struct ir3_shader_variant * so,gl_varying_slot slot)1099 ir3_find_input_loc(const struct ir3_shader_variant *so, gl_varying_slot slot)
1100 {
1101 int var = ir3_find_input(so, slot);
1102 return var == -1 ? 0xff : so->inputs[var].inloc;
1103 }
1104
1105 struct ir3_shader_linkage {
1106 /* Maximum location either consumed by the fragment shader or produced by
1107 * the last geometry stage, i.e. the size required for each vertex in the
1108 * VPC in DWORD's.
1109 */
1110 uint8_t max_loc;
1111
1112 /* Number of entries in var. */
1113 uint8_t cnt;
1114
1115 /* Bitset of locations used, including ones which are only used by the FS.
1116 */
1117 uint32_t varmask[4];
1118
1119 /* Map from VS output to location. */
1120 struct {
1121 uint8_t slot;
1122 uint8_t regid;
1123 uint8_t compmask;
1124 uint8_t loc;
1125 } var[32];
1126
1127 /* location for fixed-function gl_PrimitiveID passthrough */
1128 uint8_t primid_loc;
1129
1130 /* location for fixed-function gl_ViewIndex passthrough */
1131 uint8_t viewid_loc;
1132
1133 /* location for combined clip/cull distance arrays */
1134 uint8_t clip0_loc, clip1_loc;
1135 };
1136
1137 static inline void
ir3_link_add(struct ir3_shader_linkage * l,uint8_t slot,uint8_t regid_,uint8_t compmask,uint8_t loc)1138 ir3_link_add(struct ir3_shader_linkage *l, uint8_t slot, uint8_t regid_,
1139 uint8_t compmask, uint8_t loc)
1140 {
1141 for (int j = 0; j < util_last_bit(compmask); j++) {
1142 uint8_t comploc = loc + j;
1143 l->varmask[comploc / 32] |= 1 << (comploc % 32);
1144 }
1145
1146 l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
1147
1148 if (regid_ != regid(63, 0)) {
1149 int i = l->cnt++;
1150 assert(i < ARRAY_SIZE(l->var));
1151
1152 l->var[i].slot = slot;
1153 l->var[i].regid = regid_;
1154 l->var[i].compmask = compmask;
1155 l->var[i].loc = loc;
1156 }
1157 }
1158
1159 static inline void
ir3_link_shaders(struct ir3_shader_linkage * l,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * fs,bool pack_vs_out)1160 ir3_link_shaders(struct ir3_shader_linkage *l,
1161 const struct ir3_shader_variant *vs,
1162 const struct ir3_shader_variant *fs, bool pack_vs_out)
1163 {
1164 /* On older platforms, varmask isn't programmed at all, and it appears
1165 * that the hardware generates a mask of used VPC locations using the VS
1166 * output map, and hangs if a FS bary instruction references a location
1167 * not in the list. This means that we need to have a dummy entry in the
1168 * VS out map for things like gl_PointCoord which aren't written by the
1169 * VS. Furthermore we can't use r63.x, so just pick a random register to
1170 * use if there is no VS output.
1171 */
1172 const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0);
1173 int j = -1, k;
1174
1175 l->primid_loc = 0xff;
1176 l->viewid_loc = 0xff;
1177 l->clip0_loc = 0xff;
1178 l->clip1_loc = 0xff;
1179
1180 while (l->cnt < ARRAY_SIZE(l->var)) {
1181 j = ir3_next_varying(fs, j);
1182
1183 if (j >= fs->inputs_count)
1184 break;
1185
1186 if (fs->inputs[j].inloc >= fs->total_in)
1187 continue;
1188
1189 k = ir3_find_output(vs, (gl_varying_slot)fs->inputs[j].slot);
1190
1191 if (fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) {
1192 l->primid_loc = fs->inputs[j].inloc;
1193 }
1194
1195 if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) {
1196 assert(k < 0);
1197 l->viewid_loc = fs->inputs[j].inloc;
1198 }
1199
1200 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0)
1201 l->clip0_loc = fs->inputs[j].inloc;
1202
1203 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1)
1204 l->clip1_loc = fs->inputs[j].inloc;
1205
1206 ir3_link_add(l, fs->inputs[j].slot,
1207 k >= 0 ? vs->outputs[k].regid : default_regid,
1208 fs->inputs[j].compmask, fs->inputs[j].inloc);
1209 }
1210 }
1211
1212 static inline uint32_t
ir3_find_output_regid(const struct ir3_shader_variant * so,unsigned slot)1213 ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
1214 {
1215 int j;
1216 for (j = 0; j < so->outputs_count; j++)
1217 if (so->outputs[j].slot == slot) {
1218 uint32_t regid = so->outputs[j].regid;
1219 if (so->outputs[j].half)
1220 regid |= HALF_REG_ID;
1221 return regid;
1222 }
1223 return regid(63, 0);
1224 }
1225
1226 void print_raw(FILE *out, const BITSET_WORD *data, size_t size);
1227
1228 void ir3_link_stream_out(struct ir3_shader_linkage *l,
1229 const struct ir3_shader_variant *v);
1230
1231 #define VARYING_SLOT_GS_HEADER_IR3 (VARYING_SLOT_MAX + 0)
1232 #define VARYING_SLOT_GS_VERTEX_FLAGS_IR3 (VARYING_SLOT_MAX + 1)
1233 #define VARYING_SLOT_TCS_HEADER_IR3 (VARYING_SLOT_MAX + 2)
1234 #define VARYING_SLOT_REL_PATCH_ID_IR3 (VARYING_SLOT_MAX + 3)
1235
1236 static inline uint32_t
ir3_find_sysval_regid(const struct ir3_shader_variant * so,unsigned slot)1237 ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
1238 {
1239 if (!so)
1240 return regid(63, 0);
1241 for (int j = 0; j < so->inputs_count; j++)
1242 if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
1243 return so->inputs[j].regid;
1244 return regid(63, 0);
1245 }
1246
1247 /* calculate register footprint in terms of half-regs (ie. one full
1248 * reg counts as two half-regs).
1249 */
1250 static inline uint32_t
ir3_shader_halfregs(const struct ir3_shader_variant * v)1251 ir3_shader_halfregs(const struct ir3_shader_variant *v)
1252 {
1253 return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
1254 }
1255
1256 static inline uint32_t
ir3_shader_nibo(const struct ir3_shader_variant * v)1257 ir3_shader_nibo(const struct ir3_shader_variant *v)
1258 {
1259 return v->num_ibos;
1260 }
1261
1262 static inline uint32_t
ir3_shader_branchstack_hw(const struct ir3_shader_variant * v)1263 ir3_shader_branchstack_hw(const struct ir3_shader_variant *v)
1264 {
1265 /* Dummy shader */
1266 if (!v->compiler)
1267 return 0;
1268
1269 if (v->compiler->gen < 5)
1270 return v->branchstack;
1271
1272 return DIV_ROUND_UP(MIN2(v->branchstack, v->compiler->branchstack_size), 2);
1273 }
1274
1275 ENDC;
1276
1277 #endif /* IR3_SHADER_H_ */
1278