1 /*
2 * Copyright © 2010 - 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #ifndef ELK_COMPILER_H
25 #define ELK_COMPILER_H
26
27 #include <stdio.h>
28 #include "c11/threads.h"
29 #include "dev/intel_device_info.h"
30 #include "isl/isl.h"
31 #include "util/macros.h"
32 #include "util/mesa-sha1.h"
33 #include "util/enum_operators.h"
34 #include "util/ralloc.h"
35 #include "util/u_math.h"
36 #include "elk_isa_info.h"
37 #include "../intel_shader_enums.h"
38
39 #ifdef __cplusplus
40 extern "C" {
41 #endif
42
43 struct ra_regs;
44 struct nir_shader;
45 struct shader_info;
46
47 struct nir_shader_compiler_options;
48 typedef struct nir_shader nir_shader;
49
50 #define REG_CLASS_COUNT 20
51
52 struct elk_compiler {
53 const struct intel_device_info *devinfo;
54
55 /* This lock must be taken if the compiler is to be modified in any way,
56 * including adding something to the ralloc child list.
57 */
58 mtx_t mutex;
59
60 struct elk_isa_info isa;
61
62 struct {
63 struct ra_regs *regs;
64
65 /**
66 * Array of the ra classes for the unaligned contiguous register
67 * block sizes used.
68 */
69 struct ra_class **classes;
70 } vec4_reg_set;
71
72 struct {
73 struct ra_regs *regs;
74
75 /**
76 * Array of the ra classes for the unaligned contiguous register
77 * block sizes used, indexed by register size.
78 */
79 struct ra_class *classes[REG_CLASS_COUNT];
80
81 /**
82 * ra class for the aligned barycentrics we use for PLN, which doesn't
83 * appear in *classes.
84 */
85 struct ra_class *aligned_bary_class;
86 } fs_reg_sets[3];
87
88 void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
89 void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
90
91 bool scalar_stage[MESA_ALL_SHADER_STAGES];
92 struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES];
93
94 /**
95 * Apply workarounds for SIN and COS output range problems.
96 * This can negatively impact performance.
97 */
98 bool precise_trig;
99
100 /**
101 * Is 3DSTATE_CONSTANT_*'s Constant Buffer 0 relative to Dynamic State
102 * Base Address? (If not, it's a normal GPU address.)
103 */
104 bool constant_buffer_0_is_relative;
105
106 /**
107 * Whether or not the driver supports NIR shader constants. This controls
108 * whether nir_opt_large_constants will be run.
109 */
110 bool supports_shader_constants;
111
112 /**
113 * Whether indirect UBO loads should use the sampler or go through the
114 * data/constant cache. For the sampler, UBO surface states have to be set
115 * up with VK_FORMAT_R32G32B32A32_FLOAT whereas if it's going through the
116 * constant or data cache, UBOs must use VK_FORMAT_RAW.
117 */
118 bool indirect_ubos_use_sampler;
119
120 /**
121 * Calling the ra_allocate function after each register spill can take
122 * several minutes. This option speeds up shader compilation by spilling
123 * more registers after the ra_allocate failure. Required for
124 * Cyberpunk 2077, which uses a watchdog thread to terminate the process
125 * in case the render thread hasn't responded within 2 minutes.
126 */
127 int spilling_rate;
128 };
129
130 #define elk_shader_debug_log(compiler, data, fmt, ... ) do { \
131 static unsigned id = 0; \
132 compiler->shader_debug_log(data, &id, fmt, ##__VA_ARGS__); \
133 } while (0)
134
135 #define elk_shader_perf_log(compiler, data, fmt, ... ) do { \
136 static unsigned id = 0; \
137 compiler->shader_perf_log(data, &id, fmt, ##__VA_ARGS__); \
138 } while (0)
139
140 /**
141 * We use a constant subgroup size of 32. It really only needs to be a
142 * maximum and, since we do SIMD32 for compute shaders in some cases, it
143 * needs to be at least 32. SIMD8 and SIMD16 shaders will still claim a
144 * subgroup size of 32 but will act as if 16 or 24 of those channels are
145 * disabled.
146 */
147 #define ELK_SUBGROUP_SIZE 32
148
149 /**
150 * Program key structures.
151 *
152 * When drawing, we look for the currently bound shaders in the program
153 * cache. This is essentially a hash table lookup, and these are the keys.
154 *
155 * Sometimes OpenGL features specified as state need to be simulated via
156 * shader code, due to a mismatch between the API and the hardware. This
157 * is often referred to as "non-orthagonal state" or "NOS". We store NOS
158 * in the program key so it's considered when searching for a program. If
159 * we haven't seen a particular combination before, we have to recompile a
160 * new specialized version.
161 *
162 * Shader compilation should not look up state in gl_context directly, but
163 * instead use the copy in the program key. This guarantees recompiles will
164 * happen correctly.
165 *
166 * @{
167 */
168
169 enum PACKED elk_gfx6_gather_sampler_wa {
170 ELK_WA_SIGN = 1, /* whether we need to sign extend */
171 ELK_WA_8BIT = 2, /* if we have an 8bit format needing wa */
172 ELK_WA_16BIT = 4, /* if we have a 16bit format needing wa */
173 };
174
175 #define ELK_MAX_SAMPLERS 32
176
177 /* Provide explicit padding for each member, to ensure that the compiler
178 * initializes every bit in the shader cache keys. The keys will be compared
179 * with memcmp.
180 */
181 PRAGMA_DIAGNOSTIC_PUSH
182 PRAGMA_DIAGNOSTIC_ERROR(-Wpadded)
183
184 /**
185 * Sampler information needed by VS, WM, and GS program cache keys.
186 */
187 struct elk_sampler_prog_key_data {
188 /**
189 * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles.
190 *
191 * This field is not consumed by the back-end compiler and is only relevant
192 * for the crocus OpenGL driver for Broadwell and earlier hardware.
193 */
194 uint16_t swizzles[ELK_MAX_SAMPLERS];
195
196 uint32_t gl_clamp_mask[3];
197
198 /**
199 * For RG32F, gather4's channel select is broken.
200 */
201 uint32_t gather_channel_quirk_mask;
202
203 /**
204 * For Sandybridge, which shader w/a we need for gather quirks.
205 */
206 enum elk_gfx6_gather_sampler_wa gfx6_gather_wa[ELK_MAX_SAMPLERS];
207 };
208
209 enum elk_robustness_flags {
210 ELK_ROBUSTNESS_UBO = BITFIELD_BIT(0),
211 ELK_ROBUSTNESS_SSBO = BITFIELD_BIT(1),
212 };
213
214 struct elk_base_prog_key {
215 unsigned program_string_id;
216
217 enum elk_robustness_flags robust_flags:2;
218
219 unsigned padding:22;
220
221 /**
222 * Apply workarounds for SIN and COS input range problems.
223 * This limits input range for SIN and COS to [-2p : 2p] to
224 * avoid precision issues.
225 */
226 bool limit_trig_input_range;
227
228 struct elk_sampler_prog_key_data tex;
229 };
230
231 /**
232 * The VF can't natively handle certain types of attributes, such as GL_FIXED
233 * or most 10_10_10_2 types. These flags enable various VS workarounds to
234 * "fix" attributes at the beginning of shaders.
235 */
236 #define ELK_ATTRIB_WA_COMPONENT_MASK 7 /* mask for GL_FIXED scale channel count */
237 #define ELK_ATTRIB_WA_NORMALIZE 8 /* normalize in shader */
238 #define ELK_ATTRIB_WA_BGRA 16 /* swap r/b channels in shader */
239 #define ELK_ATTRIB_WA_SIGN 32 /* interpret as signed in shader */
240 #define ELK_ATTRIB_WA_SCALE 64 /* interpret as scaled in shader */
241
242 /**
243 * OpenGL attribute slots fall in [0, VERT_ATTRIB_MAX - 1] with the range
244 * [VERT_ATTRIB_GENERIC0, VERT_ATTRIB_MAX - 1] reserved for up to 16 user
245 * input vertex attributes. In Vulkan, we expose up to 28 user vertex input
246 * attributes that are mapped to slots also starting at VERT_ATTRIB_GENERIC0.
247 */
248 #define MAX_GL_VERT_ATTRIB VERT_ATTRIB_MAX
249 #define MAX_VK_VERT_ATTRIB (VERT_ATTRIB_GENERIC0 + 28)
250
251 /**
252 * Max number of binding table entries used for stream output.
253 *
254 * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the
255 * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64.
256 *
257 * On Gfx6, the size of transform feedback data is limited not by the number
258 * of components but by the number of binding table entries we set aside. We
259 * use one binding table entry for a float, one entry for a vector, and one
260 * entry per matrix column. Since the only way we can communicate our
261 * transform feedback capabilities to the client is via
262 * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the
263 * worst case, in which all the varyings are floats, so we use up one binding
264 * table entry per component. Therefore we need to set aside at least 64
265 * binding table entries for use by transform feedback.
266 *
267 * Note: since we don't currently pack varyings, it is currently impossible
268 * for the client to actually use up all of these binding table entries--if
269 * all of their varyings were floats, they would run out of varying slots and
270 * fail to link. But that's a bug, so it seems prudent to go ahead and
271 * allocate the number of binding table entries we will need once the bug is
272 * fixed.
273 */
274 #define ELK_MAX_SOL_BINDINGS 64
275
276 /** The program key for Vertex Shaders. */
277 struct elk_vs_prog_key {
278 struct elk_base_prog_key base;
279
280 /**
281 * Per-attribute workaround flags
282 *
283 * For each attribute, a combination of ELK_ATTRIB_WA_*.
284 *
285 * For OpenGL, where we expose a maximum of 16 user input attributes
286 * we only need up to VERT_ATTRIB_MAX slots, however, in Vulkan
287 * slots preceding VERT_ATTRIB_GENERIC0 are unused and we can
288 * expose up to 28 user input vertex attributes that are mapped to slots
289 * starting at VERT_ATTRIB_GENERIC0, so this array needs to be large
290 * enough to hold this many slots.
291 */
292 uint8_t gl_attrib_wa_flags[MAX2(MAX_GL_VERT_ATTRIB, MAX_VK_VERT_ATTRIB)];
293
294 /**
295 * For pre-Gfx6 hardware, a bitfield indicating which texture coordinates
296 * are going to be replaced with point coordinates (as a consequence of a
297 * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because
298 * our SF thread requires exact matching between VS outputs and FS inputs,
299 * these texture coordinates will need to be unconditionally included in
300 * the VUE, even if they aren't written by the vertex shader.
301 */
302 uint8_t point_coord_replace;
303 unsigned clamp_pointsize:1;
304
305 bool copy_edgeflag:1;
306
307 bool clamp_vertex_color:1;
308
309 /**
310 * How many user clipping planes are being uploaded to the vertex shader as
311 * push constants.
312 *
313 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
314 * clip distances.
315 */
316 unsigned nr_userclip_plane_consts:4;
317
318 uint32_t padding: 25;
319 };
320
321 /** The program key for Tessellation Control Shaders. */
322 struct elk_tcs_prog_key
323 {
324 struct elk_base_prog_key base;
325
326 /** A bitfield of per-vertex outputs written. */
327 uint64_t outputs_written;
328
329 enum tess_primitive_mode _tes_primitive_mode;
330
331 /** Number of input vertices, 0 means dynamic */
332 unsigned input_vertices;
333
334 /** A bitfield of per-patch outputs written. */
335 uint32_t patch_outputs_written;
336
337 bool quads_workaround;
338 uint32_t padding:24;
339 };
340
341 #define ELK_MAX_TCS_INPUT_VERTICES (32)
342
343 static inline uint32_t
elk_tcs_prog_key_input_vertices(const struct elk_tcs_prog_key * key)344 elk_tcs_prog_key_input_vertices(const struct elk_tcs_prog_key *key)
345 {
346 return key->input_vertices != 0 ?
347 key->input_vertices : ELK_MAX_TCS_INPUT_VERTICES;
348 }
349
350 /** The program key for Tessellation Evaluation Shaders. */
351 struct elk_tes_prog_key
352 {
353 struct elk_base_prog_key base;
354
355 /** A bitfield of per-vertex inputs read. */
356 uint64_t inputs_read;
357
358 /** A bitfield of per-patch inputs read. */
359 uint32_t patch_inputs_read;
360
361 /**
362 * How many user clipping planes are being uploaded to the tessellation
363 * evaluation shader as push constants.
364 *
365 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
366 * clip distances.
367 */
368 unsigned nr_userclip_plane_consts:4;
369 unsigned clamp_pointsize:1;
370 uint32_t padding:27;
371 };
372
373 /** The program key for Geometry Shaders. */
374 struct elk_gs_prog_key
375 {
376 struct elk_base_prog_key base;
377
378 /**
379 * How many user clipping planes are being uploaded to the geometry shader
380 * as push constants.
381 *
382 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
383 * clip distances.
384 */
385 unsigned nr_userclip_plane_consts:4;
386 unsigned clamp_pointsize:1;
387 unsigned padding:27;
388 };
389
390 enum elk_sf_primitive {
391 ELK_SF_PRIM_POINTS = 0,
392 ELK_SF_PRIM_LINES = 1,
393 ELK_SF_PRIM_TRIANGLES = 2,
394 ELK_SF_PRIM_UNFILLED_TRIS = 3,
395 };
396
397 struct elk_sf_prog_key {
398 uint64_t attrs;
399 bool contains_flat_varying;
400 unsigned char interp_mode[65]; /* ELK_VARYING_SLOT_COUNT */
401 uint8_t point_sprite_coord_replace;
402 enum elk_sf_primitive primitive:2;
403 bool do_twoside_color:1;
404 bool frontface_ccw:1;
405 bool do_point_sprite:1;
406 bool do_point_coord:1;
407 bool sprite_origin_lower_left:1;
408 bool userclip_active:1;
409 unsigned padding: 32;
410 };
411
412 enum elk_clip_mode {
413 ELK_CLIP_MODE_NORMAL = 0,
414 ELK_CLIP_MODE_CLIP_ALL = 1,
415 ELK_CLIP_MODE_CLIP_NON_REJECTED = 2,
416 ELK_CLIP_MODE_REJECT_ALL = 3,
417 ELK_CLIP_MODE_ACCEPT_ALL = 4,
418 ELK_CLIP_MODE_KERNEL_CLIP = 5,
419 };
420
421 enum elk_clip_fill_mode {
422 ELK_CLIP_FILL_MODE_LINE = 0,
423 ELK_CLIP_FILL_MODE_POINT = 1,
424 ELK_CLIP_FILL_MODE_FILL = 2,
425 ELK_CLIP_FILL_MODE_CULL = 3,
426 };
427
428 /* Note that if unfilled primitives are being emitted, we have to fix
429 * up polygon offset and flatshading at this point:
430 */
431 struct elk_clip_prog_key {
432 uint64_t attrs;
433 float offset_factor;
434 float offset_units;
435 float offset_clamp;
436 bool contains_flat_varying;
437 bool contains_noperspective_varying;
438 unsigned char interp_mode[65]; /* ELK_VARYING_SLOT_COUNT */
439 unsigned primitive:4;
440 unsigned nr_userclip:4;
441 bool pv_first:1;
442 bool do_unfilled:1;
443 enum elk_clip_fill_mode fill_cw:2; /* includes cull information */
444 enum elk_clip_fill_mode fill_ccw:2; /* includes cull information */
445 bool offset_cw:1;
446 bool offset_ccw:1;
447 bool copy_bfc_cw:1;
448 bool copy_bfc_ccw:1;
449 enum elk_clip_mode clip_mode:3;
450 uint64_t padding:51;
451 };
452
453 /* A big lookup table is used to figure out which and how many
454 * additional regs will inserted before the main payload in the WM
455 * program execution. These mainly relate to depth and stencil
456 * processing and the early-depth-test optimization.
457 */
458 enum elk_wm_iz_bits {
459 ELK_WM_IZ_PS_KILL_ALPHATEST_BIT = 0x1,
460 ELK_WM_IZ_PS_COMPUTES_DEPTH_BIT = 0x2,
461 ELK_WM_IZ_DEPTH_WRITE_ENABLE_BIT = 0x4,
462 ELK_WM_IZ_DEPTH_TEST_ENABLE_BIT = 0x8,
463 ELK_WM_IZ_STENCIL_WRITE_ENABLE_BIT = 0x10,
464 ELK_WM_IZ_STENCIL_TEST_ENABLE_BIT = 0x20,
465 ELK_WM_IZ_BIT_MAX = 0x40
466 };
467
468 enum elk_sometimes {
469 ELK_NEVER = 0,
470 ELK_SOMETIMES,
471 ELK_ALWAYS
472 };
473
474 static inline enum elk_sometimes
elk_sometimes_invert(enum elk_sometimes x)475 elk_sometimes_invert(enum elk_sometimes x)
476 {
477 return (enum elk_sometimes)((int)ELK_ALWAYS - (int)x);
478 }
479
480 /** The program key for Fragment/Pixel Shaders. */
481 struct elk_wm_prog_key {
482 struct elk_base_prog_key base;
483
484 uint64_t input_slots_valid;
485 float alpha_test_ref;
486 uint8_t color_outputs_valid;
487
488 /* Some collection of ELK_WM_IZ_* */
489 uint8_t iz_lookup;
490 bool stats_wm:1;
491 bool flat_shade:1;
492 unsigned nr_color_regions:5;
493 bool emit_alpha_test:1;
494 enum compare_func alpha_test_func:3; /* < For Gfx4/5 MRT alpha test */
495 bool alpha_test_replicate_alpha:1;
496 enum elk_sometimes alpha_to_coverage:2;
497 bool clamp_fragment_color:1;
498
499 bool force_dual_color_blend:1;
500
501 /** Whether or inputs are interpolated at sample rate by default
502 *
503 * This corresponds to the sample shading API bit in Vulkan or OpenGL which
504 * controls how inputs with no interpolation qualifier are interpolated.
505 * This is distinct from the way that using gl_SampleID or similar requires
506 * us to run per-sample. Even when running per-sample due to gl_SampleID,
507 * we may still interpolate unqualified inputs at the pixel center.
508 */
509 enum elk_sometimes persample_interp:2;
510
511 /* Whether or not we are running on a multisampled framebuffer */
512 enum elk_sometimes multisample_fbo:2;
513
514 enum elk_sometimes line_aa:2;
515
516 bool coherent_fb_fetch:1;
517 bool ignore_sample_mask_out:1;
518
519 uint64_t padding:56;
520 };
521
522 struct elk_cs_prog_key {
523 struct elk_base_prog_key base;
524 };
525
526 struct elk_ff_gs_prog_key {
527 uint64_t attrs;
528
529 /**
530 * Map from the index of a transform feedback binding table entry to the
531 * gl_varying_slot that should be streamed out through that binding table
532 * entry.
533 */
534 unsigned char transform_feedback_bindings[ELK_MAX_SOL_BINDINGS];
535
536 /**
537 * Map from the index of a transform feedback binding table entry to the
538 * swizzles that should be used when streaming out data through that
539 * binding table entry.
540 */
541 unsigned char transform_feedback_swizzles[ELK_MAX_SOL_BINDINGS];
542
543 /**
544 * Hardware primitive type being drawn, e.g. _3DPRIM_TRILIST.
545 */
546 unsigned primitive:8;
547
548 unsigned pv_first:1;
549 unsigned need_gs_prog:1;
550
551 /**
552 * Number of varyings that are output to transform feedback.
553 */
554 unsigned num_transform_feedback_bindings:7; /* 0-ELK_MAX_SOL_BINDINGS */
555 uint64_t padding:47;
556 };
557
558 /* elk_any_prog_key is any of the keys that map to an API stage */
559 union elk_any_prog_key {
560 struct elk_base_prog_key base;
561 struct elk_vs_prog_key vs;
562 struct elk_tcs_prog_key tcs;
563 struct elk_tes_prog_key tes;
564 struct elk_gs_prog_key gs;
565 struct elk_wm_prog_key wm;
566 struct elk_cs_prog_key cs;
567 };
568
569 PRAGMA_DIAGNOSTIC_POP
570
571 /** Max number of render targets in a shader */
572 #define ELK_MAX_DRAW_BUFFERS 8
573
574 /**
575 * Binding table index for the first gfx6 SOL binding.
576 */
577 #define ELK_GFX6_SOL_BINDING_START 0
578
579 struct elk_ubo_range
580 {
581 uint16_t block;
582
583 /* In units of 32-byte registers */
584 uint8_t start;
585 uint8_t length;
586 };
587
588 /* We reserve the first 2^16 values for builtins */
589 #define ELK_PARAM_IS_BUILTIN(param) (((param) & 0xffff0000) == 0)
590
591 enum elk_param_builtin {
592 ELK_PARAM_BUILTIN_ZERO,
593
594 ELK_PARAM_BUILTIN_CLIP_PLANE_0_X,
595 ELK_PARAM_BUILTIN_CLIP_PLANE_0_Y,
596 ELK_PARAM_BUILTIN_CLIP_PLANE_0_Z,
597 ELK_PARAM_BUILTIN_CLIP_PLANE_0_W,
598 ELK_PARAM_BUILTIN_CLIP_PLANE_1_X,
599 ELK_PARAM_BUILTIN_CLIP_PLANE_1_Y,
600 ELK_PARAM_BUILTIN_CLIP_PLANE_1_Z,
601 ELK_PARAM_BUILTIN_CLIP_PLANE_1_W,
602 ELK_PARAM_BUILTIN_CLIP_PLANE_2_X,
603 ELK_PARAM_BUILTIN_CLIP_PLANE_2_Y,
604 ELK_PARAM_BUILTIN_CLIP_PLANE_2_Z,
605 ELK_PARAM_BUILTIN_CLIP_PLANE_2_W,
606 ELK_PARAM_BUILTIN_CLIP_PLANE_3_X,
607 ELK_PARAM_BUILTIN_CLIP_PLANE_3_Y,
608 ELK_PARAM_BUILTIN_CLIP_PLANE_3_Z,
609 ELK_PARAM_BUILTIN_CLIP_PLANE_3_W,
610 ELK_PARAM_BUILTIN_CLIP_PLANE_4_X,
611 ELK_PARAM_BUILTIN_CLIP_PLANE_4_Y,
612 ELK_PARAM_BUILTIN_CLIP_PLANE_4_Z,
613 ELK_PARAM_BUILTIN_CLIP_PLANE_4_W,
614 ELK_PARAM_BUILTIN_CLIP_PLANE_5_X,
615 ELK_PARAM_BUILTIN_CLIP_PLANE_5_Y,
616 ELK_PARAM_BUILTIN_CLIP_PLANE_5_Z,
617 ELK_PARAM_BUILTIN_CLIP_PLANE_5_W,
618 ELK_PARAM_BUILTIN_CLIP_PLANE_6_X,
619 ELK_PARAM_BUILTIN_CLIP_PLANE_6_Y,
620 ELK_PARAM_BUILTIN_CLIP_PLANE_6_Z,
621 ELK_PARAM_BUILTIN_CLIP_PLANE_6_W,
622 ELK_PARAM_BUILTIN_CLIP_PLANE_7_X,
623 ELK_PARAM_BUILTIN_CLIP_PLANE_7_Y,
624 ELK_PARAM_BUILTIN_CLIP_PLANE_7_Z,
625 ELK_PARAM_BUILTIN_CLIP_PLANE_7_W,
626
627 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_X,
628 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y,
629 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_Z,
630 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_W,
631 ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_X,
632 ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_Y,
633
634 ELK_PARAM_BUILTIN_PATCH_VERTICES_IN,
635
636 ELK_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X,
637 ELK_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y,
638 ELK_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z,
639 ELK_PARAM_BUILTIN_SUBGROUP_ID,
640 ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_X,
641 ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_Y,
642 ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_Z,
643 ELK_PARAM_BUILTIN_WORK_DIM,
644 };
645
646 #define ELK_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
647 (ELK_PARAM_BUILTIN_CLIP_PLANE_0_X + ((idx) << 2) + (comp))
648
649 #define ELK_PARAM_BUILTIN_IS_CLIP_PLANE(param) \
650 ((param) >= ELK_PARAM_BUILTIN_CLIP_PLANE_0_X && \
651 (param) <= ELK_PARAM_BUILTIN_CLIP_PLANE_7_W)
652
653 #define ELK_PARAM_BUILTIN_CLIP_PLANE_IDX(param) \
654 (((param) - ELK_PARAM_BUILTIN_CLIP_PLANE_0_X) >> 2)
655
656 #define ELK_PARAM_BUILTIN_CLIP_PLANE_COMP(param) \
657 (((param) - ELK_PARAM_BUILTIN_CLIP_PLANE_0_X) & 0x3)
658
659 enum elk_shader_reloc_id {
660 ELK_SHADER_RELOC_CONST_DATA_ADDR_LOW,
661 ELK_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
662 ELK_SHADER_RELOC_SHADER_START_OFFSET,
663 ELK_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH,
664 };
665
666 enum elk_shader_reloc_type {
667 /** An arbitrary 32-bit value */
668 ELK_SHADER_RELOC_TYPE_U32,
669 /** A MOV instruction with an immediate source */
670 ELK_SHADER_RELOC_TYPE_MOV_IMM,
671 };
672
673 /** Represents a code relocation
674 *
675 * Relocatable constants are immediates in the code which we want to be able
676 * to replace post-compile with the actual value.
677 */
678 struct elk_shader_reloc {
679 /** The 32-bit ID of the relocatable constant */
680 uint32_t id;
681
682 /** Type of this relocation */
683 enum elk_shader_reloc_type type;
684
685 /** The offset in the shader to the relocated value
686 *
687 * For MOV_IMM relocs, this is an offset to the MOV instruction. This
688 * allows us to do some sanity checking while we update the value.
689 */
690 uint32_t offset;
691
692 /** Value to be added to the relocated value before it is written */
693 uint32_t delta;
694 };
695
696 /** A value to write to a relocation */
697 struct elk_shader_reloc_value {
698 /** The 32-bit ID of the relocatable constant */
699 uint32_t id;
700
701 /** The value with which to replace the relocated immediate */
702 uint32_t value;
703 };
704
705 struct elk_stage_prog_data {
706 struct elk_ubo_range ubo_ranges[4];
707
708 unsigned nr_params; /**< number of float params/constants */
709
710 gl_shader_stage stage;
711
712 /* zero_push_reg is a bitfield which indicates what push registers (if any)
713 * should be zeroed by SW at the start of the shader. The corresponding
714 * push_reg_mask_param specifies the param index (in 32-bit units) where
715 * the actual runtime 64-bit mask will be pushed. The shader will zero
716 * push reg i if
717 *
718 * reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
719 *
720 * If this field is set, elk_compiler::compact_params must be false.
721 */
722 uint64_t zero_push_reg;
723 unsigned push_reg_mask_param;
724
725 unsigned curb_read_length;
726 unsigned total_scratch;
727 unsigned total_shared;
728
729 unsigned program_size;
730
731 unsigned const_data_size;
732 unsigned const_data_offset;
733
734 unsigned num_relocs;
735 const struct elk_shader_reloc *relocs;
736
737 /** Does this program pull from any UBO or other constant buffers? */
738 bool has_ubo_pull;
739
740 /**
741 * Register where the thread expects to find input data from the URB
742 * (typically uniforms, followed by vertex or fragment attributes).
743 */
744 unsigned dispatch_grf_start_reg;
745
746 bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */
747
748 /* 32-bit identifiers for all push/pull parameters. These can be anything
749 * the driver wishes them to be; the core of the back-end compiler simply
750 * re-arranges them. The one restriction is that the bottom 2^16 values
751 * are reserved for builtins defined in the elk_param_builtin enum defined
752 * above.
753 */
754 uint32_t *param;
755
756 /* Whether shader uses atomic operations. */
757 bool uses_atomic_load_store;
758 };
759
760 static inline uint32_t *
elk_stage_prog_data_add_params(struct elk_stage_prog_data * prog_data,unsigned nr_new_params)761 elk_stage_prog_data_add_params(struct elk_stage_prog_data *prog_data,
762 unsigned nr_new_params)
763 {
764 unsigned old_nr_params = prog_data->nr_params;
765 prog_data->nr_params += nr_new_params;
766 prog_data->param = reralloc(ralloc_parent(prog_data->param),
767 prog_data->param, uint32_t,
768 prog_data->nr_params);
769 return prog_data->param + old_nr_params;
770 }
771
772 enum elk_barycentric_mode {
773 ELK_BARYCENTRIC_PERSPECTIVE_PIXEL = 0,
774 ELK_BARYCENTRIC_PERSPECTIVE_CENTROID = 1,
775 ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2,
776 ELK_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3,
777 ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4,
778 ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5,
779 ELK_BARYCENTRIC_MODE_COUNT = 6
780 };
781 #define ELK_BARYCENTRIC_PERSPECTIVE_BITS \
782 ((1 << ELK_BARYCENTRIC_PERSPECTIVE_PIXEL) | \
783 (1 << ELK_BARYCENTRIC_PERSPECTIVE_CENTROID) | \
784 (1 << ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE))
785 #define ELK_BARYCENTRIC_NONPERSPECTIVE_BITS \
786 ((1 << ELK_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \
787 (1 << ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \
788 (1 << ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
789
790 enum elk_pixel_shader_computed_depth_mode {
791 ELK_PSCDEPTH_OFF = 0, /* PS does not compute depth */
792 ELK_PSCDEPTH_ON = 1, /* PS computes depth; no guarantee about value */
793 ELK_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */
794 ELK_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */
795 };
796
797 /* Data about a particular attempt to compile a program. Note that
798 * there can be many of these, each in a different GL state
799 * corresponding to a different elk_wm_prog_key struct, with different
800 * compiled programs.
801 */
802 struct elk_wm_prog_data {
803 struct elk_stage_prog_data base;
804
805 unsigned num_per_primitive_inputs;
806 unsigned num_varying_inputs;
807
808 uint8_t reg_blocks_8;
809 uint8_t reg_blocks_16;
810 uint8_t reg_blocks_32;
811
812 uint8_t dispatch_grf_start_reg_16;
813 uint8_t dispatch_grf_start_reg_32;
814 uint32_t prog_offset_16;
815 uint32_t prog_offset_32;
816
817 struct {
818 /** @{
819 * surface indices the WM-specific surfaces
820 */
821 uint32_t render_target_read_start;
822 /** @} */
823 } binding_table;
824
825 uint8_t color_outputs_written;
826 uint8_t computed_depth_mode;
827
828 bool computed_stencil;
829 bool early_fragment_tests;
830 bool post_depth_coverage;
831 bool inner_coverage;
832 bool dispatch_8;
833 bool dispatch_16;
834 bool dispatch_32;
835 bool dual_src_blend;
836 bool uses_pos_offset;
837 bool uses_omask;
838 bool uses_kill;
839 bool uses_src_depth;
840 bool uses_src_w;
841 bool uses_sample_mask;
842 bool uses_vmask;
843 bool has_side_effects;
844 bool pulls_bary;
845
846 bool contains_flat_varying;
847 bool contains_noperspective_varying;
848
849 /** True if the shader wants sample shading
850 *
851 * This corresponds to whether or not a gl_SampleId, gl_SamplePosition, or
852 * a sample-qualified input are used in the shader. It is independent of
853 * GL_MIN_SAMPLE_SHADING_VALUE in GL or minSampleShading in Vulkan.
854 */
855 bool sample_shading;
856
857 /** Should this shader be dispatched per-sample */
858 enum elk_sometimes persample_dispatch;
859
860 /**
861 * Shader writes the SampleMask and this is AND-ed with the API's
862 * SampleMask to generate a new coverage mask.
863 */
864 enum elk_sometimes alpha_to_coverage;
865
866 unsigned msaa_flags_param;
867
868 /**
869 * Mask of which interpolation modes are required by the fragment shader.
870 * Those interpolations are delivered as part of the thread payload. Used
871 * in hardware setup on gfx6+.
872 */
873 uint32_t barycentric_interp_modes;
874
875 /**
876 * Whether nonperspective interpolation modes are used by the
877 * barycentric_interp_modes or fragment shader through interpolator messages.
878 */
879 bool uses_nonperspective_interp_modes;
880
881 /**
882 * Mask of which FS inputs are marked flat by the shader source. This is
883 * needed for setting up 3DSTATE_SF/SBE.
884 */
885 uint32_t flat_inputs;
886
887 /**
888 * The FS inputs
889 */
890 uint64_t inputs;
891
892 /* Mapping of VUE slots to interpolation modes.
893 * Used by the Gfx4-5 clip/sf/wm stages.
894 */
895 unsigned char interp_mode[65]; /* ELK_VARYING_SLOT_COUNT */
896
897 /**
898 * Map from gl_varying_slot to the position within the FS setup data
899 * payload where the varying's attribute vertex deltas should be delivered.
900 * For varying slots that are not used by the FS, the value is -1.
901 */
902 int urb_setup[VARYING_SLOT_MAX];
903 int urb_setup_channel[VARYING_SLOT_MAX];
904
905 /**
906 * Cache structure into the urb_setup array above that contains the
907 * attribute numbers of active varyings out of urb_setup.
908 * The actual count is stored in urb_setup_attribs_count.
909 */
910 uint8_t urb_setup_attribs[VARYING_SLOT_MAX];
911 uint8_t urb_setup_attribs_count;
912 };
913
914 #ifdef GFX_VERx10
915
916 /** Returns the SIMD width corresponding to a given KSP index
917 *
918 * The "Variable Pixel Dispatch" table in the PRM (which can be found, for
919 * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to
920 * kernel start pointer (KSP) indices that is based on what dispatch widths
921 * are enabled. This function provides, effectively, the reverse mapping.
922 *
923 * If the given KSP is valid with respect to the SIMD8/16/32 enables, a SIMD
924 * width of 8, 16, or 32 is returned. If the KSP is invalid, 0 is returned.
925 */
926 static inline unsigned
elk_fs_simd_width_for_ksp(unsigned ksp_idx,bool simd8_enabled,bool simd16_enabled,bool simd32_enabled)927 elk_fs_simd_width_for_ksp(unsigned ksp_idx, bool simd8_enabled,
928 bool simd16_enabled, bool simd32_enabled)
929 {
930 /* This function strictly ignores contiguous dispatch */
931 switch (ksp_idx) {
932 case 0:
933 return simd8_enabled ? 8 :
934 (simd16_enabled && !simd32_enabled) ? 16 :
935 (simd32_enabled && !simd16_enabled) ? 32 : 0;
936 case 1:
937 return (simd32_enabled && (simd16_enabled || simd8_enabled)) ? 32 : 0;
938 case 2:
939 return (simd16_enabled && (simd32_enabled || simd8_enabled)) ? 16 : 0;
940 default:
941 unreachable("Invalid KSP index");
942 }
943 }
944
945 #define elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \
946 elk_fs_simd_width_for_ksp((ksp_idx), (wm_state)._8PixelDispatchEnable, \
947 (wm_state)._16PixelDispatchEnable, \
948 (wm_state)._32PixelDispatchEnable)
949
950 #endif
951
952 #define elk_wm_state_has_ksp(wm_state, ksp_idx) \
953 (elk_wm_state_simd_width_for_ksp((wm_state), (ksp_idx)) != 0)
954
955 static inline uint32_t
_elk_wm_prog_data_prog_offset(const struct elk_wm_prog_data * prog_data,unsigned simd_width)956 _elk_wm_prog_data_prog_offset(const struct elk_wm_prog_data *prog_data,
957 unsigned simd_width)
958 {
959 switch (simd_width) {
960 case 8: return 0;
961 case 16: return prog_data->prog_offset_16;
962 case 32: return prog_data->prog_offset_32;
963 default: return 0;
964 }
965 }
966
967 #define elk_wm_prog_data_prog_offset(prog_data, wm_state, ksp_idx) \
968 _elk_wm_prog_data_prog_offset(prog_data, \
969 elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
970
971 static inline uint8_t
_elk_wm_prog_data_dispatch_grf_start_reg(const struct elk_wm_prog_data * prog_data,unsigned simd_width)972 _elk_wm_prog_data_dispatch_grf_start_reg(const struct elk_wm_prog_data *prog_data,
973 unsigned simd_width)
974 {
975 switch (simd_width) {
976 case 8: return prog_data->base.dispatch_grf_start_reg;
977 case 16: return prog_data->dispatch_grf_start_reg_16;
978 case 32: return prog_data->dispatch_grf_start_reg_32;
979 default: return 0;
980 }
981 }
982
983 #define elk_wm_prog_data_dispatch_grf_start_reg(prog_data, wm_state, ksp_idx) \
984 _elk_wm_prog_data_dispatch_grf_start_reg(prog_data, \
985 elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
986
987 static inline uint8_t
_elk_wm_prog_data_reg_blocks(const struct elk_wm_prog_data * prog_data,unsigned simd_width)988 _elk_wm_prog_data_reg_blocks(const struct elk_wm_prog_data *prog_data,
989 unsigned simd_width)
990 {
991 switch (simd_width) {
992 case 8: return prog_data->reg_blocks_8;
993 case 16: return prog_data->reg_blocks_16;
994 case 32: return prog_data->reg_blocks_32;
995 default: return 0;
996 }
997 }
998
999 #define elk_wm_prog_data_reg_blocks(prog_data, wm_state, ksp_idx) \
1000 _elk_wm_prog_data_reg_blocks(prog_data, \
1001 elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
1002
1003 static inline bool
elk_wm_prog_data_is_persample(const struct elk_wm_prog_data * prog_data,enum intel_msaa_flags pushed_msaa_flags)1004 elk_wm_prog_data_is_persample(const struct elk_wm_prog_data *prog_data,
1005 enum intel_msaa_flags pushed_msaa_flags)
1006 {
1007 if (pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC) {
1008 if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_MULTISAMPLE_FBO))
1009 return false;
1010
1011 if (prog_data->sample_shading)
1012 assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
1013
1014 if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)
1015 assert(prog_data->persample_dispatch != ELK_NEVER);
1016 else
1017 assert(prog_data->persample_dispatch != ELK_ALWAYS);
1018
1019 return (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) != 0;
1020 }
1021
1022 assert(prog_data->persample_dispatch == ELK_ALWAYS ||
1023 prog_data->persample_dispatch == ELK_NEVER);
1024
1025 return prog_data->persample_dispatch;
1026 }
1027
1028 static inline uint32_t
elk_wm_prog_data_barycentric_modes(const struct elk_wm_prog_data * prog_data,enum intel_msaa_flags pushed_msaa_flags)1029 elk_wm_prog_data_barycentric_modes(const struct elk_wm_prog_data *prog_data,
1030 enum intel_msaa_flags pushed_msaa_flags)
1031 {
1032 uint32_t modes = prog_data->barycentric_interp_modes;
1033
1034 /* In the non dynamic case, we can just return the computed modes from
1035 * compilation time.
1036 */
1037 if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC))
1038 return modes;
1039
1040 if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_INTERP) {
1041 assert(prog_data->persample_dispatch == ELK_ALWAYS ||
1042 (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH));
1043
1044 /* Making dynamic per-sample interpolation work is a bit tricky. The
1045 * hardware will hang if SAMPLE is requested but per-sample dispatch is
1046 * not enabled. This means we can't preemptively add SAMPLE to the
1047 * barycentrics bitfield. Instead, we have to add it late and only
1048 * on-demand. Annoyingly, changing the number of barycentrics requested
1049 * changes the whole PS shader payload so we very much don't want to do
1050 * that. Instead, if the dynamic per-sample interpolation flag is set,
1051 * we check to see if SAMPLE was requested and, if not, replace the
1052 * highest barycentric bit in the [non]perspective grouping (CENTROID,
1053 * if it exists, else PIXEL) with SAMPLE. The shader will stomp all the
1054 * barycentrics in the shader with SAMPLE so it really doesn't matter
1055 * which one we replace. The important thing is that we keep the number
1056 * of barycentrics in each [non]perspective grouping the same.
1057 */
1058 if ((modes & ELK_BARYCENTRIC_PERSPECTIVE_BITS) &&
1059 !(modes & BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE))) {
1060 int sample_mode =
1061 util_last_bit(modes & ELK_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
1062 assert(modes & BITFIELD_BIT(sample_mode));
1063
1064 modes &= ~BITFIELD_BIT(sample_mode);
1065 modes |= BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE);
1066 }
1067
1068 if ((modes & ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) &&
1069 !(modes & BITFIELD_BIT(ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))) {
1070 int sample_mode =
1071 util_last_bit(modes & ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
1072 assert(modes & BITFIELD_BIT(sample_mode));
1073
1074 modes &= ~BITFIELD_BIT(sample_mode);
1075 modes |= BITFIELD_BIT(ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE);
1076 }
1077 } else {
1078 /* If we're not using per-sample interpolation, we need to disable the
1079 * per-sample bits.
1080 *
1081 * SKL PRMs, Volume 2a: Command Reference: Instructions,
1082 * 3DSTATE_WM:Barycentric Interpolation Mode:
1083
1084 * "MSDISPMODE_PERSAMPLE is required in order to select Perspective
1085 * Sample or Non-perspective Sample barycentric coordinates."
1086 */
1087 modes &= ~(BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE) |
1088 BITFIELD_BIT(ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE));
1089 }
1090
1091 return modes;
1092 }
1093
1094 struct elk_push_const_block {
1095 unsigned dwords; /* Dword count, not reg aligned */
1096 unsigned regs;
1097 unsigned size; /* Bytes, register aligned */
1098 };
1099
1100 struct elk_cs_prog_data {
1101 struct elk_stage_prog_data base;
1102
1103 unsigned local_size[3];
1104
1105 /* Program offsets for the 8/16/32 SIMD variants. Multiple variants are
1106 * kept when using variable group size, and the right one can only be
1107 * decided at dispatch time.
1108 */
1109 unsigned prog_offset[3];
1110
1111 /* Bitmask indicating which program offsets are valid. */
1112 unsigned prog_mask;
1113
1114 /* Bitmask indicating which programs have spilled. */
1115 unsigned prog_spilled;
1116
1117 bool uses_barrier;
1118 bool uses_num_work_groups;
1119
1120 struct {
1121 struct elk_push_const_block cross_thread;
1122 struct elk_push_const_block per_thread;
1123 } push;
1124
1125 struct {
1126 /** @{
1127 * surface indices the CS-specific surfaces
1128 */
1129 uint32_t work_groups_start;
1130 /** @} */
1131 } binding_table;
1132 };
1133
1134 static inline uint32_t
elk_cs_prog_data_prog_offset(const struct elk_cs_prog_data * prog_data,unsigned dispatch_width)1135 elk_cs_prog_data_prog_offset(const struct elk_cs_prog_data *prog_data,
1136 unsigned dispatch_width)
1137 {
1138 assert(dispatch_width == 8 ||
1139 dispatch_width == 16 ||
1140 dispatch_width == 32);
1141 const unsigned index = dispatch_width / 16;
1142 assert(prog_data->prog_mask & (1 << index));
1143 return prog_data->prog_offset[index];
1144 }
1145
1146 struct elk_ff_gs_prog_data {
1147 unsigned urb_read_length;
1148 unsigned total_grf;
1149
1150 /**
1151 * Gfx6 transform feedback: Amount by which the streaming vertex buffer
1152 * indices should be incremented each time the GS is invoked.
1153 */
1154 unsigned svbi_postincrement_value;
1155 };
1156
1157 /**
1158 * Enum representing the i965-specific vertex results that don't correspond
1159 * exactly to any element of gl_varying_slot. The values of this enum are
1160 * assigned such that they don't conflict with gl_varying_slot.
1161 */
1162 typedef enum
1163 {
1164 ELK_VARYING_SLOT_NDC = VARYING_SLOT_MAX,
1165 ELK_VARYING_SLOT_PAD,
1166 /**
1167 * Technically this is not a varying but just a placeholder that
1168 * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord
1169 * builtin variable to be compiled correctly. see compile_sf_prog() for
1170 * more info.
1171 */
1172 ELK_VARYING_SLOT_PNTC,
1173 ELK_VARYING_SLOT_COUNT
1174 } elk_varying_slot;
1175
1176 /**
1177 * We always program SF to start reading at an offset of 1 (2 varying slots)
1178 * from the start of the vertex URB entry. This causes it to skip:
1179 * - VARYING_SLOT_PSIZ and ELK_VARYING_SLOT_NDC on gfx4-5
1180 * - VARYING_SLOT_PSIZ and VARYING_SLOT_POS on gfx6+
1181 */
1182 #define ELK_SF_URB_ENTRY_READ_OFFSET 1
1183
1184 /**
1185 * Bitmask indicating which fragment shader inputs represent varyings (and
1186 * hence have to be delivered to the fragment shader by the SF/SBE stage).
1187 */
1188 #define ELK_FS_VARYING_INPUT_MASK \
1189 (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \
1190 ~VARYING_BIT_POS & ~VARYING_BIT_FACE)
1191
1192 void elk_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
1193 gl_shader_stage stage);
1194
1195 /**
1196 * Convert a VUE slot number into a byte offset within the VUE.
1197 */
elk_vue_slot_to_offset(unsigned slot)1198 static inline unsigned elk_vue_slot_to_offset(unsigned slot)
1199 {
1200 return 16*slot;
1201 }
1202
1203 /**
1204 * Convert a vertex output (elk_varying_slot) into a byte offset within the
1205 * VUE.
1206 */
1207 static inline unsigned
elk_varying_to_offset(const struct intel_vue_map * vue_map,unsigned varying)1208 elk_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying)
1209 {
1210 return elk_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
1211 }
1212
1213 void elk_compute_vue_map(const struct intel_device_info *devinfo,
1214 struct intel_vue_map *vue_map,
1215 uint64_t slots_valid,
1216 bool separate_shader,
1217 uint32_t pos_slots);
1218
1219 void elk_compute_tess_vue_map(struct intel_vue_map *const vue_map,
1220 uint64_t slots_valid,
1221 uint32_t is_patch);
1222
1223 /* elk_interpolation_map.c */
1224 void elk_setup_vue_interpolation(const struct intel_vue_map *vue_map,
1225 struct nir_shader *nir,
1226 struct elk_wm_prog_data *prog_data);
1227
1228 struct elk_vue_prog_data {
1229 struct elk_stage_prog_data base;
1230 struct intel_vue_map vue_map;
1231
1232 /** Should the hardware deliver input VUE handles for URB pull loads? */
1233 bool include_vue_handles;
1234
1235 unsigned urb_read_length;
1236 unsigned total_grf;
1237
1238 uint32_t clip_distance_mask;
1239 uint32_t cull_distance_mask;
1240
1241 /* Used for calculating urb partitions. In the VS, this is the size of the
1242 * URB entry used for both input and output to the thread. In the GS, this
1243 * is the size of the URB entry used for output.
1244 */
1245 unsigned urb_entry_size;
1246
1247 enum intel_shader_dispatch_mode dispatch_mode;
1248 };
1249
1250 struct elk_vs_prog_data {
1251 struct elk_vue_prog_data base;
1252
1253 uint64_t inputs_read;
1254 uint64_t double_inputs_read;
1255
1256 unsigned nr_attribute_slots;
1257
1258 bool uses_vertexid;
1259 bool uses_instanceid;
1260 bool uses_is_indexed_draw;
1261 bool uses_firstvertex;
1262 bool uses_baseinstance;
1263 bool uses_drawid;
1264 };
1265
1266 struct elk_tcs_prog_data
1267 {
1268 struct elk_vue_prog_data base;
1269
1270 /** Should the non-SINGLE_PATCH payload provide primitive ID? */
1271 bool include_primitive_id;
1272
1273 /** Number vertices in output patch */
1274 int instances;
1275
1276 /** Track patch count threshold */
1277 int patch_count_threshold;
1278 };
1279
1280
1281 struct elk_tes_prog_data
1282 {
1283 struct elk_vue_prog_data base;
1284
1285 enum intel_tess_partitioning partitioning;
1286 enum intel_tess_output_topology output_topology;
1287 enum intel_tess_domain domain;
1288 bool include_primitive_id;
1289 };
1290
1291 struct elk_gs_prog_data
1292 {
1293 struct elk_vue_prog_data base;
1294
1295 unsigned vertices_in;
1296
1297 /**
1298 * Size of an output vertex, measured in HWORDS (32 bytes).
1299 */
1300 unsigned output_vertex_size_hwords;
1301
1302 unsigned output_topology;
1303
1304 /**
1305 * Size of the control data (cut bits or StreamID bits), in hwords (32
1306 * bytes). 0 if there is no control data.
1307 */
1308 unsigned control_data_header_size_hwords;
1309
1310 /**
1311 * Format of the control data (either GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
1312 * if the control data is StreamID bits, or
1313 * GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits).
1314 * Ignored if control_data_header_size is 0.
1315 */
1316 unsigned control_data_format;
1317
1318 bool include_primitive_id;
1319
1320 /**
1321 * The number of vertices emitted, if constant - otherwise -1.
1322 */
1323 int static_vertex_count;
1324
1325 int invocations;
1326
1327 /**
1328 * Gfx6: Provoking vertex convention for odd-numbered triangles
1329 * in tristrips.
1330 */
1331 unsigned pv_first:1;
1332
1333 /**
1334 * Gfx6: Number of varyings that are output to transform feedback.
1335 */
1336 unsigned num_transform_feedback_bindings:7; /* 0-ELK_MAX_SOL_BINDINGS */
1337
1338 /**
1339 * Gfx6: Map from the index of a transform feedback binding table entry to the
1340 * gl_varying_slot that should be streamed out through that binding table
1341 * entry.
1342 */
1343 unsigned char transform_feedback_bindings[64 /* ELK_MAX_SOL_BINDINGS */];
1344
1345 /**
1346 * Gfx6: Map from the index of a transform feedback binding table entry to the
1347 * swizzles that should be used when streaming out data through that
1348 * binding table entry.
1349 */
1350 unsigned char transform_feedback_swizzles[64 /* ELK_MAX_SOL_BINDINGS */];
1351 };
1352
1353 struct elk_sf_prog_data {
1354 uint32_t urb_read_length;
1355 uint32_t total_grf;
1356
1357 /* Each vertex may have up to 12 attributes, 4 components each,
1358 * except WPOS which requires only 2. (11*4 + 2) == 44 ==> 11
1359 * rows.
1360 *
1361 * Actually we use 4 for each, so call it 12 rows.
1362 */
1363 unsigned urb_entry_size;
1364 };
1365
1366 struct elk_clip_prog_data {
1367 uint32_t curb_read_length; /* user planes? */
1368 uint32_t clip_mode;
1369 uint32_t urb_read_length;
1370 uint32_t total_grf;
1371 };
1372
1373 /* elk_any_prog_data is prog_data for any stage that maps to an API stage */
1374 union elk_any_prog_data {
1375 struct elk_stage_prog_data base;
1376 struct elk_vue_prog_data vue;
1377 struct elk_vs_prog_data vs;
1378 struct elk_tcs_prog_data tcs;
1379 struct elk_tes_prog_data tes;
1380 struct elk_gs_prog_data gs;
1381 struct elk_wm_prog_data wm;
1382 struct elk_cs_prog_data cs;
1383 };
1384
1385 #define DEFINE_PROG_DATA_DOWNCAST(STAGE, CHECK) \
1386 static inline struct elk_##STAGE##_prog_data * \
1387 elk_##STAGE##_prog_data(struct elk_stage_prog_data *prog_data) \
1388 { \
1389 if (prog_data) \
1390 assert(CHECK); \
1391 return (struct elk_##STAGE##_prog_data *) prog_data; \
1392 } \
1393 static inline const struct elk_##STAGE##_prog_data * \
1394 elk_##STAGE##_prog_data_const(const struct elk_stage_prog_data *prog_data) \
1395 { \
1396 if (prog_data) \
1397 assert(CHECK); \
1398 return (const struct elk_##STAGE##_prog_data *) prog_data; \
1399 }
1400
1401 DEFINE_PROG_DATA_DOWNCAST(vs, prog_data->stage == MESA_SHADER_VERTEX)
1402 DEFINE_PROG_DATA_DOWNCAST(tcs, prog_data->stage == MESA_SHADER_TESS_CTRL)
1403 DEFINE_PROG_DATA_DOWNCAST(tes, prog_data->stage == MESA_SHADER_TESS_EVAL)
1404 DEFINE_PROG_DATA_DOWNCAST(gs, prog_data->stage == MESA_SHADER_GEOMETRY)
1405 DEFINE_PROG_DATA_DOWNCAST(wm, prog_data->stage == MESA_SHADER_FRAGMENT)
1406 DEFINE_PROG_DATA_DOWNCAST(cs, gl_shader_stage_uses_workgroup(prog_data->stage))
1407
1408 DEFINE_PROG_DATA_DOWNCAST(vue, prog_data->stage == MESA_SHADER_VERTEX ||
1409 prog_data->stage == MESA_SHADER_TESS_CTRL ||
1410 prog_data->stage == MESA_SHADER_TESS_EVAL ||
1411 prog_data->stage == MESA_SHADER_GEOMETRY)
1412
1413 /* These are not really elk_stage_prog_data. */
1414 DEFINE_PROG_DATA_DOWNCAST(ff_gs, true)
1415 DEFINE_PROG_DATA_DOWNCAST(clip, true)
1416 DEFINE_PROG_DATA_DOWNCAST(sf, true)
1417 #undef DEFINE_PROG_DATA_DOWNCAST
1418
1419 struct elk_compile_stats {
1420 uint32_t dispatch_width; /**< 0 for vec4 */
1421 uint32_t max_polygons;
1422 uint32_t max_dispatch_width;
1423 uint32_t instructions;
1424 uint32_t sends;
1425 uint32_t loops;
1426 uint32_t cycles;
1427 uint32_t spills;
1428 uint32_t fills;
1429 uint32_t max_live_registers;
1430 };
1431
1432 /** @} */
1433
1434 struct elk_compiler *
1435 elk_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo);
1436
1437 /**
1438 * Returns a compiler configuration for use with disk shader cache
1439 *
1440 * This value only needs to change for settings that can cause different
1441 * program generation between two runs on the same hardware.
1442 *
1443 * For example, it doesn't need to be different for gen 8 and gen 9 hardware,
1444 * but it does need to be different if INTEL_DEBUG=nocompact is or isn't used.
1445 */
1446 uint64_t
1447 elk_get_compiler_config_value(const struct elk_compiler *compiler);
1448
1449 unsigned
1450 elk_prog_data_size(gl_shader_stage stage);
1451
1452 unsigned
1453 elk_prog_key_size(gl_shader_stage stage);
1454
1455 struct elk_compile_params {
1456 void *mem_ctx;
1457
1458 nir_shader *nir;
1459
1460 struct elk_compile_stats *stats;
1461
1462 void *log_data;
1463
1464 char *error_str;
1465
1466 uint64_t debug_flag;
1467
1468 uint32_t source_hash;
1469 };
1470
1471 /**
1472 * Parameters for compiling a vertex shader.
1473 *
1474 * Some of these will be modified during the shader compilation.
1475 */
1476 struct elk_compile_vs_params {
1477 struct elk_compile_params base;
1478
1479 const struct elk_vs_prog_key *key;
1480 struct elk_vs_prog_data *prog_data;
1481
1482 bool edgeflag_is_last; /* true for gallium */
1483 };
1484
1485 /**
1486 * Compile a vertex shader.
1487 *
1488 * Returns the final assembly and updates the parameters structure.
1489 */
1490 const unsigned *
1491 elk_compile_vs(const struct elk_compiler *compiler,
1492 struct elk_compile_vs_params *params);
1493
1494 /**
1495 * Parameters for compiling a tessellation control shader.
1496 *
1497 * Some of these will be modified during the shader compilation.
1498 */
1499 struct elk_compile_tcs_params {
1500 struct elk_compile_params base;
1501
1502 const struct elk_tcs_prog_key *key;
1503 struct elk_tcs_prog_data *prog_data;
1504 };
1505
1506 /**
1507 * Compile a tessellation control shader.
1508 *
1509 * Returns the final assembly and updates the parameters structure.
1510 */
1511 const unsigned *
1512 elk_compile_tcs(const struct elk_compiler *compiler,
1513 struct elk_compile_tcs_params *params);
1514
1515 /**
1516 * Parameters for compiling a tessellation evaluation shader.
1517 *
1518 * Some of these will be modified during the shader compilation.
1519 */
1520 struct elk_compile_tes_params {
1521 struct elk_compile_params base;
1522
1523 const struct elk_tes_prog_key *key;
1524 struct elk_tes_prog_data *prog_data;
1525 const struct intel_vue_map *input_vue_map;
1526 };
1527
1528 /**
1529 * Compile a tessellation evaluation shader.
1530 *
1531 * Returns the final assembly and updates the parameters structure.
1532 */
1533 const unsigned *
1534 elk_compile_tes(const struct elk_compiler *compiler,
1535 struct elk_compile_tes_params *params);
1536
1537 /**
1538 * Parameters for compiling a geometry shader.
1539 *
1540 * Some of these will be modified during the shader compilation.
1541 */
1542 struct elk_compile_gs_params {
1543 struct elk_compile_params base;
1544
1545 const struct elk_gs_prog_key *key;
1546 struct elk_gs_prog_data *prog_data;
1547 };
1548
1549 /**
1550 * Compile a geometry shader.
1551 *
1552 * Returns the final assembly and updates the parameters structure.
1553 */
1554 const unsigned *
1555 elk_compile_gs(const struct elk_compiler *compiler,
1556 struct elk_compile_gs_params *params);
1557
1558 /**
1559 * Compile a strips and fans shader.
1560 *
1561 * This is a fixed-function shader determined entirely by the shader key and
1562 * a VUE map.
1563 *
1564 * Returns the final assembly and the program's size.
1565 */
1566 const unsigned *
1567 elk_compile_sf(const struct elk_compiler *compiler,
1568 void *mem_ctx,
1569 const struct elk_sf_prog_key *key,
1570 struct elk_sf_prog_data *prog_data,
1571 struct intel_vue_map *vue_map,
1572 unsigned *final_assembly_size);
1573
1574 /**
1575 * Compile a clipper shader.
1576 *
1577 * This is a fixed-function shader determined entirely by the shader key and
1578 * a VUE map.
1579 *
1580 * Returns the final assembly and the program's size.
1581 */
1582 const unsigned *
1583 elk_compile_clip(const struct elk_compiler *compiler,
1584 void *mem_ctx,
1585 const struct elk_clip_prog_key *key,
1586 struct elk_clip_prog_data *prog_data,
1587 struct intel_vue_map *vue_map,
1588 unsigned *final_assembly_size);
1589
1590 /**
1591 * Parameters for compiling a fragment shader.
1592 *
1593 * Some of these will be modified during the shader compilation.
1594 */
1595 struct elk_compile_fs_params {
1596 struct elk_compile_params base;
1597
1598 const struct elk_wm_prog_key *key;
1599 struct elk_wm_prog_data *prog_data;
1600
1601 const struct intel_vue_map *vue_map;
1602 const struct elk_mue_map *mue_map;
1603
1604 bool allow_spilling;
1605 bool use_rep_send;
1606 uint8_t max_polygons;
1607 };
1608
1609 /**
1610 * Compile a fragment shader.
1611 *
1612 * Returns the final assembly and updates the parameters structure.
1613 */
1614 const unsigned *
1615 elk_compile_fs(const struct elk_compiler *compiler,
1616 struct elk_compile_fs_params *params);
1617
1618 /**
1619 * Parameters for compiling a compute shader.
1620 *
1621 * Some of these will be modified during the shader compilation.
1622 */
1623 struct elk_compile_cs_params {
1624 struct elk_compile_params base;
1625
1626 const struct elk_cs_prog_key *key;
1627 struct elk_cs_prog_data *prog_data;
1628 };
1629
1630 /**
1631 * Compile a compute shader.
1632 *
1633 * Returns the final assembly and updates the parameters structure.
1634 */
1635 const unsigned *
1636 elk_compile_cs(const struct elk_compiler *compiler,
1637 struct elk_compile_cs_params *params);
1638
1639 /**
1640 * Compile a fixed function geometry shader.
1641 *
1642 * Returns the final assembly and the program's size.
1643 */
1644 const unsigned *
1645 elk_compile_ff_gs_prog(struct elk_compiler *compiler,
1646 void *mem_ctx,
1647 const struct elk_ff_gs_prog_key *key,
1648 struct elk_ff_gs_prog_data *prog_data,
1649 struct intel_vue_map *vue_map,
1650 unsigned *final_assembly_size);
1651
1652 void elk_debug_key_recompile(const struct elk_compiler *c, void *log,
1653 gl_shader_stage stage,
1654 const struct elk_base_prog_key *old_key,
1655 const struct elk_base_prog_key *key);
1656
1657 unsigned
1658 elk_cs_push_const_total_size(const struct elk_cs_prog_data *cs_prog_data,
1659 unsigned threads);
1660
1661 void
1662 elk_write_shader_relocs(const struct elk_isa_info *isa,
1663 void *program,
1664 const struct elk_stage_prog_data *prog_data,
1665 struct elk_shader_reloc_value *values,
1666 unsigned num_values);
1667
1668 /**
1669 * Get the dispatch information for a shader to be used with GPGPU_WALKER and
1670 * similar instructions.
1671 *
1672 * If override_local_size is not NULL, it must to point to a 3-element that
1673 * will override the value from prog_data->local_size. This is used by
1674 * ARB_compute_variable_group_size, where the size is set only at dispatch
1675 * time (so prog_data is outdated).
1676 */
1677 struct intel_cs_dispatch_info
1678 elk_cs_get_dispatch_info(const struct intel_device_info *devinfo,
1679 const struct elk_cs_prog_data *prog_data,
1680 const unsigned *override_local_size);
1681
1682 /**
1683 * Return true if the given shader stage is dispatched contiguously by the
1684 * relevant fixed function starting from channel 0 of the SIMD thread, which
1685 * implies that the dispatch mask of a thread can be assumed to have the form
1686 * '2^n - 1' for some n.
1687 */
1688 static inline bool
elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info * devinfo,gl_shader_stage stage,const struct elk_stage_prog_data * prog_data)1689 elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
1690 gl_shader_stage stage,
1691 const struct elk_stage_prog_data *prog_data)
1692 {
1693 /* The code below makes assumptions about the hardware's thread dispatch
1694 * behavior that could be proven wrong in future generations -- Make sure
1695 * to do a full test run with elk_fs_test_dispatch_packing() hooked up to
1696 * the NIR front-end before changing this assertion.
1697 */
1698 assert(devinfo->ver <= 8);
1699
1700 switch (stage) {
1701 case MESA_SHADER_FRAGMENT: {
1702 /* The PSD discards subspans coming in with no lit samples, which in the
1703 * per-pixel shading case implies that each subspan will either be fully
1704 * lit (due to the VMask being used to allow derivative computations),
1705 * or not dispatched at all. In per-sample dispatch mode individual
1706 * samples from the same subspan have a fixed relative location within
1707 * the SIMD thread, so dispatch of unlit samples cannot be avoided in
1708 * general and we should return false.
1709 */
1710 const struct elk_wm_prog_data *wm_prog_data =
1711 (const struct elk_wm_prog_data *)prog_data;
1712 return !wm_prog_data->persample_dispatch &&
1713 wm_prog_data->uses_vmask;
1714 }
1715 case MESA_SHADER_COMPUTE:
1716 /* Compute shaders will be spawned with either a fully enabled dispatch
1717 * mask or with whatever bottom/right execution mask was given to the
1718 * GPGPU walker command to be used along the workgroup edges -- In both
1719 * cases the dispatch mask is required to be tightly packed for our
1720 * invocation index calculations to work.
1721 */
1722 return true;
1723 default:
1724 /* Most remaining fixed functions are limited to use a packed dispatch
1725 * mask due to the hardware representation of the dispatch mask as a
1726 * single counter representing the number of enabled channels.
1727 */
1728 return true;
1729 }
1730 }
1731
1732 /**
1733 * Computes the first varying slot in the URB produced by the previous stage
1734 * that is used in the next stage. We do this by testing the varying slots in
1735 * the previous stage's vue map against the inputs read in the next stage.
1736 *
1737 * Note that:
1738 *
1739 * - Each URB offset contains two varying slots and we can only skip a
1740 * full offset if both slots are unused, so the value we return here is always
1741 * rounded down to the closest multiple of two.
1742 *
1743 * - gl_Layer and gl_ViewportIndex don't have their own varying slots, they are
1744 * part of the vue header, so if these are read we can't skip anything.
1745 */
1746 static inline int
elk_compute_first_urb_slot_required(uint64_t inputs_read,const struct intel_vue_map * prev_stage_vue_map)1747 elk_compute_first_urb_slot_required(uint64_t inputs_read,
1748 const struct intel_vue_map *prev_stage_vue_map)
1749 {
1750 if ((inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_SHADING_RATE)) == 0) {
1751 for (int i = 0; i < prev_stage_vue_map->num_slots; i++) {
1752 int varying = prev_stage_vue_map->slot_to_varying[i];
1753 if (varying != ELK_VARYING_SLOT_PAD && varying > 0 &&
1754 varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0)
1755 return ROUND_DOWN_TO(i, 2);
1756 }
1757 }
1758
1759 return 0;
1760 }
1761
1762 #ifdef __cplusplus
1763 } /* extern "C" */
1764 #endif
1765
1766 #endif /* ELK_COMPILER_H */
1767