xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_shader_info.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2021 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_pipe.h"
8 #include "si_shader_internal.h"
9 #include "util/mesa-sha1.h"
10 #include "sid.h"
11 #include "nir.h"
12 #include "aco_interface.h"
13 
14 struct si_shader_profile si_shader_profiles[] =
15 {
16    {
17       /* Plot3D */
18       {0x38c94662, 0x7b634109, 0x50f8254a, 0x0f4986a9, 0x11e59716, 0x3081e1a2, 0xbb2a0c59, 0xc29e853a},
19       SI_PROFILE_VS_NO_BINNING,
20    },
21    {
22       /* Viewperf/Energy */
23       {0x3279654e, 0xf51c358d, 0xc526e175, 0xd198eb26, 0x75c36c86, 0xd796398b, 0xc99b5e92, 0xddc31503},
24       SI_PROFILE_NO_OPT_UNIFORM_VARYINGS,    /* Uniform propagation regresses performance. */
25    },
26    {
27       /* Viewperf/Medical */
28       {0x4a041ad8, 0xe105a058, 0x2e9f7a38, 0xef4d1c2f, 0xb8aee798, 0x821f166b, 0x17b42668, 0xa4d1cc0a},
29       SI_PROFILE_GFX9_GFX10_PS_NO_BINNING,
30    },
31    {
32       /* Viewperf/Medical, a shader with a divergent loop doesn't benefit from Wave32,
33        * probably due to interpolation performance.
34        */
35       {0xa9c7e2c2, 0x3e01de01, 0x886cab63, 0x24327678, 0xe247c394, 0x2ecc4bf9, 0xc196d978, 0x2ba7a89c},
36       SI_PROFILE_GFX10_WAVE64,
37    },
38    {
39       /* Viewperf/Creo */
40       {0x182bd6b3, 0x5e8fba11, 0xa7b74071, 0xc69f6153, 0xc57aef8c, 0x9076492a, 0x53dc83ee, 0x921fb114},
41       SI_PROFILE_CLAMP_DIV_BY_ZERO,
42    },
43 };
44 
si_get_num_shader_profiles(void)45 unsigned si_get_num_shader_profiles(void)
46 {
47    return ARRAY_SIZE(si_shader_profiles);
48 }
49 
get_inst_tessfactor_writemask(nir_intrinsic_instr * intrin)50 static unsigned get_inst_tessfactor_writemask(nir_intrinsic_instr *intrin)
51 {
52    if (intrin->intrinsic != nir_intrinsic_store_output)
53       return 0;
54 
55    unsigned writemask = nir_intrinsic_write_mask(intrin) << nir_intrinsic_component(intrin);
56    unsigned location = nir_intrinsic_io_semantics(intrin).location;
57 
58    if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
59       return writemask << 4;
60    else if (location == VARYING_SLOT_TESS_LEVEL_INNER)
61       return writemask;
62 
63    return 0;
64 }
65 
scan_tess_ctrl(nir_cf_node * cf_node,unsigned * upper_block_tf_writemask,unsigned * cond_block_tf_writemask,bool * tessfactors_are_def_in_all_invocs,bool is_nested_cf)66 static void scan_tess_ctrl(nir_cf_node *cf_node, unsigned *upper_block_tf_writemask,
67                            unsigned *cond_block_tf_writemask,
68                            bool *tessfactors_are_def_in_all_invocs, bool is_nested_cf)
69 {
70    switch (cf_node->type) {
71    case nir_cf_node_block: {
72       nir_block *block = nir_cf_node_as_block(cf_node);
73       nir_foreach_instr (instr, block) {
74          if (instr->type != nir_instr_type_intrinsic)
75             continue;
76 
77          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
78          if (intrin->intrinsic == nir_intrinsic_barrier &&
79              nir_intrinsic_execution_scope(intrin) >= SCOPE_WORKGROUP) {
80 
81             /* If we find a barrier in nested control flow put this in the
82              * too hard basket. In GLSL this is not possible but it is in
83              * SPIR-V.
84              */
85             if (is_nested_cf) {
86                *tessfactors_are_def_in_all_invocs = false;
87                return;
88             }
89 
90             /* The following case must be prevented:
91              *    gl_TessLevelInner = ...;
92              *    barrier();
93              *    if (gl_InvocationID == 1)
94              *       gl_TessLevelInner = ...;
95              *
96              * If you consider disjoint code segments separated by barriers, each
97              * such segment that writes tess factor channels should write the same
98              * channels in all codepaths within that segment.
99              */
100             if (*upper_block_tf_writemask || *cond_block_tf_writemask) {
101                /* Accumulate the result: */
102                *tessfactors_are_def_in_all_invocs &=
103                   !(*cond_block_tf_writemask & ~(*upper_block_tf_writemask));
104 
105                /* Analyze the next code segment from scratch. */
106                *upper_block_tf_writemask = 0;
107                *cond_block_tf_writemask = 0;
108             }
109          } else
110             *upper_block_tf_writemask |= get_inst_tessfactor_writemask(intrin);
111       }
112 
113       break;
114    }
115    case nir_cf_node_if: {
116       unsigned then_tessfactor_writemask = 0;
117       unsigned else_tessfactor_writemask = 0;
118 
119       nir_if *if_stmt = nir_cf_node_as_if(cf_node);
120       foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->then_list)
121       {
122          scan_tess_ctrl(nested_node, &then_tessfactor_writemask, cond_block_tf_writemask,
123                         tessfactors_are_def_in_all_invocs, true);
124       }
125 
126       foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->else_list)
127       {
128          scan_tess_ctrl(nested_node, &else_tessfactor_writemask, cond_block_tf_writemask,
129                         tessfactors_are_def_in_all_invocs, true);
130       }
131 
132       if (then_tessfactor_writemask || else_tessfactor_writemask) {
133          /* If both statements write the same tess factor channels,
134           * we can say that the upper block writes them too.
135           */
136          *upper_block_tf_writemask |= then_tessfactor_writemask & else_tessfactor_writemask;
137          *cond_block_tf_writemask |= then_tessfactor_writemask | else_tessfactor_writemask;
138       }
139 
140       break;
141    }
142    case nir_cf_node_loop: {
143       nir_loop *loop = nir_cf_node_as_loop(cf_node);
144       assert(!nir_loop_has_continue_construct(loop));
145       foreach_list_typed(nir_cf_node, nested_node, node, &loop->body)
146       {
147          scan_tess_ctrl(nested_node, cond_block_tf_writemask, cond_block_tf_writemask,
148                         tessfactors_are_def_in_all_invocs, true);
149       }
150 
151       break;
152    }
153    default:
154       unreachable("unknown cf node type");
155    }
156 }
157 
are_tessfactors_def_in_all_invocs(const struct nir_shader * nir)158 static bool are_tessfactors_def_in_all_invocs(const struct nir_shader *nir)
159 {
160    assert(nir->info.stage == MESA_SHADER_TESS_CTRL);
161 
162    /* The pass works as follows:
163     * If all codepaths write tess factors, we can say that all
164     * invocations define tess factors.
165     *
166     * Each tess factor channel is tracked separately.
167     */
168    unsigned main_block_tf_writemask = 0; /* if main block writes tess factors */
169    unsigned cond_block_tf_writemask = 0; /* if cond block writes tess factors */
170 
171    /* Initial value = true. Here the pass will accumulate results from
172     * multiple segments surrounded by barriers. If tess factors aren't
173     * written at all, it's a shader bug and we don't care if this will be
174     * true.
175     */
176    bool tessfactors_are_def_in_all_invocs = true;
177 
178    nir_foreach_function (function, nir) {
179       if (function->impl) {
180          foreach_list_typed(nir_cf_node, node, node, &function->impl->body)
181          {
182             scan_tess_ctrl(node, &main_block_tf_writemask, &cond_block_tf_writemask,
183                            &tessfactors_are_def_in_all_invocs, false);
184          }
185       }
186    }
187 
188    /* Accumulate the result for the last code segment separated by a
189     * barrier.
190     */
191    if (main_block_tf_writemask || cond_block_tf_writemask) {
192       tessfactors_are_def_in_all_invocs &= !(cond_block_tf_writemask & ~main_block_tf_writemask);
193    }
194 
195    return tessfactors_are_def_in_all_invocs;
196 }
197 
get_texture_src(nir_tex_instr * instr,nir_tex_src_type type)198 static const nir_src *get_texture_src(nir_tex_instr *instr, nir_tex_src_type type)
199 {
200    for (unsigned i = 0; i < instr->num_srcs; i++) {
201       if (instr->src[i].src_type == type)
202          return &instr->src[i].src;
203    }
204    return NULL;
205 }
206 
scan_io_usage(const nir_shader * nir,struct si_shader_info * info,nir_intrinsic_instr * intr,bool is_input)207 static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info,
208                           nir_intrinsic_instr *intr, bool is_input)
209 {
210    unsigned interp = INTERP_MODE_FLAT; /* load_input uses flat shading */
211 
212    if (intr->intrinsic == nir_intrinsic_load_interpolated_input) {
213       nir_instr *src_instr = intr->src[0].ssa->parent_instr;
214       if (src_instr->type == nir_instr_type_intrinsic) {
215          nir_intrinsic_instr *baryc = nir_instr_as_intrinsic(src_instr);
216          if (nir_intrinsic_infos[baryc->intrinsic].index_map[NIR_INTRINSIC_INTERP_MODE] > 0)
217             interp = nir_intrinsic_interp_mode(baryc);
218          else
219             unreachable("unknown barycentric intrinsic");
220       } else {
221          /* May get here when si_update_shader_binary_info() after ps lower bc_optimize
222           * which select center and centroid. Set to any value is OK because we don't
223           * care this when si_update_shader_binary_info().
224           */
225          interp = INTERP_MODE_SMOOTH;
226       }
227    }
228 
229    unsigned mask, bit_size;
230    bool is_output_load;
231 
232    if (nir_intrinsic_has_write_mask(intr)) {
233       mask = nir_intrinsic_write_mask(intr); /* store */
234       bit_size = nir_src_bit_size(intr->src[0]);
235       is_output_load = false;
236    } else {
237       mask = nir_def_components_read(&intr->def); /* load */
238       bit_size = intr->def.bit_size;
239       is_output_load = !is_input;
240    }
241    assert(bit_size != 64 && !(mask & ~0xf) && "64-bit IO should have been lowered");
242 
243    /* Convert the 16-bit component mask to a 32-bit component mask except for VS inputs
244     * where the mask is untyped.
245     */
246    if (bit_size == 16 && !is_input) {
247       unsigned new_mask = 0;
248       for (unsigned i = 0; i < 4; i++) {
249          if (mask & (1 << i))
250             new_mask |= 0x1 << (i / 2);
251       }
252       mask = new_mask;
253    }
254 
255    mask <<= nir_intrinsic_component(intr);
256 
257    nir_src offset = *nir_get_io_offset_src(intr);
258    bool indirect = !nir_src_is_const(offset);
259    if (!indirect)
260       assert(nir_src_as_uint(offset) == 0);
261 
262    unsigned semantic = 0;
263    /* VS doesn't have semantics. */
264    if (nir->info.stage != MESA_SHADER_VERTEX || !is_input)
265       semantic = nir_intrinsic_io_semantics(intr).location;
266 
267    if (nir->info.stage == MESA_SHADER_FRAGMENT && is_input) {
268       /* The PARAM_GEN input shouldn't be scanned. */
269       if (nir_intrinsic_io_semantics(intr).no_varying)
270          return;
271 
272       /* Gather color PS inputs. We can only get here after lowering colors in monolithic
273        * shaders. This must match what we do for nir_intrinsic_load_color0/1.
274        */
275       if (semantic == VARYING_SLOT_COL0 || semantic == VARYING_SLOT_COL1 ||
276           semantic == VARYING_SLOT_BFC0 || semantic == VARYING_SLOT_BFC1) {
277          unsigned index = semantic == VARYING_SLOT_COL1 || semantic == VARYING_SLOT_BFC1;
278          info->colors_read |= mask << (index * 4);
279          return;
280       }
281    }
282 
283    if (nir->info.stage == MESA_SHADER_FRAGMENT && !is_input) {
284       /* Never use FRAG_RESULT_COLOR directly. */
285       if (semantic == FRAG_RESULT_COLOR)
286          semantic = FRAG_RESULT_DATA0;
287       semantic += nir_intrinsic_io_semantics(intr).dual_source_blend_index;
288    }
289 
290    unsigned driver_location = nir_intrinsic_base(intr);
291    unsigned num_slots = indirect ? nir_intrinsic_io_semantics(intr).num_slots : 1;
292 
293    if (is_input) {
294       assert(driver_location + num_slots <= ARRAY_SIZE(info->input));
295 
296       for (unsigned i = 0; i < num_slots; i++) {
297          unsigned loc = driver_location + i;
298 
299          info->input[loc].semantic = semantic + i;
300 
301          /* "interpolate" starts out as FLAT. The first seen load_interpolated_input overwrites it.  */
302          if (semantic != VARYING_SLOT_PRIMITIVE_ID &&
303              info->input[loc].interpolate == INTERP_MODE_FLAT)
304             info->input[loc].interpolate = interp;
305 
306          if (mask) {
307             info->input[loc].usage_mask |= mask;
308             if (bit_size == 16) {
309                if (nir_intrinsic_io_semantics(intr).high_16bits)
310                   info->input[loc].fp16_lo_hi_valid |= 0x2;
311                else
312                   info->input[loc].fp16_lo_hi_valid |= 0x1;
313             }
314             info->num_inputs = MAX2(info->num_inputs, loc + 1);
315          }
316       }
317    } else {
318       /* Outputs. */
319       assert(driver_location + num_slots <= ARRAY_SIZE(info->output_usagemask));
320 
321       for (unsigned i = 0; i < num_slots; i++) {
322          unsigned loc = driver_location + i;
323 
324          /* Call the translation functions to validate the semantic (call assertions in them). */
325          if (nir->info.stage != MESA_SHADER_FRAGMENT &&
326              semantic != VARYING_SLOT_EDGE) {
327             if (semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
328                 semantic == VARYING_SLOT_TESS_LEVEL_OUTER ||
329                 (semantic >= VARYING_SLOT_PATCH0 && semantic <= VARYING_SLOT_PATCH31)) {
330                ac_shader_io_get_unique_index_patch(semantic);
331                ac_shader_io_get_unique_index_patch(semantic + i);
332             } else {
333                si_shader_io_get_unique_index(semantic);
334                si_shader_io_get_unique_index(semantic + i);
335             }
336          }
337 
338          info->output_semantic[loc] = semantic + i;
339 
340          if (is_output_load) {
341             /* Output loads have only a few things that we need to track. */
342             info->output_readmask[loc] |= mask;
343          } else if (mask) {
344             /* Output stores. */
345             unsigned gs_streams = (uint32_t)nir_intrinsic_io_semantics(intr).gs_streams <<
346                                   (nir_intrinsic_component(intr) * 2);
347             unsigned new_mask = mask & ~info->output_usagemask[loc];
348 
349             /* Iterate over all components. */
350             for (unsigned i = 0; i < 4; i++) {
351                unsigned stream = (gs_streams >> (i * 2)) & 0x3;
352 
353                if (new_mask & (1 << i)) {
354                   info->output_streams[loc] |= stream << (i * 2);
355                   info->num_stream_output_components[stream]++;
356                }
357 
358                if (nir_intrinsic_has_io_xfb(intr)) {
359                   nir_io_xfb xfb = i < 2 ? nir_intrinsic_io_xfb(intr) :
360                                            nir_intrinsic_io_xfb2(intr);
361                   if (xfb.out[i % 2].num_components) {
362                      unsigned stream = (gs_streams >> (i * 2)) & 0x3;
363                      info->enabled_streamout_buffer_mask |=
364                         BITFIELD_BIT(stream * 4 + xfb.out[i % 2].buffer);
365                   }
366                }
367             }
368 
369             if (nir_intrinsic_has_src_type(intr))
370                info->output_type[loc] = nir_intrinsic_src_type(intr);
371             else if (nir_intrinsic_has_dest_type(intr))
372                info->output_type[loc] = nir_intrinsic_dest_type(intr);
373             else
374                info->output_type[loc] = nir_type_float32;
375 
376             info->output_usagemask[loc] |= mask;
377             info->num_outputs = MAX2(info->num_outputs, loc + 1);
378 
379             if (nir->info.stage == MESA_SHADER_FRAGMENT &&
380                 semantic >= FRAG_RESULT_DATA0 && semantic <= FRAG_RESULT_DATA7) {
381                unsigned index = semantic - FRAG_RESULT_DATA0;
382 
383                if (nir_intrinsic_src_type(intr) == nir_type_float16)
384                   info->output_color_types |= SI_TYPE_FLOAT16 << (index * 2);
385                else if (nir_intrinsic_src_type(intr) == nir_type_int16)
386                   info->output_color_types |= SI_TYPE_INT16 << (index * 2);
387                else if (nir_intrinsic_src_type(intr) == nir_type_uint16)
388                   info->output_color_types |= SI_TYPE_UINT16 << (index * 2);
389             }
390          }
391       }
392    }
393 }
394 
is_bindless_handle_indirect(nir_instr * src)395 static bool is_bindless_handle_indirect(nir_instr *src)
396 {
397    /* Check if the bindless handle comes from indirect load_ubo. */
398    if (src->type == nir_instr_type_intrinsic &&
399        nir_instr_as_intrinsic(src)->intrinsic == nir_intrinsic_load_ubo) {
400       if (!nir_src_is_const(nir_instr_as_intrinsic(src)->src[0]))
401          return true;
402    } else {
403       /* Some other instruction. Return the worst-case result. */
404       return true;
405    }
406    return false;
407 }
408 
409 /* TODO: convert to nir_shader_instructions_pass */
scan_instruction(const struct nir_shader * nir,struct si_shader_info * info,nir_instr * instr)410 static void scan_instruction(const struct nir_shader *nir, struct si_shader_info *info,
411                              nir_instr *instr)
412 {
413    if (instr->type == nir_instr_type_tex) {
414       nir_tex_instr *tex = nir_instr_as_tex(instr);
415       const nir_src *handle = get_texture_src(tex, nir_tex_src_texture_handle);
416 
417       /* Gather the types of used VMEM instructions that return something. */
418       switch (tex->op) {
419       case nir_texop_tex:
420       case nir_texop_txb:
421       case nir_texop_txl:
422       case nir_texop_txd:
423       case nir_texop_lod:
424       case nir_texop_tg4:
425          info->uses_vmem_sampler_or_bvh = true;
426          break;
427       default:
428          info->uses_vmem_load_other = true;
429          break;
430       }
431 
432       if (handle) {
433          info->uses_bindless_samplers = true;
434 
435          if (is_bindless_handle_indirect(handle->ssa->parent_instr))
436             info->uses_indirect_descriptor = true;
437       } else {
438          const nir_src *deref = get_texture_src(tex, nir_tex_src_texture_deref);
439 
440          if (nir_deref_instr_has_indirect(nir_src_as_deref(*deref)))
441             info->uses_indirect_descriptor = true;
442       }
443 
444       info->has_non_uniform_tex_access =
445          tex->texture_non_uniform || tex->sampler_non_uniform;
446    } else if (instr->type == nir_instr_type_intrinsic) {
447       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
448       const char *intr_name = nir_intrinsic_infos[intr->intrinsic].name;
449       bool is_ssbo = strstr(intr_name, "ssbo");
450       bool is_image = strstr(intr_name, "image") == intr_name;
451       bool is_bindless_image = strstr(intr_name, "bindless_image") == intr_name;
452 
453       /* Gather the types of used VMEM instructions that return something. */
454       if (nir_intrinsic_infos[intr->intrinsic].has_dest) {
455          switch (intr->intrinsic) {
456          case nir_intrinsic_load_ubo:
457             if (!nir_src_is_const(intr->src[1]))
458                info->uses_vmem_load_other = true;
459             break;
460 
461          case nir_intrinsic_load_input:
462          case nir_intrinsic_load_input_vertex:
463          case nir_intrinsic_load_per_vertex_input:
464             if (nir->info.stage == MESA_SHADER_VERTEX ||
465                 nir->info.stage == MESA_SHADER_TESS_EVAL)
466                info->uses_vmem_load_other = true;
467             break;
468 
469          case nir_intrinsic_load_constant:
470          case nir_intrinsic_load_barycentric_at_sample: /* This loads sample positions. */
471          case nir_intrinsic_load_buffer_amd:
472             info->uses_vmem_load_other = true;
473             break;
474 
475          default:
476             if (is_image ||
477                 is_bindless_image ||
478                 is_ssbo ||
479                 (strstr(intr_name, "global") == intr_name ||
480                  intr->intrinsic == nir_intrinsic_load_global ||
481                  intr->intrinsic == nir_intrinsic_store_global) ||
482                 strstr(intr_name, "scratch"))
483                info->uses_vmem_load_other = true;
484             break;
485          }
486       }
487 
488       if (is_bindless_image)
489          info->uses_bindless_images = true;
490 
491       if (nir_intrinsic_writes_external_memory(intr))
492          info->num_memory_stores++;
493 
494       if (is_image && nir_deref_instr_has_indirect(nir_src_as_deref(intr->src[0])))
495          info->uses_indirect_descriptor = true;
496 
497       if (is_bindless_image && is_bindless_handle_indirect(intr->src[0].ssa->parent_instr))
498          info->uses_indirect_descriptor = true;
499 
500       if (intr->intrinsic != nir_intrinsic_store_ssbo && is_ssbo &&
501           !nir_src_is_const(intr->src[0]))
502          info->uses_indirect_descriptor = true;
503 
504       if (nir_intrinsic_has_atomic_op(intr)) {
505          if (nir_intrinsic_atomic_op(intr) == nir_atomic_op_ordered_add_gfx12_amd)
506             info->uses_atomic_ordered_add = true;
507       }
508 
509       switch (intr->intrinsic) {
510       case nir_intrinsic_store_ssbo:
511          if (!nir_src_is_const(intr->src[1]))
512             info->uses_indirect_descriptor = true;
513          break;
514       case nir_intrinsic_load_ubo:
515          if (!nir_src_is_const(intr->src[0]))
516             info->uses_indirect_descriptor = true;
517          break;
518       case nir_intrinsic_load_local_invocation_id:
519       case nir_intrinsic_load_workgroup_id: {
520          unsigned mask = nir_def_components_read(&intr->def);
521          while (mask) {
522             unsigned i = u_bit_scan(&mask);
523 
524             if (intr->intrinsic == nir_intrinsic_load_workgroup_id)
525                info->uses_block_id[i] = true;
526             else
527                info->uses_thread_id[i] = true;
528          }
529          break;
530       }
531       case nir_intrinsic_load_color0:
532       case nir_intrinsic_load_color1: {
533          unsigned index = intr->intrinsic == nir_intrinsic_load_color1;
534          uint8_t mask = nir_def_components_read(&intr->def);
535          info->colors_read |= mask << (index * 4);
536 
537          switch (info->color_interpolate[index]) {
538          case INTERP_MODE_SMOOTH:
539             if (info->color_interpolate_loc[index] == TGSI_INTERPOLATE_LOC_SAMPLE)
540                info->uses_persp_sample = true;
541             else if (info->color_interpolate_loc[index] == TGSI_INTERPOLATE_LOC_CENTROID)
542                info->uses_persp_centroid = true;
543             else if (info->color_interpolate_loc[index] == TGSI_INTERPOLATE_LOC_CENTER)
544                info->uses_persp_center = true;
545             break;
546          case INTERP_MODE_NOPERSPECTIVE:
547             if (info->color_interpolate_loc[index] == TGSI_INTERPOLATE_LOC_SAMPLE)
548                info->uses_linear_sample = true;
549             else if (info->color_interpolate_loc[index] == TGSI_INTERPOLATE_LOC_CENTROID)
550                info->uses_linear_centroid = true;
551             else if (info->color_interpolate_loc[index] == TGSI_INTERPOLATE_LOC_CENTER)
552                info->uses_linear_center = true;
553             break;
554          case INTERP_MODE_COLOR:
555             /* We don't know the final value. This will be FLAT if flatshading is enabled
556              * in the rasterizer state, otherwise it will be SMOOTH.
557              */
558             info->uses_interp_color = true;
559             if (info->color_interpolate_loc[index] == TGSI_INTERPOLATE_LOC_SAMPLE)
560                info->uses_persp_sample_color = true;
561             else if (info->color_interpolate_loc[index] == TGSI_INTERPOLATE_LOC_CENTROID)
562                info->uses_persp_centroid_color = true;
563             else if (info->color_interpolate_loc[index] == TGSI_INTERPOLATE_LOC_CENTER)
564                info->uses_persp_center_color = true;
565             break;
566          }
567          break;
568       }
569       case nir_intrinsic_load_vector_arg_amd:
570          /* Non-monolithic lowered PS can have this. We need to record color usage. */
571          if (nir_intrinsic_flags(intr) & SI_VECTOR_ARG_IS_COLOR) {
572             /* The channel can be between 0 and 7. */
573             unsigned chan = SI_GET_VECTOR_ARG_COLOR_COMPONENT(nir_intrinsic_flags(intr));
574             info->colors_read |= BITFIELD_BIT(chan);
575          }
576          break;
577       case nir_intrinsic_load_barycentric_at_offset:   /* uses center */
578       case nir_intrinsic_load_barycentric_at_sample:   /* uses center */
579          if (nir_intrinsic_interp_mode(intr) == INTERP_MODE_FLAT)
580             break;
581 
582          if (nir_intrinsic_interp_mode(intr) == INTERP_MODE_NOPERSPECTIVE) {
583             info->uses_linear_center = true;
584          } else {
585             info->uses_persp_center = true;
586          }
587          if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
588             info->uses_interp_at_sample = true;
589          break;
590       case nir_intrinsic_load_frag_coord:
591          info->reads_frag_coord_mask |= nir_def_components_read(&intr->def);
592          break;
593       case nir_intrinsic_load_sample_pos:
594          info->reads_sample_pos_mask |= nir_def_components_read(&intr->def);
595          break;
596       case nir_intrinsic_load_input:
597       case nir_intrinsic_load_per_vertex_input:
598       case nir_intrinsic_load_input_vertex:
599       case nir_intrinsic_load_interpolated_input:
600          scan_io_usage(nir, info, intr, true);
601          break;
602       case nir_intrinsic_load_output:
603       case nir_intrinsic_load_per_vertex_output:
604       case nir_intrinsic_store_output:
605       case nir_intrinsic_store_per_vertex_output:
606          scan_io_usage(nir, info, intr, false);
607          break;
608       case nir_intrinsic_load_deref:
609       case nir_intrinsic_store_deref:
610          /* These can only occur if there is indirect temp indexing. */
611          break;
612       case nir_intrinsic_interp_deref_at_centroid:
613       case nir_intrinsic_interp_deref_at_sample:
614       case nir_intrinsic_interp_deref_at_offset:
615          unreachable("these opcodes should have been lowered");
616          break;
617       case nir_intrinsic_ordered_add_loop_gfx12_amd:
618          info->uses_atomic_ordered_add = true;
619          break;
620       default:
621          break;
622       }
623    }
624 }
625 
si_nir_scan_shader(struct si_screen * sscreen,const struct nir_shader * nir,struct si_shader_info * info)626 void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir,
627                         struct si_shader_info *info)
628 {
629    memset(info, 0, sizeof(*info));
630    info->base = nir->info;
631    info->base.use_aco_amd = aco_is_gpu_supported(&sscreen->info) &&
632                             (sscreen->use_aco || nir->info.use_aco_amd) &&
633                             sscreen->info.has_image_opcodes;
634 
635    /* Get options from shader profiles. */
636    for (unsigned i = 0; i < ARRAY_SIZE(si_shader_profiles); i++) {
637       if (_mesa_printed_blake3_equal(info->base.source_blake3, si_shader_profiles[i].blake3)) {
638          info->options = si_shader_profiles[i].options;
639          break;
640       }
641    }
642 
643    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
644       /* post_depth_coverage implies early_fragment_tests */
645       info->base.fs.early_fragment_tests |= info->base.fs.post_depth_coverage;
646 
647       info->color_interpolate[0] = nir->info.fs.color0_interp;
648       info->color_interpolate[1] = nir->info.fs.color1_interp;
649       for (unsigned i = 0; i < 2; i++) {
650          if (info->color_interpolate[i] == INTERP_MODE_NONE)
651             info->color_interpolate[i] = INTERP_MODE_COLOR;
652       }
653 
654       info->color_interpolate_loc[0] = nir->info.fs.color0_sample ? TGSI_INTERPOLATE_LOC_SAMPLE :
655                                        nir->info.fs.color0_centroid ? TGSI_INTERPOLATE_LOC_CENTROID :
656                                                                       TGSI_INTERPOLATE_LOC_CENTER;
657       info->color_interpolate_loc[1] = nir->info.fs.color1_sample ? TGSI_INTERPOLATE_LOC_SAMPLE :
658                                        nir->info.fs.color1_centroid ? TGSI_INTERPOLATE_LOC_CENTROID :
659                                                                       TGSI_INTERPOLATE_LOC_CENTER;
660       /* Set an invalid value. Will be determined at draw time if needed when the expected
661        * conditions are met.
662        */
663       info->writes_1_if_tex_is_1 = nir->info.writes_memory ? 0 : 0xff;
664 
665       /* Initialize all FS inputs to flat. If we see load_interpolated_input for any component,
666        * it will be changed to its interp mode.
667        */
668       for (unsigned i = 0; i < ARRAY_SIZE(info->input); i++)
669          info->input[i].interpolate = INTERP_MODE_FLAT;
670    }
671 
672    info->constbuf0_num_slots = nir->num_uniforms;
673 
674    if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
675       info->tessfactors_are_def_in_all_invocs = are_tessfactors_def_in_all_invocs(nir);
676    }
677 
678    /* tess factors are loaded as input instead of system value */
679    info->reads_tess_factors = nir->info.inputs_read &
680       (BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_INNER) |
681        BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_OUTER));
682 
683    info->uses_frontface = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRONT_FACE);
684    info->uses_instanceid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
685    info->uses_base_vertex = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_VERTEX);
686    info->uses_base_instance = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
687    info->uses_invocationid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INVOCATION_ID);
688    info->uses_grid_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_NUM_WORKGROUPS);
689    info->uses_tg_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_NUM_SUBGROUPS);
690    if (sscreen->info.gfx_level < GFX12) {
691       info->uses_tg_size |= BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) ||
692                             BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SUBGROUP_ID) ||
693                             si_should_clear_lds(sscreen, nir);
694    }
695    info->uses_variable_block_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_WORKGROUP_SIZE);
696    info->uses_drawid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
697    info->uses_primid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID) ||
698                        nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID;
699    info->reads_samplemask = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
700    info->uses_linear_sample = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);
701    info->uses_linear_centroid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);
702    info->uses_linear_center = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);
703    info->uses_persp_sample = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);
704    info->uses_persp_centroid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);
705    info->uses_persp_center = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);
706    info->uses_sampleid = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID);
707    info->uses_layer_id = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_LAYER_ID);
708 
709    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
710       info->writes_z = nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH);
711       info->writes_stencil = nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
712       info->writes_samplemask = nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
713 
714       info->colors_written = nir->info.outputs_written >> FRAG_RESULT_DATA0;
715       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR)) {
716          info->color0_writes_all_cbufs = true;
717          info->colors_written |= 0x1;
718       }
719       if (nir->info.fs.color_is_dual_source)
720          info->colors_written |= 0x2;
721    } else {
722       info->writes_primid = nir->info.outputs_written & VARYING_BIT_PRIMITIVE_ID;
723       info->writes_viewport_index = nir->info.outputs_written & VARYING_BIT_VIEWPORT;
724       info->writes_layer = nir->info.outputs_written & VARYING_BIT_LAYER;
725       info->writes_psize = nir->info.outputs_written & VARYING_BIT_PSIZ;
726       info->writes_clipvertex = nir->info.outputs_written & VARYING_BIT_CLIP_VERTEX;
727       info->writes_edgeflag = nir->info.outputs_written & VARYING_BIT_EDGE;
728       info->writes_position = nir->info.outputs_written & VARYING_BIT_POS;
729    }
730 
731    nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader*)nir);
732    nir_foreach_block (block, impl) {
733       nir_foreach_instr (instr, block)
734          scan_instruction(nir, info, instr);
735    }
736 
737    if (nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL) {
738       /* Add the PrimitiveID output, but don't increment num_outputs.
739        * The driver inserts PrimitiveID only when it's used by the pixel shader,
740        * and si_emit_spi_map uses this unconditionally when such a pixel shader is used.
741        */
742       info->output_semantic[info->num_outputs] = VARYING_SLOT_PRIMITIVE_ID;
743       info->output_type[info->num_outputs] = nir_type_uint32;
744       info->output_usagemask[info->num_outputs] = 0x1;
745    }
746 
747    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
748       info->allow_flat_shading = !(info->uses_persp_center || info->uses_persp_centroid ||
749                                    info->uses_persp_sample || info->uses_linear_center ||
750                                    info->uses_linear_centroid || info->uses_linear_sample ||
751                                    info->uses_interp_at_sample || nir->info.writes_memory ||
752                                    nir->info.fs.uses_fbfetch_output ||
753                                    nir->info.fs.needs_quad_helper_invocations ||
754                                    BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
755                                    BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_POINT_COORD) ||
756                                    BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID) ||
757                                    BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
758                                    BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN) ||
759                                    BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_HELPER_INVOCATION));
760 
761       info->uses_vmem_load_other |= info->base.fs.uses_fbfetch_output;
762 
763       /* Add both front and back color inputs. */
764       unsigned num_inputs_with_colors = info->num_inputs;
765       for (unsigned back = 0; back < 2; back++) {
766          for (unsigned i = 0; i < 2; i++) {
767             if ((info->colors_read >> (i * 4)) & 0xf) {
768                unsigned index = num_inputs_with_colors;
769 
770                info->input[index].semantic = (back ? VARYING_SLOT_BFC0 : VARYING_SLOT_COL0) + i;
771                info->input[index].interpolate = info->color_interpolate[i];
772                info->input[index].usage_mask = info->colors_read >> (i * 4);
773                num_inputs_with_colors++;
774 
775                /* Back-face color don't increment num_inputs. si_emit_spi_map will use
776                 * back-face colors conditionally only when they are needed.
777                 */
778                if (!back)
779                   info->num_inputs = num_inputs_with_colors;
780             }
781          }
782       }
783    }
784 
785    info->uses_vmem_load_other |= info->uses_indirect_descriptor;
786 
787    /* Trim output read masks based on write masks. */
788    for (unsigned i = 0; i < info->num_outputs; i++)
789       info->output_readmask[i] &= info->output_usagemask[i];
790 
791    info->has_divergent_loop = nir_has_divergent_loop((nir_shader*)nir);
792 
793    if (nir->info.stage == MESA_SHADER_VERTEX ||
794        nir->info.stage == MESA_SHADER_TESS_CTRL ||
795        nir->info.stage == MESA_SHADER_TESS_EVAL ||
796        nir->info.stage == MESA_SHADER_GEOMETRY) {
797       if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
798          /* Always reserve space for these. */
799          info->patch_outputs_written |=
800             (1ull << ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER)) |
801             (1ull << ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER));
802       }
803       for (unsigned i = 0; i < info->num_outputs; i++) {
804          unsigned semantic = info->output_semantic[i];
805 
806          if (semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
807              semantic == VARYING_SLOT_TESS_LEVEL_OUTER ||
808              (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX)) {
809             info->patch_outputs_written |= 1ull << ac_shader_io_get_unique_index_patch(semantic);
810          } else if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
811                     semantic != VARYING_SLOT_EDGE) {
812             /* Ignore outputs that are not passed from VS to PS. */
813             if (semantic != VARYING_SLOT_POS &&
814                 semantic != VARYING_SLOT_PSIZ &&
815                 semantic != VARYING_SLOT_CLIP_VERTEX &&
816                 semantic != VARYING_SLOT_LAYER) {
817                info->outputs_written_before_ps |= 1ull
818                                                   << si_shader_io_get_unique_index(semantic);
819             }
820 
821             /* LAYER and VIEWPORT have no effect if they don't feed the rasterizer. */
822             if (semantic != VARYING_SLOT_LAYER &&
823                 semantic != VARYING_SLOT_VIEWPORT) {
824                info->outputs_written_before_tes_gs |=
825                   BITFIELD64_BIT(si_shader_io_get_unique_index(semantic));
826             }
827          }
828       }
829    }
830 
831    if (nir->info.stage == MESA_SHADER_VERTEX) {
832       info->num_vs_inputs =
833          nir->info.stage == MESA_SHADER_VERTEX && !info->base.vs.blit_sgprs_amd ? info->num_inputs : 0;
834       unsigned num_vbos_in_sgprs = si_num_vbos_in_user_sgprs_inline(sscreen->info.gfx_level);
835       info->num_vbos_in_user_sgprs = MIN2(info->num_vs_inputs, num_vbos_in_sgprs);
836    }
837 
838    if (nir->info.stage == MESA_SHADER_VERTEX ||
839        nir->info.stage == MESA_SHADER_TESS_CTRL ||
840        nir->info.stage == MESA_SHADER_TESS_EVAL) {
841       info->esgs_vertex_stride =
842          util_last_bit64(info->outputs_written_before_tes_gs) * 16;
843 
844       /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
845        * conflicts, i.e. each vertex will start on a different bank.
846        */
847       if (sscreen->info.gfx_level >= GFX9)
848          info->esgs_vertex_stride += 4;
849       else
850          assert(((info->esgs_vertex_stride / 4) & C_028AAC_ITEMSIZE) == 0);
851 
852       info->tcs_vgpr_only_inputs = ~info->base.tess.tcs_cross_invocation_inputs_read &
853                                    ~info->base.inputs_read_indirectly &
854                                    info->base.inputs_read;
855    }
856 
857    if (nir->info.stage == MESA_SHADER_GEOMETRY) {
858       info->gsvs_vertex_size = info->num_outputs * 16;
859       info->max_gsvs_emit_size = info->gsvs_vertex_size * info->base.gs.vertices_out;
860       info->gs_input_verts_per_prim =
861          mesa_vertices_per_prim(info->base.gs.input_primitive);
862    }
863 
864    info->clipdist_mask = info->writes_clipvertex ? SI_USER_CLIP_PLANE_MASK :
865                          u_bit_consecutive(0, info->base.clip_distance_array_size);
866    info->culldist_mask = u_bit_consecutive(0, info->base.cull_distance_array_size) <<
867                          info->base.clip_distance_array_size;
868 
869    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
870       for (unsigned i = 0; i < info->num_inputs; i++) {
871          unsigned semantic = info->input[i].semantic;
872 
873          if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
874              semantic != VARYING_SLOT_PNTC) {
875             info->inputs_read |= 1ull << si_shader_io_get_unique_index(semantic);
876          }
877       }
878 
879       for (unsigned i = 0; i < 8; i++)
880          if (info->colors_written & (1 << i))
881             info->colors_written_4bit |= 0xf << (4 * i);
882 
883       for (unsigned i = 0; i < info->num_inputs; i++) {
884          if (info->input[i].semantic == VARYING_SLOT_COL0)
885             info->color_attr_index[0] = i;
886          else if (info->input[i].semantic == VARYING_SLOT_COL1)
887             info->color_attr_index[1] = i;
888       }
889    }
890 }
891 
892 enum ac_hw_stage
si_select_hw_stage(const gl_shader_stage stage,const union si_shader_key * const key,const enum amd_gfx_level gfx_level)893 si_select_hw_stage(const gl_shader_stage stage, const union si_shader_key *const key,
894                    const enum amd_gfx_level gfx_level)
895 {
896    switch (stage) {
897    case MESA_SHADER_VERTEX:
898    case MESA_SHADER_TESS_EVAL:
899       if (key->ge.as_ngg)
900          return AC_HW_NEXT_GEN_GEOMETRY_SHADER;
901       else if (key->ge.as_es)
902          return gfx_level >= GFX9 ? AC_HW_LEGACY_GEOMETRY_SHADER : AC_HW_EXPORT_SHADER;
903       else if (key->ge.as_ls)
904          return gfx_level >= GFX9 ? AC_HW_HULL_SHADER : AC_HW_LOCAL_SHADER;
905       else
906          return AC_HW_VERTEX_SHADER;
907    case MESA_SHADER_TESS_CTRL:
908       return AC_HW_HULL_SHADER;
909    case MESA_SHADER_GEOMETRY:
910       if (key->ge.as_ngg)
911          return AC_HW_NEXT_GEN_GEOMETRY_SHADER;
912       else
913          return AC_HW_LEGACY_GEOMETRY_SHADER;
914    case MESA_SHADER_FRAGMENT:
915       return AC_HW_PIXEL_SHADER;
916    case MESA_SHADER_COMPUTE:
917    case MESA_SHADER_KERNEL:
918       return AC_HW_COMPUTE_SHADER;
919    default:
920       unreachable("Unsupported HW stage");
921    }
922 }
923