xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_compile_mesh.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <list>
25 #include <vector>
26 #include "brw_compiler.h"
27 #include "brw_fs.h"
28 #include "brw_fs_builder.h"
29 #include "brw_nir.h"
30 #include "brw_private.h"
31 #include "compiler/nir/nir_builder.h"
32 #include "dev/intel_debug.h"
33 
34 #include <memory>
35 
36 using namespace brw;
37 
38 static bool
brw_nir_lower_load_uniforms_filter(const nir_instr * instr,UNUSED const void * data)39 brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
40                                    UNUSED const void *data)
41 {
42    if (instr->type != nir_instr_type_intrinsic)
43       return false;
44    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
45    return intrin->intrinsic == nir_intrinsic_load_uniform;
46 }
47 
48 static nir_def *
brw_nir_lower_load_uniforms_impl(nir_builder * b,nir_instr * instr,UNUSED void * data)49 brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
50                                  UNUSED void *data)
51 {
52    assert(instr->type == nir_instr_type_intrinsic);
53    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
54    assert(intrin->intrinsic == nir_intrinsic_load_uniform);
55 
56    /* Read the first few 32-bit scalars from InlineData. */
57    if (nir_src_is_const(intrin->src[0]) &&
58        intrin->def.bit_size == 32 &&
59        intrin->def.num_components == 1) {
60       unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
61       unsigned off_dw = off / 4;
62       if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) {
63          off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW;
64          return nir_load_mesh_inline_data_intel(b, 32, off_dw);
65       }
66    }
67 
68    return brw_nir_load_global_const(b, intrin,
69                                     nir_load_mesh_inline_data_intel(b, 64, 0), 0);
70 }
71 
72 static bool
brw_nir_lower_load_uniforms(nir_shader * nir)73 brw_nir_lower_load_uniforms(nir_shader *nir)
74 {
75    return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
76                                         brw_nir_lower_load_uniforms_impl, NULL);
77 }
78 
79 static inline int
type_size_scalar_dwords(const struct glsl_type * type,bool bindless)80 type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
81 {
82    return glsl_count_dword_slots(type, bindless);
83 }
84 
85 /* TODO(mesh): Make this a common function. */
86 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)87 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
88 {
89    assert(glsl_type_is_vector_or_scalar(type));
90 
91    uint32_t comp_size = glsl_type_is_boolean(type)
92       ? 4 : glsl_get_bit_size(type) / 8;
93    unsigned length = glsl_get_vector_elements(type);
94    *size = comp_size * length,
95    *align = comp_size * (length == 3 ? 4 : length);
96 }
97 
98 static bool
brw_nir_lower_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)99 brw_nir_lower_launch_mesh_workgroups_instr(nir_builder *b,
100                                            nir_intrinsic_instr *intrin,
101                                            void *data)
102 {
103    if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
104       return false;
105 
106    b->cursor = nir_before_instr(&intrin->instr);
107 
108    nir_def *local_invocation_index = nir_load_local_invocation_index(b);
109 
110    /* Make sure that the mesh workgroup size is taken from the first invocation
111     * (nir_intrinsic_launch_mesh_workgroups requirement)
112     */
113    nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0);
114    nir_if *if_stmt = nir_push_if(b, cmp);
115    {
116       /* TUE header contains 4 words:
117        *
118        * - Word 0 for Task Count.
119        *
120        * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
121        *   3D dispatch into the 1D dispatch supported by HW.
122        */
123       nir_def *x = nir_channel(b, intrin->src[0].ssa, 0);
124       nir_def *y = nir_channel(b, intrin->src[0].ssa, 1);
125       nir_def *z = nir_channel(b, intrin->src[0].ssa, 2);
126       nir_def *task_count = nir_imul(b, x, nir_imul(b, y, z));
127       nir_def *tue_header = nir_vec4(b, task_count, x, y, z);
128       nir_store_task_payload(b, tue_header, nir_imm_int(b, 0));
129    }
130    nir_pop_if(b, if_stmt);
131 
132    nir_instr_remove(&intrin->instr);
133 
134    return true;
135 }
136 
137 static bool
brw_nir_lower_launch_mesh_workgroups(nir_shader * nir)138 brw_nir_lower_launch_mesh_workgroups(nir_shader *nir)
139 {
140    return nir_shader_intrinsics_pass(nir,
141                                        brw_nir_lower_launch_mesh_workgroups_instr,
142                                        nir_metadata_none,
143                                        NULL);
144 }
145 
146 static void
brw_nir_lower_tue_outputs(nir_shader * nir,brw_tue_map * map)147 brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
148 {
149    memset(map, 0, sizeof(*map));
150 
151    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
152             type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
153 
154    /* From bspec: "It is suggested that SW reserve the 16 bytes following the
155     * TUE Header, and therefore start the SW-defined data structure at 32B
156     * alignment.  This allows the TUE Header to always be written as 32 bytes
157     * with 32B alignment, the most optimal write performance case."
158     */
159    map->per_task_data_start_dw = 8;
160 
161    /* Lowering to explicit types will start offsets from task_payload_size, so
162     * set it to start after the header.
163     */
164    nir->info.task_payload_size = map->per_task_data_start_dw * 4;
165    NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
166             nir_var_mem_task_payload, shared_type_info);
167    NIR_PASS(_, nir, nir_lower_explicit_io,
168             nir_var_mem_task_payload, nir_address_format_32bit_offset);
169 
170    map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
171 }
172 
173 static void
brw_print_tue_map(FILE * fp,const struct brw_tue_map * map)174 brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
175 {
176    fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
177 }
178 
179 static bool
brw_nir_adjust_task_payload_offsets_instr(struct nir_builder * b,nir_intrinsic_instr * intrin,void * data)180 brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
181                                           nir_intrinsic_instr *intrin,
182                                           void *data)
183 {
184    switch (intrin->intrinsic) {
185    case nir_intrinsic_store_task_payload:
186    case nir_intrinsic_load_task_payload: {
187       nir_src *offset_src = nir_get_io_offset_src(intrin);
188 
189       if (nir_src_is_const(*offset_src))
190          assert(nir_src_as_uint(*offset_src) % 4 == 0);
191 
192       b->cursor = nir_before_instr(&intrin->instr);
193 
194       /* Regular I/O uses dwords while explicit I/O used for task payload uses
195        * bytes.  Normalize it to dwords.
196        *
197        * TODO(mesh): Figure out how to handle 8-bit, 16-bit.
198        */
199 
200       nir_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
201       nir_src_rewrite(offset_src, offset);
202 
203       unsigned base = nir_intrinsic_base(intrin);
204       assert(base % 4 == 0);
205       nir_intrinsic_set_base(intrin, base / 4);
206 
207       return true;
208    }
209 
210    default:
211       return false;
212    }
213 }
214 
215 static bool
brw_nir_adjust_task_payload_offsets(nir_shader * nir)216 brw_nir_adjust_task_payload_offsets(nir_shader *nir)
217 {
218    return nir_shader_intrinsics_pass(nir,
219                                        brw_nir_adjust_task_payload_offsets_instr,
220                                        nir_metadata_control_flow,
221                                        NULL);
222 }
223 
224 void
brw_nir_adjust_payload(nir_shader * shader)225 brw_nir_adjust_payload(nir_shader *shader)
226 {
227    /* Adjustment of task payload offsets must be performed *after* last pass
228     * which interprets them as bytes, because it changes their unit.
229     */
230    bool adjusted = false;
231    NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets);
232    if (adjusted) /* clean up the mess created by offset adjustments */
233       NIR_PASS(_, shader, nir_opt_constant_folding);
234 }
235 
236 static bool
brw_nir_align_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)237 brw_nir_align_launch_mesh_workgroups_instr(nir_builder *b,
238                                            nir_intrinsic_instr *intrin,
239                                            void *data)
240 {
241    if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
242       return false;
243 
244    /* nir_lower_task_shader uses "range" as task payload size. */
245    unsigned range = nir_intrinsic_range(intrin);
246    /* This will avoid special case in nir_lower_task_shader dealing with
247     * not vec4-aligned payload when payload_in_shared workaround is enabled.
248     */
249    nir_intrinsic_set_range(intrin, ALIGN(range, 16));
250 
251    return true;
252 }
253 
254 static bool
brw_nir_align_launch_mesh_workgroups(nir_shader * nir)255 brw_nir_align_launch_mesh_workgroups(nir_shader *nir)
256 {
257    return nir_shader_intrinsics_pass(nir,
258                                        brw_nir_align_launch_mesh_workgroups_instr,
259                                        nir_metadata_control_flow,
260                                        NULL);
261 }
262 
263 static void
brw_emit_urb_fence(fs_visitor & s)264 brw_emit_urb_fence(fs_visitor &s)
265 {
266    const fs_builder bld = fs_builder(&s).at_end();
267    brw_reg dst = bld.vgrf(BRW_TYPE_UD);
268    fs_inst *fence = bld.emit(SHADER_OPCODE_MEMORY_FENCE, dst,
269                              brw_vec8_grf(0, 0),
270                              brw_imm_ud(true),
271                              brw_imm_ud(0));
272    fence->sfid = BRW_SFID_URB;
273    /* The logical thing here would likely be a THREADGROUP fence but that's
274     * still failing some tests like in dEQP-VK.mesh_shader.ext.query.*
275     *
276     * Gfx12.5 has a comment about this on BSpec 53533 :
277     *
278     *    "If fence scope is Local or Threadgroup, HW ignores the flush type
279     *     and operates as if it was set to None (no flush)"
280     *
281     * Software workaround from HSD-22014129519 indicates that a GPU fence
282     * resolves the issue.
283     */
284    fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_GPU,
285                                     LSC_FLUSH_TYPE_NONE, true);
286 
287    bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE,
288                                    bld.null_reg_ud(),
289                                    &dst,
290                                    1);
291 }
292 
293 static bool
run_task_mesh(fs_visitor & s,bool allow_spilling)294 run_task_mesh(fs_visitor &s, bool allow_spilling)
295 {
296    assert(s.stage == MESA_SHADER_TASK ||
297           s.stage == MESA_SHADER_MESH);
298 
299    s.payload_ = new task_mesh_thread_payload(s);
300 
301    nir_to_brw(&s);
302 
303    if (s.failed)
304       return false;
305 
306    brw_emit_urb_fence(s);
307 
308    s.emit_cs_terminate();
309 
310    brw_calculate_cfg(s);
311 
312    brw_fs_optimize(s);
313 
314    s.assign_curb_setup();
315 
316    brw_fs_lower_3src_null_dest(s);
317    brw_fs_workaround_memory_fence_before_eot(s);
318    brw_fs_workaround_emit_dummy_mov_instruction(s);
319 
320    brw_allocate_registers(s, allow_spilling);
321 
322    return !s.failed;
323 }
324 
325 const unsigned *
brw_compile_task(const struct brw_compiler * compiler,struct brw_compile_task_params * params)326 brw_compile_task(const struct brw_compiler *compiler,
327                  struct brw_compile_task_params *params)
328 {
329    struct nir_shader *nir = params->base.nir;
330    const struct brw_task_prog_key *key = params->key;
331    struct brw_task_prog_data *prog_data = params->prog_data;
332    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TASK);
333 
334    brw_nir_lower_tue_outputs(nir, &prog_data->map);
335 
336    NIR_PASS(_, nir, brw_nir_align_launch_mesh_workgroups);
337 
338    nir_lower_task_shader_options lower_ts_opt = {
339       .payload_to_shared_for_atomics = true,
340       .payload_to_shared_for_small_types = true,
341       /* The actual payload data starts after the TUE header and padding,
342        * so skip those when copying.
343        */
344       .payload_offset_in_bytes = prog_data->map.per_task_data_start_dw * 4,
345    };
346    NIR_PASS(_, nir, nir_lower_task_shader, lower_ts_opt);
347 
348    NIR_PASS(_, nir, brw_nir_lower_launch_mesh_workgroups);
349 
350    prog_data->base.base.stage = MESA_SHADER_TASK;
351    prog_data->base.base.total_shared = nir->info.shared_size;
352    prog_data->base.base.total_scratch = 0;
353 
354    prog_data->base.local_size[0] = nir->info.workgroup_size[0];
355    prog_data->base.local_size[1] = nir->info.workgroup_size[1];
356    prog_data->base.local_size[2] = nir->info.workgroup_size[2];
357 
358    prog_data->uses_drawid =
359       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
360 
361    brw_simd_selection_state simd_state{
362       .devinfo = compiler->devinfo,
363       .prog_data = &prog_data->base,
364       .required_width = brw_required_dispatch_width(&nir->info),
365    };
366 
367    std::unique_ptr<fs_visitor> v[3];
368 
369    for (unsigned simd = 0; simd < 3; simd++) {
370       if (!brw_simd_should_compile(simd_state, simd))
371          continue;
372 
373       const unsigned dispatch_width = 8 << simd;
374 
375       nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
376       brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
377 
378       NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
379       NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
380 
381       brw_postprocess_nir(shader, compiler, debug_enabled,
382                           key->base.robust_flags);
383 
384       v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
385                                              &key->base,
386                                              &prog_data->base.base,
387                                              shader, dispatch_width,
388                                              params->base.stats != NULL,
389                                              debug_enabled);
390 
391       if (prog_data->base.prog_mask) {
392          unsigned first = ffs(prog_data->base.prog_mask) - 1;
393          v[simd]->import_uniforms(v[first].get());
394       }
395 
396       const bool allow_spilling = !brw_simd_any_compiled(simd_state);
397       if (run_task_mesh(*v[simd], allow_spilling))
398          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
399       else
400          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
401    }
402 
403    int selected_simd = brw_simd_select(simd_state);
404    if (selected_simd < 0) {
405       params->base.error_str =
406          ralloc_asprintf(params->base.mem_ctx,
407                          "Can't compile shader: "
408                          "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
409                          simd_state.error[0], simd_state.error[1],
410                          simd_state.error[2]);
411       return NULL;
412    }
413 
414    fs_visitor *selected = v[selected_simd].get();
415    prog_data->base.prog_mask = 1 << selected_simd;
416 
417    if (unlikely(debug_enabled)) {
418       fprintf(stderr, "Task Output ");
419       brw_print_tue_map(stderr, &prog_data->map);
420    }
421 
422    fs_generator g(compiler, &params->base, &prog_data->base.base,
423                   MESA_SHADER_TASK);
424    if (unlikely(debug_enabled)) {
425       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
426                                      "%s task shader %s",
427                                      nir->info.label ? nir->info.label
428                                                      : "unnamed",
429                                      nir->info.name));
430    }
431 
432    g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
433                    selected->performance_analysis.require(), params->base.stats);
434    g.add_const_data(nir->constant_data, nir->constant_data_size);
435    return g.get_assembly();
436 }
437 
438 static void
brw_nir_lower_tue_inputs(nir_shader * nir,const brw_tue_map * map)439 brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
440 {
441    if (!map)
442       return;
443 
444    nir->info.task_payload_size = map->per_task_data_start_dw * 4;
445 
446    bool progress = false;
447 
448    NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
449             nir_var_mem_task_payload, shared_type_info);
450 
451    if (progress) {
452       /* The types for Task Output and Mesh Input should match, so their sizes
453        * should also match.
454        */
455       assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
456    } else {
457       /* Mesh doesn't read any input, to make it clearer set the
458        * task_payload_size to zero instead of keeping an incomplete size that
459        * just includes the header.
460        */
461       nir->info.task_payload_size = 0;
462    }
463 
464    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload,
465             nir_address_format_32bit_offset);
466 }
467 
468 /* Attribute types. Flat attributes have to be a separate class because
469  * flat and interpolated attributes can't share the same vec4 slot
470  * (see 3DSTATE_SBE.ConstantInterpolationEnable).
471  */
472 enum {
473    PRIM, /* per primitive */
474    VERT, /* per vertex interpolated */
475    VERT_FLAT, /* per vertex flat */
476 };
477 
478 struct attr_desc {
479    int location;
480    const struct glsl_type *type;
481    unsigned dwords;
482    unsigned slots;
483 };
484 
485 struct attr_type_info {
486    /* order of attributes, negative values are holes */
487    std::list<struct attr_desc> *order;
488 
489    /* attributes after which there's hole of size equal to array index */
490    std::list<int> holes[5];
491 };
492 
493 static void
brw_mue_assign_position(const struct attr_desc * attr,struct brw_mue_map * map,unsigned start_dw)494 brw_mue_assign_position(const struct attr_desc *attr,
495                         struct brw_mue_map *map,
496                         unsigned start_dw)
497 {
498    bool is_array = glsl_type_is_array(attr->type);
499    int location = attr->location;
500    unsigned remaining = attr->dwords;
501 
502    for (unsigned slot = 0; slot < attr->slots; ++slot) {
503       map->start_dw[location + slot] = start_dw;
504 
505       unsigned sz;
506 
507       if (is_array) {
508          assert(attr->dwords % attr->slots == 0);
509          sz = attr->dwords / attr->slots;
510       } else {
511          sz = MIN2(remaining, 4);
512       }
513 
514       map->len_dw[location + slot] = sz;
515       start_dw += sz;
516       remaining -= sz;
517    }
518 }
519 
520 static nir_variable *
brw_nir_find_complete_variable_with_location(nir_shader * shader,nir_variable_mode mode,int location)521 brw_nir_find_complete_variable_with_location(nir_shader *shader,
522                                              nir_variable_mode mode,
523                                              int location)
524 {
525    nir_variable *best_var = NULL;
526    unsigned last_size = 0;
527 
528    nir_foreach_variable_with_modes(var, shader, mode) {
529       if (var->data.location != location)
530          continue;
531 
532       unsigned new_size = glsl_count_dword_slots(var->type, false);
533       if (new_size > last_size) {
534          best_var = var;
535          last_size = new_size;
536       }
537    }
538 
539    return best_var;
540 }
541 
542 static unsigned
brw_sum_size(const std::list<struct attr_desc> & orders)543 brw_sum_size(const std::list<struct attr_desc> &orders)
544 {
545    unsigned sz = 0;
546    for (auto it = orders.cbegin(); it != orders.cend(); ++it)
547       sz += (*it).dwords;
548    return sz;
549 }
550 
551 /* Finds order of outputs which require minimum size, without splitting
552  * of URB read/write messages (which operate on vec4-aligned memory).
553  */
554 static void
brw_compute_mue_layout(const struct brw_compiler * compiler,std::list<struct attr_desc> * orders,uint64_t outputs_written,struct nir_shader * nir,bool * pack_prim_data_into_header,bool * pack_vert_data_into_header)555 brw_compute_mue_layout(const struct brw_compiler *compiler,
556                        std::list<struct attr_desc> *orders,
557                        uint64_t outputs_written,
558                        struct nir_shader *nir,
559                        bool *pack_prim_data_into_header,
560                        bool *pack_vert_data_into_header)
561 {
562    const struct shader_info *info = &nir->info;
563 
564    struct attr_type_info data[3];
565 
566    if ((compiler->mesh.mue_header_packing & 1) == 0)
567       *pack_prim_data_into_header = false;
568    if ((compiler->mesh.mue_header_packing & 2) == 0)
569       *pack_vert_data_into_header = false;
570 
571    for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
572       data[i].order = &orders[i];
573 
574    /* If packing into header is enabled, add a hole of size 4 and add
575     * a virtual location to keep the algorithm happy (it expects holes
576     * to be preceded by some location). We'll remove those virtual
577     * locations at the end.
578     */
579    const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
580    assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
581 
582    struct attr_desc d;
583    d.location = virtual_header_location;
584    d.type = NULL;
585    d.dwords = 0;
586    d.slots = 0;
587 
588    struct attr_desc h;
589    h.location = -1;
590    h.type = NULL;
591    h.dwords = 4;
592    h.slots = 0;
593 
594    if (*pack_prim_data_into_header) {
595       orders[PRIM].push_back(d);
596       orders[PRIM].push_back(h);
597       data[PRIM].holes[4].push_back(virtual_header_location);
598    }
599 
600    if (*pack_vert_data_into_header) {
601       orders[VERT].push_back(d);
602       orders[VERT].push_back(h);
603       data[VERT].holes[4].push_back(virtual_header_location);
604    }
605 
606    u_foreach_bit64(location, outputs_written) {
607       if ((BITFIELD64_BIT(location) & outputs_written) == 0)
608          continue;
609 
610       /* At this point there are both complete and split variables as
611        * outputs. We need the complete variable to compute the required
612        * size.
613        */
614       nir_variable *var =
615             brw_nir_find_complete_variable_with_location(nir,
616                                                          nir_var_shader_out,
617                                                          location);
618 
619       d.location = location;
620       d.type     = brw_nir_get_var_type(nir, var);
621       d.dwords   = glsl_count_dword_slots(d.type, false);
622       d.slots    = glsl_count_attribute_slots(d.type, false);
623 
624       struct attr_type_info *type_data;
625 
626       if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
627          type_data = &data[PRIM];
628       else if (var->data.interpolation == INTERP_MODE_FLAT)
629          type_data = &data[VERT_FLAT];
630       else
631          type_data = &data[VERT];
632 
633       std::list<struct attr_desc> *order = type_data->order;
634       std::list<int> *holes = type_data->holes;
635 
636       outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
637 
638       /* special case to use hole of size 4 */
639       if (d.dwords == 4 && !holes[4].empty()) {
640          holes[4].pop_back();
641 
642          assert(order->front().location == virtual_header_location);
643          order->pop_front();
644 
645          assert(order->front().location == -1);
646          assert(order->front().dwords == 4);
647          order->front() = d;
648 
649          continue;
650       }
651 
652       int mod = d.dwords % 4;
653       if (mod == 0) {
654          order->push_back(d);
655          continue;
656       }
657 
658       h.location = -1;
659       h.type = NULL;
660       h.dwords = 4 - mod;
661       h.slots = 0;
662 
663       if (!compiler->mesh.mue_compaction) {
664          order->push_back(d);
665          order->push_back(h);
666          continue;
667       }
668 
669       if (d.dwords > 4) {
670          order->push_back(d);
671          order->push_back(h);
672          holes[h.dwords].push_back(location);
673          continue;
674       }
675 
676       assert(d.dwords < 4);
677 
678       unsigned found = 0;
679       /* try to find the smallest hole big enough to hold this attribute */
680       for (unsigned sz = d.dwords; sz <= 4; sz++){
681          if (!holes[sz].empty()) {
682             found = sz;
683             break;
684          }
685       }
686 
687       /* append at the end if not found */
688       if (found == 0) {
689          order->push_back(d);
690          order->push_back(h);
691          holes[h.dwords].push_back(location);
692 
693          continue;
694       }
695 
696       assert(found <= 4);
697       assert(!holes[found].empty());
698       int after_loc = holes[found].back();
699       holes[found].pop_back();
700 
701       bool inserted_back = false;
702 
703       for (auto it = order->begin(); it != order->end(); ++it) {
704          if ((*it).location != after_loc)
705             continue;
706 
707          ++it;
708          /* must be a hole */
709          assert((*it).location < 0);
710          /* and it must be big enough */
711          assert(d.dwords <= (*it).dwords);
712 
713          if (d.dwords == (*it).dwords) {
714             /* exact size, just replace */
715             *it = d;
716          } else {
717             /* inexact size, shrink hole */
718             (*it).dwords -= d.dwords;
719             /* and insert new attribute before it */
720             order->insert(it, d);
721 
722             /* Insert shrunk hole in a spot so that the order of attributes
723              * is preserved.
724              */
725             std::list<int> &hole_list = holes[(*it).dwords];
726             std::list<int>::iterator insert_before = hole_list.end();
727 
728             for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
729                if ((*it2) >= (int)location) {
730                   insert_before = it2;
731                   break;
732                }
733             }
734 
735             hole_list.insert(insert_before, location);
736          }
737 
738          inserted_back = true;
739          break;
740       }
741 
742       assert(inserted_back);
743    }
744 
745    if (*pack_prim_data_into_header) {
746       if (orders[PRIM].front().location == virtual_header_location)
747          orders[PRIM].pop_front();
748 
749       if (!data[PRIM].holes[4].empty()) {
750          *pack_prim_data_into_header = false;
751 
752          assert(orders[PRIM].front().location == -1);
753          assert(orders[PRIM].front().dwords == 4);
754          orders[PRIM].pop_front();
755       }
756 
757       if (*pack_prim_data_into_header) {
758          unsigned sz = brw_sum_size(orders[PRIM]);
759 
760          if (sz % 8 == 0 || sz % 8 > 4)
761             *pack_prim_data_into_header = false;
762       }
763    }
764 
765    if (*pack_vert_data_into_header) {
766       if (orders[VERT].front().location == virtual_header_location)
767          orders[VERT].pop_front();
768 
769       if (!data[VERT].holes[4].empty()) {
770          *pack_vert_data_into_header = false;
771 
772          assert(orders[VERT].front().location == -1);
773          assert(orders[VERT].front().dwords == 4);
774          orders[VERT].pop_front();
775       }
776 
777       if (*pack_vert_data_into_header) {
778          unsigned sz = brw_sum_size(orders[VERT]) +
779                        brw_sum_size(orders[VERT_FLAT]);
780 
781          if (sz % 8 == 0 || sz % 8 > 4)
782             *pack_vert_data_into_header = false;
783       }
784    }
785 
786 
787    if (INTEL_DEBUG(DEBUG_MESH)) {
788       fprintf(stderr, "MUE attribute order:\n");
789       for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
790          if (!orders[i].empty())
791             fprintf(stderr, "%d: ", i);
792          for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
793             fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
794          }
795          if (!orders[i].empty())
796             fprintf(stderr, "\n");
797       }
798    }
799 }
800 
801 /* Mesh URB Entry consists of an initial section
802  *
803  *  - Primitive Count
804  *  - Primitive Indices (from 0 to Max-1)
805  *  - Padding to 32B if needed
806  *
807  * optionally followed by a section for per-primitive data,
808  * in which each primitive (from 0 to Max-1) gets
809  *
810  *  - Primitive Header (e.g. ViewportIndex)
811  *  - Primitive Custom Attributes
812  *
813  * then followed by a section for per-vertex data
814  *
815  *  - Vertex Header (e.g. Position)
816  *  - Vertex Custom Attributes
817  *
818  * Each per-element section has a pitch and a starting offset.  All the
819  * individual attributes offsets in start_dw are considering the first entry
820  * of the section (i.e. where the Position for first vertex, or ViewportIndex
821  * for first primitive).  Attributes for other elements are calculated using
822  * the pitch.
823  */
824 static void
brw_compute_mue_map(const struct brw_compiler * compiler,struct nir_shader * nir,struct brw_mue_map * map,enum brw_mesh_index_format index_format,bool compact_mue)825 brw_compute_mue_map(const struct brw_compiler *compiler,
826                     struct nir_shader *nir, struct brw_mue_map *map,
827                     enum brw_mesh_index_format index_format, bool compact_mue)
828 {
829    memset(map, 0, sizeof(*map));
830 
831    memset(&map->start_dw[0], -1, sizeof(map->start_dw));
832    memset(&map->len_dw[0], 0, sizeof(map->len_dw));
833 
834    unsigned vertices_per_primitive =
835       mesa_vertices_per_prim(nir->info.mesh.primitive_type);
836 
837    map->max_primitives = nir->info.mesh.max_primitives_out;
838    map->max_vertices = nir->info.mesh.max_vertices_out;
839 
840    uint64_t outputs_written = nir->info.outputs_written;
841 
842    /* One dword for primitives count then K extra dwords for each primitive. */
843    switch (index_format) {
844    case BRW_INDEX_FORMAT_U32:
845       map->per_primitive_indices_dw = vertices_per_primitive;
846       break;
847    case BRW_INDEX_FORMAT_U888X:
848       map->per_primitive_indices_dw = 1;
849       break;
850    default:
851       unreachable("invalid index format");
852    }
853 
854    map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
855                                        map->max_primitives + 1, 8);
856 
857    /* Assign initial section. */
858    if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
859       map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
860       map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
861       outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
862    }
863    if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
864       map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
865       map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
866             map->per_primitive_indices_dw * map->max_primitives;
867       outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
868    }
869 
870    const uint64_t per_primitive_header_bits =
871          BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
872          BITFIELD64_BIT(VARYING_SLOT_LAYER) |
873          BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
874          BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
875 
876    const uint64_t per_vertex_header_bits =
877          BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
878          BITFIELD64_BIT(VARYING_SLOT_POS) |
879          BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
880          BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
881 
882    std::list<struct attr_desc> orders[3];
883    uint64_t regular_outputs = outputs_written &
884          ~(per_primitive_header_bits | per_vertex_header_bits);
885 
886    /* packing into prim header is possible only if prim header is present */
887    map->user_data_in_primitive_header = compact_mue &&
888          (outputs_written & per_primitive_header_bits) != 0;
889 
890    /* Packing into vert header is always possible, but we allow it only
891     * if full vec4 is available (so point size is not used) and there's
892     * nothing between it and normal vertex data (so no clip distances).
893     */
894    map->user_data_in_vertex_header = compact_mue &&
895          (outputs_written & per_vertex_header_bits) ==
896                BITFIELD64_BIT(VARYING_SLOT_POS);
897 
898    if (outputs_written & per_primitive_header_bits) {
899       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
900          map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
901                map->per_primitive_start_dw + 0;
902          map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
903       }
904 
905       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) {
906          map->start_dw[VARYING_SLOT_LAYER] =
907                map->per_primitive_start_dw + 1; /* RTAIndex */
908          map->len_dw[VARYING_SLOT_LAYER] = 1;
909       }
910 
911       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) {
912           map->start_dw[VARYING_SLOT_VIEWPORT] =
913                 map->per_primitive_start_dw + 2;
914           map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
915       }
916 
917       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
918          map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
919                map->per_primitive_start_dw + 3;
920          map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
921       }
922 
923       map->per_primitive_header_size_dw = 8;
924       outputs_written &= ~per_primitive_header_bits;
925    } else {
926       map->per_primitive_header_size_dw = 0;
927    }
928 
929    map->per_primitive_data_size_dw = 0;
930 
931    /* For fast linked libraries, we can't pack the MUE, as the fragment shader
932     * will be compiled without access to the MUE map and won't be able to find
933     * out where everything is.
934     * Instead, keep doing things as we did before the packing, just laying out
935     * everything in varying order, which is how the FS will expect them.
936     */
937    if (compact_mue) {
938       brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
939                              &map->user_data_in_primitive_header,
940                              &map->user_data_in_vertex_header);
941 
942       unsigned start_dw = map->per_primitive_start_dw;
943       if (map->user_data_in_primitive_header)
944          start_dw += 4; /* first 4 dwords are used */
945       else
946          start_dw += map->per_primitive_header_size_dw;
947       unsigned header_used_dw = 0;
948 
949       for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
950          int location = (*it).location;
951          if (location < 0) {
952             start_dw += (*it).dwords;
953             if (map->user_data_in_primitive_header && header_used_dw < 4)
954                header_used_dw += (*it).dwords;
955             else
956                map->per_primitive_data_size_dw += (*it).dwords;
957             assert(header_used_dw <= 4);
958             continue;
959          }
960 
961          assert(map->start_dw[location] == -1);
962 
963          assert(location == VARYING_SLOT_PRIMITIVE_ID ||
964                 location >= VARYING_SLOT_VAR0);
965 
966          brw_mue_assign_position(&*it, map, start_dw);
967 
968          start_dw += (*it).dwords;
969          if (map->user_data_in_primitive_header && header_used_dw < 4)
970             header_used_dw += (*it).dwords;
971          else
972             map->per_primitive_data_size_dw += (*it).dwords;
973          assert(header_used_dw <= 4);
974          outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
975       }
976    } else {
977       unsigned start_dw = map->per_primitive_start_dw +
978                           map->per_primitive_header_size_dw;
979 
980       uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
981       while (per_prim_outputs) {
982          uint64_t location = ffsll(per_prim_outputs) - 1;
983 
984          assert(map->start_dw[location] == -1);
985          assert(location == VARYING_SLOT_PRIMITIVE_ID ||
986                 location >= VARYING_SLOT_VAR0);
987 
988          nir_variable *var =
989             brw_nir_find_complete_variable_with_location(nir,
990                                                          nir_var_shader_out,
991                                                          location);
992          struct attr_desc d;
993          d.location = location;
994          d.type     = brw_nir_get_var_type(nir, var);
995          d.dwords   = glsl_count_dword_slots(d.type, false);
996          d.slots    = glsl_count_attribute_slots(d.type, false);
997 
998          brw_mue_assign_position(&d, map, start_dw);
999 
1000          map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
1001          start_dw += ALIGN(d.dwords, 4);
1002 
1003          per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
1004       }
1005    }
1006 
1007    map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
1008                                        map->per_primitive_data_size_dw, 8);
1009 
1010    map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
1011                                     map->per_primitive_pitch_dw *
1012                                     map->max_primitives, 8);
1013 
1014    /* TODO(mesh): Multiview. */
1015    unsigned fixed_header_size = 8;
1016    map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
1017                                           nir->info.clip_distance_array_size +
1018                                           nir->info.cull_distance_array_size, 8);
1019 
1020    if (outputs_written & per_vertex_header_bits) {
1021       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
1022          map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
1023          map->len_dw[VARYING_SLOT_PSIZ] = 1;
1024       }
1025 
1026       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
1027          map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
1028          map->len_dw[VARYING_SLOT_POS] = 4;
1029       }
1030 
1031       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
1032          map->start_dw[VARYING_SLOT_CLIP_DIST0] =
1033                map->per_vertex_start_dw + fixed_header_size + 0;
1034          map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
1035       }
1036 
1037       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
1038          map->start_dw[VARYING_SLOT_CLIP_DIST1] =
1039                map->per_vertex_start_dw + fixed_header_size + 4;
1040          map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
1041       }
1042 
1043       outputs_written &= ~per_vertex_header_bits;
1044    }
1045 
1046    /* cull distances should be lowered earlier */
1047    assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
1048    assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
1049 
1050    map->per_vertex_data_size_dw = 0;
1051 
1052    /* For fast linked libraries, we can't pack the MUE, as the fragment shader
1053     * will be compiled without access to the MUE map and won't be able to find
1054     * out where everything is.
1055     * Instead, keep doing things as we did before the packing, just laying out
1056     * everything in varying order, which is how the FS will expect them.
1057     */
1058    if (compact_mue) {
1059       unsigned start_dw = map->per_vertex_start_dw;
1060       if (!map->user_data_in_vertex_header)
1061          start_dw += map->per_vertex_header_size_dw;
1062 
1063       unsigned header_used_dw = 0;
1064       for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
1065          for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
1066             int location = (*it).location;
1067             if (location < 0) {
1068                start_dw += (*it).dwords;
1069                if (map->user_data_in_vertex_header && header_used_dw < 4) {
1070                   header_used_dw += (*it).dwords;
1071                   assert(header_used_dw <= 4);
1072                   if (header_used_dw == 4)
1073                      start_dw += 4; /* jump over gl_position */
1074                } else {
1075                   map->per_vertex_data_size_dw += (*it).dwords;
1076                }
1077                continue;
1078             }
1079 
1080             assert(map->start_dw[location] == -1);
1081 
1082             assert(location >= VARYING_SLOT_VAR0);
1083 
1084             brw_mue_assign_position(&*it, map, start_dw);
1085 
1086             start_dw += (*it).dwords;
1087             if (map->user_data_in_vertex_header && header_used_dw < 4) {
1088                header_used_dw += (*it).dwords;
1089                assert(header_used_dw <= 4);
1090                if (header_used_dw == 4)
1091                   start_dw += 4; /* jump over gl_position */
1092             } else {
1093                map->per_vertex_data_size_dw += (*it).dwords;
1094             }
1095             outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
1096          }
1097       }
1098    } else {
1099       unsigned start_dw = map->per_vertex_start_dw +
1100                           map->per_vertex_header_size_dw;
1101 
1102       uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
1103       while (per_vertex_outputs) {
1104          uint64_t location = ffsll(per_vertex_outputs) - 1;
1105 
1106          assert(map->start_dw[location] == -1);
1107          assert(location >= VARYING_SLOT_VAR0);
1108 
1109          nir_variable *var =
1110             brw_nir_find_complete_variable_with_location(nir,
1111                                                          nir_var_shader_out,
1112                                                          location);
1113          struct attr_desc d;
1114          d.location = location;
1115          d.type     = brw_nir_get_var_type(nir, var);
1116          d.dwords   = glsl_count_dword_slots(d.type, false);
1117          d.slots    = glsl_count_attribute_slots(d.type, false);
1118 
1119          brw_mue_assign_position(&d, map, start_dw);
1120 
1121          map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
1122          start_dw += ALIGN(d.dwords, 4);
1123 
1124          per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
1125       }
1126    }
1127 
1128    map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
1129                                     map->per_vertex_data_size_dw, 8);
1130 
1131    map->size_dw =
1132       map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
1133 
1134    assert(map->size_dw % 8 == 0);
1135 }
1136 
1137 static void
brw_print_mue_map(FILE * fp,const struct brw_mue_map * map,struct nir_shader * nir)1138 brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
1139 {
1140    fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
1141            map->size_dw, map->max_primitives, map->max_vertices);
1142    fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
1143            map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
1144            map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
1145            map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
1146    fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
1147            map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
1148            map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
1149            map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
1150 
1151    fprintf(fp, "  ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
1152            map->per_primitive_start_dw,
1153            map->per_primitive_header_size_dw,
1154            map->per_primitive_data_size_dw,
1155            map->per_primitive_pitch_dw);
1156 
1157    for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1158       if (map->start_dw[i] < 0)
1159          continue;
1160 
1161       const unsigned offset = map->start_dw[i];
1162       const unsigned len = map->len_dw[i];
1163 
1164       if (offset < map->per_primitive_start_dw ||
1165           offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
1166          continue;
1167 
1168       const char *name =
1169             gl_varying_slot_name_for_stage((gl_varying_slot)i,
1170                                            MESA_SHADER_MESH);
1171 
1172       fprintf(fp, "  <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
1173               name, i);
1174    }
1175 
1176    fprintf(fp, "  ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
1177            map->per_vertex_start_dw,
1178            map->per_vertex_header_size_dw,
1179            map->per_vertex_data_size_dw,
1180            map->per_vertex_pitch_dw);
1181 
1182    for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1183       if (map->start_dw[i] < 0)
1184          continue;
1185 
1186       const unsigned offset = map->start_dw[i];
1187       const unsigned len = map->len_dw[i];
1188 
1189       if (offset < map->per_vertex_start_dw ||
1190           offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
1191          continue;
1192 
1193       nir_variable *var =
1194             nir_find_variable_with_location(nir, nir_var_shader_out, i);
1195       bool flat = var->data.interpolation == INTERP_MODE_FLAT;
1196 
1197       const char *name =
1198             gl_varying_slot_name_for_stage((gl_varying_slot)i,
1199                                            MESA_SHADER_MESH);
1200 
1201       fprintf(fp, "  <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
1202               name, i, flat ? " (flat)" : "");
1203    }
1204 
1205    fprintf(fp, "\n");
1206 }
1207 
1208 static void
brw_nir_lower_mue_outputs(nir_shader * nir,const struct brw_mue_map * map)1209 brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
1210 {
1211    nir_foreach_shader_out_variable(var, nir) {
1212       int location = var->data.location;
1213       assert(location >= 0);
1214       assert(map->start_dw[location] != -1);
1215       var->data.driver_location = map->start_dw[location];
1216    }
1217 
1218    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
1219             type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
1220 }
1221 
1222 static void
brw_nir_initialize_mue(nir_shader * nir,const struct brw_mue_map * map,unsigned dispatch_width)1223 brw_nir_initialize_mue(nir_shader *nir,
1224                        const struct brw_mue_map *map,
1225                        unsigned dispatch_width)
1226 {
1227    assert(map->per_primitive_header_size_dw > 0);
1228 
1229    nir_builder b;
1230    nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
1231    b = nir_builder_at(nir_before_impl(entrypoint));
1232 
1233    nir_def *dw_off = nir_imm_int(&b, 0);
1234    nir_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0);
1235 
1236    /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */
1237 
1238    assert(!nir->info.workgroup_size_variable);
1239    const unsigned workgroup_size = nir->info.workgroup_size[0] *
1240                                    nir->info.workgroup_size[1] *
1241                                    nir->info.workgroup_size[2];
1242 
1243    /* Invocations from a single workgroup will cooperate in zeroing MUE. */
1244 
1245    /* How many prims each invocation needs to cover without checking its index? */
1246    unsigned prims_per_inv = map->max_primitives / workgroup_size;
1247 
1248    /* Zero first 4 dwords of MUE Primitive Header:
1249     * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask.
1250     */
1251 
1252    nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
1253 
1254    /* Zero primitive headers distanced by workgroup_size, starting from
1255     * invocation index.
1256     */
1257    for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) {
1258       nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1259                                            prim_in_inv * workgroup_size);
1260 
1261       nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1262                                      .base = (int)map->per_primitive_start_dw,
1263                                      .write_mask = WRITEMASK_XYZW,
1264                                      .component = 0,
1265                                      .src_type = nir_type_uint32);
1266    }
1267 
1268    /* How many prims are left? */
1269    unsigned remaining = map->max_primitives % workgroup_size;
1270 
1271    if (remaining) {
1272       /* Zero "remaining" primitive headers starting from the last one covered
1273        * by the loop above + workgroup_size.
1274        */
1275       nir_def *cmp = nir_ilt_imm(&b, local_invocation_index, remaining);
1276       nir_if *if_stmt = nir_push_if(&b, cmp);
1277       {
1278          nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1279                                                prims_per_inv * workgroup_size);
1280 
1281          nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1282                                         .base = (int)map->per_primitive_start_dw,
1283                                         .write_mask = WRITEMASK_XYZW,
1284                                         .component = 0,
1285                                         .src_type = nir_type_uint32);
1286       }
1287       nir_pop_if(&b, if_stmt);
1288    }
1289 
1290    /* If there's more than one subgroup, then we need to wait for all of them
1291     * to finish initialization before we can proceed. Otherwise some subgroups
1292     * may start filling MUE before other finished initializing.
1293     */
1294    if (workgroup_size > dispatch_width) {
1295       nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP,
1296                          NIR_MEMORY_ACQ_REL, nir_var_shader_out);
1297    }
1298 
1299    if (remaining) {
1300       nir_metadata_preserve(entrypoint, nir_metadata_none);
1301    } else {
1302       nir_metadata_preserve(entrypoint, nir_metadata_control_flow);
1303    }
1304 }
1305 
1306 static void
brw_nir_adjust_offset(nir_builder * b,nir_intrinsic_instr * intrin,uint32_t pitch)1307 brw_nir_adjust_offset(nir_builder *b, nir_intrinsic_instr *intrin, uint32_t pitch)
1308 {
1309    nir_src *index_src = nir_get_io_arrayed_index_src(intrin);
1310    nir_src *offset_src = nir_get_io_offset_src(intrin);
1311 
1312    b->cursor = nir_before_instr(&intrin->instr);
1313    nir_def *offset =
1314       nir_iadd(b,
1315                offset_src->ssa,
1316                nir_imul_imm(b, index_src->ssa, pitch));
1317    nir_src_rewrite(offset_src, offset);
1318 }
1319 
1320 static bool
brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1321 brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
1322                                                 nir_intrinsic_instr *intrin,
1323                                                 void *data)
1324 {
1325    const struct brw_mue_map *map = (const struct brw_mue_map *) data;
1326 
1327    /* Remap per_vertex and per_primitive offsets using the extra source and
1328     * the pitch.
1329     */
1330    switch (intrin->intrinsic) {
1331    case nir_intrinsic_load_per_vertex_output:
1332    case nir_intrinsic_store_per_vertex_output:
1333       brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw);
1334 
1335       return true;
1336 
1337    case nir_intrinsic_load_per_primitive_output:
1338    case nir_intrinsic_store_per_primitive_output: {
1339       struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
1340       uint32_t pitch;
1341       if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
1342          pitch = map->per_primitive_indices_dw;
1343       else
1344          pitch = map->per_primitive_pitch_dw;
1345 
1346       brw_nir_adjust_offset(b, intrin, pitch);
1347 
1348       return true;
1349    }
1350 
1351    default:
1352       return false;
1353    }
1354 }
1355 
1356 static bool
brw_nir_adjust_offset_for_arrayed_indices(nir_shader * nir,const struct brw_mue_map * map)1357 brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map)
1358 {
1359    return nir_shader_intrinsics_pass(nir,
1360                                        brw_nir_adjust_offset_for_arrayed_indices_instr,
1361                                        nir_metadata_control_flow,
1362                                        (void *)map);
1363 }
1364 
1365 struct index_packing_state {
1366    unsigned vertices_per_primitive;
1367    nir_variable *original_prim_indices;
1368    nir_variable *packed_prim_indices;
1369 };
1370 
1371 static bool
brw_can_pack_primitive_indices(nir_shader * nir,struct index_packing_state * state)1372 brw_can_pack_primitive_indices(nir_shader *nir, struct index_packing_state *state)
1373 {
1374    /* can single index fit into one byte of U888X format? */
1375    if (nir->info.mesh.max_vertices_out > 255)
1376       return false;
1377 
1378    state->vertices_per_primitive =
1379          mesa_vertices_per_prim(nir->info.mesh.primitive_type);
1380    /* packing point indices doesn't help */
1381    if (state->vertices_per_primitive == 1)
1382       return false;
1383 
1384    state->original_prim_indices =
1385       nir_find_variable_with_location(nir,
1386                                       nir_var_shader_out,
1387                                       VARYING_SLOT_PRIMITIVE_INDICES);
1388    /* no indices = no changes to the shader, but it's still worth it,
1389     * because less URB space will be used
1390     */
1391    if (!state->original_prim_indices)
1392       return true;
1393 
1394    ASSERTED const struct glsl_type *type = state->original_prim_indices->type;
1395    assert(glsl_type_is_array(type));
1396    assert(glsl_type_is_vector(glsl_without_array(type)));
1397    assert(glsl_without_array(type)->vector_elements == state->vertices_per_primitive);
1398 
1399    nir_foreach_function_impl(impl, nir) {
1400       nir_foreach_block(block, impl) {
1401          nir_foreach_instr(instr, block) {
1402             if (instr->type != nir_instr_type_intrinsic)
1403                continue;
1404 
1405             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1406 
1407             if (intrin->intrinsic != nir_intrinsic_store_deref) {
1408                /* any unknown deref operation on primitive indices -> don't pack */
1409                unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
1410                for (unsigned i = 0; i < num_srcs; i++) {
1411                   nir_deref_instr *deref = nir_src_as_deref(intrin->src[i]);
1412                   if (!deref)
1413                      continue;
1414                   nir_variable *var = nir_deref_instr_get_variable(deref);
1415 
1416                   if (var == state->original_prim_indices)
1417                      return false;
1418                }
1419 
1420                continue;
1421             }
1422 
1423             nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
1424             if (!deref)
1425                continue;
1426 
1427             nir_variable *var = nir_deref_instr_get_variable(deref);
1428             if (var != state->original_prim_indices)
1429                continue;
1430 
1431             if (deref->deref_type != nir_deref_type_array)
1432                return false; /* unknown chain of derefs */
1433 
1434             nir_deref_instr *var_deref = nir_src_as_deref(deref->parent);
1435             if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1436                return false; /* unknown chain of derefs */
1437 
1438             assert (var_deref->var == state->original_prim_indices);
1439 
1440             unsigned write_mask = nir_intrinsic_write_mask(intrin);
1441 
1442             /* If only some components are written, then we can't easily pack.
1443              * In theory we could, by loading current dword value, bitmasking
1444              * one byte and storing back the whole dword, but it would be slow
1445              * and could actually decrease performance. TODO: reevaluate this
1446              * once there will be something hitting this.
1447              */
1448             if (write_mask != BITFIELD_MASK(state->vertices_per_primitive))
1449                return false;
1450          }
1451       }
1452    }
1453 
1454    return true;
1455 }
1456 
1457 static bool
brw_pack_primitive_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1458 brw_pack_primitive_indices_instr(nir_builder *b, nir_intrinsic_instr *intrin,
1459                                  void *data)
1460 {
1461    if (intrin->intrinsic != nir_intrinsic_store_deref)
1462       return false;
1463 
1464    nir_deref_instr *array_deref = nir_src_as_deref(intrin->src[0]);
1465    if (!array_deref || array_deref->deref_type != nir_deref_type_array)
1466       return false;
1467 
1468    nir_deref_instr *var_deref = nir_src_as_deref(array_deref->parent);
1469    if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1470       return false;
1471 
1472    struct index_packing_state *state =
1473          (struct index_packing_state *)data;
1474 
1475    nir_variable *var = var_deref->var;
1476 
1477    if (var != state->original_prim_indices)
1478       return false;
1479 
1480    unsigned vertices_per_primitive = state->vertices_per_primitive;
1481 
1482    b->cursor = nir_before_instr(&intrin->instr);
1483 
1484    nir_deref_instr *new_var_deref =
1485          nir_build_deref_var(b, state->packed_prim_indices);
1486    nir_deref_instr *new_array_deref =
1487          nir_build_deref_array(b, new_var_deref, array_deref->arr.index.ssa);
1488 
1489    nir_src *data_src = &intrin->src[1];
1490    nir_def *data_def =
1491          data_src->ssa;
1492 
1493    nir_def *new_data =
1494          nir_ior(b, nir_ishl_imm(b, nir_channel(b, data_def, 0), 0),
1495                     nir_ishl_imm(b, nir_channel(b, data_def, 1), 8));
1496 
1497    if (vertices_per_primitive >= 3) {
1498       new_data =
1499             nir_ior(b, new_data,
1500                        nir_ishl_imm(b, nir_channel(b, data_def, 2), 16));
1501    }
1502 
1503    nir_build_store_deref(b, &new_array_deref->def, new_data);
1504 
1505    nir_instr_remove(&intrin->instr);
1506 
1507    return true;
1508 }
1509 
1510 static bool
brw_pack_primitive_indices(nir_shader * nir,void * data)1511 brw_pack_primitive_indices(nir_shader *nir, void *data)
1512 {
1513    struct index_packing_state *state = (struct index_packing_state *)data;
1514 
1515    const struct glsl_type *new_type =
1516          glsl_array_type(glsl_uint_type(),
1517                          nir->info.mesh.max_primitives_out,
1518                          0);
1519 
1520    state->packed_prim_indices =
1521          nir_variable_create(nir, nir_var_shader_out,
1522                              new_type, "gl_PrimitiveIndicesPacked");
1523    state->packed_prim_indices->data.location = VARYING_SLOT_PRIMITIVE_INDICES;
1524    state->packed_prim_indices->data.interpolation = INTERP_MODE_NONE;
1525    state->packed_prim_indices->data.per_primitive = 1;
1526 
1527    return nir_shader_intrinsics_pass(nir, brw_pack_primitive_indices_instr,
1528                                        nir_metadata_control_flow,
1529                                        data);
1530 }
1531 
1532 static bool
brw_mesh_autostrip_enable(const struct brw_compiler * compiler,struct nir_shader * nir,struct brw_mue_map * map)1533 brw_mesh_autostrip_enable(const struct brw_compiler *compiler, struct nir_shader *nir,
1534                           struct brw_mue_map *map)
1535 {
1536    /* Auto-striping can be enabled when shader either doesn't write to
1537     * RTA Index and VP Index or writes the same values for all primitives.
1538     * Since determining whether shader writes the same value across the whole
1539     * workgroup (not just subgroup!) is tricky, we do the simplest possible
1540     * thing - say yes only when shader writes const values and they all match.
1541     *
1542     * TODO: improve this
1543     */
1544 
1545    if (compiler->devinfo->ver < 20)
1546       return false;
1547 
1548    if (map->start_dw[VARYING_SLOT_VIEWPORT] < 0 &&
1549        map->start_dw[VARYING_SLOT_LAYER] < 0)
1550       return true;
1551 
1552    nir_def *vp = NULL;
1553    nir_def *layer = NULL;
1554 
1555    nir_foreach_function(function, nir) {
1556       if (!function->impl)
1557          continue;
1558 
1559       nir_foreach_block(block, function->impl) {
1560          nir_foreach_instr(instr, block) {
1561             if (instr->type != nir_instr_type_intrinsic)
1562                continue;
1563 
1564             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1565             if (intrin->intrinsic != nir_intrinsic_store_per_primitive_output)
1566                continue;
1567 
1568             struct nir_io_semantics io = nir_intrinsic_io_semantics(intrin);
1569             bool is_vp = io.location == VARYING_SLOT_VIEWPORT;
1570             bool is_layer = io.location == VARYING_SLOT_LAYER;
1571             if (!is_vp && !is_layer)
1572                continue;
1573 
1574             nir_src *src = &intrin->src[0];
1575 
1576             if (!nir_src_is_const(*src))
1577                return false;
1578 
1579             nir_def **cmp;
1580             if (is_vp)
1581                cmp = &vp;
1582             else
1583                cmp = &layer;
1584 
1585             if (*cmp == NULL)
1586                *cmp = src->ssa;
1587             else if (*cmp != src->ssa)
1588                return false;
1589          }
1590       }
1591    }
1592 
1593    return true;
1594 }
1595 
1596 const unsigned *
brw_compile_mesh(const struct brw_compiler * compiler,struct brw_compile_mesh_params * params)1597 brw_compile_mesh(const struct brw_compiler *compiler,
1598                  struct brw_compile_mesh_params *params)
1599 {
1600    struct nir_shader *nir = params->base.nir;
1601    const struct brw_mesh_prog_key *key = params->key;
1602    struct brw_mesh_prog_data *prog_data = params->prog_data;
1603    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_MESH);
1604 
1605    prog_data->base.base.stage = MESA_SHADER_MESH;
1606    prog_data->base.base.total_shared = nir->info.shared_size;
1607    prog_data->base.base.total_scratch = 0;
1608 
1609    prog_data->base.local_size[0] = nir->info.workgroup_size[0];
1610    prog_data->base.local_size[1] = nir->info.workgroup_size[1];
1611    prog_data->base.local_size[2] = nir->info.workgroup_size[2];
1612 
1613    prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1;
1614    prog_data->cull_distance_mask =
1615          ((1 << nir->info.cull_distance_array_size) - 1) <<
1616           nir->info.clip_distance_array_size;
1617    prog_data->primitive_type = nir->info.mesh.primitive_type;
1618 
1619    struct index_packing_state index_packing_state = {};
1620    if (brw_can_pack_primitive_indices(nir, &index_packing_state)) {
1621       if (index_packing_state.original_prim_indices)
1622          NIR_PASS(_, nir, brw_pack_primitive_indices, &index_packing_state);
1623       prog_data->index_format = BRW_INDEX_FORMAT_U888X;
1624    } else {
1625       prog_data->index_format = BRW_INDEX_FORMAT_U32;
1626    }
1627 
1628    prog_data->uses_drawid =
1629       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
1630 
1631    brw_nir_lower_tue_inputs(nir, params->tue_map);
1632 
1633    brw_compute_mue_map(compiler, nir, &prog_data->map,
1634                        prog_data->index_format, key->compact_mue);
1635    brw_nir_lower_mue_outputs(nir, &prog_data->map);
1636 
1637    prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map);
1638 
1639    brw_simd_selection_state simd_state{
1640       .devinfo = compiler->devinfo,
1641       .prog_data = &prog_data->base,
1642       .required_width = brw_required_dispatch_width(&nir->info),
1643    };
1644 
1645    std::unique_ptr<fs_visitor> v[3];
1646 
1647    for (int simd = 0; simd < 3; simd++) {
1648       if (!brw_simd_should_compile(simd_state, simd))
1649          continue;
1650 
1651       const unsigned dispatch_width = 8 << simd;
1652 
1653       nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
1654 
1655       /*
1656        * When Primitive Header is enabled, we may not generates writes to all
1657        * fields, so let's initialize everything.
1658        */
1659       if (prog_data->map.per_primitive_header_size_dw > 0)
1660          NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
1661 
1662       brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
1663 
1664       NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
1665       /* Load uniforms can do a better job for constants, so fold before it. */
1666       NIR_PASS(_, shader, nir_opt_constant_folding);
1667       NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
1668 
1669       NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
1670 
1671       brw_postprocess_nir(shader, compiler, debug_enabled,
1672                           key->base.robust_flags);
1673 
1674       v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
1675                                              &key->base,
1676                                              &prog_data->base.base,
1677                                              shader, dispatch_width,
1678                                              params->base.stats != NULL,
1679                                              debug_enabled);
1680 
1681       if (prog_data->base.prog_mask) {
1682          unsigned first = ffs(prog_data->base.prog_mask) - 1;
1683          v[simd]->import_uniforms(v[first].get());
1684       }
1685 
1686       const bool allow_spilling = !brw_simd_any_compiled(simd_state);
1687       if (run_task_mesh(*v[simd], allow_spilling))
1688          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
1689       else
1690          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
1691    }
1692 
1693    int selected_simd = brw_simd_select(simd_state);
1694    if (selected_simd < 0) {
1695       params->base.error_str =
1696          ralloc_asprintf(params->base.mem_ctx,
1697                          "Can't compile shader: "
1698                          "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
1699                          simd_state.error[0], simd_state.error[1],
1700                          simd_state.error[2]);
1701       return NULL;
1702    }
1703 
1704    fs_visitor *selected = v[selected_simd].get();
1705    prog_data->base.prog_mask = 1 << selected_simd;
1706 
1707    if (unlikely(debug_enabled)) {
1708       if (params->tue_map) {
1709          fprintf(stderr, "Mesh Input ");
1710          brw_print_tue_map(stderr, params->tue_map);
1711       }
1712       fprintf(stderr, "Mesh Output ");
1713       brw_print_mue_map(stderr, &prog_data->map, nir);
1714    }
1715 
1716    fs_generator g(compiler, &params->base, &prog_data->base.base,
1717                   MESA_SHADER_MESH);
1718    if (unlikely(debug_enabled)) {
1719       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
1720                                      "%s mesh shader %s",
1721                                      nir->info.label ? nir->info.label
1722                                                      : "unnamed",
1723                                      nir->info.name));
1724    }
1725 
1726    g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
1727                    selected->performance_analysis.require(), params->base.stats);
1728    g.add_const_data(nir->constant_data, nir->constant_data_size);
1729    return g.get_assembly();
1730 }
1731