1 /*
2 * Copyright © 2021 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <list>
25 #include <vector>
26 #include "brw_compiler.h"
27 #include "brw_fs.h"
28 #include "brw_fs_builder.h"
29 #include "brw_nir.h"
30 #include "brw_private.h"
31 #include "compiler/nir/nir_builder.h"
32 #include "dev/intel_debug.h"
33
34 #include <memory>
35
36 using namespace brw;
37
38 static bool
brw_nir_lower_load_uniforms_filter(const nir_instr * instr,UNUSED const void * data)39 brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
40 UNUSED const void *data)
41 {
42 if (instr->type != nir_instr_type_intrinsic)
43 return false;
44 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
45 return intrin->intrinsic == nir_intrinsic_load_uniform;
46 }
47
48 static nir_def *
brw_nir_lower_load_uniforms_impl(nir_builder * b,nir_instr * instr,UNUSED void * data)49 brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
50 UNUSED void *data)
51 {
52 assert(instr->type == nir_instr_type_intrinsic);
53 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
54 assert(intrin->intrinsic == nir_intrinsic_load_uniform);
55
56 /* Read the first few 32-bit scalars from InlineData. */
57 if (nir_src_is_const(intrin->src[0]) &&
58 intrin->def.bit_size == 32 &&
59 intrin->def.num_components == 1) {
60 unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
61 unsigned off_dw = off / 4;
62 if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) {
63 off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW;
64 return nir_load_mesh_inline_data_intel(b, 32, off_dw);
65 }
66 }
67
68 return brw_nir_load_global_const(b, intrin,
69 nir_load_mesh_inline_data_intel(b, 64, 0), 0);
70 }
71
72 static bool
brw_nir_lower_load_uniforms(nir_shader * nir)73 brw_nir_lower_load_uniforms(nir_shader *nir)
74 {
75 return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
76 brw_nir_lower_load_uniforms_impl, NULL);
77 }
78
79 static inline int
type_size_scalar_dwords(const struct glsl_type * type,bool bindless)80 type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
81 {
82 return glsl_count_dword_slots(type, bindless);
83 }
84
85 /* TODO(mesh): Make this a common function. */
86 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)87 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
88 {
89 assert(glsl_type_is_vector_or_scalar(type));
90
91 uint32_t comp_size = glsl_type_is_boolean(type)
92 ? 4 : glsl_get_bit_size(type) / 8;
93 unsigned length = glsl_get_vector_elements(type);
94 *size = comp_size * length,
95 *align = comp_size * (length == 3 ? 4 : length);
96 }
97
98 static bool
brw_nir_lower_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)99 brw_nir_lower_launch_mesh_workgroups_instr(nir_builder *b,
100 nir_intrinsic_instr *intrin,
101 void *data)
102 {
103 if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
104 return false;
105
106 b->cursor = nir_before_instr(&intrin->instr);
107
108 nir_def *local_invocation_index = nir_load_local_invocation_index(b);
109
110 /* Make sure that the mesh workgroup size is taken from the first invocation
111 * (nir_intrinsic_launch_mesh_workgroups requirement)
112 */
113 nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0);
114 nir_if *if_stmt = nir_push_if(b, cmp);
115 {
116 /* TUE header contains 4 words:
117 *
118 * - Word 0 for Task Count.
119 *
120 * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
121 * 3D dispatch into the 1D dispatch supported by HW.
122 */
123 nir_def *x = nir_channel(b, intrin->src[0].ssa, 0);
124 nir_def *y = nir_channel(b, intrin->src[0].ssa, 1);
125 nir_def *z = nir_channel(b, intrin->src[0].ssa, 2);
126 nir_def *task_count = nir_imul(b, x, nir_imul(b, y, z));
127 nir_def *tue_header = nir_vec4(b, task_count, x, y, z);
128 nir_store_task_payload(b, tue_header, nir_imm_int(b, 0));
129 }
130 nir_pop_if(b, if_stmt);
131
132 nir_instr_remove(&intrin->instr);
133
134 return true;
135 }
136
137 static bool
brw_nir_lower_launch_mesh_workgroups(nir_shader * nir)138 brw_nir_lower_launch_mesh_workgroups(nir_shader *nir)
139 {
140 return nir_shader_intrinsics_pass(nir,
141 brw_nir_lower_launch_mesh_workgroups_instr,
142 nir_metadata_none,
143 NULL);
144 }
145
146 static void
brw_nir_lower_tue_outputs(nir_shader * nir,brw_tue_map * map)147 brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
148 {
149 memset(map, 0, sizeof(*map));
150
151 NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
152 type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
153
154 /* From bspec: "It is suggested that SW reserve the 16 bytes following the
155 * TUE Header, and therefore start the SW-defined data structure at 32B
156 * alignment. This allows the TUE Header to always be written as 32 bytes
157 * with 32B alignment, the most optimal write performance case."
158 */
159 map->per_task_data_start_dw = 8;
160
161 /* Lowering to explicit types will start offsets from task_payload_size, so
162 * set it to start after the header.
163 */
164 nir->info.task_payload_size = map->per_task_data_start_dw * 4;
165 NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
166 nir_var_mem_task_payload, shared_type_info);
167 NIR_PASS(_, nir, nir_lower_explicit_io,
168 nir_var_mem_task_payload, nir_address_format_32bit_offset);
169
170 map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
171 }
172
173 static void
brw_print_tue_map(FILE * fp,const struct brw_tue_map * map)174 brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
175 {
176 fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
177 }
178
179 static bool
brw_nir_adjust_task_payload_offsets_instr(struct nir_builder * b,nir_intrinsic_instr * intrin,void * data)180 brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
181 nir_intrinsic_instr *intrin,
182 void *data)
183 {
184 switch (intrin->intrinsic) {
185 case nir_intrinsic_store_task_payload:
186 case nir_intrinsic_load_task_payload: {
187 nir_src *offset_src = nir_get_io_offset_src(intrin);
188
189 if (nir_src_is_const(*offset_src))
190 assert(nir_src_as_uint(*offset_src) % 4 == 0);
191
192 b->cursor = nir_before_instr(&intrin->instr);
193
194 /* Regular I/O uses dwords while explicit I/O used for task payload uses
195 * bytes. Normalize it to dwords.
196 *
197 * TODO(mesh): Figure out how to handle 8-bit, 16-bit.
198 */
199
200 nir_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
201 nir_src_rewrite(offset_src, offset);
202
203 unsigned base = nir_intrinsic_base(intrin);
204 assert(base % 4 == 0);
205 nir_intrinsic_set_base(intrin, base / 4);
206
207 return true;
208 }
209
210 default:
211 return false;
212 }
213 }
214
215 static bool
brw_nir_adjust_task_payload_offsets(nir_shader * nir)216 brw_nir_adjust_task_payload_offsets(nir_shader *nir)
217 {
218 return nir_shader_intrinsics_pass(nir,
219 brw_nir_adjust_task_payload_offsets_instr,
220 nir_metadata_control_flow,
221 NULL);
222 }
223
224 void
brw_nir_adjust_payload(nir_shader * shader)225 brw_nir_adjust_payload(nir_shader *shader)
226 {
227 /* Adjustment of task payload offsets must be performed *after* last pass
228 * which interprets them as bytes, because it changes their unit.
229 */
230 bool adjusted = false;
231 NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets);
232 if (adjusted) /* clean up the mess created by offset adjustments */
233 NIR_PASS(_, shader, nir_opt_constant_folding);
234 }
235
236 static bool
brw_nir_align_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)237 brw_nir_align_launch_mesh_workgroups_instr(nir_builder *b,
238 nir_intrinsic_instr *intrin,
239 void *data)
240 {
241 if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
242 return false;
243
244 /* nir_lower_task_shader uses "range" as task payload size. */
245 unsigned range = nir_intrinsic_range(intrin);
246 /* This will avoid special case in nir_lower_task_shader dealing with
247 * not vec4-aligned payload when payload_in_shared workaround is enabled.
248 */
249 nir_intrinsic_set_range(intrin, ALIGN(range, 16));
250
251 return true;
252 }
253
254 static bool
brw_nir_align_launch_mesh_workgroups(nir_shader * nir)255 brw_nir_align_launch_mesh_workgroups(nir_shader *nir)
256 {
257 return nir_shader_intrinsics_pass(nir,
258 brw_nir_align_launch_mesh_workgroups_instr,
259 nir_metadata_control_flow,
260 NULL);
261 }
262
263 static void
brw_emit_urb_fence(fs_visitor & s)264 brw_emit_urb_fence(fs_visitor &s)
265 {
266 const fs_builder bld = fs_builder(&s).at_end();
267 brw_reg dst = bld.vgrf(BRW_TYPE_UD);
268 fs_inst *fence = bld.emit(SHADER_OPCODE_MEMORY_FENCE, dst,
269 brw_vec8_grf(0, 0),
270 brw_imm_ud(true),
271 brw_imm_ud(0));
272 fence->sfid = BRW_SFID_URB;
273 /* The logical thing here would likely be a THREADGROUP fence but that's
274 * still failing some tests like in dEQP-VK.mesh_shader.ext.query.*
275 *
276 * Gfx12.5 has a comment about this on BSpec 53533 :
277 *
278 * "If fence scope is Local or Threadgroup, HW ignores the flush type
279 * and operates as if it was set to None (no flush)"
280 *
281 * Software workaround from HSD-22014129519 indicates that a GPU fence
282 * resolves the issue.
283 */
284 fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_GPU,
285 LSC_FLUSH_TYPE_NONE, true);
286
287 bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE,
288 bld.null_reg_ud(),
289 &dst,
290 1);
291 }
292
293 static bool
run_task_mesh(fs_visitor & s,bool allow_spilling)294 run_task_mesh(fs_visitor &s, bool allow_spilling)
295 {
296 assert(s.stage == MESA_SHADER_TASK ||
297 s.stage == MESA_SHADER_MESH);
298
299 s.payload_ = new task_mesh_thread_payload(s);
300
301 nir_to_brw(&s);
302
303 if (s.failed)
304 return false;
305
306 brw_emit_urb_fence(s);
307
308 s.emit_cs_terminate();
309
310 brw_calculate_cfg(s);
311
312 brw_fs_optimize(s);
313
314 s.assign_curb_setup();
315
316 brw_fs_lower_3src_null_dest(s);
317 brw_fs_workaround_memory_fence_before_eot(s);
318 brw_fs_workaround_emit_dummy_mov_instruction(s);
319
320 brw_allocate_registers(s, allow_spilling);
321
322 return !s.failed;
323 }
324
325 const unsigned *
brw_compile_task(const struct brw_compiler * compiler,struct brw_compile_task_params * params)326 brw_compile_task(const struct brw_compiler *compiler,
327 struct brw_compile_task_params *params)
328 {
329 struct nir_shader *nir = params->base.nir;
330 const struct brw_task_prog_key *key = params->key;
331 struct brw_task_prog_data *prog_data = params->prog_data;
332 const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TASK);
333
334 brw_nir_lower_tue_outputs(nir, &prog_data->map);
335
336 NIR_PASS(_, nir, brw_nir_align_launch_mesh_workgroups);
337
338 nir_lower_task_shader_options lower_ts_opt = {
339 .payload_to_shared_for_atomics = true,
340 .payload_to_shared_for_small_types = true,
341 /* The actual payload data starts after the TUE header and padding,
342 * so skip those when copying.
343 */
344 .payload_offset_in_bytes = prog_data->map.per_task_data_start_dw * 4,
345 };
346 NIR_PASS(_, nir, nir_lower_task_shader, lower_ts_opt);
347
348 NIR_PASS(_, nir, brw_nir_lower_launch_mesh_workgroups);
349
350 prog_data->base.base.stage = MESA_SHADER_TASK;
351 prog_data->base.base.total_shared = nir->info.shared_size;
352 prog_data->base.base.total_scratch = 0;
353
354 prog_data->base.local_size[0] = nir->info.workgroup_size[0];
355 prog_data->base.local_size[1] = nir->info.workgroup_size[1];
356 prog_data->base.local_size[2] = nir->info.workgroup_size[2];
357
358 prog_data->uses_drawid =
359 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
360
361 brw_simd_selection_state simd_state{
362 .devinfo = compiler->devinfo,
363 .prog_data = &prog_data->base,
364 .required_width = brw_required_dispatch_width(&nir->info),
365 };
366
367 std::unique_ptr<fs_visitor> v[3];
368
369 for (unsigned simd = 0; simd < 3; simd++) {
370 if (!brw_simd_should_compile(simd_state, simd))
371 continue;
372
373 const unsigned dispatch_width = 8 << simd;
374
375 nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
376 brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
377
378 NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
379 NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
380
381 brw_postprocess_nir(shader, compiler, debug_enabled,
382 key->base.robust_flags);
383
384 v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
385 &key->base,
386 &prog_data->base.base,
387 shader, dispatch_width,
388 params->base.stats != NULL,
389 debug_enabled);
390
391 if (prog_data->base.prog_mask) {
392 unsigned first = ffs(prog_data->base.prog_mask) - 1;
393 v[simd]->import_uniforms(v[first].get());
394 }
395
396 const bool allow_spilling = !brw_simd_any_compiled(simd_state);
397 if (run_task_mesh(*v[simd], allow_spilling))
398 brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
399 else
400 simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
401 }
402
403 int selected_simd = brw_simd_select(simd_state);
404 if (selected_simd < 0) {
405 params->base.error_str =
406 ralloc_asprintf(params->base.mem_ctx,
407 "Can't compile shader: "
408 "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
409 simd_state.error[0], simd_state.error[1],
410 simd_state.error[2]);
411 return NULL;
412 }
413
414 fs_visitor *selected = v[selected_simd].get();
415 prog_data->base.prog_mask = 1 << selected_simd;
416
417 if (unlikely(debug_enabled)) {
418 fprintf(stderr, "Task Output ");
419 brw_print_tue_map(stderr, &prog_data->map);
420 }
421
422 fs_generator g(compiler, ¶ms->base, &prog_data->base.base,
423 MESA_SHADER_TASK);
424 if (unlikely(debug_enabled)) {
425 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
426 "%s task shader %s",
427 nir->info.label ? nir->info.label
428 : "unnamed",
429 nir->info.name));
430 }
431
432 g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
433 selected->performance_analysis.require(), params->base.stats);
434 g.add_const_data(nir->constant_data, nir->constant_data_size);
435 return g.get_assembly();
436 }
437
438 static void
brw_nir_lower_tue_inputs(nir_shader * nir,const brw_tue_map * map)439 brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
440 {
441 if (!map)
442 return;
443
444 nir->info.task_payload_size = map->per_task_data_start_dw * 4;
445
446 bool progress = false;
447
448 NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
449 nir_var_mem_task_payload, shared_type_info);
450
451 if (progress) {
452 /* The types for Task Output and Mesh Input should match, so their sizes
453 * should also match.
454 */
455 assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
456 } else {
457 /* Mesh doesn't read any input, to make it clearer set the
458 * task_payload_size to zero instead of keeping an incomplete size that
459 * just includes the header.
460 */
461 nir->info.task_payload_size = 0;
462 }
463
464 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload,
465 nir_address_format_32bit_offset);
466 }
467
468 /* Attribute types. Flat attributes have to be a separate class because
469 * flat and interpolated attributes can't share the same vec4 slot
470 * (see 3DSTATE_SBE.ConstantInterpolationEnable).
471 */
472 enum {
473 PRIM, /* per primitive */
474 VERT, /* per vertex interpolated */
475 VERT_FLAT, /* per vertex flat */
476 };
477
478 struct attr_desc {
479 int location;
480 const struct glsl_type *type;
481 unsigned dwords;
482 unsigned slots;
483 };
484
485 struct attr_type_info {
486 /* order of attributes, negative values are holes */
487 std::list<struct attr_desc> *order;
488
489 /* attributes after which there's hole of size equal to array index */
490 std::list<int> holes[5];
491 };
492
493 static void
brw_mue_assign_position(const struct attr_desc * attr,struct brw_mue_map * map,unsigned start_dw)494 brw_mue_assign_position(const struct attr_desc *attr,
495 struct brw_mue_map *map,
496 unsigned start_dw)
497 {
498 bool is_array = glsl_type_is_array(attr->type);
499 int location = attr->location;
500 unsigned remaining = attr->dwords;
501
502 for (unsigned slot = 0; slot < attr->slots; ++slot) {
503 map->start_dw[location + slot] = start_dw;
504
505 unsigned sz;
506
507 if (is_array) {
508 assert(attr->dwords % attr->slots == 0);
509 sz = attr->dwords / attr->slots;
510 } else {
511 sz = MIN2(remaining, 4);
512 }
513
514 map->len_dw[location + slot] = sz;
515 start_dw += sz;
516 remaining -= sz;
517 }
518 }
519
520 static nir_variable *
brw_nir_find_complete_variable_with_location(nir_shader * shader,nir_variable_mode mode,int location)521 brw_nir_find_complete_variable_with_location(nir_shader *shader,
522 nir_variable_mode mode,
523 int location)
524 {
525 nir_variable *best_var = NULL;
526 unsigned last_size = 0;
527
528 nir_foreach_variable_with_modes(var, shader, mode) {
529 if (var->data.location != location)
530 continue;
531
532 unsigned new_size = glsl_count_dword_slots(var->type, false);
533 if (new_size > last_size) {
534 best_var = var;
535 last_size = new_size;
536 }
537 }
538
539 return best_var;
540 }
541
542 static unsigned
brw_sum_size(const std::list<struct attr_desc> & orders)543 brw_sum_size(const std::list<struct attr_desc> &orders)
544 {
545 unsigned sz = 0;
546 for (auto it = orders.cbegin(); it != orders.cend(); ++it)
547 sz += (*it).dwords;
548 return sz;
549 }
550
551 /* Finds order of outputs which require minimum size, without splitting
552 * of URB read/write messages (which operate on vec4-aligned memory).
553 */
554 static void
brw_compute_mue_layout(const struct brw_compiler * compiler,std::list<struct attr_desc> * orders,uint64_t outputs_written,struct nir_shader * nir,bool * pack_prim_data_into_header,bool * pack_vert_data_into_header)555 brw_compute_mue_layout(const struct brw_compiler *compiler,
556 std::list<struct attr_desc> *orders,
557 uint64_t outputs_written,
558 struct nir_shader *nir,
559 bool *pack_prim_data_into_header,
560 bool *pack_vert_data_into_header)
561 {
562 const struct shader_info *info = &nir->info;
563
564 struct attr_type_info data[3];
565
566 if ((compiler->mesh.mue_header_packing & 1) == 0)
567 *pack_prim_data_into_header = false;
568 if ((compiler->mesh.mue_header_packing & 2) == 0)
569 *pack_vert_data_into_header = false;
570
571 for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
572 data[i].order = &orders[i];
573
574 /* If packing into header is enabled, add a hole of size 4 and add
575 * a virtual location to keep the algorithm happy (it expects holes
576 * to be preceded by some location). We'll remove those virtual
577 * locations at the end.
578 */
579 const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
580 assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
581
582 struct attr_desc d;
583 d.location = virtual_header_location;
584 d.type = NULL;
585 d.dwords = 0;
586 d.slots = 0;
587
588 struct attr_desc h;
589 h.location = -1;
590 h.type = NULL;
591 h.dwords = 4;
592 h.slots = 0;
593
594 if (*pack_prim_data_into_header) {
595 orders[PRIM].push_back(d);
596 orders[PRIM].push_back(h);
597 data[PRIM].holes[4].push_back(virtual_header_location);
598 }
599
600 if (*pack_vert_data_into_header) {
601 orders[VERT].push_back(d);
602 orders[VERT].push_back(h);
603 data[VERT].holes[4].push_back(virtual_header_location);
604 }
605
606 u_foreach_bit64(location, outputs_written) {
607 if ((BITFIELD64_BIT(location) & outputs_written) == 0)
608 continue;
609
610 /* At this point there are both complete and split variables as
611 * outputs. We need the complete variable to compute the required
612 * size.
613 */
614 nir_variable *var =
615 brw_nir_find_complete_variable_with_location(nir,
616 nir_var_shader_out,
617 location);
618
619 d.location = location;
620 d.type = brw_nir_get_var_type(nir, var);
621 d.dwords = glsl_count_dword_slots(d.type, false);
622 d.slots = glsl_count_attribute_slots(d.type, false);
623
624 struct attr_type_info *type_data;
625
626 if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
627 type_data = &data[PRIM];
628 else if (var->data.interpolation == INTERP_MODE_FLAT)
629 type_data = &data[VERT_FLAT];
630 else
631 type_data = &data[VERT];
632
633 std::list<struct attr_desc> *order = type_data->order;
634 std::list<int> *holes = type_data->holes;
635
636 outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
637
638 /* special case to use hole of size 4 */
639 if (d.dwords == 4 && !holes[4].empty()) {
640 holes[4].pop_back();
641
642 assert(order->front().location == virtual_header_location);
643 order->pop_front();
644
645 assert(order->front().location == -1);
646 assert(order->front().dwords == 4);
647 order->front() = d;
648
649 continue;
650 }
651
652 int mod = d.dwords % 4;
653 if (mod == 0) {
654 order->push_back(d);
655 continue;
656 }
657
658 h.location = -1;
659 h.type = NULL;
660 h.dwords = 4 - mod;
661 h.slots = 0;
662
663 if (!compiler->mesh.mue_compaction) {
664 order->push_back(d);
665 order->push_back(h);
666 continue;
667 }
668
669 if (d.dwords > 4) {
670 order->push_back(d);
671 order->push_back(h);
672 holes[h.dwords].push_back(location);
673 continue;
674 }
675
676 assert(d.dwords < 4);
677
678 unsigned found = 0;
679 /* try to find the smallest hole big enough to hold this attribute */
680 for (unsigned sz = d.dwords; sz <= 4; sz++){
681 if (!holes[sz].empty()) {
682 found = sz;
683 break;
684 }
685 }
686
687 /* append at the end if not found */
688 if (found == 0) {
689 order->push_back(d);
690 order->push_back(h);
691 holes[h.dwords].push_back(location);
692
693 continue;
694 }
695
696 assert(found <= 4);
697 assert(!holes[found].empty());
698 int after_loc = holes[found].back();
699 holes[found].pop_back();
700
701 bool inserted_back = false;
702
703 for (auto it = order->begin(); it != order->end(); ++it) {
704 if ((*it).location != after_loc)
705 continue;
706
707 ++it;
708 /* must be a hole */
709 assert((*it).location < 0);
710 /* and it must be big enough */
711 assert(d.dwords <= (*it).dwords);
712
713 if (d.dwords == (*it).dwords) {
714 /* exact size, just replace */
715 *it = d;
716 } else {
717 /* inexact size, shrink hole */
718 (*it).dwords -= d.dwords;
719 /* and insert new attribute before it */
720 order->insert(it, d);
721
722 /* Insert shrunk hole in a spot so that the order of attributes
723 * is preserved.
724 */
725 std::list<int> &hole_list = holes[(*it).dwords];
726 std::list<int>::iterator insert_before = hole_list.end();
727
728 for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
729 if ((*it2) >= (int)location) {
730 insert_before = it2;
731 break;
732 }
733 }
734
735 hole_list.insert(insert_before, location);
736 }
737
738 inserted_back = true;
739 break;
740 }
741
742 assert(inserted_back);
743 }
744
745 if (*pack_prim_data_into_header) {
746 if (orders[PRIM].front().location == virtual_header_location)
747 orders[PRIM].pop_front();
748
749 if (!data[PRIM].holes[4].empty()) {
750 *pack_prim_data_into_header = false;
751
752 assert(orders[PRIM].front().location == -1);
753 assert(orders[PRIM].front().dwords == 4);
754 orders[PRIM].pop_front();
755 }
756
757 if (*pack_prim_data_into_header) {
758 unsigned sz = brw_sum_size(orders[PRIM]);
759
760 if (sz % 8 == 0 || sz % 8 > 4)
761 *pack_prim_data_into_header = false;
762 }
763 }
764
765 if (*pack_vert_data_into_header) {
766 if (orders[VERT].front().location == virtual_header_location)
767 orders[VERT].pop_front();
768
769 if (!data[VERT].holes[4].empty()) {
770 *pack_vert_data_into_header = false;
771
772 assert(orders[VERT].front().location == -1);
773 assert(orders[VERT].front().dwords == 4);
774 orders[VERT].pop_front();
775 }
776
777 if (*pack_vert_data_into_header) {
778 unsigned sz = brw_sum_size(orders[VERT]) +
779 brw_sum_size(orders[VERT_FLAT]);
780
781 if (sz % 8 == 0 || sz % 8 > 4)
782 *pack_vert_data_into_header = false;
783 }
784 }
785
786
787 if (INTEL_DEBUG(DEBUG_MESH)) {
788 fprintf(stderr, "MUE attribute order:\n");
789 for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
790 if (!orders[i].empty())
791 fprintf(stderr, "%d: ", i);
792 for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
793 fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
794 }
795 if (!orders[i].empty())
796 fprintf(stderr, "\n");
797 }
798 }
799 }
800
801 /* Mesh URB Entry consists of an initial section
802 *
803 * - Primitive Count
804 * - Primitive Indices (from 0 to Max-1)
805 * - Padding to 32B if needed
806 *
807 * optionally followed by a section for per-primitive data,
808 * in which each primitive (from 0 to Max-1) gets
809 *
810 * - Primitive Header (e.g. ViewportIndex)
811 * - Primitive Custom Attributes
812 *
813 * then followed by a section for per-vertex data
814 *
815 * - Vertex Header (e.g. Position)
816 * - Vertex Custom Attributes
817 *
818 * Each per-element section has a pitch and a starting offset. All the
819 * individual attributes offsets in start_dw are considering the first entry
820 * of the section (i.e. where the Position for first vertex, or ViewportIndex
821 * for first primitive). Attributes for other elements are calculated using
822 * the pitch.
823 */
824 static void
brw_compute_mue_map(const struct brw_compiler * compiler,struct nir_shader * nir,struct brw_mue_map * map,enum brw_mesh_index_format index_format,bool compact_mue)825 brw_compute_mue_map(const struct brw_compiler *compiler,
826 struct nir_shader *nir, struct brw_mue_map *map,
827 enum brw_mesh_index_format index_format, bool compact_mue)
828 {
829 memset(map, 0, sizeof(*map));
830
831 memset(&map->start_dw[0], -1, sizeof(map->start_dw));
832 memset(&map->len_dw[0], 0, sizeof(map->len_dw));
833
834 unsigned vertices_per_primitive =
835 mesa_vertices_per_prim(nir->info.mesh.primitive_type);
836
837 map->max_primitives = nir->info.mesh.max_primitives_out;
838 map->max_vertices = nir->info.mesh.max_vertices_out;
839
840 uint64_t outputs_written = nir->info.outputs_written;
841
842 /* One dword for primitives count then K extra dwords for each primitive. */
843 switch (index_format) {
844 case BRW_INDEX_FORMAT_U32:
845 map->per_primitive_indices_dw = vertices_per_primitive;
846 break;
847 case BRW_INDEX_FORMAT_U888X:
848 map->per_primitive_indices_dw = 1;
849 break;
850 default:
851 unreachable("invalid index format");
852 }
853
854 map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
855 map->max_primitives + 1, 8);
856
857 /* Assign initial section. */
858 if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
859 map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
860 map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
861 outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
862 }
863 if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
864 map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
865 map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
866 map->per_primitive_indices_dw * map->max_primitives;
867 outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
868 }
869
870 const uint64_t per_primitive_header_bits =
871 BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
872 BITFIELD64_BIT(VARYING_SLOT_LAYER) |
873 BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
874 BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
875
876 const uint64_t per_vertex_header_bits =
877 BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
878 BITFIELD64_BIT(VARYING_SLOT_POS) |
879 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
880 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
881
882 std::list<struct attr_desc> orders[3];
883 uint64_t regular_outputs = outputs_written &
884 ~(per_primitive_header_bits | per_vertex_header_bits);
885
886 /* packing into prim header is possible only if prim header is present */
887 map->user_data_in_primitive_header = compact_mue &&
888 (outputs_written & per_primitive_header_bits) != 0;
889
890 /* Packing into vert header is always possible, but we allow it only
891 * if full vec4 is available (so point size is not used) and there's
892 * nothing between it and normal vertex data (so no clip distances).
893 */
894 map->user_data_in_vertex_header = compact_mue &&
895 (outputs_written & per_vertex_header_bits) ==
896 BITFIELD64_BIT(VARYING_SLOT_POS);
897
898 if (outputs_written & per_primitive_header_bits) {
899 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
900 map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
901 map->per_primitive_start_dw + 0;
902 map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
903 }
904
905 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) {
906 map->start_dw[VARYING_SLOT_LAYER] =
907 map->per_primitive_start_dw + 1; /* RTAIndex */
908 map->len_dw[VARYING_SLOT_LAYER] = 1;
909 }
910
911 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) {
912 map->start_dw[VARYING_SLOT_VIEWPORT] =
913 map->per_primitive_start_dw + 2;
914 map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
915 }
916
917 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
918 map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
919 map->per_primitive_start_dw + 3;
920 map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
921 }
922
923 map->per_primitive_header_size_dw = 8;
924 outputs_written &= ~per_primitive_header_bits;
925 } else {
926 map->per_primitive_header_size_dw = 0;
927 }
928
929 map->per_primitive_data_size_dw = 0;
930
931 /* For fast linked libraries, we can't pack the MUE, as the fragment shader
932 * will be compiled without access to the MUE map and won't be able to find
933 * out where everything is.
934 * Instead, keep doing things as we did before the packing, just laying out
935 * everything in varying order, which is how the FS will expect them.
936 */
937 if (compact_mue) {
938 brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
939 &map->user_data_in_primitive_header,
940 &map->user_data_in_vertex_header);
941
942 unsigned start_dw = map->per_primitive_start_dw;
943 if (map->user_data_in_primitive_header)
944 start_dw += 4; /* first 4 dwords are used */
945 else
946 start_dw += map->per_primitive_header_size_dw;
947 unsigned header_used_dw = 0;
948
949 for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
950 int location = (*it).location;
951 if (location < 0) {
952 start_dw += (*it).dwords;
953 if (map->user_data_in_primitive_header && header_used_dw < 4)
954 header_used_dw += (*it).dwords;
955 else
956 map->per_primitive_data_size_dw += (*it).dwords;
957 assert(header_used_dw <= 4);
958 continue;
959 }
960
961 assert(map->start_dw[location] == -1);
962
963 assert(location == VARYING_SLOT_PRIMITIVE_ID ||
964 location >= VARYING_SLOT_VAR0);
965
966 brw_mue_assign_position(&*it, map, start_dw);
967
968 start_dw += (*it).dwords;
969 if (map->user_data_in_primitive_header && header_used_dw < 4)
970 header_used_dw += (*it).dwords;
971 else
972 map->per_primitive_data_size_dw += (*it).dwords;
973 assert(header_used_dw <= 4);
974 outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
975 }
976 } else {
977 unsigned start_dw = map->per_primitive_start_dw +
978 map->per_primitive_header_size_dw;
979
980 uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
981 while (per_prim_outputs) {
982 uint64_t location = ffsll(per_prim_outputs) - 1;
983
984 assert(map->start_dw[location] == -1);
985 assert(location == VARYING_SLOT_PRIMITIVE_ID ||
986 location >= VARYING_SLOT_VAR0);
987
988 nir_variable *var =
989 brw_nir_find_complete_variable_with_location(nir,
990 nir_var_shader_out,
991 location);
992 struct attr_desc d;
993 d.location = location;
994 d.type = brw_nir_get_var_type(nir, var);
995 d.dwords = glsl_count_dword_slots(d.type, false);
996 d.slots = glsl_count_attribute_slots(d.type, false);
997
998 brw_mue_assign_position(&d, map, start_dw);
999
1000 map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
1001 start_dw += ALIGN(d.dwords, 4);
1002
1003 per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
1004 }
1005 }
1006
1007 map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
1008 map->per_primitive_data_size_dw, 8);
1009
1010 map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
1011 map->per_primitive_pitch_dw *
1012 map->max_primitives, 8);
1013
1014 /* TODO(mesh): Multiview. */
1015 unsigned fixed_header_size = 8;
1016 map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
1017 nir->info.clip_distance_array_size +
1018 nir->info.cull_distance_array_size, 8);
1019
1020 if (outputs_written & per_vertex_header_bits) {
1021 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
1022 map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
1023 map->len_dw[VARYING_SLOT_PSIZ] = 1;
1024 }
1025
1026 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
1027 map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
1028 map->len_dw[VARYING_SLOT_POS] = 4;
1029 }
1030
1031 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
1032 map->start_dw[VARYING_SLOT_CLIP_DIST0] =
1033 map->per_vertex_start_dw + fixed_header_size + 0;
1034 map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
1035 }
1036
1037 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
1038 map->start_dw[VARYING_SLOT_CLIP_DIST1] =
1039 map->per_vertex_start_dw + fixed_header_size + 4;
1040 map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
1041 }
1042
1043 outputs_written &= ~per_vertex_header_bits;
1044 }
1045
1046 /* cull distances should be lowered earlier */
1047 assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
1048 assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
1049
1050 map->per_vertex_data_size_dw = 0;
1051
1052 /* For fast linked libraries, we can't pack the MUE, as the fragment shader
1053 * will be compiled without access to the MUE map and won't be able to find
1054 * out where everything is.
1055 * Instead, keep doing things as we did before the packing, just laying out
1056 * everything in varying order, which is how the FS will expect them.
1057 */
1058 if (compact_mue) {
1059 unsigned start_dw = map->per_vertex_start_dw;
1060 if (!map->user_data_in_vertex_header)
1061 start_dw += map->per_vertex_header_size_dw;
1062
1063 unsigned header_used_dw = 0;
1064 for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
1065 for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
1066 int location = (*it).location;
1067 if (location < 0) {
1068 start_dw += (*it).dwords;
1069 if (map->user_data_in_vertex_header && header_used_dw < 4) {
1070 header_used_dw += (*it).dwords;
1071 assert(header_used_dw <= 4);
1072 if (header_used_dw == 4)
1073 start_dw += 4; /* jump over gl_position */
1074 } else {
1075 map->per_vertex_data_size_dw += (*it).dwords;
1076 }
1077 continue;
1078 }
1079
1080 assert(map->start_dw[location] == -1);
1081
1082 assert(location >= VARYING_SLOT_VAR0);
1083
1084 brw_mue_assign_position(&*it, map, start_dw);
1085
1086 start_dw += (*it).dwords;
1087 if (map->user_data_in_vertex_header && header_used_dw < 4) {
1088 header_used_dw += (*it).dwords;
1089 assert(header_used_dw <= 4);
1090 if (header_used_dw == 4)
1091 start_dw += 4; /* jump over gl_position */
1092 } else {
1093 map->per_vertex_data_size_dw += (*it).dwords;
1094 }
1095 outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
1096 }
1097 }
1098 } else {
1099 unsigned start_dw = map->per_vertex_start_dw +
1100 map->per_vertex_header_size_dw;
1101
1102 uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
1103 while (per_vertex_outputs) {
1104 uint64_t location = ffsll(per_vertex_outputs) - 1;
1105
1106 assert(map->start_dw[location] == -1);
1107 assert(location >= VARYING_SLOT_VAR0);
1108
1109 nir_variable *var =
1110 brw_nir_find_complete_variable_with_location(nir,
1111 nir_var_shader_out,
1112 location);
1113 struct attr_desc d;
1114 d.location = location;
1115 d.type = brw_nir_get_var_type(nir, var);
1116 d.dwords = glsl_count_dword_slots(d.type, false);
1117 d.slots = glsl_count_attribute_slots(d.type, false);
1118
1119 brw_mue_assign_position(&d, map, start_dw);
1120
1121 map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
1122 start_dw += ALIGN(d.dwords, 4);
1123
1124 per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
1125 }
1126 }
1127
1128 map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
1129 map->per_vertex_data_size_dw, 8);
1130
1131 map->size_dw =
1132 map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
1133
1134 assert(map->size_dw % 8 == 0);
1135 }
1136
1137 static void
brw_print_mue_map(FILE * fp,const struct brw_mue_map * map,struct nir_shader * nir)1138 brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
1139 {
1140 fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
1141 map->size_dw, map->max_primitives, map->max_vertices);
1142 fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
1143 map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
1144 map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
1145 map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
1146 fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
1147 map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
1148 map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
1149 map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
1150
1151 fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
1152 map->per_primitive_start_dw,
1153 map->per_primitive_header_size_dw,
1154 map->per_primitive_data_size_dw,
1155 map->per_primitive_pitch_dw);
1156
1157 for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1158 if (map->start_dw[i] < 0)
1159 continue;
1160
1161 const unsigned offset = map->start_dw[i];
1162 const unsigned len = map->len_dw[i];
1163
1164 if (offset < map->per_primitive_start_dw ||
1165 offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
1166 continue;
1167
1168 const char *name =
1169 gl_varying_slot_name_for_stage((gl_varying_slot)i,
1170 MESA_SHADER_MESH);
1171
1172 fprintf(fp, " <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
1173 name, i);
1174 }
1175
1176 fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
1177 map->per_vertex_start_dw,
1178 map->per_vertex_header_size_dw,
1179 map->per_vertex_data_size_dw,
1180 map->per_vertex_pitch_dw);
1181
1182 for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1183 if (map->start_dw[i] < 0)
1184 continue;
1185
1186 const unsigned offset = map->start_dw[i];
1187 const unsigned len = map->len_dw[i];
1188
1189 if (offset < map->per_vertex_start_dw ||
1190 offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
1191 continue;
1192
1193 nir_variable *var =
1194 nir_find_variable_with_location(nir, nir_var_shader_out, i);
1195 bool flat = var->data.interpolation == INTERP_MODE_FLAT;
1196
1197 const char *name =
1198 gl_varying_slot_name_for_stage((gl_varying_slot)i,
1199 MESA_SHADER_MESH);
1200
1201 fprintf(fp, " <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
1202 name, i, flat ? " (flat)" : "");
1203 }
1204
1205 fprintf(fp, "\n");
1206 }
1207
1208 static void
brw_nir_lower_mue_outputs(nir_shader * nir,const struct brw_mue_map * map)1209 brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
1210 {
1211 nir_foreach_shader_out_variable(var, nir) {
1212 int location = var->data.location;
1213 assert(location >= 0);
1214 assert(map->start_dw[location] != -1);
1215 var->data.driver_location = map->start_dw[location];
1216 }
1217
1218 NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
1219 type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
1220 }
1221
1222 static void
brw_nir_initialize_mue(nir_shader * nir,const struct brw_mue_map * map,unsigned dispatch_width)1223 brw_nir_initialize_mue(nir_shader *nir,
1224 const struct brw_mue_map *map,
1225 unsigned dispatch_width)
1226 {
1227 assert(map->per_primitive_header_size_dw > 0);
1228
1229 nir_builder b;
1230 nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
1231 b = nir_builder_at(nir_before_impl(entrypoint));
1232
1233 nir_def *dw_off = nir_imm_int(&b, 0);
1234 nir_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0);
1235
1236 /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */
1237
1238 assert(!nir->info.workgroup_size_variable);
1239 const unsigned workgroup_size = nir->info.workgroup_size[0] *
1240 nir->info.workgroup_size[1] *
1241 nir->info.workgroup_size[2];
1242
1243 /* Invocations from a single workgroup will cooperate in zeroing MUE. */
1244
1245 /* How many prims each invocation needs to cover without checking its index? */
1246 unsigned prims_per_inv = map->max_primitives / workgroup_size;
1247
1248 /* Zero first 4 dwords of MUE Primitive Header:
1249 * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask.
1250 */
1251
1252 nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
1253
1254 /* Zero primitive headers distanced by workgroup_size, starting from
1255 * invocation index.
1256 */
1257 for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) {
1258 nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1259 prim_in_inv * workgroup_size);
1260
1261 nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1262 .base = (int)map->per_primitive_start_dw,
1263 .write_mask = WRITEMASK_XYZW,
1264 .component = 0,
1265 .src_type = nir_type_uint32);
1266 }
1267
1268 /* How many prims are left? */
1269 unsigned remaining = map->max_primitives % workgroup_size;
1270
1271 if (remaining) {
1272 /* Zero "remaining" primitive headers starting from the last one covered
1273 * by the loop above + workgroup_size.
1274 */
1275 nir_def *cmp = nir_ilt_imm(&b, local_invocation_index, remaining);
1276 nir_if *if_stmt = nir_push_if(&b, cmp);
1277 {
1278 nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1279 prims_per_inv * workgroup_size);
1280
1281 nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1282 .base = (int)map->per_primitive_start_dw,
1283 .write_mask = WRITEMASK_XYZW,
1284 .component = 0,
1285 .src_type = nir_type_uint32);
1286 }
1287 nir_pop_if(&b, if_stmt);
1288 }
1289
1290 /* If there's more than one subgroup, then we need to wait for all of them
1291 * to finish initialization before we can proceed. Otherwise some subgroups
1292 * may start filling MUE before other finished initializing.
1293 */
1294 if (workgroup_size > dispatch_width) {
1295 nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP,
1296 NIR_MEMORY_ACQ_REL, nir_var_shader_out);
1297 }
1298
1299 if (remaining) {
1300 nir_metadata_preserve(entrypoint, nir_metadata_none);
1301 } else {
1302 nir_metadata_preserve(entrypoint, nir_metadata_control_flow);
1303 }
1304 }
1305
1306 static void
brw_nir_adjust_offset(nir_builder * b,nir_intrinsic_instr * intrin,uint32_t pitch)1307 brw_nir_adjust_offset(nir_builder *b, nir_intrinsic_instr *intrin, uint32_t pitch)
1308 {
1309 nir_src *index_src = nir_get_io_arrayed_index_src(intrin);
1310 nir_src *offset_src = nir_get_io_offset_src(intrin);
1311
1312 b->cursor = nir_before_instr(&intrin->instr);
1313 nir_def *offset =
1314 nir_iadd(b,
1315 offset_src->ssa,
1316 nir_imul_imm(b, index_src->ssa, pitch));
1317 nir_src_rewrite(offset_src, offset);
1318 }
1319
1320 static bool
brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1321 brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
1322 nir_intrinsic_instr *intrin,
1323 void *data)
1324 {
1325 const struct brw_mue_map *map = (const struct brw_mue_map *) data;
1326
1327 /* Remap per_vertex and per_primitive offsets using the extra source and
1328 * the pitch.
1329 */
1330 switch (intrin->intrinsic) {
1331 case nir_intrinsic_load_per_vertex_output:
1332 case nir_intrinsic_store_per_vertex_output:
1333 brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw);
1334
1335 return true;
1336
1337 case nir_intrinsic_load_per_primitive_output:
1338 case nir_intrinsic_store_per_primitive_output: {
1339 struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
1340 uint32_t pitch;
1341 if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
1342 pitch = map->per_primitive_indices_dw;
1343 else
1344 pitch = map->per_primitive_pitch_dw;
1345
1346 brw_nir_adjust_offset(b, intrin, pitch);
1347
1348 return true;
1349 }
1350
1351 default:
1352 return false;
1353 }
1354 }
1355
1356 static bool
brw_nir_adjust_offset_for_arrayed_indices(nir_shader * nir,const struct brw_mue_map * map)1357 brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map)
1358 {
1359 return nir_shader_intrinsics_pass(nir,
1360 brw_nir_adjust_offset_for_arrayed_indices_instr,
1361 nir_metadata_control_flow,
1362 (void *)map);
1363 }
1364
1365 struct index_packing_state {
1366 unsigned vertices_per_primitive;
1367 nir_variable *original_prim_indices;
1368 nir_variable *packed_prim_indices;
1369 };
1370
1371 static bool
brw_can_pack_primitive_indices(nir_shader * nir,struct index_packing_state * state)1372 brw_can_pack_primitive_indices(nir_shader *nir, struct index_packing_state *state)
1373 {
1374 /* can single index fit into one byte of U888X format? */
1375 if (nir->info.mesh.max_vertices_out > 255)
1376 return false;
1377
1378 state->vertices_per_primitive =
1379 mesa_vertices_per_prim(nir->info.mesh.primitive_type);
1380 /* packing point indices doesn't help */
1381 if (state->vertices_per_primitive == 1)
1382 return false;
1383
1384 state->original_prim_indices =
1385 nir_find_variable_with_location(nir,
1386 nir_var_shader_out,
1387 VARYING_SLOT_PRIMITIVE_INDICES);
1388 /* no indices = no changes to the shader, but it's still worth it,
1389 * because less URB space will be used
1390 */
1391 if (!state->original_prim_indices)
1392 return true;
1393
1394 ASSERTED const struct glsl_type *type = state->original_prim_indices->type;
1395 assert(glsl_type_is_array(type));
1396 assert(glsl_type_is_vector(glsl_without_array(type)));
1397 assert(glsl_without_array(type)->vector_elements == state->vertices_per_primitive);
1398
1399 nir_foreach_function_impl(impl, nir) {
1400 nir_foreach_block(block, impl) {
1401 nir_foreach_instr(instr, block) {
1402 if (instr->type != nir_instr_type_intrinsic)
1403 continue;
1404
1405 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1406
1407 if (intrin->intrinsic != nir_intrinsic_store_deref) {
1408 /* any unknown deref operation on primitive indices -> don't pack */
1409 unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
1410 for (unsigned i = 0; i < num_srcs; i++) {
1411 nir_deref_instr *deref = nir_src_as_deref(intrin->src[i]);
1412 if (!deref)
1413 continue;
1414 nir_variable *var = nir_deref_instr_get_variable(deref);
1415
1416 if (var == state->original_prim_indices)
1417 return false;
1418 }
1419
1420 continue;
1421 }
1422
1423 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
1424 if (!deref)
1425 continue;
1426
1427 nir_variable *var = nir_deref_instr_get_variable(deref);
1428 if (var != state->original_prim_indices)
1429 continue;
1430
1431 if (deref->deref_type != nir_deref_type_array)
1432 return false; /* unknown chain of derefs */
1433
1434 nir_deref_instr *var_deref = nir_src_as_deref(deref->parent);
1435 if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1436 return false; /* unknown chain of derefs */
1437
1438 assert (var_deref->var == state->original_prim_indices);
1439
1440 unsigned write_mask = nir_intrinsic_write_mask(intrin);
1441
1442 /* If only some components are written, then we can't easily pack.
1443 * In theory we could, by loading current dword value, bitmasking
1444 * one byte and storing back the whole dword, but it would be slow
1445 * and could actually decrease performance. TODO: reevaluate this
1446 * once there will be something hitting this.
1447 */
1448 if (write_mask != BITFIELD_MASK(state->vertices_per_primitive))
1449 return false;
1450 }
1451 }
1452 }
1453
1454 return true;
1455 }
1456
1457 static bool
brw_pack_primitive_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1458 brw_pack_primitive_indices_instr(nir_builder *b, nir_intrinsic_instr *intrin,
1459 void *data)
1460 {
1461 if (intrin->intrinsic != nir_intrinsic_store_deref)
1462 return false;
1463
1464 nir_deref_instr *array_deref = nir_src_as_deref(intrin->src[0]);
1465 if (!array_deref || array_deref->deref_type != nir_deref_type_array)
1466 return false;
1467
1468 nir_deref_instr *var_deref = nir_src_as_deref(array_deref->parent);
1469 if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1470 return false;
1471
1472 struct index_packing_state *state =
1473 (struct index_packing_state *)data;
1474
1475 nir_variable *var = var_deref->var;
1476
1477 if (var != state->original_prim_indices)
1478 return false;
1479
1480 unsigned vertices_per_primitive = state->vertices_per_primitive;
1481
1482 b->cursor = nir_before_instr(&intrin->instr);
1483
1484 nir_deref_instr *new_var_deref =
1485 nir_build_deref_var(b, state->packed_prim_indices);
1486 nir_deref_instr *new_array_deref =
1487 nir_build_deref_array(b, new_var_deref, array_deref->arr.index.ssa);
1488
1489 nir_src *data_src = &intrin->src[1];
1490 nir_def *data_def =
1491 data_src->ssa;
1492
1493 nir_def *new_data =
1494 nir_ior(b, nir_ishl_imm(b, nir_channel(b, data_def, 0), 0),
1495 nir_ishl_imm(b, nir_channel(b, data_def, 1), 8));
1496
1497 if (vertices_per_primitive >= 3) {
1498 new_data =
1499 nir_ior(b, new_data,
1500 nir_ishl_imm(b, nir_channel(b, data_def, 2), 16));
1501 }
1502
1503 nir_build_store_deref(b, &new_array_deref->def, new_data);
1504
1505 nir_instr_remove(&intrin->instr);
1506
1507 return true;
1508 }
1509
1510 static bool
brw_pack_primitive_indices(nir_shader * nir,void * data)1511 brw_pack_primitive_indices(nir_shader *nir, void *data)
1512 {
1513 struct index_packing_state *state = (struct index_packing_state *)data;
1514
1515 const struct glsl_type *new_type =
1516 glsl_array_type(glsl_uint_type(),
1517 nir->info.mesh.max_primitives_out,
1518 0);
1519
1520 state->packed_prim_indices =
1521 nir_variable_create(nir, nir_var_shader_out,
1522 new_type, "gl_PrimitiveIndicesPacked");
1523 state->packed_prim_indices->data.location = VARYING_SLOT_PRIMITIVE_INDICES;
1524 state->packed_prim_indices->data.interpolation = INTERP_MODE_NONE;
1525 state->packed_prim_indices->data.per_primitive = 1;
1526
1527 return nir_shader_intrinsics_pass(nir, brw_pack_primitive_indices_instr,
1528 nir_metadata_control_flow,
1529 data);
1530 }
1531
1532 static bool
brw_mesh_autostrip_enable(const struct brw_compiler * compiler,struct nir_shader * nir,struct brw_mue_map * map)1533 brw_mesh_autostrip_enable(const struct brw_compiler *compiler, struct nir_shader *nir,
1534 struct brw_mue_map *map)
1535 {
1536 /* Auto-striping can be enabled when shader either doesn't write to
1537 * RTA Index and VP Index or writes the same values for all primitives.
1538 * Since determining whether shader writes the same value across the whole
1539 * workgroup (not just subgroup!) is tricky, we do the simplest possible
1540 * thing - say yes only when shader writes const values and they all match.
1541 *
1542 * TODO: improve this
1543 */
1544
1545 if (compiler->devinfo->ver < 20)
1546 return false;
1547
1548 if (map->start_dw[VARYING_SLOT_VIEWPORT] < 0 &&
1549 map->start_dw[VARYING_SLOT_LAYER] < 0)
1550 return true;
1551
1552 nir_def *vp = NULL;
1553 nir_def *layer = NULL;
1554
1555 nir_foreach_function(function, nir) {
1556 if (!function->impl)
1557 continue;
1558
1559 nir_foreach_block(block, function->impl) {
1560 nir_foreach_instr(instr, block) {
1561 if (instr->type != nir_instr_type_intrinsic)
1562 continue;
1563
1564 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1565 if (intrin->intrinsic != nir_intrinsic_store_per_primitive_output)
1566 continue;
1567
1568 struct nir_io_semantics io = nir_intrinsic_io_semantics(intrin);
1569 bool is_vp = io.location == VARYING_SLOT_VIEWPORT;
1570 bool is_layer = io.location == VARYING_SLOT_LAYER;
1571 if (!is_vp && !is_layer)
1572 continue;
1573
1574 nir_src *src = &intrin->src[0];
1575
1576 if (!nir_src_is_const(*src))
1577 return false;
1578
1579 nir_def **cmp;
1580 if (is_vp)
1581 cmp = &vp;
1582 else
1583 cmp = &layer;
1584
1585 if (*cmp == NULL)
1586 *cmp = src->ssa;
1587 else if (*cmp != src->ssa)
1588 return false;
1589 }
1590 }
1591 }
1592
1593 return true;
1594 }
1595
1596 const unsigned *
brw_compile_mesh(const struct brw_compiler * compiler,struct brw_compile_mesh_params * params)1597 brw_compile_mesh(const struct brw_compiler *compiler,
1598 struct brw_compile_mesh_params *params)
1599 {
1600 struct nir_shader *nir = params->base.nir;
1601 const struct brw_mesh_prog_key *key = params->key;
1602 struct brw_mesh_prog_data *prog_data = params->prog_data;
1603 const bool debug_enabled = brw_should_print_shader(nir, DEBUG_MESH);
1604
1605 prog_data->base.base.stage = MESA_SHADER_MESH;
1606 prog_data->base.base.total_shared = nir->info.shared_size;
1607 prog_data->base.base.total_scratch = 0;
1608
1609 prog_data->base.local_size[0] = nir->info.workgroup_size[0];
1610 prog_data->base.local_size[1] = nir->info.workgroup_size[1];
1611 prog_data->base.local_size[2] = nir->info.workgroup_size[2];
1612
1613 prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1;
1614 prog_data->cull_distance_mask =
1615 ((1 << nir->info.cull_distance_array_size) - 1) <<
1616 nir->info.clip_distance_array_size;
1617 prog_data->primitive_type = nir->info.mesh.primitive_type;
1618
1619 struct index_packing_state index_packing_state = {};
1620 if (brw_can_pack_primitive_indices(nir, &index_packing_state)) {
1621 if (index_packing_state.original_prim_indices)
1622 NIR_PASS(_, nir, brw_pack_primitive_indices, &index_packing_state);
1623 prog_data->index_format = BRW_INDEX_FORMAT_U888X;
1624 } else {
1625 prog_data->index_format = BRW_INDEX_FORMAT_U32;
1626 }
1627
1628 prog_data->uses_drawid =
1629 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
1630
1631 brw_nir_lower_tue_inputs(nir, params->tue_map);
1632
1633 brw_compute_mue_map(compiler, nir, &prog_data->map,
1634 prog_data->index_format, key->compact_mue);
1635 brw_nir_lower_mue_outputs(nir, &prog_data->map);
1636
1637 prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map);
1638
1639 brw_simd_selection_state simd_state{
1640 .devinfo = compiler->devinfo,
1641 .prog_data = &prog_data->base,
1642 .required_width = brw_required_dispatch_width(&nir->info),
1643 };
1644
1645 std::unique_ptr<fs_visitor> v[3];
1646
1647 for (int simd = 0; simd < 3; simd++) {
1648 if (!brw_simd_should_compile(simd_state, simd))
1649 continue;
1650
1651 const unsigned dispatch_width = 8 << simd;
1652
1653 nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
1654
1655 /*
1656 * When Primitive Header is enabled, we may not generates writes to all
1657 * fields, so let's initialize everything.
1658 */
1659 if (prog_data->map.per_primitive_header_size_dw > 0)
1660 NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
1661
1662 brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
1663
1664 NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
1665 /* Load uniforms can do a better job for constants, so fold before it. */
1666 NIR_PASS(_, shader, nir_opt_constant_folding);
1667 NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
1668
1669 NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
1670
1671 brw_postprocess_nir(shader, compiler, debug_enabled,
1672 key->base.robust_flags);
1673
1674 v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
1675 &key->base,
1676 &prog_data->base.base,
1677 shader, dispatch_width,
1678 params->base.stats != NULL,
1679 debug_enabled);
1680
1681 if (prog_data->base.prog_mask) {
1682 unsigned first = ffs(prog_data->base.prog_mask) - 1;
1683 v[simd]->import_uniforms(v[first].get());
1684 }
1685
1686 const bool allow_spilling = !brw_simd_any_compiled(simd_state);
1687 if (run_task_mesh(*v[simd], allow_spilling))
1688 brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
1689 else
1690 simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
1691 }
1692
1693 int selected_simd = brw_simd_select(simd_state);
1694 if (selected_simd < 0) {
1695 params->base.error_str =
1696 ralloc_asprintf(params->base.mem_ctx,
1697 "Can't compile shader: "
1698 "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
1699 simd_state.error[0], simd_state.error[1],
1700 simd_state.error[2]);
1701 return NULL;
1702 }
1703
1704 fs_visitor *selected = v[selected_simd].get();
1705 prog_data->base.prog_mask = 1 << selected_simd;
1706
1707 if (unlikely(debug_enabled)) {
1708 if (params->tue_map) {
1709 fprintf(stderr, "Mesh Input ");
1710 brw_print_tue_map(stderr, params->tue_map);
1711 }
1712 fprintf(stderr, "Mesh Output ");
1713 brw_print_mue_map(stderr, &prog_data->map, nir);
1714 }
1715
1716 fs_generator g(compiler, ¶ms->base, &prog_data->base.base,
1717 MESA_SHADER_MESH);
1718 if (unlikely(debug_enabled)) {
1719 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
1720 "%s mesh shader %s",
1721 nir->info.label ? nir->info.label
1722 : "unnamed",
1723 nir->info.name));
1724 }
1725
1726 g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
1727 selected->performance_analysis.require(), params->base.stats);
1728 g.add_const_data(nir->constant_data, nir->constant_data_size);
1729 return g.get_assembly();
1730 }
1731