1 /*
2 * Copyright © 2022 Collabora, Ltd.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "nak_private.h"
7 #include "nir_builder.h"
8 #include "nir_control_flow.h"
9 #include "nir_xfb_info.h"
10
11 #include "util/u_math.h"
12
13 #define OPT(nir, pass, ...) ({ \
14 bool this_progress = false; \
15 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
16 if (this_progress) \
17 progress = true; \
18 this_progress; \
19 })
20
21 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
22
23 bool
nak_nir_workgroup_has_one_subgroup(const nir_shader * nir)24 nak_nir_workgroup_has_one_subgroup(const nir_shader *nir)
25 {
26 switch (nir->info.stage) {
27 case MESA_SHADER_VERTEX:
28 case MESA_SHADER_TESS_EVAL:
29 case MESA_SHADER_GEOMETRY:
30 case MESA_SHADER_FRAGMENT:
31 unreachable("Shader stage does not have workgroups");
32 break;
33
34 case MESA_SHADER_TESS_CTRL:
35 /* Tessellation only ever has one subgroup per workgroup. The Vulkan
36 * limit on the number of tessellation invocations is 32 to allow for
37 * this.
38 */
39 return true;
40
41 case MESA_SHADER_COMPUTE:
42 case MESA_SHADER_KERNEL: {
43 if (nir->info.workgroup_size_variable)
44 return false;
45
46 uint16_t wg_sz = nir->info.workgroup_size[0] *
47 nir->info.workgroup_size[1] *
48 nir->info.workgroup_size[2];
49
50 return wg_sz <= NAK_SUBGROUP_SIZE;
51 }
52
53 default:
54 unreachable("Unknown shader stage");
55 }
56 }
57
58 static uint8_t
vectorize_filter_cb(const nir_instr * instr,const void * _data)59 vectorize_filter_cb(const nir_instr *instr, const void *_data)
60 {
61 if (instr->type != nir_instr_type_alu)
62 return 0;
63
64 const nir_alu_instr *alu = nir_instr_as_alu(instr);
65
66 const unsigned bit_size = nir_alu_instr_is_comparison(alu)
67 ? alu->src[0].src.ssa->bit_size
68 : alu->def.bit_size;
69
70 switch (alu->op) {
71 case nir_op_fadd:
72 case nir_op_fsub:
73 case nir_op_fabs:
74 case nir_op_fneg:
75 case nir_op_feq:
76 case nir_op_fge:
77 case nir_op_flt:
78 case nir_op_fneu:
79 case nir_op_fmul:
80 case nir_op_ffma:
81 case nir_op_fsign:
82 case nir_op_fsat:
83 case nir_op_fmax:
84 case nir_op_fmin:
85 return bit_size == 16 ? 2 : 1;
86 default:
87 return 1;
88 }
89 }
90
91 static void
optimize_nir(nir_shader * nir,const struct nak_compiler * nak,bool allow_copies)92 optimize_nir(nir_shader *nir, const struct nak_compiler *nak, bool allow_copies)
93 {
94 bool progress;
95
96 unsigned lower_flrp =
97 (nir->options->lower_flrp16 ? 16 : 0) |
98 (nir->options->lower_flrp32 ? 32 : 0) |
99 (nir->options->lower_flrp64 ? 64 : 0);
100
101 do {
102 progress = false;
103
104 /* This pass is causing problems with types used by OpenCL :
105 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13955
106 *
107 * Running with it disabled made no difference in the resulting assembly
108 * code.
109 */
110 if (nir->info.stage != MESA_SHADER_KERNEL)
111 OPT(nir, nir_split_array_vars, nir_var_function_temp);
112
113 OPT(nir, nir_shrink_vec_array_vars, nir_var_function_temp);
114 OPT(nir, nir_opt_deref);
115 if (OPT(nir, nir_opt_memcpy))
116 OPT(nir, nir_split_var_copies);
117
118 OPT(nir, nir_lower_vars_to_ssa);
119
120 if (allow_copies) {
121 /* Only run this pass in the first call to brw_nir_optimize. Later
122 * calls assume that we've lowered away any copy_deref instructions
123 * and we don't want to introduce any more.
124 */
125 OPT(nir, nir_opt_find_array_copies);
126 }
127 OPT(nir, nir_opt_copy_prop_vars);
128 OPT(nir, nir_opt_dead_write_vars);
129 OPT(nir, nir_opt_combine_stores, nir_var_all);
130
131 OPT(nir, nir_lower_alu_width, vectorize_filter_cb, NULL);
132 OPT(nir, nir_opt_vectorize, vectorize_filter_cb, NULL);
133 OPT(nir, nir_lower_phis_to_scalar, false);
134 OPT(nir, nir_lower_frexp);
135 OPT(nir, nir_copy_prop);
136 OPT(nir, nir_opt_dce);
137 OPT(nir, nir_opt_cse);
138
139 OPT(nir, nir_opt_peephole_select, 0, false, false);
140 OPT(nir, nir_opt_intrinsics);
141 OPT(nir, nir_opt_idiv_const, 32);
142 OPT(nir, nir_opt_algebraic);
143 OPT(nir, nir_lower_constant_convert_alu_types);
144 OPT(nir, nir_opt_constant_folding);
145
146 if (lower_flrp != 0) {
147 if (OPT(nir, nir_lower_flrp, lower_flrp, false /* always_precise */))
148 OPT(nir, nir_opt_constant_folding);
149 /* Nothing should rematerialize any flrps */
150 lower_flrp = 0;
151 }
152
153 OPT(nir, nir_opt_dead_cf);
154 if (OPT(nir, nir_opt_loop)) {
155 /* If nir_opt_loop makes progress, then we need to clean things up
156 * if we want any hope of nir_opt_if or nir_opt_loop_unroll to make
157 * progress.
158 */
159 OPT(nir, nir_copy_prop);
160 OPT(nir, nir_opt_dce);
161 }
162 OPT(nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
163 OPT(nir, nir_opt_conditional_discard);
164 if (nir->options->max_unroll_iterations != 0) {
165 OPT(nir, nir_opt_loop_unroll);
166 }
167 OPT(nir, nir_opt_remove_phis);
168 OPT(nir, nir_opt_gcm, false);
169 OPT(nir, nir_opt_undef);
170 OPT(nir, nir_lower_pack);
171 } while (progress);
172
173 OPT(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
174 }
175
176 void
nak_optimize_nir(nir_shader * nir,const struct nak_compiler * nak)177 nak_optimize_nir(nir_shader *nir, const struct nak_compiler *nak)
178 {
179 optimize_nir(nir, nak, false);
180 }
181
182 static unsigned
lower_bit_size_cb(const nir_instr * instr,void * data)183 lower_bit_size_cb(const nir_instr *instr, void *data)
184 {
185 const struct nak_compiler *nak = data;
186
187 switch (instr->type) {
188 case nir_instr_type_alu: {
189 nir_alu_instr *alu = nir_instr_as_alu(instr);
190 if (nir_op_infos[alu->op].is_conversion)
191 return 0;
192
193 const unsigned bit_size = nir_alu_instr_is_comparison(alu)
194 ? alu->src[0].src.ssa->bit_size
195 : alu->def.bit_size;
196
197 switch (alu->op) {
198 case nir_op_bit_count:
199 case nir_op_ufind_msb:
200 case nir_op_ifind_msb:
201 case nir_op_find_lsb:
202 /* These are handled specially because the destination is always
203 * 32-bit and so the bit size of the instruction is given by the
204 * source.
205 */
206 return alu->src[0].src.ssa->bit_size == 32 ? 0 : 32;
207
208 case nir_op_fabs:
209 case nir_op_fadd:
210 case nir_op_fneg:
211 case nir_op_feq:
212 case nir_op_fge:
213 case nir_op_flt:
214 case nir_op_fneu:
215 case nir_op_fmul:
216 case nir_op_ffma:
217 case nir_op_ffmaz:
218 case nir_op_fsign:
219 case nir_op_fsat:
220 case nir_op_fceil:
221 case nir_op_ffloor:
222 case nir_op_fround_even:
223 case nir_op_ftrunc:
224 if (bit_size == 16 && nak->sm >= 70)
225 return 0;
226 break;
227
228 case nir_op_fmax:
229 case nir_op_fmin:
230 if (bit_size == 16 && nak->sm >= 80)
231 return 0;
232 break;
233
234 default:
235 break;
236 }
237
238 if (bit_size >= 32)
239 return 0;
240
241 if (bit_size & (8 | 16))
242 return 32;
243
244 return 0;
245 }
246
247 case nir_instr_type_intrinsic: {
248 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
249 switch (intrin->intrinsic) {
250 case nir_intrinsic_vote_ieq:
251 if (intrin->src[0].ssa->bit_size != 1 &&
252 intrin->src[0].ssa->bit_size < 32)
253 return 32;
254 return 0;
255
256 case nir_intrinsic_vote_feq:
257 case nir_intrinsic_read_invocation:
258 case nir_intrinsic_read_first_invocation:
259 case nir_intrinsic_shuffle:
260 case nir_intrinsic_shuffle_xor:
261 case nir_intrinsic_shuffle_up:
262 case nir_intrinsic_shuffle_down:
263 case nir_intrinsic_quad_broadcast:
264 case nir_intrinsic_quad_swap_horizontal:
265 case nir_intrinsic_quad_swap_vertical:
266 case nir_intrinsic_quad_swap_diagonal:
267 case nir_intrinsic_reduce:
268 case nir_intrinsic_inclusive_scan:
269 case nir_intrinsic_exclusive_scan:
270 if (intrin->src[0].ssa->bit_size < 32)
271 return 32;
272 return 0;
273
274 default:
275 return 0;
276 }
277 }
278
279 case nir_instr_type_phi: {
280 nir_phi_instr *phi = nir_instr_as_phi(instr);
281 if (phi->def.bit_size < 32 && phi->def.bit_size != 1)
282 return 32;
283 return 0;
284 }
285
286 default:
287 return 0;
288 }
289 }
290
291 void
nak_preprocess_nir(nir_shader * nir,const struct nak_compiler * nak)292 nak_preprocess_nir(nir_shader *nir, const struct nak_compiler *nak)
293 {
294 UNUSED bool progress = false;
295
296 nir_validate_ssa_dominance(nir, "before nak_preprocess_nir");
297
298 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
299 nir_lower_io_to_temporaries(nir, nir_shader_get_entrypoint(nir),
300 true /* outputs */, false /* inputs */);
301 }
302
303 const nir_lower_tex_options tex_options = {
304 .lower_txd_3d = true,
305 .lower_txd_cube_map = true,
306 .lower_txd_clamp = true,
307 .lower_txd_shadow = true,
308 .lower_txp = ~0,
309 /* TODO: More lowering */
310 };
311 OPT(nir, nir_lower_tex, &tex_options);
312 OPT(nir, nir_normalize_cubemap_coords);
313
314 nir_lower_image_options image_options = {
315 .lower_cube_size = true,
316 };
317 OPT(nir, nir_lower_image, &image_options);
318
319 OPT(nir, nir_lower_global_vars_to_local);
320
321 OPT(nir, nir_split_var_copies);
322 OPT(nir, nir_split_struct_vars, nir_var_function_temp);
323
324 /* Optimize but allow copies because we haven't lowered them yet */
325 optimize_nir(nir, nak, true /* allow_copies */);
326
327 OPT(nir, nir_lower_load_const_to_scalar);
328 OPT(nir, nir_lower_var_copies);
329 OPT(nir, nir_lower_system_values);
330 OPT(nir, nir_lower_compute_system_values, NULL);
331
332 if (nir->info.stage == MESA_SHADER_FRAGMENT)
333 OPT(nir, nir_lower_terminate_to_demote);
334 }
335
336 uint16_t
nak_varying_attr_addr(gl_varying_slot slot)337 nak_varying_attr_addr(gl_varying_slot slot)
338 {
339 if (slot >= VARYING_SLOT_PATCH0) {
340 return NAK_ATTR_PATCH_START + (slot - VARYING_SLOT_PATCH0) * 0x10;
341 } else if (slot >= VARYING_SLOT_VAR0) {
342 return NAK_ATTR_GENERIC_START + (slot - VARYING_SLOT_VAR0) * 0x10;
343 } else {
344 switch (slot) {
345 case VARYING_SLOT_TESS_LEVEL_OUTER: return NAK_ATTR_TESS_LOD;
346 case VARYING_SLOT_TESS_LEVEL_INNER: return NAK_ATTR_TESS_INTERRIOR;
347 case VARYING_SLOT_PRIMITIVE_ID: return NAK_ATTR_PRIMITIVE_ID;
348 case VARYING_SLOT_LAYER: return NAK_ATTR_RT_ARRAY_INDEX;
349 case VARYING_SLOT_VIEWPORT: return NAK_ATTR_VIEWPORT_INDEX;
350 case VARYING_SLOT_PSIZ: return NAK_ATTR_POINT_SIZE;
351 case VARYING_SLOT_POS: return NAK_ATTR_POSITION;
352 case VARYING_SLOT_CLIP_DIST0: return NAK_ATTR_CLIP_CULL_DIST_0;
353 case VARYING_SLOT_CLIP_DIST1: return NAK_ATTR_CLIP_CULL_DIST_4;
354 default: unreachable("Invalid varying slot");
355 }
356 }
357 }
358
359 static uint16_t
nak_fs_out_addr(gl_frag_result slot,uint32_t blend_idx)360 nak_fs_out_addr(gl_frag_result slot, uint32_t blend_idx)
361 {
362 switch (slot) {
363 case FRAG_RESULT_DEPTH:
364 assert(blend_idx == 0);
365 return NAK_FS_OUT_DEPTH;
366
367 case FRAG_RESULT_STENCIL:
368 unreachable("EXT_shader_stencil_export not supported");
369
370 case FRAG_RESULT_COLOR:
371 unreachable("Vulkan alway uses explicit locations");
372
373 case FRAG_RESULT_SAMPLE_MASK:
374 assert(blend_idx == 0);
375 return NAK_FS_OUT_SAMPLE_MASK;
376
377 default:
378 assert(blend_idx < 2);
379 return NAK_FS_OUT_COLOR((slot - FRAG_RESULT_DATA0) + blend_idx);
380 }
381 }
382
383 uint16_t
nak_sysval_attr_addr(gl_system_value sysval)384 nak_sysval_attr_addr(gl_system_value sysval)
385 {
386 switch (sysval) {
387 case SYSTEM_VALUE_PRIMITIVE_ID: return NAK_ATTR_PRIMITIVE_ID;
388 case SYSTEM_VALUE_FRAG_COORD: return NAK_ATTR_POSITION;
389 case SYSTEM_VALUE_POINT_COORD: return NAK_ATTR_POINT_SPRITE;
390 case SYSTEM_VALUE_TESS_COORD: return NAK_ATTR_TESS_COORD;
391 case SYSTEM_VALUE_INSTANCE_ID: return NAK_ATTR_INSTANCE_ID;
392 case SYSTEM_VALUE_VERTEX_ID: return NAK_ATTR_VERTEX_ID;
393 case SYSTEM_VALUE_FRONT_FACE: return NAK_ATTR_FRONT_FACE;
394 case SYSTEM_VALUE_LAYER_ID: return NAK_ATTR_RT_ARRAY_INDEX;
395 default: unreachable("Invalid system value");
396 }
397 }
398
399 static uint8_t
nak_sysval_sysval_idx(gl_system_value sysval)400 nak_sysval_sysval_idx(gl_system_value sysval)
401 {
402 switch (sysval) {
403 case SYSTEM_VALUE_SUBGROUP_INVOCATION: return NAK_SV_LANE_ID;
404 case SYSTEM_VALUE_VERTICES_IN: return NAK_SV_VERTEX_COUNT;
405 case SYSTEM_VALUE_INVOCATION_ID: return NAK_SV_INVOCATION_ID;
406 case SYSTEM_VALUE_HELPER_INVOCATION: return NAK_SV_THREAD_KILL;
407 case SYSTEM_VALUE_LOCAL_INVOCATION_ID: return NAK_SV_TID;
408 case SYSTEM_VALUE_WORKGROUP_ID: return NAK_SV_CTAID;
409 case SYSTEM_VALUE_SUBGROUP_EQ_MASK: return NAK_SV_LANEMASK_EQ;
410 case SYSTEM_VALUE_SUBGROUP_LT_MASK: return NAK_SV_LANEMASK_LT;
411 case SYSTEM_VALUE_SUBGROUP_LE_MASK: return NAK_SV_LANEMASK_LE;
412 case SYSTEM_VALUE_SUBGROUP_GT_MASK: return NAK_SV_LANEMASK_GT;
413 case SYSTEM_VALUE_SUBGROUP_GE_MASK: return NAK_SV_LANEMASK_GE;
414 default: unreachable("Invalid system value");
415 }
416 }
417
418 static bool
nak_nir_lower_system_value_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * data)419 nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
420 void *data)
421 {
422 const struct nak_compiler *nak = data;
423
424 b->cursor = nir_before_instr(&intrin->instr);
425
426 nir_def *val;
427 switch (intrin->intrinsic) {
428 case nir_intrinsic_load_primitive_id:
429 case nir_intrinsic_load_instance_id:
430 case nir_intrinsic_load_vertex_id: {
431 assert(b->shader->info.stage != MESA_SHADER_VERTEX ||
432 b->shader->info.stage != MESA_SHADER_TESS_CTRL ||
433 b->shader->info.stage == MESA_SHADER_TESS_EVAL ||
434 b->shader->info.stage == MESA_SHADER_GEOMETRY);
435 const gl_system_value sysval =
436 nir_system_value_from_intrinsic(intrin->intrinsic);
437 const uint32_t addr = nak_sysval_attr_addr(sysval);
438 val = nir_ald_nv(b, 1, nir_imm_int(b, 0), nir_imm_int(b, 0),
439 .base = addr, .flags = 0,
440 .range_base = addr, .range = 4,
441 .access = ACCESS_CAN_REORDER);
442 break;
443 }
444
445 case nir_intrinsic_load_patch_vertices_in: {
446 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VERTEX_COUNT,
447 .access = ACCESS_CAN_REORDER);
448 val = nir_extract_u8(b, val, nir_imm_int(b, 1));
449 break;
450 }
451
452 case nir_intrinsic_load_subgroup_eq_mask:
453 case nir_intrinsic_load_subgroup_lt_mask:
454 case nir_intrinsic_load_subgroup_le_mask:
455 case nir_intrinsic_load_subgroup_gt_mask:
456 case nir_intrinsic_load_subgroup_ge_mask: {
457 const gl_system_value sysval =
458 nir_system_value_from_intrinsic(intrin->intrinsic);
459 const uint32_t idx = nak_sysval_sysval_idx(sysval);
460 val = nir_load_sysval_nv(b, 32, .base = idx,
461 .access = ACCESS_CAN_REORDER);
462
463 /* Pad with 0 because all invocations above 31 are off */
464 if (intrin->def.bit_size == 64) {
465 val = nir_u2u32(b, val);
466 } else {
467 assert(intrin->def.bit_size == 32);
468 val = nir_pad_vector_imm_int(b, val, 0, intrin->def.num_components);
469 }
470 break;
471 }
472
473 case nir_intrinsic_load_subgroup_invocation:
474 case nir_intrinsic_load_helper_invocation:
475 case nir_intrinsic_load_invocation_id:
476 case nir_intrinsic_load_workgroup_id: {
477 const gl_system_value sysval =
478 nir_system_value_from_intrinsic(intrin->intrinsic);
479 const uint32_t idx = nak_sysval_sysval_idx(sysval);
480 nir_def *comps[3];
481 assert(intrin->def.num_components <= 3);
482 for (unsigned c = 0; c < intrin->def.num_components; c++) {
483 comps[c] = nir_load_sysval_nv(b, 32, .base = idx + c,
484 .access = ACCESS_CAN_REORDER);
485 }
486 val = nir_vec(b, comps, intrin->def.num_components);
487 break;
488 }
489
490 case nir_intrinsic_load_local_invocation_id: {
491 nir_def *x = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_X,
492 .access = ACCESS_CAN_REORDER);
493 nir_def *y = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Y,
494 .access = ACCESS_CAN_REORDER);
495 nir_def *z = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Z,
496 .access = ACCESS_CAN_REORDER);
497
498 if (b->shader->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
499 nir_def *x_lo = nir_iand_imm(b, x, 0x1);
500 nir_def *y_lo = nir_ushr_imm(b, nir_iand_imm(b, x, 0x2), 1);
501 nir_def *x_hi = nir_ushr_imm(b, nir_iand_imm(b, x, ~0x3), 1);
502 nir_def *y_hi = nir_ishl_imm(b, y, 1);
503
504 x = nir_ior(b, x_lo, x_hi);
505 y = nir_ior(b, y_lo, y_hi);
506 }
507
508 val = nir_vec3(b, x, y, z);
509 break;
510 }
511
512 case nir_intrinsic_load_num_subgroups: {
513 assert(!b->shader->info.workgroup_size_variable);
514 uint16_t wg_size = b->shader->info.workgroup_size[0] *
515 b->shader->info.workgroup_size[1] *
516 b->shader->info.workgroup_size[2];
517 val = nir_imm_int(b, DIV_ROUND_UP(wg_size, 32));
518 break;
519 }
520
521 case nir_intrinsic_load_subgroup_id:
522 if (nak_nir_workgroup_has_one_subgroup(b->shader)) {
523 val = nir_imm_int(b, 0);
524 } else {
525 assert(!b->shader->info.workgroup_size_variable);
526 nir_def *tid_x = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_X,
527 .access = ACCESS_CAN_REORDER);
528 nir_def *tid_y = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Y,
529 .access = ACCESS_CAN_REORDER);
530 nir_def *tid_z = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Z,
531 .access = ACCESS_CAN_REORDER);
532
533 const uint16_t *wg_size = b->shader->info.workgroup_size;
534 nir_def *tid =
535 nir_iadd(b, tid_x,
536 nir_iadd(b, nir_imul_imm(b, tid_y, wg_size[0]),
537 nir_imul_imm(b, tid_z, wg_size[0] * wg_size[1])));
538
539 val = nir_udiv_imm(b, tid, 32);
540 }
541 break;
542
543 case nir_intrinsic_is_helper_invocation: {
544 /* Unlike load_helper_invocation, this one isn't re-orderable */
545 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_THREAD_KILL);
546 break;
547 }
548
549 case nir_intrinsic_shader_clock: {
550 /* The CS2R opcode can load 64 bits worth of sysval data at a time but
551 * it's not actually atomic. In order to get correct shader clocks, we
552 * need to do a loop where we do
553 *
554 * CS2R SV_CLOCK_HI
555 * CS2R SV_CLOCK_LO
556 * CS2R SV_CLOCK_HI
557 * CS2R SV_CLOCK_LO
558 * CS2R SV_CLOCK_HI
559 * ...
560 *
561 * The moment two high values are the same, we take the low value
562 * between them and that gives us our clock.
563 *
564 * In order to make sure we don't run into any weird races, we also need
565 * to insert a barrier after every load to ensure the one load completes
566 * before we kick off the next load. Otherwise, if one load happens to
567 * be faster than the other (they are variable latency, after all) we're
568 * still guaranteed that the loads happen in the order we want.
569 */
570 nir_variable *clock =
571 nir_local_variable_create(b->impl, glsl_uvec2_type(), NULL);
572
573 nir_def *clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_HI);
574 nir_ssa_bar_nv(b, clock_hi);
575
576 nir_store_var(b, clock, nir_vec2(b, nir_imm_int(b, 0), clock_hi), 0x3);
577
578 nir_push_loop(b);
579 {
580 nir_def *last_clock = nir_load_var(b, clock);
581
582 nir_def *clock_lo = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_LO);
583 nir_ssa_bar_nv(b, clock_lo);
584
585 clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK + 1);
586 nir_ssa_bar_nv(b, clock_hi);
587
588 nir_store_var(b, clock, nir_vec2(b, clock_lo, clock_hi), 0x3);
589
590 nir_break_if(b, nir_ieq(b, clock_hi, nir_channel(b, last_clock, 1)));
591 }
592 nir_pop_loop(b, NULL);
593
594 val = nir_load_var(b, clock);
595 if (intrin->def.bit_size == 64)
596 val = nir_pack_64_2x32(b, val);
597 break;
598 }
599
600 case nir_intrinsic_load_warps_per_sm_nv:
601 val = nir_imm_int(b, nak->warps_per_sm);
602 break;
603
604 case nir_intrinsic_load_sm_count_nv:
605 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTCFG);
606 val = nir_ubitfield_extract_imm(b, val, 20, 9);
607 break;
608
609 case nir_intrinsic_load_warp_id_nv:
610 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
611 val = nir_ubitfield_extract_imm(b, val, 8, 7);
612 break;
613
614 case nir_intrinsic_load_sm_id_nv:
615 val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
616 val = nir_ubitfield_extract_imm(b, val, 20, 9);
617 break;
618
619 default:
620 return false;
621 }
622
623 if (intrin->def.bit_size == 1)
624 val = nir_i2b(b, val);
625
626 nir_def_rewrite_uses(&intrin->def, val);
627
628 return true;
629 }
630
631 static bool
nak_nir_lower_system_values(nir_shader * nir,const struct nak_compiler * nak)632 nak_nir_lower_system_values(nir_shader *nir, const struct nak_compiler *nak)
633 {
634 return nir_shader_intrinsics_pass(nir, nak_nir_lower_system_value_intrin,
635 nir_metadata_none,
636 (void *)nak);
637 }
638
639 struct nak_xfb_info
nak_xfb_from_nir(const struct nir_xfb_info * nir_xfb)640 nak_xfb_from_nir(const struct nir_xfb_info *nir_xfb)
641 {
642 if (nir_xfb == NULL)
643 return (struct nak_xfb_info) { };
644
645 struct nak_xfb_info nak_xfb = { };
646
647 u_foreach_bit(b, nir_xfb->buffers_written) {
648 nak_xfb.stride[b] = nir_xfb->buffers[b].stride;
649 nak_xfb.stream[b] = nir_xfb->buffer_to_stream[b];
650 }
651 memset(nak_xfb.attr_index, 0xff, sizeof(nak_xfb.attr_index)); /* = skip */
652
653 for (unsigned o = 0; o < nir_xfb->output_count; o++) {
654 const nir_xfb_output_info *out = &nir_xfb->outputs[o];
655 const uint8_t b = out->buffer;
656 assert(nir_xfb->buffers_written & BITFIELD_BIT(b));
657
658 const uint16_t attr_addr = nak_varying_attr_addr(out->location);
659 assert(attr_addr % 4 == 0);
660 const uint16_t attr_idx = attr_addr / 4;
661
662 assert(out->offset % 4 == 0);
663 uint8_t out_idx = out->offset / 4;
664
665 u_foreach_bit(c, out->component_mask)
666 nak_xfb.attr_index[b][out_idx++] = attr_idx + c;
667
668 nak_xfb.attr_count[b] = MAX2(nak_xfb.attr_count[b], out_idx);
669 }
670
671 return nak_xfb;
672 }
673
674 static bool
lower_fs_output_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * _data)675 lower_fs_output_intrin(nir_builder *b, nir_intrinsic_instr *intrin, void *_data)
676 {
677 if (intrin->intrinsic != nir_intrinsic_store_output)
678 return false;
679
680 b->cursor = nir_before_instr(&intrin->instr);
681
682 const nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
683 uint16_t addr = nak_fs_out_addr(sem.location, sem.dual_source_blend_index) +
684 nir_src_as_uint(intrin->src[1]) * 16 +
685 nir_intrinsic_component(intrin) * 4;
686
687 nir_def *data = intrin->src[0].ssa;
688
689 /* The fs_out_nv intrinsic is always scalar */
690 u_foreach_bit(c, nir_intrinsic_write_mask(intrin)) {
691 if (nir_scalar_is_undef(nir_scalar_resolved(data, c)))
692 continue;
693
694 nir_fs_out_nv(b, nir_channel(b, data, c), .base = addr + c * 4);
695 }
696
697 nir_instr_remove(&intrin->instr);
698
699 return true;
700 }
701
702 static bool
nak_nir_lower_fs_outputs(nir_shader * nir)703 nak_nir_lower_fs_outputs(nir_shader *nir)
704 {
705 if (nir->info.outputs_written == 0)
706 return false;
707
708 bool progress = nir_shader_intrinsics_pass(nir, lower_fs_output_intrin,
709 nir_metadata_control_flow,
710 NULL);
711
712 if (progress) {
713 /* We need a copy_fs_outputs_nv intrinsic so NAK knows where to place
714 * the final copy. This needs to be in the last block, after all
715 * store_output intrinsics.
716 */
717 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
718 nir_builder b = nir_builder_at(nir_after_impl(impl));
719 nir_copy_fs_outputs_nv(&b);
720 }
721
722 return progress;
723 }
724
725 static bool
nak_nir_remove_barrier_intrin(nir_builder * b,nir_intrinsic_instr * barrier,UNUSED void * _data)726 nak_nir_remove_barrier_intrin(nir_builder *b, nir_intrinsic_instr *barrier,
727 UNUSED void *_data)
728 {
729 if (barrier->intrinsic != nir_intrinsic_barrier)
730 return false;
731
732 mesa_scope exec_scope = nir_intrinsic_execution_scope(barrier);
733 assert(exec_scope <= SCOPE_WORKGROUP &&
734 "Control barrier with scope > WORKGROUP");
735
736 if (exec_scope == SCOPE_WORKGROUP &&
737 nak_nir_workgroup_has_one_subgroup(b->shader))
738 exec_scope = SCOPE_SUBGROUP;
739
740 /* Because we're guaranteeing maximal convergence via warp barriers,
741 * subgroup barriers do nothing.
742 */
743 if (exec_scope <= SCOPE_SUBGROUP)
744 exec_scope = SCOPE_NONE;
745
746 const nir_variable_mode mem_modes = nir_intrinsic_memory_modes(barrier);
747 if (exec_scope == SCOPE_NONE && mem_modes == 0) {
748 nir_instr_remove(&barrier->instr);
749 return true;
750 }
751
752 /* In this case, we're leaving the barrier there */
753 b->shader->info.uses_control_barrier = true;
754
755 bool progress = false;
756 if (exec_scope != nir_intrinsic_execution_scope(barrier)) {
757 nir_intrinsic_set_execution_scope(barrier, exec_scope);
758 progress = true;
759 }
760
761 return progress;
762 }
763
764 static bool
nak_nir_remove_barriers(nir_shader * nir)765 nak_nir_remove_barriers(nir_shader *nir)
766 {
767 /* We'll set this back to true if we leave any barriers in place */
768 nir->info.uses_control_barrier = false;
769
770 return nir_shader_intrinsics_pass(nir, nak_nir_remove_barrier_intrin,
771 nir_metadata_control_flow,
772 NULL);
773 }
774
775 static bool
nak_mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * cb_data)776 nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
777 unsigned bit_size, unsigned num_components,
778 nir_intrinsic_instr *low, nir_intrinsic_instr *high,
779 void *cb_data)
780 {
781 /*
782 * Since we legalize these later with nir_lower_mem_access_bit_sizes,
783 * we can optimistically combine anything that might be profitable
784 */
785 assert(util_is_power_of_two_nonzero(align_mul));
786
787 unsigned max_bytes = 128u / 8u;
788 if (low->intrinsic == nir_intrinsic_ldc_nv ||
789 low->intrinsic == nir_intrinsic_ldcx_nv)
790 max_bytes = 64u / 8u;
791
792 align_mul = MIN2(align_mul, max_bytes);
793 align_offset = align_offset % align_mul;
794 return align_offset + num_components * (bit_size / 8) <= align_mul;
795 }
796
797 static nir_mem_access_size_align
nak_mem_access_size_align(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,const void * cb_data)798 nak_mem_access_size_align(nir_intrinsic_op intrin,
799 uint8_t bytes, uint8_t bit_size,
800 uint32_t align_mul, uint32_t align_offset,
801 bool offset_is_const, const void *cb_data)
802 {
803 const uint32_t align = nir_combined_align(align_mul, align_offset);
804 assert(util_is_power_of_two_nonzero(align));
805
806 unsigned bytes_pow2;
807 if (nir_intrinsic_infos[intrin].has_dest) {
808 /* Reads can over-fetch a bit if the alignment is okay. */
809 bytes_pow2 = util_next_power_of_two(bytes);
810 } else {
811 bytes_pow2 = 1 << (util_last_bit(bytes) - 1);
812 }
813
814 unsigned chunk_bytes = MIN3(bytes_pow2, align, 16);
815 assert(util_is_power_of_two_nonzero(chunk_bytes));
816 if (intrin == nir_intrinsic_ldc_nv ||
817 intrin == nir_intrinsic_ldcx_nv)
818 chunk_bytes = MIN2(chunk_bytes, 8);
819
820 if ((intrin == nir_intrinsic_ldc_nv ||
821 intrin == nir_intrinsic_ldcx_nv) && align < 4) {
822 /* CBufs require 4B alignment unless we're doing a ldc.u8 or ldc.i8.
823 * In particular, this applies to ldc.u16 which means we either have to
824 * fall back to two ldc.u8 or use ldc.u32 and shift stuff around to get
825 * the 16bit value out. Fortunately, nir_lower_mem_access_bit_sizes()
826 * can handle over-alignment for reads.
827 */
828 if (align == 2 || offset_is_const) {
829 return (nir_mem_access_size_align) {
830 .bit_size = 32,
831 .num_components = 1,
832 .align = 4,
833 };
834 } else {
835 assert(align == 1);
836 return (nir_mem_access_size_align) {
837 .bit_size = 8,
838 .num_components = 1,
839 .align = 1,
840 };
841 }
842 } else if (chunk_bytes < 4) {
843 return (nir_mem_access_size_align) {
844 .bit_size = chunk_bytes * 8,
845 .num_components = 1,
846 .align = chunk_bytes,
847 };
848 } else {
849 return (nir_mem_access_size_align) {
850 .bit_size = 32,
851 .num_components = chunk_bytes / 4,
852 .align = chunk_bytes,
853 };
854 }
855 }
856
857 static bool
nir_shader_has_local_variables(const nir_shader * nir)858 nir_shader_has_local_variables(const nir_shader *nir)
859 {
860 nir_foreach_function(func, nir) {
861 if (func->impl && !exec_list_is_empty(&func->impl->locals))
862 return true;
863 }
864
865 return false;
866 }
867
868 static int
type_size_vec4(const struct glsl_type * type,bool bindless)869 type_size_vec4(const struct glsl_type *type, bool bindless)
870 {
871 return glsl_count_vec4_slots(type, false, bindless);
872 }
873
874 void
nak_postprocess_nir(nir_shader * nir,const struct nak_compiler * nak,nir_variable_mode robust2_modes,const struct nak_fs_key * fs_key)875 nak_postprocess_nir(nir_shader *nir,
876 const struct nak_compiler *nak,
877 nir_variable_mode robust2_modes,
878 const struct nak_fs_key *fs_key)
879 {
880 UNUSED bool progress = false;
881
882 nak_optimize_nir(nir, nak);
883
884 const nir_lower_subgroups_options subgroups_options = {
885 .subgroup_size = NAK_SUBGROUP_SIZE,
886 .ballot_bit_size = 32,
887 .ballot_components = 1,
888 .lower_to_scalar = true,
889 .lower_vote_eq = true,
890 .lower_first_invocation_to_ballot = true,
891 .lower_read_first_invocation = true,
892 .lower_elect = true,
893 .lower_inverse_ballot = true,
894 .lower_rotate_to_shuffle = true
895 };
896 OPT(nir, nir_lower_subgroups, &subgroups_options);
897 OPT(nir, nak_nir_lower_scan_reduce);
898
899 if (nir_shader_has_local_variables(nir)) {
900 OPT(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
901 glsl_get_natural_size_align_bytes);
902 OPT(nir, nir_lower_explicit_io, nir_var_function_temp,
903 nir_address_format_32bit_offset);
904 nak_optimize_nir(nir, nak);
905 }
906
907 OPT(nir, nir_opt_shrink_vectors, true);
908
909 nir_load_store_vectorize_options vectorize_opts = {};
910 vectorize_opts.modes = nir_var_mem_global |
911 nir_var_mem_ssbo |
912 nir_var_mem_shared |
913 nir_var_shader_temp;
914 vectorize_opts.callback = nak_mem_vectorize_cb;
915 vectorize_opts.robust_modes = robust2_modes;
916 OPT(nir, nir_opt_load_store_vectorize, &vectorize_opts);
917
918 nir_lower_mem_access_bit_sizes_options mem_bit_size_options = {
919 .modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic,
920 .callback = nak_mem_access_size_align,
921 };
922 OPT(nir, nir_lower_mem_access_bit_sizes, &mem_bit_size_options);
923 OPT(nir, nir_lower_bit_size, lower_bit_size_cb, (void *)nak);
924
925 OPT(nir, nir_opt_combine_barriers, NULL, NULL);
926
927 nak_optimize_nir(nir, nak);
928
929 OPT(nir, nak_nir_lower_tex, nak);
930 OPT(nir, nir_lower_idiv, NULL);
931
932 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
933
934 OPT(nir, nir_lower_indirect_derefs, 0, UINT32_MAX);
935
936 if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
937 OPT(nir, nir_lower_tess_coord_z,
938 nir->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES);
939 }
940
941 /* We need to do this before nak_nir_lower_system_values() because it
942 * relies on the workgroup size being the actual HW workgroup size in
943 * nir_intrinsic_load_subgroup_id.
944 */
945 if (gl_shader_stage_uses_workgroup(nir->info.stage) &&
946 nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
947 assert(nir->info.workgroup_size[0] % 2 == 0);
948 assert(nir->info.workgroup_size[1] % 2 == 0);
949 nir->info.workgroup_size[0] *= 2;
950 nir->info.workgroup_size[1] /= 2;
951 }
952
953 OPT(nir, nak_nir_lower_system_values, nak);
954
955 switch (nir->info.stage) {
956 case MESA_SHADER_VERTEX:
957 case MESA_SHADER_TESS_CTRL:
958 case MESA_SHADER_TESS_EVAL:
959 case MESA_SHADER_GEOMETRY:
960 OPT(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
961 type_size_vec4, nir_lower_io_lower_64bit_to_32_new);
962 OPT(nir, nir_opt_constant_folding);
963 OPT(nir, nak_nir_lower_vtg_io, nak);
964 if (nir->info.stage == MESA_SHADER_GEOMETRY)
965 OPT(nir, nak_nir_lower_gs_intrinsics);
966 break;
967
968 case MESA_SHADER_FRAGMENT:
969 OPT(nir, nir_lower_indirect_derefs,
970 nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
971 OPT(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
972 type_size_vec4, nir_lower_io_lower_64bit_to_32_new);
973 OPT(nir, nir_opt_constant_folding);
974 OPT(nir, nak_nir_lower_fs_inputs, nak, fs_key);
975 OPT(nir, nak_nir_lower_fs_outputs);
976 break;
977
978 case MESA_SHADER_COMPUTE:
979 case MESA_SHADER_KERNEL:
980 break;
981
982 default:
983 unreachable("Unsupported shader stage");
984 }
985
986 OPT(nir, nir_lower_doubles, NULL, nak->nir_options.lower_doubles_options);
987 OPT(nir, nir_lower_int64);
988
989 nak_optimize_nir(nir, nak);
990
991 do {
992 progress = false;
993 OPT(nir, nir_opt_algebraic_late);
994 OPT(nir, nak_nir_lower_algebraic_late, nak);
995
996 /* If we're lowering fp64 sat but not min/max, the sat lowering may have
997 * been undone by nir_opt_algebraic. Lower sat again just to be sure.
998 */
999 if ((nak->nir_options.lower_doubles_options & nir_lower_dsat) &&
1000 !(nak->nir_options.lower_doubles_options & nir_lower_dminmax))
1001 OPT(nir, nir_lower_doubles, NULL, nir_lower_dsat);
1002
1003 if (progress) {
1004 OPT(nir, nir_opt_constant_folding);
1005 OPT(nir, nir_copy_prop);
1006 OPT(nir, nir_opt_dce);
1007 OPT(nir, nir_opt_cse);
1008 }
1009 } while (progress);
1010
1011 if (nak->sm < 70)
1012 OPT(nir, nak_nir_split_64bit_conversions);
1013
1014 nir_convert_to_lcssa(nir, true, true);
1015 nir_divergence_analysis(nir);
1016
1017 if (nak->sm >= 75) {
1018 if (OPT(nir, nak_nir_lower_non_uniform_ldcx)) {
1019 OPT(nir, nir_copy_prop);
1020 OPT(nir, nir_opt_dce);
1021 nir_divergence_analysis(nir);
1022 }
1023 }
1024
1025 OPT(nir, nak_nir_remove_barriers);
1026
1027 if (nak->sm >= 70) {
1028 if (nak_should_print_nir()) {
1029 fprintf(stderr, "Structured NIR for %s shader:\n",
1030 _mesa_shader_stage_to_string(nir->info.stage));
1031 nir_print_shader(nir, stderr);
1032 }
1033 OPT(nir, nak_nir_lower_cf);
1034 }
1035
1036 /* Re-index blocks and compact SSA defs because we'll use them to index
1037 * arrays
1038 */
1039 nir_foreach_function(func, nir) {
1040 if (func->impl) {
1041 nir_index_blocks(func->impl);
1042 nir_index_ssa_defs(func->impl);
1043 }
1044 }
1045
1046 if (nak_should_print_nir()) {
1047 fprintf(stderr, "NIR for %s shader:\n",
1048 _mesa_shader_stage_to_string(nir->info.stage));
1049 nir_print_shader(nir, stderr);
1050 }
1051 }
1052
1053 static bool
scalar_is_imm_int(nir_scalar x,unsigned bits)1054 scalar_is_imm_int(nir_scalar x, unsigned bits)
1055 {
1056 if (!nir_scalar_is_const(x))
1057 return false;
1058
1059 int64_t imm = nir_scalar_as_int(x);
1060 return u_intN_min(bits) <= imm && imm <= u_intN_max(bits);
1061 }
1062
1063 struct nak_io_addr_offset
nak_get_io_addr_offset(nir_def * addr,uint8_t imm_bits)1064 nak_get_io_addr_offset(nir_def *addr, uint8_t imm_bits)
1065 {
1066 nir_scalar addr_s = {
1067 .def = addr,
1068 .comp = 0,
1069 };
1070 if (scalar_is_imm_int(addr_s, imm_bits)) {
1071 /* Base is a dumb name for this. It should be offset */
1072 return (struct nak_io_addr_offset) {
1073 .offset = nir_scalar_as_int(addr_s),
1074 };
1075 }
1076
1077 addr_s = nir_scalar_chase_movs(addr_s);
1078 if (!nir_scalar_is_alu(addr_s) ||
1079 nir_scalar_alu_op(addr_s) != nir_op_iadd) {
1080 return (struct nak_io_addr_offset) {
1081 .base = addr_s,
1082 };
1083 }
1084
1085 for (unsigned i = 0; i < 2; i++) {
1086 nir_scalar off_s = nir_scalar_chase_alu_src(addr_s, i);
1087 off_s = nir_scalar_chase_movs(off_s);
1088 if (scalar_is_imm_int(off_s, imm_bits)) {
1089 return (struct nak_io_addr_offset) {
1090 .base = nir_scalar_chase_alu_src(addr_s, 1 - i),
1091 .offset = nir_scalar_as_int(off_s),
1092 };
1093 }
1094 }
1095
1096 return (struct nak_io_addr_offset) {
1097 .base = addr_s,
1098 };
1099 }
1100