xref: /aosp_15_r20/external/mesa3d/src/nouveau/compiler/nak_nir.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Collabora, Ltd.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "nak_private.h"
7 #include "nir_builder.h"
8 #include "nir_control_flow.h"
9 #include "nir_xfb_info.h"
10 
11 #include "util/u_math.h"
12 
13 #define OPT(nir, pass, ...) ({                           \
14    bool this_progress = false;                           \
15    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);    \
16    if (this_progress)                                    \
17       progress = true;                                   \
18    this_progress;                                        \
19 })
20 
21 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
22 
23 bool
nak_nir_workgroup_has_one_subgroup(const nir_shader * nir)24 nak_nir_workgroup_has_one_subgroup(const nir_shader *nir)
25 {
26    switch (nir->info.stage) {
27    case MESA_SHADER_VERTEX:
28    case MESA_SHADER_TESS_EVAL:
29    case MESA_SHADER_GEOMETRY:
30    case MESA_SHADER_FRAGMENT:
31       unreachable("Shader stage does not have workgroups");
32       break;
33 
34    case MESA_SHADER_TESS_CTRL:
35       /* Tessellation only ever has one subgroup per workgroup.  The Vulkan
36        * limit on the number of tessellation invocations is 32 to allow for
37        * this.
38        */
39       return true;
40 
41    case MESA_SHADER_COMPUTE:
42    case MESA_SHADER_KERNEL: {
43       if (nir->info.workgroup_size_variable)
44          return false;
45 
46       uint16_t wg_sz = nir->info.workgroup_size[0] *
47                        nir->info.workgroup_size[1] *
48                        nir->info.workgroup_size[2];
49 
50       return wg_sz <= NAK_SUBGROUP_SIZE;
51    }
52 
53    default:
54       unreachable("Unknown shader stage");
55    }
56 }
57 
58 static uint8_t
vectorize_filter_cb(const nir_instr * instr,const void * _data)59 vectorize_filter_cb(const nir_instr *instr, const void *_data)
60 {
61    if (instr->type != nir_instr_type_alu)
62       return 0;
63 
64    const nir_alu_instr *alu = nir_instr_as_alu(instr);
65 
66    const unsigned bit_size = nir_alu_instr_is_comparison(alu)
67                              ? alu->src[0].src.ssa->bit_size
68                              : alu->def.bit_size;
69 
70    switch (alu->op) {
71    case nir_op_fadd:
72    case nir_op_fsub:
73    case nir_op_fabs:
74    case nir_op_fneg:
75    case nir_op_feq:
76    case nir_op_fge:
77    case nir_op_flt:
78    case nir_op_fneu:
79    case nir_op_fmul:
80    case nir_op_ffma:
81    case nir_op_fsign:
82    case nir_op_fsat:
83    case nir_op_fmax:
84    case nir_op_fmin:
85       return bit_size == 16 ? 2 : 1;
86    default:
87       return 1;
88    }
89 }
90 
91 static void
optimize_nir(nir_shader * nir,const struct nak_compiler * nak,bool allow_copies)92 optimize_nir(nir_shader *nir, const struct nak_compiler *nak, bool allow_copies)
93 {
94    bool progress;
95 
96    unsigned lower_flrp =
97       (nir->options->lower_flrp16 ? 16 : 0) |
98       (nir->options->lower_flrp32 ? 32 : 0) |
99       (nir->options->lower_flrp64 ? 64 : 0);
100 
101    do {
102       progress = false;
103 
104       /* This pass is causing problems with types used by OpenCL :
105        *    https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13955
106        *
107        * Running with it disabled made no difference in the resulting assembly
108        * code.
109        */
110       if (nir->info.stage != MESA_SHADER_KERNEL)
111          OPT(nir, nir_split_array_vars, nir_var_function_temp);
112 
113       OPT(nir, nir_shrink_vec_array_vars, nir_var_function_temp);
114       OPT(nir, nir_opt_deref);
115       if (OPT(nir, nir_opt_memcpy))
116          OPT(nir, nir_split_var_copies);
117 
118       OPT(nir, nir_lower_vars_to_ssa);
119 
120       if (allow_copies) {
121          /* Only run this pass in the first call to brw_nir_optimize.  Later
122           * calls assume that we've lowered away any copy_deref instructions
123           * and we don't want to introduce any more.
124           */
125          OPT(nir, nir_opt_find_array_copies);
126       }
127       OPT(nir, nir_opt_copy_prop_vars);
128       OPT(nir, nir_opt_dead_write_vars);
129       OPT(nir, nir_opt_combine_stores, nir_var_all);
130 
131       OPT(nir, nir_lower_alu_width, vectorize_filter_cb, NULL);
132       OPT(nir, nir_opt_vectorize, vectorize_filter_cb, NULL);
133       OPT(nir, nir_lower_phis_to_scalar, false);
134       OPT(nir, nir_lower_frexp);
135       OPT(nir, nir_copy_prop);
136       OPT(nir, nir_opt_dce);
137       OPT(nir, nir_opt_cse);
138 
139       OPT(nir, nir_opt_peephole_select, 0, false, false);
140       OPT(nir, nir_opt_intrinsics);
141       OPT(nir, nir_opt_idiv_const, 32);
142       OPT(nir, nir_opt_algebraic);
143       OPT(nir, nir_lower_constant_convert_alu_types);
144       OPT(nir, nir_opt_constant_folding);
145 
146       if (lower_flrp != 0) {
147          if (OPT(nir, nir_lower_flrp, lower_flrp, false /* always_precise */))
148             OPT(nir, nir_opt_constant_folding);
149          /* Nothing should rematerialize any flrps */
150          lower_flrp = 0;
151       }
152 
153       OPT(nir, nir_opt_dead_cf);
154       if (OPT(nir, nir_opt_loop)) {
155          /* If nir_opt_loop makes progress, then we need to clean things up
156           * if we want any hope of nir_opt_if or nir_opt_loop_unroll to make
157           * progress.
158           */
159          OPT(nir, nir_copy_prop);
160          OPT(nir, nir_opt_dce);
161       }
162       OPT(nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
163       OPT(nir, nir_opt_conditional_discard);
164       if (nir->options->max_unroll_iterations != 0) {
165          OPT(nir, nir_opt_loop_unroll);
166       }
167       OPT(nir, nir_opt_remove_phis);
168       OPT(nir, nir_opt_gcm, false);
169       OPT(nir, nir_opt_undef);
170       OPT(nir, nir_lower_pack);
171    } while (progress);
172 
173    OPT(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
174 }
175 
176 void
nak_optimize_nir(nir_shader * nir,const struct nak_compiler * nak)177 nak_optimize_nir(nir_shader *nir, const struct nak_compiler *nak)
178 {
179    optimize_nir(nir, nak, false);
180 }
181 
182 static unsigned
lower_bit_size_cb(const nir_instr * instr,void * data)183 lower_bit_size_cb(const nir_instr *instr, void *data)
184 {
185    const struct nak_compiler *nak = data;
186 
187    switch (instr->type) {
188    case nir_instr_type_alu: {
189       nir_alu_instr *alu = nir_instr_as_alu(instr);
190       if (nir_op_infos[alu->op].is_conversion)
191          return 0;
192 
193       const unsigned bit_size = nir_alu_instr_is_comparison(alu)
194                                 ? alu->src[0].src.ssa->bit_size
195                                 : alu->def.bit_size;
196 
197       switch (alu->op) {
198       case nir_op_bit_count:
199       case nir_op_ufind_msb:
200       case nir_op_ifind_msb:
201       case nir_op_find_lsb:
202          /* These are handled specially because the destination is always
203           * 32-bit and so the bit size of the instruction is given by the
204           * source.
205           */
206          return alu->src[0].src.ssa->bit_size == 32 ? 0 : 32;
207 
208       case nir_op_fabs:
209       case nir_op_fadd:
210       case nir_op_fneg:
211       case nir_op_feq:
212       case nir_op_fge:
213       case nir_op_flt:
214       case nir_op_fneu:
215       case nir_op_fmul:
216       case nir_op_ffma:
217       case nir_op_ffmaz:
218       case nir_op_fsign:
219       case nir_op_fsat:
220       case nir_op_fceil:
221       case nir_op_ffloor:
222       case nir_op_fround_even:
223       case nir_op_ftrunc:
224          if (bit_size == 16  && nak->sm >= 70)
225             return 0;
226          break;
227 
228       case nir_op_fmax:
229       case nir_op_fmin:
230          if (bit_size == 16 && nak->sm >= 80)
231             return 0;
232          break;
233 
234       default:
235          break;
236       }
237 
238       if (bit_size >= 32)
239          return 0;
240 
241       if (bit_size & (8 | 16))
242          return 32;
243 
244       return 0;
245    }
246 
247    case nir_instr_type_intrinsic: {
248       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
249       switch (intrin->intrinsic) {
250       case nir_intrinsic_vote_ieq:
251          if (intrin->src[0].ssa->bit_size != 1 &&
252              intrin->src[0].ssa->bit_size < 32)
253             return 32;
254          return 0;
255 
256       case nir_intrinsic_vote_feq:
257       case nir_intrinsic_read_invocation:
258       case nir_intrinsic_read_first_invocation:
259       case nir_intrinsic_shuffle:
260       case nir_intrinsic_shuffle_xor:
261       case nir_intrinsic_shuffle_up:
262       case nir_intrinsic_shuffle_down:
263       case nir_intrinsic_quad_broadcast:
264       case nir_intrinsic_quad_swap_horizontal:
265       case nir_intrinsic_quad_swap_vertical:
266       case nir_intrinsic_quad_swap_diagonal:
267       case nir_intrinsic_reduce:
268       case nir_intrinsic_inclusive_scan:
269       case nir_intrinsic_exclusive_scan:
270          if (intrin->src[0].ssa->bit_size < 32)
271             return 32;
272          return 0;
273 
274       default:
275          return 0;
276       }
277    }
278 
279    case nir_instr_type_phi: {
280       nir_phi_instr *phi = nir_instr_as_phi(instr);
281       if (phi->def.bit_size < 32 && phi->def.bit_size != 1)
282          return 32;
283       return 0;
284    }
285 
286    default:
287       return 0;
288    }
289 }
290 
291 void
nak_preprocess_nir(nir_shader * nir,const struct nak_compiler * nak)292 nak_preprocess_nir(nir_shader *nir, const struct nak_compiler *nak)
293 {
294    UNUSED bool progress = false;
295 
296    nir_validate_ssa_dominance(nir, "before nak_preprocess_nir");
297 
298    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
299       nir_lower_io_to_temporaries(nir, nir_shader_get_entrypoint(nir),
300                                   true /* outputs */, false /* inputs */);
301    }
302 
303    const nir_lower_tex_options tex_options = {
304       .lower_txd_3d = true,
305       .lower_txd_cube_map = true,
306       .lower_txd_clamp = true,
307       .lower_txd_shadow = true,
308       .lower_txp = ~0,
309       /* TODO: More lowering */
310    };
311    OPT(nir, nir_lower_tex, &tex_options);
312    OPT(nir, nir_normalize_cubemap_coords);
313 
314    nir_lower_image_options image_options = {
315       .lower_cube_size = true,
316    };
317    OPT(nir, nir_lower_image, &image_options);
318 
319    OPT(nir, nir_lower_global_vars_to_local);
320 
321    OPT(nir, nir_split_var_copies);
322    OPT(nir, nir_split_struct_vars, nir_var_function_temp);
323 
324    /* Optimize but allow copies because we haven't lowered them yet */
325    optimize_nir(nir, nak, true /* allow_copies */);
326 
327    OPT(nir, nir_lower_load_const_to_scalar);
328    OPT(nir, nir_lower_var_copies);
329    OPT(nir, nir_lower_system_values);
330    OPT(nir, nir_lower_compute_system_values, NULL);
331 
332    if (nir->info.stage == MESA_SHADER_FRAGMENT)
333       OPT(nir, nir_lower_terminate_to_demote);
334 }
335 
336 uint16_t
nak_varying_attr_addr(gl_varying_slot slot)337 nak_varying_attr_addr(gl_varying_slot slot)
338 {
339    if (slot >= VARYING_SLOT_PATCH0) {
340       return NAK_ATTR_PATCH_START + (slot - VARYING_SLOT_PATCH0) * 0x10;
341    } else if (slot >= VARYING_SLOT_VAR0) {
342       return NAK_ATTR_GENERIC_START + (slot - VARYING_SLOT_VAR0) * 0x10;
343    } else {
344       switch (slot) {
345       case VARYING_SLOT_TESS_LEVEL_OUTER: return NAK_ATTR_TESS_LOD;
346       case VARYING_SLOT_TESS_LEVEL_INNER: return NAK_ATTR_TESS_INTERRIOR;
347       case VARYING_SLOT_PRIMITIVE_ID:     return NAK_ATTR_PRIMITIVE_ID;
348       case VARYING_SLOT_LAYER:            return NAK_ATTR_RT_ARRAY_INDEX;
349       case VARYING_SLOT_VIEWPORT:         return NAK_ATTR_VIEWPORT_INDEX;
350       case VARYING_SLOT_PSIZ:             return NAK_ATTR_POINT_SIZE;
351       case VARYING_SLOT_POS:              return NAK_ATTR_POSITION;
352       case VARYING_SLOT_CLIP_DIST0:       return NAK_ATTR_CLIP_CULL_DIST_0;
353       case VARYING_SLOT_CLIP_DIST1:       return NAK_ATTR_CLIP_CULL_DIST_4;
354       default: unreachable("Invalid varying slot");
355       }
356    }
357 }
358 
359 static uint16_t
nak_fs_out_addr(gl_frag_result slot,uint32_t blend_idx)360 nak_fs_out_addr(gl_frag_result slot, uint32_t blend_idx)
361 {
362    switch (slot) {
363    case FRAG_RESULT_DEPTH:
364       assert(blend_idx == 0);
365       return NAK_FS_OUT_DEPTH;
366 
367    case FRAG_RESULT_STENCIL:
368       unreachable("EXT_shader_stencil_export not supported");
369 
370    case FRAG_RESULT_COLOR:
371       unreachable("Vulkan alway uses explicit locations");
372 
373    case FRAG_RESULT_SAMPLE_MASK:
374       assert(blend_idx == 0);
375       return NAK_FS_OUT_SAMPLE_MASK;
376 
377    default:
378       assert(blend_idx < 2);
379       return NAK_FS_OUT_COLOR((slot - FRAG_RESULT_DATA0) + blend_idx);
380    }
381 }
382 
383 uint16_t
nak_sysval_attr_addr(gl_system_value sysval)384 nak_sysval_attr_addr(gl_system_value sysval)
385 {
386    switch (sysval) {
387    case SYSTEM_VALUE_PRIMITIVE_ID:  return NAK_ATTR_PRIMITIVE_ID;
388    case SYSTEM_VALUE_FRAG_COORD:    return NAK_ATTR_POSITION;
389    case SYSTEM_VALUE_POINT_COORD:   return NAK_ATTR_POINT_SPRITE;
390    case SYSTEM_VALUE_TESS_COORD:    return NAK_ATTR_TESS_COORD;
391    case SYSTEM_VALUE_INSTANCE_ID:   return NAK_ATTR_INSTANCE_ID;
392    case SYSTEM_VALUE_VERTEX_ID:     return NAK_ATTR_VERTEX_ID;
393    case SYSTEM_VALUE_FRONT_FACE:    return NAK_ATTR_FRONT_FACE;
394    case SYSTEM_VALUE_LAYER_ID:      return NAK_ATTR_RT_ARRAY_INDEX;
395    default: unreachable("Invalid system value");
396    }
397 }
398 
399 static uint8_t
nak_sysval_sysval_idx(gl_system_value sysval)400 nak_sysval_sysval_idx(gl_system_value sysval)
401 {
402    switch (sysval) {
403    case SYSTEM_VALUE_SUBGROUP_INVOCATION:    return NAK_SV_LANE_ID;
404    case SYSTEM_VALUE_VERTICES_IN:            return NAK_SV_VERTEX_COUNT;
405    case SYSTEM_VALUE_INVOCATION_ID:          return NAK_SV_INVOCATION_ID;
406    case SYSTEM_VALUE_HELPER_INVOCATION:      return NAK_SV_THREAD_KILL;
407    case SYSTEM_VALUE_LOCAL_INVOCATION_ID:    return NAK_SV_TID;
408    case SYSTEM_VALUE_WORKGROUP_ID:           return NAK_SV_CTAID;
409    case SYSTEM_VALUE_SUBGROUP_EQ_MASK:       return NAK_SV_LANEMASK_EQ;
410    case SYSTEM_VALUE_SUBGROUP_LT_MASK:       return NAK_SV_LANEMASK_LT;
411    case SYSTEM_VALUE_SUBGROUP_LE_MASK:       return NAK_SV_LANEMASK_LE;
412    case SYSTEM_VALUE_SUBGROUP_GT_MASK:       return NAK_SV_LANEMASK_GT;
413    case SYSTEM_VALUE_SUBGROUP_GE_MASK:       return NAK_SV_LANEMASK_GE;
414    default: unreachable("Invalid system value");
415    }
416 }
417 
418 static bool
nak_nir_lower_system_value_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * data)419 nak_nir_lower_system_value_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
420                                   void *data)
421 {
422    const struct nak_compiler *nak = data;
423 
424    b->cursor = nir_before_instr(&intrin->instr);
425 
426    nir_def *val;
427    switch (intrin->intrinsic) {
428    case nir_intrinsic_load_primitive_id:
429    case nir_intrinsic_load_instance_id:
430    case nir_intrinsic_load_vertex_id: {
431       assert(b->shader->info.stage != MESA_SHADER_VERTEX ||
432              b->shader->info.stage != MESA_SHADER_TESS_CTRL ||
433              b->shader->info.stage == MESA_SHADER_TESS_EVAL ||
434              b->shader->info.stage == MESA_SHADER_GEOMETRY);
435       const gl_system_value sysval =
436          nir_system_value_from_intrinsic(intrin->intrinsic);
437       const uint32_t addr = nak_sysval_attr_addr(sysval);
438       val = nir_ald_nv(b, 1, nir_imm_int(b, 0), nir_imm_int(b, 0),
439                        .base = addr, .flags = 0,
440                        .range_base = addr, .range = 4,
441                        .access = ACCESS_CAN_REORDER);
442       break;
443    }
444 
445    case nir_intrinsic_load_patch_vertices_in: {
446       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VERTEX_COUNT,
447                                .access = ACCESS_CAN_REORDER);
448       val = nir_extract_u8(b, val, nir_imm_int(b, 1));
449       break;
450    }
451 
452    case nir_intrinsic_load_subgroup_eq_mask:
453    case nir_intrinsic_load_subgroup_lt_mask:
454    case nir_intrinsic_load_subgroup_le_mask:
455    case nir_intrinsic_load_subgroup_gt_mask:
456    case nir_intrinsic_load_subgroup_ge_mask: {
457       const gl_system_value sysval =
458          nir_system_value_from_intrinsic(intrin->intrinsic);
459       const uint32_t idx = nak_sysval_sysval_idx(sysval);
460       val = nir_load_sysval_nv(b, 32, .base = idx,
461                                .access = ACCESS_CAN_REORDER);
462 
463       /* Pad with 0 because all invocations above 31 are off */
464       if (intrin->def.bit_size == 64) {
465          val = nir_u2u32(b, val);
466       } else {
467          assert(intrin->def.bit_size == 32);
468          val = nir_pad_vector_imm_int(b, val, 0, intrin->def.num_components);
469       }
470       break;
471    }
472 
473    case nir_intrinsic_load_subgroup_invocation:
474    case nir_intrinsic_load_helper_invocation:
475    case nir_intrinsic_load_invocation_id:
476    case nir_intrinsic_load_workgroup_id: {
477       const gl_system_value sysval =
478          nir_system_value_from_intrinsic(intrin->intrinsic);
479       const uint32_t idx = nak_sysval_sysval_idx(sysval);
480       nir_def *comps[3];
481       assert(intrin->def.num_components <= 3);
482       for (unsigned c = 0; c < intrin->def.num_components; c++) {
483          comps[c] = nir_load_sysval_nv(b, 32, .base = idx + c,
484                                        .access = ACCESS_CAN_REORDER);
485       }
486       val = nir_vec(b, comps, intrin->def.num_components);
487       break;
488    }
489 
490    case nir_intrinsic_load_local_invocation_id: {
491       nir_def *x = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_X,
492                                       .access = ACCESS_CAN_REORDER);
493       nir_def *y = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Y,
494                                       .access = ACCESS_CAN_REORDER);
495       nir_def *z = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Z,
496                                       .access = ACCESS_CAN_REORDER);
497 
498       if (b->shader->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
499          nir_def *x_lo = nir_iand_imm(b, x, 0x1);
500          nir_def *y_lo = nir_ushr_imm(b, nir_iand_imm(b, x, 0x2), 1);
501          nir_def *x_hi = nir_ushr_imm(b, nir_iand_imm(b, x, ~0x3), 1);
502          nir_def *y_hi = nir_ishl_imm(b, y, 1);
503 
504          x = nir_ior(b, x_lo, x_hi);
505          y = nir_ior(b, y_lo, y_hi);
506       }
507 
508       val = nir_vec3(b, x, y, z);
509       break;
510    }
511 
512    case nir_intrinsic_load_num_subgroups: {
513       assert(!b->shader->info.workgroup_size_variable);
514       uint16_t wg_size = b->shader->info.workgroup_size[0] *
515                          b->shader->info.workgroup_size[1] *
516                          b->shader->info.workgroup_size[2];
517       val = nir_imm_int(b, DIV_ROUND_UP(wg_size, 32));
518       break;
519    }
520 
521    case nir_intrinsic_load_subgroup_id:
522       if (nak_nir_workgroup_has_one_subgroup(b->shader)) {
523          val = nir_imm_int(b, 0);
524       } else {
525          assert(!b->shader->info.workgroup_size_variable);
526          nir_def *tid_x = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_X,
527                                              .access = ACCESS_CAN_REORDER);
528          nir_def *tid_y = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Y,
529                                              .access = ACCESS_CAN_REORDER);
530          nir_def *tid_z = nir_load_sysval_nv(b, 32, .base = NAK_SV_TID_Z,
531                                              .access = ACCESS_CAN_REORDER);
532 
533          const uint16_t *wg_size = b->shader->info.workgroup_size;
534          nir_def *tid =
535             nir_iadd(b, tid_x,
536             nir_iadd(b, nir_imul_imm(b, tid_y, wg_size[0]),
537                         nir_imul_imm(b, tid_z, wg_size[0] * wg_size[1])));
538 
539          val = nir_udiv_imm(b, tid, 32);
540       }
541       break;
542 
543    case nir_intrinsic_is_helper_invocation: {
544       /* Unlike load_helper_invocation, this one isn't re-orderable */
545       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_THREAD_KILL);
546       break;
547    }
548 
549    case nir_intrinsic_shader_clock: {
550       /* The CS2R opcode can load 64 bits worth of sysval data at a time but
551        * it's not actually atomic.  In order to get correct shader clocks, we
552        * need to do a loop where we do
553        *
554        *    CS2R SV_CLOCK_HI
555        *    CS2R SV_CLOCK_LO
556        *    CS2R SV_CLOCK_HI
557        *    CS2R SV_CLOCK_LO
558        *    CS2R SV_CLOCK_HI
559        *    ...
560        *
561        * The moment two high values are the same, we take the low value
562        * between them and that gives us our clock.
563        *
564        * In order to make sure we don't run into any weird races, we also need
565        * to insert a barrier after every load to ensure the one load completes
566        * before we kick off the next load.  Otherwise, if one load happens to
567        * be faster than the other (they are variable latency, after all) we're
568        * still guaranteed that the loads happen in the order we want.
569        */
570       nir_variable *clock =
571          nir_local_variable_create(b->impl, glsl_uvec2_type(), NULL);
572 
573       nir_def *clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_HI);
574       nir_ssa_bar_nv(b, clock_hi);
575 
576       nir_store_var(b, clock, nir_vec2(b, nir_imm_int(b, 0), clock_hi), 0x3);
577 
578       nir_push_loop(b);
579       {
580          nir_def *last_clock = nir_load_var(b, clock);
581 
582          nir_def *clock_lo = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK_LO);
583          nir_ssa_bar_nv(b, clock_lo);
584 
585          clock_hi = nir_load_sysval_nv(b, 32, .base = NAK_SV_CLOCK + 1);
586          nir_ssa_bar_nv(b, clock_hi);
587 
588          nir_store_var(b, clock, nir_vec2(b, clock_lo, clock_hi), 0x3);
589 
590          nir_break_if(b, nir_ieq(b, clock_hi, nir_channel(b, last_clock, 1)));
591       }
592       nir_pop_loop(b, NULL);
593 
594       val = nir_load_var(b, clock);
595       if (intrin->def.bit_size == 64)
596          val = nir_pack_64_2x32(b, val);
597       break;
598    }
599 
600    case nir_intrinsic_load_warps_per_sm_nv:
601       val = nir_imm_int(b, nak->warps_per_sm);
602       break;
603 
604    case nir_intrinsic_load_sm_count_nv:
605       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTCFG);
606       val = nir_ubitfield_extract_imm(b, val, 20, 9);
607       break;
608 
609    case nir_intrinsic_load_warp_id_nv:
610       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
611       val = nir_ubitfield_extract_imm(b, val, 8, 7);
612       break;
613 
614    case nir_intrinsic_load_sm_id_nv:
615       val = nir_load_sysval_nv(b, 32, .base = NAK_SV_VIRTID);
616       val = nir_ubitfield_extract_imm(b, val, 20, 9);
617       break;
618 
619    default:
620       return false;
621    }
622 
623    if (intrin->def.bit_size == 1)
624       val = nir_i2b(b, val);
625 
626    nir_def_rewrite_uses(&intrin->def, val);
627 
628    return true;
629 }
630 
631 static bool
nak_nir_lower_system_values(nir_shader * nir,const struct nak_compiler * nak)632 nak_nir_lower_system_values(nir_shader *nir, const struct nak_compiler *nak)
633 {
634    return nir_shader_intrinsics_pass(nir, nak_nir_lower_system_value_intrin,
635                                      nir_metadata_none,
636                                      (void *)nak);
637 }
638 
639 struct nak_xfb_info
nak_xfb_from_nir(const struct nir_xfb_info * nir_xfb)640 nak_xfb_from_nir(const struct nir_xfb_info *nir_xfb)
641 {
642    if (nir_xfb == NULL)
643       return (struct nak_xfb_info) { };
644 
645    struct nak_xfb_info nak_xfb = { };
646 
647    u_foreach_bit(b, nir_xfb->buffers_written) {
648       nak_xfb.stride[b] = nir_xfb->buffers[b].stride;
649       nak_xfb.stream[b] = nir_xfb->buffer_to_stream[b];
650    }
651    memset(nak_xfb.attr_index, 0xff, sizeof(nak_xfb.attr_index)); /* = skip */
652 
653    for (unsigned o = 0; o < nir_xfb->output_count; o++) {
654       const nir_xfb_output_info *out = &nir_xfb->outputs[o];
655       const uint8_t b = out->buffer;
656       assert(nir_xfb->buffers_written & BITFIELD_BIT(b));
657 
658       const uint16_t attr_addr = nak_varying_attr_addr(out->location);
659       assert(attr_addr % 4 == 0);
660       const uint16_t attr_idx = attr_addr / 4;
661 
662       assert(out->offset % 4 == 0);
663       uint8_t out_idx = out->offset / 4;
664 
665       u_foreach_bit(c, out->component_mask)
666          nak_xfb.attr_index[b][out_idx++] = attr_idx + c;
667 
668       nak_xfb.attr_count[b] = MAX2(nak_xfb.attr_count[b], out_idx);
669    }
670 
671    return nak_xfb;
672 }
673 
674 static bool
lower_fs_output_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * _data)675 lower_fs_output_intrin(nir_builder *b, nir_intrinsic_instr *intrin, void *_data)
676 {
677    if (intrin->intrinsic != nir_intrinsic_store_output)
678       return false;
679 
680    b->cursor = nir_before_instr(&intrin->instr);
681 
682    const nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
683    uint16_t addr = nak_fs_out_addr(sem.location, sem.dual_source_blend_index) +
684                    nir_src_as_uint(intrin->src[1]) * 16 +
685                    nir_intrinsic_component(intrin) * 4;
686 
687    nir_def *data = intrin->src[0].ssa;
688 
689    /* The fs_out_nv intrinsic is always scalar */
690    u_foreach_bit(c, nir_intrinsic_write_mask(intrin)) {
691       if (nir_scalar_is_undef(nir_scalar_resolved(data, c)))
692          continue;
693 
694       nir_fs_out_nv(b, nir_channel(b, data, c), .base = addr + c * 4);
695    }
696 
697    nir_instr_remove(&intrin->instr);
698 
699    return true;
700 }
701 
702 static bool
nak_nir_lower_fs_outputs(nir_shader * nir)703 nak_nir_lower_fs_outputs(nir_shader *nir)
704 {
705    if (nir->info.outputs_written == 0)
706       return false;
707 
708    bool progress = nir_shader_intrinsics_pass(nir, lower_fs_output_intrin,
709                                               nir_metadata_control_flow,
710                                               NULL);
711 
712    if (progress) {
713       /* We need a copy_fs_outputs_nv intrinsic so NAK knows where to place
714        * the final copy.  This needs to be in the last block, after all
715        * store_output intrinsics.
716        */
717       nir_function_impl *impl = nir_shader_get_entrypoint(nir);
718       nir_builder b = nir_builder_at(nir_after_impl(impl));
719       nir_copy_fs_outputs_nv(&b);
720    }
721 
722    return progress;
723 }
724 
725 static bool
nak_nir_remove_barrier_intrin(nir_builder * b,nir_intrinsic_instr * barrier,UNUSED void * _data)726 nak_nir_remove_barrier_intrin(nir_builder *b, nir_intrinsic_instr *barrier,
727                               UNUSED void *_data)
728 {
729    if (barrier->intrinsic != nir_intrinsic_barrier)
730       return false;
731 
732    mesa_scope exec_scope = nir_intrinsic_execution_scope(barrier);
733    assert(exec_scope <= SCOPE_WORKGROUP &&
734           "Control barrier with scope > WORKGROUP");
735 
736    if (exec_scope == SCOPE_WORKGROUP &&
737        nak_nir_workgroup_has_one_subgroup(b->shader))
738       exec_scope = SCOPE_SUBGROUP;
739 
740    /* Because we're guaranteeing maximal convergence via warp barriers,
741     * subgroup barriers do nothing.
742     */
743    if (exec_scope <= SCOPE_SUBGROUP)
744       exec_scope = SCOPE_NONE;
745 
746    const nir_variable_mode mem_modes = nir_intrinsic_memory_modes(barrier);
747    if (exec_scope == SCOPE_NONE && mem_modes == 0) {
748       nir_instr_remove(&barrier->instr);
749       return true;
750    }
751 
752    /* In this case, we're leaving the barrier there */
753    b->shader->info.uses_control_barrier = true;
754 
755    bool progress = false;
756    if (exec_scope != nir_intrinsic_execution_scope(barrier)) {
757       nir_intrinsic_set_execution_scope(barrier, exec_scope);
758       progress = true;
759    }
760 
761    return progress;
762 }
763 
764 static bool
nak_nir_remove_barriers(nir_shader * nir)765 nak_nir_remove_barriers(nir_shader *nir)
766 {
767    /* We'll set this back to true if we leave any barriers in place */
768    nir->info.uses_control_barrier = false;
769 
770    return nir_shader_intrinsics_pass(nir, nak_nir_remove_barrier_intrin,
771                                      nir_metadata_control_flow,
772                                      NULL);
773 }
774 
775 static bool
nak_mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * cb_data)776 nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
777                      unsigned bit_size, unsigned num_components,
778                      nir_intrinsic_instr *low, nir_intrinsic_instr *high,
779                      void *cb_data)
780 {
781    /*
782     * Since we legalize these later with nir_lower_mem_access_bit_sizes,
783     * we can optimistically combine anything that might be profitable
784     */
785    assert(util_is_power_of_two_nonzero(align_mul));
786 
787    unsigned max_bytes = 128u / 8u;
788    if (low->intrinsic == nir_intrinsic_ldc_nv ||
789        low->intrinsic == nir_intrinsic_ldcx_nv)
790       max_bytes = 64u / 8u;
791 
792    align_mul = MIN2(align_mul, max_bytes);
793    align_offset = align_offset % align_mul;
794    return align_offset + num_components * (bit_size / 8) <= align_mul;
795 }
796 
797 static nir_mem_access_size_align
nak_mem_access_size_align(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,const void * cb_data)798 nak_mem_access_size_align(nir_intrinsic_op intrin,
799                           uint8_t bytes, uint8_t bit_size,
800                           uint32_t align_mul, uint32_t align_offset,
801                           bool offset_is_const, const void *cb_data)
802 {
803    const uint32_t align = nir_combined_align(align_mul, align_offset);
804    assert(util_is_power_of_two_nonzero(align));
805 
806    unsigned bytes_pow2;
807    if (nir_intrinsic_infos[intrin].has_dest) {
808       /* Reads can over-fetch a bit if the alignment is okay. */
809       bytes_pow2 = util_next_power_of_two(bytes);
810    } else {
811       bytes_pow2 = 1 << (util_last_bit(bytes) - 1);
812    }
813 
814    unsigned chunk_bytes = MIN3(bytes_pow2, align, 16);
815    assert(util_is_power_of_two_nonzero(chunk_bytes));
816    if (intrin == nir_intrinsic_ldc_nv ||
817        intrin == nir_intrinsic_ldcx_nv)
818       chunk_bytes = MIN2(chunk_bytes, 8);
819 
820    if ((intrin == nir_intrinsic_ldc_nv ||
821         intrin == nir_intrinsic_ldcx_nv) && align < 4) {
822       /* CBufs require 4B alignment unless we're doing a ldc.u8 or ldc.i8.
823        * In particular, this applies to ldc.u16 which means we either have to
824        * fall back to two ldc.u8 or use ldc.u32 and shift stuff around to get
825        * the 16bit value out.  Fortunately, nir_lower_mem_access_bit_sizes()
826        * can handle over-alignment for reads.
827        */
828       if (align == 2 || offset_is_const) {
829          return (nir_mem_access_size_align) {
830             .bit_size = 32,
831             .num_components = 1,
832             .align = 4,
833          };
834       } else {
835          assert(align == 1);
836          return (nir_mem_access_size_align) {
837             .bit_size = 8,
838             .num_components = 1,
839             .align = 1,
840          };
841       }
842    } else if (chunk_bytes < 4) {
843       return (nir_mem_access_size_align) {
844          .bit_size = chunk_bytes * 8,
845          .num_components = 1,
846          .align = chunk_bytes,
847       };
848    } else {
849       return (nir_mem_access_size_align) {
850          .bit_size = 32,
851          .num_components = chunk_bytes / 4,
852          .align = chunk_bytes,
853       };
854    }
855 }
856 
857 static bool
nir_shader_has_local_variables(const nir_shader * nir)858 nir_shader_has_local_variables(const nir_shader *nir)
859 {
860    nir_foreach_function(func, nir) {
861       if (func->impl && !exec_list_is_empty(&func->impl->locals))
862          return true;
863    }
864 
865    return false;
866 }
867 
868 static int
type_size_vec4(const struct glsl_type * type,bool bindless)869 type_size_vec4(const struct glsl_type *type, bool bindless)
870 {
871    return glsl_count_vec4_slots(type, false, bindless);
872 }
873 
874 void
nak_postprocess_nir(nir_shader * nir,const struct nak_compiler * nak,nir_variable_mode robust2_modes,const struct nak_fs_key * fs_key)875 nak_postprocess_nir(nir_shader *nir,
876                     const struct nak_compiler *nak,
877                     nir_variable_mode robust2_modes,
878                     const struct nak_fs_key *fs_key)
879 {
880    UNUSED bool progress = false;
881 
882    nak_optimize_nir(nir, nak);
883 
884    const nir_lower_subgroups_options subgroups_options = {
885       .subgroup_size = NAK_SUBGROUP_SIZE,
886       .ballot_bit_size = 32,
887       .ballot_components = 1,
888       .lower_to_scalar = true,
889       .lower_vote_eq = true,
890       .lower_first_invocation_to_ballot = true,
891       .lower_read_first_invocation = true,
892       .lower_elect = true,
893       .lower_inverse_ballot = true,
894       .lower_rotate_to_shuffle = true
895    };
896    OPT(nir, nir_lower_subgroups, &subgroups_options);
897    OPT(nir, nak_nir_lower_scan_reduce);
898 
899    if (nir_shader_has_local_variables(nir)) {
900       OPT(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
901           glsl_get_natural_size_align_bytes);
902       OPT(nir, nir_lower_explicit_io, nir_var_function_temp,
903           nir_address_format_32bit_offset);
904       nak_optimize_nir(nir, nak);
905    }
906 
907    OPT(nir, nir_opt_shrink_vectors, true);
908 
909    nir_load_store_vectorize_options vectorize_opts = {};
910    vectorize_opts.modes = nir_var_mem_global |
911                           nir_var_mem_ssbo |
912                           nir_var_mem_shared |
913                           nir_var_shader_temp;
914    vectorize_opts.callback = nak_mem_vectorize_cb;
915    vectorize_opts.robust_modes = robust2_modes;
916    OPT(nir, nir_opt_load_store_vectorize, &vectorize_opts);
917 
918    nir_lower_mem_access_bit_sizes_options mem_bit_size_options = {
919       .modes = nir_var_mem_constant | nir_var_mem_ubo | nir_var_mem_generic,
920       .callback = nak_mem_access_size_align,
921    };
922    OPT(nir, nir_lower_mem_access_bit_sizes, &mem_bit_size_options);
923    OPT(nir, nir_lower_bit_size, lower_bit_size_cb, (void *)nak);
924 
925    OPT(nir, nir_opt_combine_barriers, NULL, NULL);
926 
927    nak_optimize_nir(nir, nak);
928 
929    OPT(nir, nak_nir_lower_tex, nak);
930    OPT(nir, nir_lower_idiv, NULL);
931 
932    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
933 
934    OPT(nir, nir_lower_indirect_derefs, 0, UINT32_MAX);
935 
936    if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
937       OPT(nir, nir_lower_tess_coord_z,
938           nir->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES);
939    }
940 
941    /* We need to do this before nak_nir_lower_system_values() because it
942     * relies on the workgroup size being the actual HW workgroup size in
943     * nir_intrinsic_load_subgroup_id.
944     */
945    if (gl_shader_stage_uses_workgroup(nir->info.stage) &&
946        nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
947       assert(nir->info.workgroup_size[0] % 2 == 0);
948       assert(nir->info.workgroup_size[1] % 2 == 0);
949       nir->info.workgroup_size[0] *= 2;
950       nir->info.workgroup_size[1] /= 2;
951    }
952 
953    OPT(nir, nak_nir_lower_system_values, nak);
954 
955    switch (nir->info.stage) {
956    case MESA_SHADER_VERTEX:
957    case MESA_SHADER_TESS_CTRL:
958    case MESA_SHADER_TESS_EVAL:
959    case MESA_SHADER_GEOMETRY:
960       OPT(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
961           type_size_vec4, nir_lower_io_lower_64bit_to_32_new);
962       OPT(nir, nir_opt_constant_folding);
963       OPT(nir, nak_nir_lower_vtg_io, nak);
964       if (nir->info.stage == MESA_SHADER_GEOMETRY)
965          OPT(nir, nak_nir_lower_gs_intrinsics);
966       break;
967 
968    case MESA_SHADER_FRAGMENT:
969       OPT(nir, nir_lower_indirect_derefs,
970           nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
971       OPT(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
972           type_size_vec4, nir_lower_io_lower_64bit_to_32_new);
973       OPT(nir, nir_opt_constant_folding);
974       OPT(nir, nak_nir_lower_fs_inputs, nak, fs_key);
975       OPT(nir, nak_nir_lower_fs_outputs);
976       break;
977 
978    case MESA_SHADER_COMPUTE:
979    case MESA_SHADER_KERNEL:
980       break;
981 
982    default:
983       unreachable("Unsupported shader stage");
984    }
985 
986    OPT(nir, nir_lower_doubles, NULL, nak->nir_options.lower_doubles_options);
987    OPT(nir, nir_lower_int64);
988 
989    nak_optimize_nir(nir, nak);
990 
991    do {
992       progress = false;
993       OPT(nir, nir_opt_algebraic_late);
994       OPT(nir, nak_nir_lower_algebraic_late, nak);
995 
996       /* If we're lowering fp64 sat but not min/max, the sat lowering may have
997        * been undone by nir_opt_algebraic.  Lower sat again just to be sure.
998        */
999       if ((nak->nir_options.lower_doubles_options & nir_lower_dsat) &&
1000           !(nak->nir_options.lower_doubles_options & nir_lower_dminmax))
1001          OPT(nir, nir_lower_doubles, NULL, nir_lower_dsat);
1002 
1003       if (progress) {
1004          OPT(nir, nir_opt_constant_folding);
1005          OPT(nir, nir_copy_prop);
1006          OPT(nir, nir_opt_dce);
1007          OPT(nir, nir_opt_cse);
1008       }
1009    } while (progress);
1010 
1011    if (nak->sm < 70)
1012       OPT(nir, nak_nir_split_64bit_conversions);
1013 
1014    nir_convert_to_lcssa(nir, true, true);
1015    nir_divergence_analysis(nir);
1016 
1017    if (nak->sm >= 75) {
1018       if (OPT(nir, nak_nir_lower_non_uniform_ldcx)) {
1019          OPT(nir, nir_copy_prop);
1020          OPT(nir, nir_opt_dce);
1021          nir_divergence_analysis(nir);
1022       }
1023    }
1024 
1025    OPT(nir, nak_nir_remove_barriers);
1026 
1027    if (nak->sm >= 70) {
1028       if (nak_should_print_nir()) {
1029          fprintf(stderr, "Structured NIR for %s shader:\n",
1030                  _mesa_shader_stage_to_string(nir->info.stage));
1031          nir_print_shader(nir, stderr);
1032       }
1033       OPT(nir, nak_nir_lower_cf);
1034    }
1035 
1036    /* Re-index blocks and compact SSA defs because we'll use them to index
1037     * arrays
1038     */
1039    nir_foreach_function(func, nir) {
1040       if (func->impl) {
1041          nir_index_blocks(func->impl);
1042          nir_index_ssa_defs(func->impl);
1043       }
1044    }
1045 
1046    if (nak_should_print_nir()) {
1047       fprintf(stderr, "NIR for %s shader:\n",
1048               _mesa_shader_stage_to_string(nir->info.stage));
1049       nir_print_shader(nir, stderr);
1050    }
1051 }
1052 
1053 static bool
scalar_is_imm_int(nir_scalar x,unsigned bits)1054 scalar_is_imm_int(nir_scalar x, unsigned bits)
1055 {
1056    if (!nir_scalar_is_const(x))
1057       return false;
1058 
1059    int64_t imm = nir_scalar_as_int(x);
1060    return u_intN_min(bits) <= imm && imm <= u_intN_max(bits);
1061 }
1062 
1063 struct nak_io_addr_offset
nak_get_io_addr_offset(nir_def * addr,uint8_t imm_bits)1064 nak_get_io_addr_offset(nir_def *addr, uint8_t imm_bits)
1065 {
1066    nir_scalar addr_s = {
1067       .def = addr,
1068       .comp = 0,
1069    };
1070    if (scalar_is_imm_int(addr_s, imm_bits)) {
1071       /* Base is a dumb name for this.  It should be offset */
1072       return (struct nak_io_addr_offset) {
1073          .offset = nir_scalar_as_int(addr_s),
1074       };
1075    }
1076 
1077    addr_s = nir_scalar_chase_movs(addr_s);
1078    if (!nir_scalar_is_alu(addr_s) ||
1079        nir_scalar_alu_op(addr_s) != nir_op_iadd) {
1080       return (struct nak_io_addr_offset) {
1081          .base = addr_s,
1082       };
1083    }
1084 
1085    for (unsigned i = 0; i < 2; i++) {
1086       nir_scalar off_s = nir_scalar_chase_alu_src(addr_s, i);
1087       off_s = nir_scalar_chase_movs(off_s);
1088       if (scalar_is_imm_int(off_s, imm_bits)) {
1089          return (struct nak_io_addr_offset) {
1090             .base = nir_scalar_chase_alu_src(addr_s, 1 - i),
1091             .offset = nir_scalar_as_int(off_s),
1092          };
1093       }
1094    }
1095 
1096    return (struct nak_io_addr_offset) {
1097       .base = addr_s,
1098    };
1099 }
1100