1 /*
2 * Copyright © 2016 Bas Nieuwenhuizen
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_gpu_info.h"
8 #include "ac_nir.h"
9 #include "ac_nir_helpers.h"
10 #include "sid.h"
11 #include "nir_builder.h"
12 #include "nir_xfb_info.h"
13
14 /* Sleep for the given number of clock cycles. */
15 void
ac_nir_sleep(nir_builder * b,unsigned num_cycles)16 ac_nir_sleep(nir_builder *b, unsigned num_cycles)
17 {
18 /* s_sleep can only sleep for N*64 cycles. */
19 if (num_cycles >= 64) {
20 nir_sleep_amd(b, num_cycles / 64);
21 num_cycles &= 63;
22 }
23
24 /* Use s_nop to sleep for the remaining cycles. */
25 while (num_cycles) {
26 unsigned nop_cycles = MIN2(num_cycles, 16);
27
28 nir_nop_amd(b, nop_cycles - 1);
29 num_cycles -= nop_cycles;
30 }
31 }
32
33 /* Load argument with index start from arg plus relative_index. */
34 nir_def *
ac_nir_load_arg_at_offset(nir_builder * b,const struct ac_shader_args * ac_args,struct ac_arg arg,unsigned relative_index)35 ac_nir_load_arg_at_offset(nir_builder *b, const struct ac_shader_args *ac_args,
36 struct ac_arg arg, unsigned relative_index)
37 {
38 unsigned arg_index = arg.arg_index + relative_index;
39 unsigned num_components = ac_args->args[arg_index].size;
40
41 if (ac_args->args[arg_index].file == AC_ARG_SGPR)
42 return nir_load_scalar_arg_amd(b, num_components, .base = arg_index);
43 else
44 return nir_load_vector_arg_amd(b, num_components, .base = arg_index);
45 }
46
47 void
ac_nir_store_arg(nir_builder * b,const struct ac_shader_args * ac_args,struct ac_arg arg,nir_def * val)48 ac_nir_store_arg(nir_builder *b, const struct ac_shader_args *ac_args, struct ac_arg arg,
49 nir_def *val)
50 {
51 assert(nir_cursor_current_block(b->cursor)->cf_node.parent->type == nir_cf_node_function);
52
53 if (ac_args->args[arg.arg_index].file == AC_ARG_SGPR)
54 nir_store_scalar_arg_amd(b, val, .base = arg.arg_index);
55 else
56 nir_store_vector_arg_amd(b, val, .base = arg.arg_index);
57 }
58
59 nir_def *
ac_nir_unpack_arg(nir_builder * b,const struct ac_shader_args * ac_args,struct ac_arg arg,unsigned rshift,unsigned bitwidth)60 ac_nir_unpack_arg(nir_builder *b, const struct ac_shader_args *ac_args, struct ac_arg arg,
61 unsigned rshift, unsigned bitwidth)
62 {
63 nir_def *value = ac_nir_load_arg(b, ac_args, arg);
64 if (rshift == 0 && bitwidth == 32)
65 return value;
66 else if (rshift == 0)
67 return nir_iand_imm(b, value, BITFIELD_MASK(bitwidth));
68 else if ((32 - rshift) <= bitwidth)
69 return nir_ushr_imm(b, value, rshift);
70 else
71 return nir_ubfe_imm(b, value, rshift, bitwidth);
72 }
73
74 static bool
is_sin_cos(const nir_instr * instr,UNUSED const void * _)75 is_sin_cos(const nir_instr *instr, UNUSED const void *_)
76 {
77 return instr->type == nir_instr_type_alu && (nir_instr_as_alu(instr)->op == nir_op_fsin ||
78 nir_instr_as_alu(instr)->op == nir_op_fcos);
79 }
80
81 static nir_def *
lower_sin_cos(struct nir_builder * b,nir_instr * instr,UNUSED void * _)82 lower_sin_cos(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
83 {
84 nir_alu_instr *sincos = nir_instr_as_alu(instr);
85 nir_def *src = nir_fmul_imm(b, nir_ssa_for_alu_src(b, sincos, 0), 0.15915493667125702);
86 return sincos->op == nir_op_fsin ? nir_fsin_amd(b, src) : nir_fcos_amd(b, src);
87 }
88
89 bool
ac_nir_lower_sin_cos(nir_shader * shader)90 ac_nir_lower_sin_cos(nir_shader *shader)
91 {
92 return nir_shader_lower_instructions(shader, is_sin_cos, lower_sin_cos, NULL);
93 }
94
95 typedef struct {
96 const struct ac_shader_args *const args;
97 const enum amd_gfx_level gfx_level;
98 const enum ac_hw_stage hw_stage;
99 } lower_intrinsics_to_args_state;
100
101 static bool
lower_intrinsic_to_arg(nir_builder * b,nir_instr * instr,void * state)102 lower_intrinsic_to_arg(nir_builder *b, nir_instr *instr, void *state)
103 {
104 if (instr->type != nir_instr_type_intrinsic)
105 return false;
106
107 lower_intrinsics_to_args_state *s = (lower_intrinsics_to_args_state *)state;
108 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
109 nir_def *replacement = NULL;
110 b->cursor = nir_after_instr(&intrin->instr);
111
112 switch (intrin->intrinsic) {
113 case nir_intrinsic_load_subgroup_id: {
114 if (s->hw_stage == AC_HW_COMPUTE_SHADER) {
115 if (s->gfx_level >= GFX12)
116 return false;
117
118 assert(s->args->tg_size.used);
119
120 if (s->gfx_level >= GFX10_3) {
121 replacement = ac_nir_unpack_arg(b, s->args, s->args->tg_size, 20, 5);
122 } else {
123 /* GFX6-10 don't actually support a wave id, but we can
124 * use the ordered id because ORDERED_APPEND_* is set to
125 * zero in the compute dispatch initiatior.
126 */
127 replacement = ac_nir_unpack_arg(b, s->args, s->args->tg_size, 6, 6);
128 }
129 } else if (s->hw_stage == AC_HW_HULL_SHADER && s->gfx_level >= GFX11) {
130 assert(s->args->tcs_wave_id.used);
131 replacement = ac_nir_unpack_arg(b, s->args, s->args->tcs_wave_id, 0, 3);
132 } else if (s->hw_stage == AC_HW_LEGACY_GEOMETRY_SHADER ||
133 s->hw_stage == AC_HW_NEXT_GEN_GEOMETRY_SHADER) {
134 assert(s->args->merged_wave_info.used);
135 replacement = ac_nir_unpack_arg(b, s->args, s->args->merged_wave_info, 24, 4);
136 } else {
137 replacement = nir_imm_int(b, 0);
138 }
139
140 break;
141 }
142 case nir_intrinsic_load_num_subgroups: {
143 if (s->hw_stage == AC_HW_COMPUTE_SHADER) {
144 assert(s->args->tg_size.used);
145 replacement = ac_nir_unpack_arg(b, s->args, s->args->tg_size, 0, 6);
146 } else if (s->hw_stage == AC_HW_LEGACY_GEOMETRY_SHADER ||
147 s->hw_stage == AC_HW_NEXT_GEN_GEOMETRY_SHADER) {
148 assert(s->args->merged_wave_info.used);
149 replacement = ac_nir_unpack_arg(b, s->args, s->args->merged_wave_info, 28, 4);
150 } else {
151 replacement = nir_imm_int(b, 1);
152 }
153
154 break;
155 }
156 case nir_intrinsic_load_workgroup_id:
157 if (b->shader->info.stage == MESA_SHADER_MESH) {
158 /* This lowering is only valid with fast_launch = 2, otherwise we assume that
159 * lower_workgroup_id_to_index removed any uses of the workgroup id by this point.
160 */
161 assert(s->gfx_level >= GFX11);
162 nir_def *xy = ac_nir_load_arg(b, s->args, s->args->tess_offchip_offset);
163 nir_def *z = ac_nir_load_arg(b, s->args, s->args->gs_attr_offset);
164 replacement = nir_vec3(b, nir_extract_u16(b, xy, nir_imm_int(b, 0)),
165 nir_extract_u16(b, xy, nir_imm_int(b, 1)),
166 nir_extract_u16(b, z, nir_imm_int(b, 1)));
167 } else {
168 return false;
169 }
170
171 break;
172 default:
173 return false;
174 }
175
176 assert(replacement);
177 nir_def_replace(&intrin->def, replacement);
178 return true;
179 }
180
181 bool
ac_nir_lower_intrinsics_to_args(nir_shader * shader,const enum amd_gfx_level gfx_level,const enum ac_hw_stage hw_stage,const struct ac_shader_args * ac_args)182 ac_nir_lower_intrinsics_to_args(nir_shader *shader, const enum amd_gfx_level gfx_level,
183 const enum ac_hw_stage hw_stage,
184 const struct ac_shader_args *ac_args)
185 {
186 lower_intrinsics_to_args_state state = {
187 .gfx_level = gfx_level,
188 .hw_stage = hw_stage,
189 .args = ac_args,
190 };
191
192 return nir_shader_instructions_pass(shader, lower_intrinsic_to_arg,
193 nir_metadata_control_flow, &state);
194 }
195
196 void
ac_nir_store_var_components(nir_builder * b,nir_variable * var,nir_def * value,unsigned component,unsigned writemask)197 ac_nir_store_var_components(nir_builder *b, nir_variable *var, nir_def *value,
198 unsigned component, unsigned writemask)
199 {
200 /* component store */
201 if (value->num_components != 4) {
202 nir_def *undef = nir_undef(b, 1, value->bit_size);
203
204 /* add undef component before and after value to form a vec4 */
205 nir_def *comp[4];
206 for (int i = 0; i < 4; i++) {
207 comp[i] = (i >= component && i < component + value->num_components) ?
208 nir_channel(b, value, i - component) : undef;
209 }
210
211 value = nir_vec(b, comp, 4);
212 writemask <<= component;
213 } else {
214 /* if num_component==4, there should be no component offset */
215 assert(component == 0);
216 }
217
218 nir_store_var(b, var, value, writemask);
219 }
220
221 /* Process the given store_output intrinsic and process its information.
222 * Meant to be used for VS/TES/GS when they are the last pre-rasterization stage.
223 *
224 * Assumptions:
225 * - We called nir_lower_io_to_temporaries on the shader
226 * - 64-bit outputs are lowered
227 * - no indirect indexing is present
228 */
ac_nir_gather_prerast_store_output_info(nir_builder * b,nir_intrinsic_instr * intrin,ac_nir_prerast_out * out)229 void ac_nir_gather_prerast_store_output_info(nir_builder *b, nir_intrinsic_instr *intrin, ac_nir_prerast_out *out)
230 {
231 assert(intrin->intrinsic == nir_intrinsic_store_output);
232 assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
233
234 const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
235 const unsigned slot = io_sem.location;
236
237 nir_def *store_val = intrin->src[0].ssa;
238 assert(store_val->bit_size == 16 || store_val->bit_size == 32);
239
240 nir_def **output;
241 nir_alu_type *type;
242 ac_nir_prerast_per_output_info *info;
243
244 if (slot >= VARYING_SLOT_VAR0_16BIT) {
245 const unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
246
247 if (io_sem.high_16bits) {
248 output = out->outputs_16bit_hi[index];
249 type = out->types_16bit_hi[index];
250 info = &out->infos_16bit_hi[index];
251 } else {
252 output = out->outputs_16bit_lo[index];
253 type = out->types_16bit_lo[index];
254 info = &out->infos_16bit_lo[index];
255 }
256 } else {
257 output = out->outputs[slot];
258 type = out->types[slot];
259 info = &out->infos[slot];
260 }
261
262 unsigned component_offset = nir_intrinsic_component(intrin);
263 unsigned write_mask = nir_intrinsic_write_mask(intrin);
264 nir_alu_type src_type = nir_intrinsic_src_type(intrin);
265 assert(nir_alu_type_get_type_size(src_type) == store_val->bit_size);
266
267 b->cursor = nir_before_instr(&intrin->instr);
268
269 /* 16-bit output stored in a normal varying slot that isn't a dedicated 16-bit slot. */
270 const bool non_dedicated_16bit = slot < VARYING_SLOT_VAR0_16BIT && store_val->bit_size == 16;
271
272 u_foreach_bit (i, write_mask) {
273 const unsigned stream = (io_sem.gs_streams >> (i * 2)) & 0x3;
274
275 if (b->shader->info.stage == MESA_SHADER_GEOMETRY) {
276 if (!(b->shader->info.gs.active_stream_mask & (1 << stream)))
277 continue;
278 }
279
280 const unsigned c = component_offset + i;
281
282 /* The same output component should always belong to the same stream. */
283 assert(!(info->components_mask & (1 << c)) ||
284 ((info->stream >> (c * 2)) & 3) == stream);
285
286 /* Components of the same output slot may belong to different streams. */
287 info->stream |= stream << (c * 2);
288 info->components_mask |= BITFIELD_BIT(c);
289
290 nir_def *store_component = nir_channel(b, intrin->src[0].ssa, i);
291
292 if (non_dedicated_16bit) {
293 if (io_sem.high_16bits) {
294 nir_def *lo = output[c] ? nir_unpack_32_2x16_split_x(b, output[c]) : nir_imm_intN_t(b, 0, 16);
295 output[c] = nir_pack_32_2x16_split(b, lo, store_component);
296 } else {
297 nir_def *hi = output[c] ? nir_unpack_32_2x16_split_y(b, output[c]) : nir_imm_intN_t(b, 0, 16);
298 output[c] = nir_pack_32_2x16_split(b, store_component, hi);
299 }
300 type[c] = nir_type_uint32;
301 } else {
302 output[c] = store_component;
303 type[c] = src_type;
304 }
305 }
306 }
307
308 static nir_intrinsic_instr *
export(nir_builder * b,nir_def * val,nir_def * row,unsigned base,unsigned flags,unsigned write_mask)309 export(nir_builder *b, nir_def *val, nir_def *row, unsigned base, unsigned flags,
310 unsigned write_mask)
311 {
312 if (row) {
313 return nir_export_row_amd(b, val, row, .base = base, .flags = flags,
314 .write_mask = write_mask);
315 } else {
316 return nir_export_amd(b, val, .base = base, .flags = flags,
317 .write_mask = write_mask);
318 }
319 }
320
321 void
ac_nir_export_primitive(nir_builder * b,nir_def * prim,nir_def * row)322 ac_nir_export_primitive(nir_builder *b, nir_def *prim, nir_def *row)
323 {
324 unsigned write_mask = BITFIELD_MASK(prim->num_components);
325
326 export(b, nir_pad_vec4(b, prim), row, V_008DFC_SQ_EXP_PRIM, AC_EXP_FLAG_DONE,
327 write_mask);
328 }
329
330 static nir_def *
get_export_output(nir_builder * b,nir_def ** output)331 get_export_output(nir_builder *b, nir_def **output)
332 {
333 nir_def *vec[4];
334 for (int i = 0; i < 4; i++) {
335 if (output[i])
336 vec[i] = nir_u2uN(b, output[i], 32);
337 else
338 vec[i] = nir_undef(b, 1, 32);
339 }
340
341 return nir_vec(b, vec, 4);
342 }
343
344 static nir_def *
get_pos0_output(nir_builder * b,nir_def ** output)345 get_pos0_output(nir_builder *b, nir_def **output)
346 {
347 /* Some applications don't write position but expect (0, 0, 0, 1)
348 * so use that value instead of undef when it isn't written.
349 */
350 nir_def *vec[4] = {0};
351
352 for (int i = 0; i < 4; i++) {
353 if (output[i])
354 vec[i] = nir_u2u32(b, output[i]);
355 else
356 vec[i] = nir_imm_float(b, i == 3 ? 1.0 : 0.0);
357 }
358
359 return nir_vec(b, vec, 4);
360 }
361
362 void
ac_nir_export_position(nir_builder * b,enum amd_gfx_level gfx_level,uint32_t clip_cull_mask,bool no_param_export,bool force_vrs,bool done,uint64_t outputs_written,nir_def * (* outputs)[4],nir_def * row)363 ac_nir_export_position(nir_builder *b,
364 enum amd_gfx_level gfx_level,
365 uint32_t clip_cull_mask,
366 bool no_param_export,
367 bool force_vrs,
368 bool done,
369 uint64_t outputs_written,
370 nir_def *(*outputs)[4],
371 nir_def *row)
372 {
373 nir_intrinsic_instr *exp[4];
374 unsigned exp_num = 0;
375 unsigned exp_pos_offset = 0;
376
377 if (outputs_written & VARYING_BIT_POS) {
378 /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
379 * Setting valid_mask=1 prevents it and has no other effect.
380 */
381 const unsigned pos_flags = gfx_level == GFX10 ? AC_EXP_FLAG_VALID_MASK : 0;
382 nir_def *pos = get_pos0_output(b, outputs[VARYING_SLOT_POS]);
383
384 exp[exp_num] = export(b, pos, row, V_008DFC_SQ_EXP_POS + exp_num, pos_flags, 0xf);
385 exp_num++;
386 } else {
387 exp_pos_offset++;
388 }
389
390 uint64_t mask =
391 VARYING_BIT_PSIZ |
392 VARYING_BIT_EDGE |
393 VARYING_BIT_LAYER |
394 VARYING_BIT_VIEWPORT |
395 VARYING_BIT_PRIMITIVE_SHADING_RATE;
396
397 /* clear output mask if no one written */
398 if (!outputs[VARYING_SLOT_PSIZ][0])
399 outputs_written &= ~VARYING_BIT_PSIZ;
400 if (!outputs[VARYING_SLOT_EDGE][0])
401 outputs_written &= ~VARYING_BIT_EDGE;
402 if (!outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE][0])
403 outputs_written &= ~VARYING_BIT_PRIMITIVE_SHADING_RATE;
404 if (!outputs[VARYING_SLOT_LAYER][0])
405 outputs_written &= ~VARYING_BIT_LAYER;
406 if (!outputs[VARYING_SLOT_VIEWPORT][0])
407 outputs_written &= ~VARYING_BIT_VIEWPORT;
408
409 if ((outputs_written & mask) || force_vrs) {
410 nir_def *zero = nir_imm_float(b, 0);
411 nir_def *vec[4] = { zero, zero, zero, zero };
412 unsigned write_mask = 0;
413
414 if (outputs_written & VARYING_BIT_PSIZ) {
415 vec[0] = outputs[VARYING_SLOT_PSIZ][0];
416 write_mask |= BITFIELD_BIT(0);
417 }
418
419 if (outputs_written & VARYING_BIT_EDGE) {
420 vec[1] = nir_umin(b, outputs[VARYING_SLOT_EDGE][0], nir_imm_int(b, 1));
421 write_mask |= BITFIELD_BIT(1);
422 }
423
424 nir_def *rates = NULL;
425 if (outputs_written & VARYING_BIT_PRIMITIVE_SHADING_RATE) {
426 rates = outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE][0];
427 } else if (force_vrs) {
428 /* If Pos.W != 1 (typical for non-GUI elements), use coarse shading. */
429 nir_def *pos_w = outputs[VARYING_SLOT_POS][3];
430 pos_w = pos_w ? nir_u2u32(b, pos_w) : nir_imm_float(b, 1.0);
431 nir_def *cond = nir_fneu_imm(b, pos_w, 1);
432 rates = nir_bcsel(b, cond, nir_load_force_vrs_rates_amd(b), nir_imm_int(b, 0));
433 }
434
435 if (rates) {
436 vec[1] = nir_ior(b, vec[1], rates);
437 write_mask |= BITFIELD_BIT(1);
438 }
439
440 if (outputs_written & VARYING_BIT_LAYER) {
441 vec[2] = outputs[VARYING_SLOT_LAYER][0];
442 write_mask |= BITFIELD_BIT(2);
443 }
444
445 if (outputs_written & VARYING_BIT_VIEWPORT) {
446 if (gfx_level >= GFX9) {
447 /* GFX9 has the layer in [10:0] and the viewport index in [19:16]. */
448 nir_def *v = nir_ishl_imm(b, outputs[VARYING_SLOT_VIEWPORT][0], 16);
449 vec[2] = nir_ior(b, vec[2], v);
450 write_mask |= BITFIELD_BIT(2);
451 } else {
452 vec[3] = outputs[VARYING_SLOT_VIEWPORT][0];
453 write_mask |= BITFIELD_BIT(3);
454 }
455 }
456
457 exp[exp_num] = export(b, nir_vec(b, vec, 4), row,
458 V_008DFC_SQ_EXP_POS + exp_num + exp_pos_offset,
459 0, write_mask);
460 exp_num++;
461 }
462
463 for (int i = 0; i < 2; i++) {
464 if ((outputs_written & (VARYING_BIT_CLIP_DIST0 << i)) &&
465 (clip_cull_mask & BITFIELD_RANGE(i * 4, 4))) {
466 exp[exp_num] = export(
467 b, get_export_output(b, outputs[VARYING_SLOT_CLIP_DIST0 + i]), row,
468 V_008DFC_SQ_EXP_POS + exp_num + exp_pos_offset, 0,
469 (clip_cull_mask >> (i * 4)) & 0xf);
470 exp_num++;
471 }
472 }
473
474 if (outputs_written & VARYING_BIT_CLIP_VERTEX) {
475 nir_def *vtx = get_export_output(b, outputs[VARYING_SLOT_CLIP_VERTEX]);
476
477 /* Clip distance for clip vertex to each user clip plane. */
478 nir_def *clip_dist[8] = {0};
479 u_foreach_bit (i, clip_cull_mask) {
480 nir_def *ucp = nir_load_user_clip_plane(b, .ucp_id = i);
481 clip_dist[i] = nir_fdot4(b, vtx, ucp);
482 }
483
484 for (int i = 0; i < 2; i++) {
485 if (clip_cull_mask & BITFIELD_RANGE(i * 4, 4)) {
486 exp[exp_num] = export(
487 b, get_export_output(b, clip_dist + i * 4), row,
488 V_008DFC_SQ_EXP_POS + exp_num + exp_pos_offset, 0,
489 (clip_cull_mask >> (i * 4)) & 0xf);
490 exp_num++;
491 }
492 }
493 }
494
495 if (!exp_num)
496 return;
497
498 nir_intrinsic_instr *final_exp = exp[exp_num - 1];
499
500 if (done) {
501 /* Specify that this is the last export */
502 const unsigned final_exp_flags = nir_intrinsic_flags(final_exp);
503 nir_intrinsic_set_flags(final_exp, final_exp_flags | AC_EXP_FLAG_DONE);
504 }
505
506 /* If a shader has no param exports, rasterization can start before
507 * the shader finishes and thus memory stores might not finish before
508 * the pixel shader starts.
509 */
510 if (gfx_level >= GFX10 && no_param_export && b->shader->info.writes_memory) {
511 nir_cursor cursor = b->cursor;
512 b->cursor = nir_before_instr(&final_exp->instr);
513 nir_scoped_memory_barrier(b, SCOPE_DEVICE, NIR_MEMORY_RELEASE,
514 nir_var_mem_ssbo | nir_var_mem_global | nir_var_image);
515 b->cursor = cursor;
516 }
517 }
518
519 void
ac_nir_export_parameters(nir_builder * b,const uint8_t * param_offsets,uint64_t outputs_written,uint16_t outputs_written_16bit,nir_def * (* outputs)[4],nir_def * (* outputs_16bit_lo)[4],nir_def * (* outputs_16bit_hi)[4])520 ac_nir_export_parameters(nir_builder *b,
521 const uint8_t *param_offsets,
522 uint64_t outputs_written,
523 uint16_t outputs_written_16bit,
524 nir_def *(*outputs)[4],
525 nir_def *(*outputs_16bit_lo)[4],
526 nir_def *(*outputs_16bit_hi)[4])
527 {
528 uint32_t exported_params = 0;
529
530 u_foreach_bit64 (slot, outputs_written) {
531 unsigned offset = param_offsets[slot];
532 if (offset > AC_EXP_PARAM_OFFSET_31)
533 continue;
534
535 uint32_t write_mask = 0;
536 for (int i = 0; i < 4; i++) {
537 if (outputs[slot][i])
538 write_mask |= BITFIELD_BIT(i);
539 }
540
541 /* no one set this output slot, we can skip the param export */
542 if (!write_mask)
543 continue;
544
545 /* Since param_offsets[] can map multiple varying slots to the same
546 * param export index (that's radeonsi-specific behavior), we need to
547 * do this so as not to emit duplicated exports.
548 */
549 if (exported_params & BITFIELD_BIT(offset))
550 continue;
551
552 nir_export_amd(
553 b, get_export_output(b, outputs[slot]),
554 .base = V_008DFC_SQ_EXP_PARAM + offset,
555 .write_mask = write_mask);
556 exported_params |= BITFIELD_BIT(offset);
557 }
558
559 u_foreach_bit (slot, outputs_written_16bit) {
560 unsigned offset = param_offsets[VARYING_SLOT_VAR0_16BIT + slot];
561 if (offset > AC_EXP_PARAM_OFFSET_31)
562 continue;
563
564 uint32_t write_mask = 0;
565 for (int i = 0; i < 4; i++) {
566 if (outputs_16bit_lo[slot][i] || outputs_16bit_hi[slot][i])
567 write_mask |= BITFIELD_BIT(i);
568 }
569
570 /* no one set this output slot, we can skip the param export */
571 if (!write_mask)
572 continue;
573
574 /* Since param_offsets[] can map multiple varying slots to the same
575 * param export index (that's radeonsi-specific behavior), we need to
576 * do this so as not to emit duplicated exports.
577 */
578 if (exported_params & BITFIELD_BIT(offset))
579 continue;
580
581 nir_def *vec[4];
582 nir_def *undef = nir_undef(b, 1, 16);
583 for (int i = 0; i < 4; i++) {
584 nir_def *lo = outputs_16bit_lo[slot][i] ? outputs_16bit_lo[slot][i] : undef;
585 nir_def *hi = outputs_16bit_hi[slot][i] ? outputs_16bit_hi[slot][i] : undef;
586 vec[i] = nir_pack_32_2x16_split(b, lo, hi);
587 }
588
589 nir_export_amd(
590 b, nir_vec(b, vec, 4),
591 .base = V_008DFC_SQ_EXP_PARAM + offset,
592 .write_mask = write_mask);
593 exported_params |= BITFIELD_BIT(offset);
594 }
595 }
596
597 unsigned
ac_nir_map_io_location(unsigned location,uint64_t mask,ac_nir_map_io_driver_location map_io)598 ac_nir_map_io_location(unsigned location,
599 uint64_t mask,
600 ac_nir_map_io_driver_location map_io)
601 {
602 /* Unlinked shaders:
603 * We are unaware of the inputs of the next stage while lowering outputs.
604 * The driver needs to pass a callback to map varyings to a fixed location.
605 */
606 if (map_io)
607 return map_io(location);
608
609 /* Linked shaders:
610 * Take advantage of knowledge of the inputs of the next stage when lowering outputs.
611 * Map varyings to a prefix sum of the IO mask to save space in LDS or VRAM.
612 */
613 assert(mask & BITFIELD64_BIT(location));
614 return util_bitcount64(mask & BITFIELD64_MASK(location));
615 }
616
617 /**
618 * This function takes an I/O intrinsic like load/store_input,
619 * and emits a sequence that calculates the full offset of that instruction,
620 * including a stride to the base and component offsets.
621 */
622 nir_def *
ac_nir_calc_io_off(nir_builder * b,nir_intrinsic_instr * intrin,nir_def * base_stride,unsigned component_stride,unsigned mapped_driver_location)623 ac_nir_calc_io_off(nir_builder *b,
624 nir_intrinsic_instr *intrin,
625 nir_def *base_stride,
626 unsigned component_stride,
627 unsigned mapped_driver_location)
628 {
629 /* base is the driver_location, which is in slots (1 slot = 4x4 bytes) */
630 nir_def *base_op = nir_imul_imm(b, base_stride, mapped_driver_location);
631
632 /* offset should be interpreted in relation to the base,
633 * so the instruction effectively reads/writes another input/output
634 * when it has an offset
635 */
636 nir_def *offset_op = nir_imul(b, base_stride,
637 nir_get_io_offset_src(intrin)->ssa);
638
639 /* component is in bytes */
640 unsigned const_op = nir_intrinsic_component(intrin) * component_stride;
641
642 return nir_iadd_imm_nuw(b, nir_iadd_nuw(b, base_op, offset_op), const_op);
643 }
644
645 bool
ac_nir_lower_indirect_derefs(nir_shader * shader,enum amd_gfx_level gfx_level)646 ac_nir_lower_indirect_derefs(nir_shader *shader,
647 enum amd_gfx_level gfx_level)
648 {
649 bool progress = false;
650
651 /* Lower large variables to scratch first so that we won't bloat the
652 * shader by generating large if ladders for them. We later lower
653 * scratch to alloca's, assuming LLVM won't generate VGPR indexing.
654 */
655 NIR_PASS(progress, shader, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
656 glsl_get_natural_size_align_bytes, glsl_get_natural_size_align_bytes);
657
658 /* LLVM doesn't support VGPR indexing on GFX9. */
659 bool llvm_has_working_vgpr_indexing = gfx_level != GFX9;
660
661 /* TODO: Indirect indexing of GS inputs is unimplemented.
662 *
663 * TCS and TES load inputs directly from LDS or offchip memory, so
664 * indirect indexing is trivial.
665 */
666 nir_variable_mode indirect_mask = 0;
667 if (shader->info.stage == MESA_SHADER_GEOMETRY ||
668 (shader->info.stage != MESA_SHADER_TESS_CTRL && shader->info.stage != MESA_SHADER_TESS_EVAL &&
669 !llvm_has_working_vgpr_indexing)) {
670 indirect_mask |= nir_var_shader_in;
671 }
672 if (!llvm_has_working_vgpr_indexing && shader->info.stage != MESA_SHADER_TESS_CTRL)
673 indirect_mask |= nir_var_shader_out;
674
675 /* TODO: We shouldn't need to do this, however LLVM isn't currently
676 * smart enough to handle indirects without causing excess spilling
677 * causing the gpu to hang.
678 *
679 * See the following thread for more details of the problem:
680 * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html
681 */
682 indirect_mask |= nir_var_function_temp;
683
684 NIR_PASS(progress, shader, nir_lower_indirect_derefs, indirect_mask, UINT32_MAX);
685 return progress;
686 }
687
688 static nir_def **
get_output_and_type(ac_nir_prerast_out * out,unsigned slot,bool high_16bits,nir_alu_type ** types)689 get_output_and_type(ac_nir_prerast_out *out, unsigned slot, bool high_16bits,
690 nir_alu_type **types)
691 {
692 nir_def **data;
693 nir_alu_type *type;
694
695 /* Only VARYING_SLOT_VARn_16BIT slots need output type to convert 16bit output
696 * to 32bit. Vulkan is not allowed to streamout output less than 32bit.
697 */
698 if (slot < VARYING_SLOT_VAR0_16BIT) {
699 data = out->outputs[slot];
700 type = NULL;
701 } else {
702 unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
703
704 if (high_16bits) {
705 data = out->outputs_16bit_hi[index];
706 type = out->types_16bit_hi[index];
707 } else {
708 data = out->outputs[index];
709 type = out->types_16bit_lo[index];
710 }
711 }
712
713 *types = type;
714 return data;
715 }
716
717 static void
emit_streamout(nir_builder * b,unsigned stream,nir_xfb_info * info,ac_nir_prerast_out * out)718 emit_streamout(nir_builder *b, unsigned stream, nir_xfb_info *info, ac_nir_prerast_out *out)
719 {
720 nir_def *so_vtx_count = nir_ubfe_imm(b, nir_load_streamout_config_amd(b), 16, 7);
721 nir_def *tid = nir_load_subgroup_invocation(b);
722
723 nir_push_if(b, nir_ilt(b, tid, so_vtx_count));
724 nir_def *so_write_index = nir_load_streamout_write_index_amd(b);
725
726 nir_def *so_buffers[NIR_MAX_XFB_BUFFERS];
727 nir_def *so_write_offset[NIR_MAX_XFB_BUFFERS];
728 u_foreach_bit(i, info->buffers_written) {
729 so_buffers[i] = nir_load_streamout_buffer_amd(b, i);
730
731 unsigned stride = info->buffers[i].stride;
732 nir_def *offset = nir_load_streamout_offset_amd(b, i);
733 offset = nir_iadd(b, nir_imul_imm(b, nir_iadd(b, so_write_index, tid), stride),
734 nir_imul_imm(b, offset, 4));
735 so_write_offset[i] = offset;
736 }
737
738 nir_def *undef = nir_undef(b, 1, 32);
739 for (unsigned i = 0; i < info->output_count; i++) {
740 const nir_xfb_output_info *output = info->outputs + i;
741 if (stream != info->buffer_to_stream[output->buffer])
742 continue;
743
744 nir_alu_type *output_type;
745 nir_def **output_data =
746 get_output_and_type(out, output->location, output->high_16bits, &output_type);
747
748 nir_def *vec[4] = {undef, undef, undef, undef};
749 uint8_t mask = 0;
750 u_foreach_bit(j, output->component_mask) {
751 nir_def *data = output_data[j];
752
753 if (data) {
754 if (data->bit_size < 32) {
755 /* we need output type to convert non-32bit output to 32bit */
756 assert(output_type);
757
758 nir_alu_type base_type = nir_alu_type_get_base_type(output_type[j]);
759 data = nir_convert_to_bit_size(b, data, base_type, 32);
760 }
761
762 unsigned comp = j - output->component_offset;
763 vec[comp] = data;
764 mask |= 1 << comp;
765 }
766 }
767
768 if (!mask)
769 continue;
770
771 unsigned buffer = output->buffer;
772 nir_def *data = nir_vec(b, vec, util_last_bit(mask));
773 nir_def *zero = nir_imm_int(b, 0);
774 nir_store_buffer_amd(b, data, so_buffers[buffer], so_write_offset[buffer], zero, zero,
775 .base = output->offset, .write_mask = mask,
776 .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL);
777 }
778
779 nir_pop_if(b, NULL);
780 }
781
782 nir_shader *
ac_nir_create_gs_copy_shader(const nir_shader * gs_nir,enum amd_gfx_level gfx_level,uint32_t clip_cull_mask,const uint8_t * param_offsets,bool has_param_exports,bool disable_streamout,bool kill_pointsize,bool kill_layer,bool force_vrs,ac_nir_gs_output_info * output_info)783 ac_nir_create_gs_copy_shader(const nir_shader *gs_nir,
784 enum amd_gfx_level gfx_level,
785 uint32_t clip_cull_mask,
786 const uint8_t *param_offsets,
787 bool has_param_exports,
788 bool disable_streamout,
789 bool kill_pointsize,
790 bool kill_layer,
791 bool force_vrs,
792 ac_nir_gs_output_info *output_info)
793 {
794 nir_builder b = nir_builder_init_simple_shader(
795 MESA_SHADER_VERTEX, gs_nir->options, "gs_copy");
796
797 nir_foreach_shader_out_variable(var, gs_nir)
798 nir_shader_add_variable(b.shader, nir_variable_clone(var, b.shader));
799
800 b.shader->info.outputs_written = gs_nir->info.outputs_written;
801 b.shader->info.outputs_written_16bit = gs_nir->info.outputs_written_16bit;
802
803 nir_def *gsvs_ring = nir_load_ring_gsvs_amd(&b);
804
805 nir_xfb_info *info = gs_nir->xfb_info;
806 nir_def *stream_id = NULL;
807 if (!disable_streamout && info)
808 stream_id = nir_ubfe_imm(&b, nir_load_streamout_config_amd(&b), 24, 2);
809
810 nir_def *vtx_offset = nir_imul_imm(&b, nir_load_vertex_id_zero_base(&b), 4);
811 nir_def *zero = nir_imm_zero(&b, 1, 32);
812
813 for (unsigned stream = 0; stream < 4; stream++) {
814 if (stream > 0 && (!stream_id || !(info->streams_written & BITFIELD_BIT(stream))))
815 continue;
816
817 if (stream_id)
818 nir_push_if(&b, nir_ieq_imm(&b, stream_id, stream));
819
820 uint32_t offset = 0;
821 ac_nir_prerast_out out = {0};
822 if (output_info->types_16bit_lo)
823 memcpy(&out.types_16bit_lo, output_info->types_16bit_lo, sizeof(out.types_16bit_lo));
824 if (output_info->types_16bit_hi)
825 memcpy(&out.types_16bit_hi, output_info->types_16bit_hi, sizeof(out.types_16bit_hi));
826
827 u_foreach_bit64 (i, gs_nir->info.outputs_written) {
828 u_foreach_bit (j, output_info->usage_mask[i]) {
829 if (((output_info->streams[i] >> (j * 2)) & 0x3) != stream)
830 continue;
831
832 out.outputs[i][j] =
833 nir_load_buffer_amd(&b, 1, 32, gsvs_ring, vtx_offset, zero, zero,
834 .base = offset,
835 .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL);
836
837 /* clamp legacy color output */
838 if (i == VARYING_SLOT_COL0 || i == VARYING_SLOT_COL1 ||
839 i == VARYING_SLOT_BFC0 || i == VARYING_SLOT_BFC1) {
840 nir_def *color = out.outputs[i][j];
841 nir_def *clamp = nir_load_clamp_vertex_color_amd(&b);
842 out.outputs[i][j] = nir_bcsel(&b, clamp, nir_fsat(&b, color), color);
843 }
844
845 offset += gs_nir->info.gs.vertices_out * 16 * 4;
846 }
847 }
848
849 u_foreach_bit (i, gs_nir->info.outputs_written_16bit) {
850 for (unsigned j = 0; j < 4; j++) {
851 bool has_lo_16bit = (output_info->usage_mask_16bit_lo[i] & (1 << j)) &&
852 ((output_info->streams_16bit_lo[i] >> (j * 2)) & 0x3) == stream;
853 bool has_hi_16bit = (output_info->usage_mask_16bit_hi[i] & (1 << j)) &&
854 ((output_info->streams_16bit_hi[i] >> (j * 2)) & 0x3) == stream;
855 if (!has_lo_16bit && !has_hi_16bit)
856 continue;
857
858 nir_def *data =
859 nir_load_buffer_amd(&b, 1, 32, gsvs_ring, vtx_offset, zero, zero,
860 .base = offset,
861 .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL);
862
863 if (has_lo_16bit)
864 out.outputs_16bit_lo[i][j] = nir_unpack_32_2x16_split_x(&b, data);
865
866 if (has_hi_16bit)
867 out.outputs_16bit_hi[i][j] = nir_unpack_32_2x16_split_y(&b, data);
868
869 offset += gs_nir->info.gs.vertices_out * 16 * 4;
870 }
871 }
872
873 if (stream_id)
874 emit_streamout(&b, stream, info, &out);
875
876 if (stream == 0) {
877 uint64_t export_outputs = b.shader->info.outputs_written | VARYING_BIT_POS;
878 if (kill_pointsize)
879 export_outputs &= ~VARYING_BIT_PSIZ;
880 if (kill_layer)
881 export_outputs &= ~VARYING_BIT_LAYER;
882
883 ac_nir_export_position(&b, gfx_level, clip_cull_mask, !has_param_exports,
884 force_vrs, true, export_outputs, out.outputs, NULL);
885
886 if (has_param_exports) {
887 ac_nir_export_parameters(&b, param_offsets,
888 b.shader->info.outputs_written,
889 b.shader->info.outputs_written_16bit,
890 out.outputs,
891 out.outputs_16bit_lo,
892 out.outputs_16bit_hi);
893 }
894 }
895
896 if (stream_id)
897 nir_push_else(&b, NULL);
898 }
899
900 b.shader->info.clip_distance_array_size = gs_nir->info.clip_distance_array_size;
901 b.shader->info.cull_distance_array_size = gs_nir->info.cull_distance_array_size;
902
903 return b.shader;
904 }
905
906 static void
gather_outputs(nir_builder * b,nir_function_impl * impl,ac_nir_prerast_out * out)907 gather_outputs(nir_builder *b, nir_function_impl *impl, ac_nir_prerast_out *out)
908 {
909 /* Assume:
910 * - the shader used nir_lower_io_to_temporaries
911 * - 64-bit outputs are lowered
912 * - no indirect indexing is present
913 */
914 nir_foreach_block (block, impl) {
915 nir_foreach_instr_safe (instr, block) {
916 if (instr->type != nir_instr_type_intrinsic)
917 continue;
918
919 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
920 if (intrin->intrinsic != nir_intrinsic_store_output)
921 continue;
922
923 ac_nir_gather_prerast_store_output_info(b, intrin, out);
924 nir_instr_remove(instr);
925 }
926 }
927 }
928
929 void
ac_nir_lower_legacy_vs(nir_shader * nir,enum amd_gfx_level gfx_level,uint32_t clip_cull_mask,const uint8_t * param_offsets,bool has_param_exports,bool export_primitive_id,bool disable_streamout,bool kill_pointsize,bool kill_layer,bool force_vrs)930 ac_nir_lower_legacy_vs(nir_shader *nir,
931 enum amd_gfx_level gfx_level,
932 uint32_t clip_cull_mask,
933 const uint8_t *param_offsets,
934 bool has_param_exports,
935 bool export_primitive_id,
936 bool disable_streamout,
937 bool kill_pointsize,
938 bool kill_layer,
939 bool force_vrs)
940 {
941 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
942 nir_metadata preserved = nir_metadata_control_flow;
943
944 nir_builder b = nir_builder_at(nir_after_impl(impl));
945
946 ac_nir_prerast_out out = {0};
947 gather_outputs(&b, impl, &out);
948 b.cursor = nir_after_impl(impl);
949
950 if (export_primitive_id) {
951 /* When the primitive ID is read by FS, we must ensure that it's exported by the previous
952 * vertex stage because it's implicit for VS or TES (but required by the Vulkan spec for GS
953 * or MS).
954 */
955 out.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = nir_load_primitive_id(&b);
956
957 /* Update outputs_written to reflect that the pass added a new output. */
958 nir->info.outputs_written |= BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_ID);
959 }
960
961 if (!disable_streamout && nir->xfb_info) {
962 emit_streamout(&b, 0, nir->xfb_info, &out);
963 preserved = nir_metadata_none;
964 }
965
966 uint64_t export_outputs = nir->info.outputs_written | VARYING_BIT_POS;
967 if (kill_pointsize)
968 export_outputs &= ~VARYING_BIT_PSIZ;
969 if (kill_layer)
970 export_outputs &= ~VARYING_BIT_LAYER;
971
972 ac_nir_export_position(&b, gfx_level, clip_cull_mask, !has_param_exports,
973 force_vrs, true, export_outputs, out.outputs, NULL);
974
975 if (has_param_exports) {
976 ac_nir_export_parameters(&b, param_offsets,
977 nir->info.outputs_written,
978 nir->info.outputs_written_16bit,
979 out.outputs,
980 out.outputs_16bit_lo,
981 out.outputs_16bit_hi);
982 }
983
984 nir_metadata_preserve(impl, preserved);
985 }
986
987 static nir_def *
ac_nir_accum_ior(nir_builder * b,nir_def * accum_result,nir_def * new_term)988 ac_nir_accum_ior(nir_builder *b, nir_def *accum_result, nir_def *new_term)
989 {
990 return accum_result ? nir_ior(b, accum_result, new_term) : new_term;
991 }
992
993 bool
ac_nir_gs_shader_query(nir_builder * b,bool has_gen_prim_query,bool has_gs_invocations_query,bool has_gs_primitives_query,unsigned num_vertices_per_primitive,unsigned wave_size,nir_def * vertex_count[4],nir_def * primitive_count[4])994 ac_nir_gs_shader_query(nir_builder *b,
995 bool has_gen_prim_query,
996 bool has_gs_invocations_query,
997 bool has_gs_primitives_query,
998 unsigned num_vertices_per_primitive,
999 unsigned wave_size,
1000 nir_def *vertex_count[4],
1001 nir_def *primitive_count[4])
1002 {
1003 nir_def *pipeline_query_enabled = NULL;
1004 nir_def *prim_gen_query_enabled = NULL;
1005 nir_def *any_query_enabled = NULL;
1006
1007 if (has_gen_prim_query) {
1008 prim_gen_query_enabled = nir_load_prim_gen_query_enabled_amd(b);
1009 any_query_enabled = ac_nir_accum_ior(b, any_query_enabled, prim_gen_query_enabled);
1010 }
1011
1012 if (has_gs_invocations_query || has_gs_primitives_query) {
1013 pipeline_query_enabled = nir_load_pipeline_stat_query_enabled_amd(b);
1014 any_query_enabled = ac_nir_accum_ior(b, any_query_enabled, pipeline_query_enabled);
1015 }
1016
1017 if (!any_query_enabled) {
1018 /* has no query */
1019 return false;
1020 }
1021
1022 nir_if *if_shader_query = nir_push_if(b, any_query_enabled);
1023
1024 nir_def *active_threads_mask = nir_ballot(b, 1, wave_size, nir_imm_true(b));
1025 nir_def *num_active_threads = nir_bit_count(b, active_threads_mask);
1026
1027 /* Calculate the "real" number of emitted primitives from the emitted GS vertices and primitives.
1028 * GS emits points, line strips or triangle strips.
1029 * Real primitives are points, lines or triangles.
1030 */
1031 nir_def *num_prims_in_wave[4] = {0};
1032 u_foreach_bit (i, b->shader->info.gs.active_stream_mask) {
1033 assert(vertex_count[i] && primitive_count[i]);
1034
1035 nir_scalar vtx_cnt = nir_get_scalar(vertex_count[i], 0);
1036 nir_scalar prm_cnt = nir_get_scalar(primitive_count[i], 0);
1037
1038 if (nir_scalar_is_const(vtx_cnt) && nir_scalar_is_const(prm_cnt)) {
1039 unsigned gs_vtx_cnt = nir_scalar_as_uint(vtx_cnt);
1040 unsigned gs_prm_cnt = nir_scalar_as_uint(prm_cnt);
1041 unsigned total_prm_cnt = gs_vtx_cnt - gs_prm_cnt * (num_vertices_per_primitive - 1u);
1042 if (total_prm_cnt == 0)
1043 continue;
1044
1045 num_prims_in_wave[i] = nir_imul_imm(b, num_active_threads, total_prm_cnt);
1046 } else {
1047 nir_def *gs_vtx_cnt = vtx_cnt.def;
1048 nir_def *gs_prm_cnt = prm_cnt.def;
1049 if (num_vertices_per_primitive > 1)
1050 gs_prm_cnt = nir_iadd(b, nir_imul_imm(b, gs_prm_cnt, -1u * (num_vertices_per_primitive - 1)), gs_vtx_cnt);
1051 num_prims_in_wave[i] = nir_reduce(b, gs_prm_cnt, .reduction_op = nir_op_iadd);
1052 }
1053 }
1054
1055 /* Store the query result to query result using an atomic add. */
1056 nir_if *if_first_lane = nir_push_if(b, nir_elect(b, 1));
1057 {
1058 if (has_gs_invocations_query || has_gs_primitives_query) {
1059 nir_if *if_pipeline_query = nir_push_if(b, pipeline_query_enabled);
1060 {
1061 nir_def *count = NULL;
1062
1063 /* Add all streams' number to the same counter. */
1064 for (int i = 0; i < 4; i++) {
1065 if (num_prims_in_wave[i]) {
1066 if (count)
1067 count = nir_iadd(b, count, num_prims_in_wave[i]);
1068 else
1069 count = num_prims_in_wave[i];
1070 }
1071 }
1072
1073 if (has_gs_primitives_query && count)
1074 nir_atomic_add_gs_emit_prim_count_amd(b, count);
1075
1076 if (has_gs_invocations_query)
1077 nir_atomic_add_shader_invocation_count_amd(b, num_active_threads);
1078 }
1079 nir_pop_if(b, if_pipeline_query);
1080 }
1081
1082 if (has_gen_prim_query) {
1083 nir_if *if_prim_gen_query = nir_push_if(b, prim_gen_query_enabled);
1084 {
1085 /* Add to the counter for this stream. */
1086 for (int i = 0; i < 4; i++) {
1087 if (num_prims_in_wave[i])
1088 nir_atomic_add_gen_prim_count_amd(b, num_prims_in_wave[i], .stream_id = i);
1089 }
1090 }
1091 nir_pop_if(b, if_prim_gen_query);
1092 }
1093 }
1094 nir_pop_if(b, if_first_lane);
1095
1096 nir_pop_if(b, if_shader_query);
1097 return true;
1098 }
1099
1100 typedef struct {
1101 nir_def *outputs[64][4];
1102 nir_def *outputs_16bit_lo[16][4];
1103 nir_def *outputs_16bit_hi[16][4];
1104
1105 ac_nir_gs_output_info *info;
1106
1107 nir_def *vertex_count[4];
1108 nir_def *primitive_count[4];
1109 } lower_legacy_gs_state;
1110
1111 static bool
lower_legacy_gs_store_output(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)1112 lower_legacy_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin,
1113 lower_legacy_gs_state *s)
1114 {
1115 /* Assume:
1116 * - the shader used nir_lower_io_to_temporaries
1117 * - 64-bit outputs are lowered
1118 * - no indirect indexing is present
1119 */
1120 assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
1121
1122 b->cursor = nir_before_instr(&intrin->instr);
1123
1124 unsigned component = nir_intrinsic_component(intrin);
1125 unsigned write_mask = nir_intrinsic_write_mask(intrin);
1126 nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
1127
1128 nir_def **outputs;
1129 if (sem.location < VARYING_SLOT_VAR0_16BIT) {
1130 outputs = s->outputs[sem.location];
1131 } else {
1132 unsigned index = sem.location - VARYING_SLOT_VAR0_16BIT;
1133 if (sem.high_16bits)
1134 outputs = s->outputs_16bit_hi[index];
1135 else
1136 outputs = s->outputs_16bit_lo[index];
1137 }
1138
1139 nir_def *store_val = intrin->src[0].ssa;
1140 /* 64bit output has been lowered to 32bit */
1141 assert(store_val->bit_size <= 32);
1142
1143 /* 16-bit output stored in a normal varying slot that isn't a dedicated 16-bit slot. */
1144 const bool non_dedicated_16bit = sem.location < VARYING_SLOT_VAR0_16BIT && store_val->bit_size == 16;
1145
1146 u_foreach_bit (i, write_mask) {
1147 unsigned comp = component + i;
1148 nir_def *store_component = nir_channel(b, store_val, i);
1149
1150 if (non_dedicated_16bit) {
1151 if (sem.high_16bits) {
1152 nir_def *lo = outputs[comp] ? nir_unpack_32_2x16_split_x(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16);
1153 outputs[comp] = nir_pack_32_2x16_split(b, lo, store_component);
1154 } else {
1155 nir_def *hi = outputs[comp] ? nir_unpack_32_2x16_split_y(b, outputs[comp]) : nir_imm_intN_t(b, 0, 16);
1156 outputs[comp] = nir_pack_32_2x16_split(b, store_component, hi);
1157 }
1158 } else {
1159 outputs[comp] = store_component;
1160 }
1161 }
1162
1163 nir_instr_remove(&intrin->instr);
1164 return true;
1165 }
1166
1167 static bool
lower_legacy_gs_emit_vertex_with_counter(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)1168 lower_legacy_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intrin,
1169 lower_legacy_gs_state *s)
1170 {
1171 b->cursor = nir_before_instr(&intrin->instr);
1172
1173 unsigned stream = nir_intrinsic_stream_id(intrin);
1174 nir_def *vtxidx = intrin->src[0].ssa;
1175
1176 nir_def *gsvs_ring = nir_load_ring_gsvs_amd(b, .stream_id = stream);
1177 nir_def *soffset = nir_load_ring_gs2vs_offset_amd(b);
1178
1179 unsigned offset = 0;
1180 u_foreach_bit64 (i, b->shader->info.outputs_written) {
1181 for (unsigned j = 0; j < 4; j++) {
1182 nir_def *output = s->outputs[i][j];
1183 /* Next vertex emit need a new value, reset all outputs. */
1184 s->outputs[i][j] = NULL;
1185
1186 if (!(s->info->usage_mask[i] & (1 << j)) ||
1187 ((s->info->streams[i] >> (j * 2)) & 0x3) != stream)
1188 continue;
1189
1190 unsigned base = offset * b->shader->info.gs.vertices_out * 4;
1191 offset++;
1192
1193 /* no one set this output, skip the buffer store */
1194 if (!output)
1195 continue;
1196
1197 nir_def *voffset = nir_ishl_imm(b, vtxidx, 2);
1198
1199 /* extend 8/16 bit to 32 bit, 64 bit has been lowered */
1200 nir_def *data = nir_u2uN(b, output, 32);
1201
1202 nir_store_buffer_amd(b, data, gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
1203 .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL |
1204 ACCESS_IS_SWIZZLED_AMD,
1205 .base = base,
1206 /* For ACO to not reorder this store around EmitVertex/EndPrimitve */
1207 .memory_modes = nir_var_shader_out);
1208 }
1209 }
1210
1211 u_foreach_bit (i, b->shader->info.outputs_written_16bit) {
1212 for (unsigned j = 0; j < 4; j++) {
1213 nir_def *output_lo = s->outputs_16bit_lo[i][j];
1214 nir_def *output_hi = s->outputs_16bit_hi[i][j];
1215 /* Next vertex emit need a new value, reset all outputs. */
1216 s->outputs_16bit_lo[i][j] = NULL;
1217 s->outputs_16bit_hi[i][j] = NULL;
1218
1219 bool has_lo_16bit = (s->info->usage_mask_16bit_lo[i] & (1 << j)) &&
1220 ((s->info->streams_16bit_lo[i] >> (j * 2)) & 0x3) == stream;
1221 bool has_hi_16bit = (s->info->usage_mask_16bit_hi[i] & (1 << j)) &&
1222 ((s->info->streams_16bit_hi[i] >> (j * 2)) & 0x3) == stream;
1223 if (!has_lo_16bit && !has_hi_16bit)
1224 continue;
1225
1226 unsigned base = offset * b->shader->info.gs.vertices_out;
1227 offset++;
1228
1229 bool has_lo_16bit_out = has_lo_16bit && output_lo;
1230 bool has_hi_16bit_out = has_hi_16bit && output_hi;
1231
1232 /* no one set needed output, skip the buffer store */
1233 if (!has_lo_16bit_out && !has_hi_16bit_out)
1234 continue;
1235
1236 if (!has_lo_16bit_out)
1237 output_lo = nir_undef(b, 1, 16);
1238
1239 if (!has_hi_16bit_out)
1240 output_hi = nir_undef(b, 1, 16);
1241
1242 nir_def *voffset = nir_iadd_imm(b, vtxidx, base);
1243 voffset = nir_ishl_imm(b, voffset, 2);
1244
1245 nir_store_buffer_amd(b, nir_pack_32_2x16_split(b, output_lo, output_hi),
1246 gsvs_ring, voffset, soffset, nir_imm_int(b, 0),
1247 .access = ACCESS_COHERENT | ACCESS_NON_TEMPORAL |
1248 ACCESS_IS_SWIZZLED_AMD,
1249 /* For ACO to not reorder this store around EmitVertex/EndPrimitve */
1250 .memory_modes = nir_var_shader_out);
1251 }
1252 }
1253
1254 /* Signal vertex emission. */
1255 nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
1256 .base = AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8));
1257
1258 nir_instr_remove(&intrin->instr);
1259 return true;
1260 }
1261
1262 static bool
lower_legacy_gs_set_vertex_and_primitive_count(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)1263 lower_legacy_gs_set_vertex_and_primitive_count(nir_builder *b, nir_intrinsic_instr *intrin,
1264 lower_legacy_gs_state *s)
1265 {
1266 b->cursor = nir_before_instr(&intrin->instr);
1267
1268 unsigned stream = nir_intrinsic_stream_id(intrin);
1269
1270 s->vertex_count[stream] = intrin->src[0].ssa;
1271 s->primitive_count[stream] = intrin->src[1].ssa;
1272
1273 nir_instr_remove(&intrin->instr);
1274 return true;
1275 }
1276
1277 static bool
lower_legacy_gs_end_primitive_with_counter(nir_builder * b,nir_intrinsic_instr * intrin,lower_legacy_gs_state * s)1278 lower_legacy_gs_end_primitive_with_counter(nir_builder *b, nir_intrinsic_instr *intrin,
1279 lower_legacy_gs_state *s)
1280 {
1281 b->cursor = nir_before_instr(&intrin->instr);
1282 const unsigned stream = nir_intrinsic_stream_id(intrin);
1283
1284 /* Signal primitive emission. */
1285 nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
1286 .base = AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8));
1287
1288 nir_instr_remove(&intrin->instr);
1289 return true;
1290 }
1291
1292 static bool
lower_legacy_gs_intrinsic(nir_builder * b,nir_instr * instr,void * state)1293 lower_legacy_gs_intrinsic(nir_builder *b, nir_instr *instr, void *state)
1294 {
1295 lower_legacy_gs_state *s = (lower_legacy_gs_state *) state;
1296
1297 if (instr->type != nir_instr_type_intrinsic)
1298 return false;
1299
1300 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1301
1302 if (intrin->intrinsic == nir_intrinsic_store_output)
1303 return lower_legacy_gs_store_output(b, intrin, s);
1304 else if (intrin->intrinsic == nir_intrinsic_emit_vertex_with_counter)
1305 return lower_legacy_gs_emit_vertex_with_counter(b, intrin, s);
1306 else if (intrin->intrinsic == nir_intrinsic_end_primitive_with_counter)
1307 return lower_legacy_gs_end_primitive_with_counter(b, intrin, s);
1308 else if (intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count)
1309 return lower_legacy_gs_set_vertex_and_primitive_count(b, intrin, s);
1310
1311 return false;
1312 }
1313
1314 void
ac_nir_lower_legacy_gs(nir_shader * nir,bool has_gen_prim_query,bool has_pipeline_stats_query,ac_nir_gs_output_info * output_info)1315 ac_nir_lower_legacy_gs(nir_shader *nir,
1316 bool has_gen_prim_query,
1317 bool has_pipeline_stats_query,
1318 ac_nir_gs_output_info *output_info)
1319 {
1320 lower_legacy_gs_state s = {
1321 .info = output_info,
1322 };
1323
1324 unsigned num_vertices_per_primitive = 0;
1325 switch (nir->info.gs.output_primitive) {
1326 case MESA_PRIM_POINTS:
1327 num_vertices_per_primitive = 1;
1328 break;
1329 case MESA_PRIM_LINE_STRIP:
1330 num_vertices_per_primitive = 2;
1331 break;
1332 case MESA_PRIM_TRIANGLE_STRIP:
1333 num_vertices_per_primitive = 3;
1334 break;
1335 default:
1336 unreachable("Invalid GS output primitive.");
1337 break;
1338 }
1339
1340 nir_shader_instructions_pass(nir, lower_legacy_gs_intrinsic,
1341 nir_metadata_control_flow, &s);
1342
1343 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1344
1345 nir_builder builder = nir_builder_at(nir_after_impl(impl));
1346 nir_builder *b = &builder;
1347
1348 /* Emit shader query for mix use legacy/NGG GS */
1349 bool progress = ac_nir_gs_shader_query(b,
1350 has_gen_prim_query,
1351 has_pipeline_stats_query,
1352 has_pipeline_stats_query,
1353 num_vertices_per_primitive,
1354 64,
1355 s.vertex_count,
1356 s.primitive_count);
1357
1358 /* Wait for all stores to finish. */
1359 nir_barrier(b, .execution_scope = SCOPE_INVOCATION,
1360 .memory_scope = SCOPE_DEVICE,
1361 .memory_semantics = NIR_MEMORY_RELEASE,
1362 .memory_modes = nir_var_shader_out | nir_var_mem_ssbo |
1363 nir_var_mem_global | nir_var_image);
1364
1365 /* Signal that the GS is done. */
1366 nir_sendmsg_amd(b, nir_load_gs_wave_id_amd(b),
1367 .base = AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE);
1368
1369 if (progress)
1370 nir_metadata_preserve(impl, nir_metadata_none);
1371 }
1372
1373 /* Shader logging function for printing nir_def values. The driver prints this after
1374 * command submission.
1375 *
1376 * Ring buffer layout: {uint32_t num_dwords; vec4; vec4; vec4; ... }
1377 * - The buffer size must be 2^N * 16 + 4
1378 * - num_dwords is incremented atomically and the ring wraps around, removing
1379 * the oldest entries.
1380 */
1381 void
ac_nir_store_debug_log_amd(nir_builder * b,nir_def * uvec4)1382 ac_nir_store_debug_log_amd(nir_builder *b, nir_def *uvec4)
1383 {
1384 nir_def *buf = nir_load_debug_log_desc_amd(b);
1385 nir_def *zero = nir_imm_int(b, 0);
1386
1387 nir_def *max_index =
1388 nir_iadd_imm(b, nir_ushr_imm(b, nir_iadd_imm(b, nir_channel(b, buf, 2), -4), 4), -1);
1389 nir_def *index = nir_ssbo_atomic(b, 32, buf, zero, nir_imm_int(b, 1),
1390 .atomic_op = nir_atomic_op_iadd);
1391 index = nir_iand(b, index, max_index);
1392 nir_def *offset = nir_iadd_imm(b, nir_imul_imm(b, index, 16), 4);
1393 nir_store_buffer_amd(b, uvec4, buf, offset, zero, zero);
1394 }
1395
1396 static bool
needs_rounding_mode_16_64(nir_instr * instr)1397 needs_rounding_mode_16_64(nir_instr *instr)
1398 {
1399 if (instr->type != nir_instr_type_alu)
1400 return false;
1401 nir_alu_instr *alu = nir_instr_as_alu(instr);
1402 if (alu->op == nir_op_fquantize2f16)
1403 return true;
1404 if (alu->def.bit_size != 16 && alu->def.bit_size != 64)
1405 return false;
1406 if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) != nir_type_float)
1407 return false;
1408
1409 switch (alu->op) {
1410 case nir_op_f2f64:
1411 case nir_op_b2f64:
1412 case nir_op_f2f16_rtz:
1413 case nir_op_b2f16:
1414 case nir_op_fsat:
1415 case nir_op_fabs:
1416 case nir_op_fneg:
1417 case nir_op_fsign:
1418 case nir_op_ftrunc:
1419 case nir_op_fceil:
1420 case nir_op_ffloor:
1421 case nir_op_ffract:
1422 case nir_op_fround_even:
1423 case nir_op_fmin:
1424 case nir_op_fmax:
1425 return false;
1426 default:
1427 return true;
1428 }
1429 }
1430
1431 static bool
can_use_fmamix(nir_scalar s,enum amd_gfx_level gfx_level)1432 can_use_fmamix(nir_scalar s, enum amd_gfx_level gfx_level)
1433 {
1434 s = nir_scalar_chase_movs(s);
1435 if (!list_is_singular(&s.def->uses))
1436 return false;
1437
1438 if (nir_scalar_is_intrinsic(s) &&
1439 nir_scalar_intrinsic_op(s) == nir_intrinsic_load_interpolated_input)
1440 return gfx_level >= GFX11;
1441
1442 if (!nir_scalar_is_alu(s))
1443 return false;
1444
1445 switch (nir_scalar_alu_op(s)) {
1446 case nir_op_fmul:
1447 case nir_op_ffma:
1448 case nir_op_fadd:
1449 case nir_op_fsub:
1450 return true;
1451 case nir_op_fsat:
1452 return can_use_fmamix(nir_scalar_chase_alu_src(s, 0), gfx_level);
1453 default:
1454 return false;
1455 }
1456 }
1457
1458 static bool
split_pack_half(nir_builder * b,nir_instr * instr,void * param)1459 split_pack_half(nir_builder *b, nir_instr *instr, void *param)
1460 {
1461 enum amd_gfx_level gfx_level = *(enum amd_gfx_level *)param;
1462
1463 if (instr->type != nir_instr_type_alu)
1464 return false;
1465 nir_alu_instr *alu = nir_instr_as_alu(instr);
1466 if (alu->op != nir_op_pack_half_2x16_rtz_split && alu->op != nir_op_pack_half_2x16_split)
1467 return false;
1468
1469 nir_scalar s = nir_get_scalar(&alu->def, 0);
1470
1471 if (!can_use_fmamix(nir_scalar_chase_alu_src(s, 0), gfx_level) ||
1472 !can_use_fmamix(nir_scalar_chase_alu_src(s, 1), gfx_level))
1473 return false;
1474
1475 b->cursor = nir_before_instr(instr);
1476
1477 /* Split pack_half into two f2f16 to create v_fma_mix{lo,hi}_f16
1478 * in the backend.
1479 */
1480 nir_def *lo = nir_f2f16(b, nir_ssa_for_alu_src(b, alu, 0));
1481 nir_def *hi = nir_f2f16(b, nir_ssa_for_alu_src(b, alu, 1));
1482 nir_def_replace(&alu->def, nir_pack_32_2x16_split(b, lo, hi));
1483 return true;
1484 }
1485
1486 bool
ac_nir_opt_pack_half(nir_shader * shader,enum amd_gfx_level gfx_level)1487 ac_nir_opt_pack_half(nir_shader *shader, enum amd_gfx_level gfx_level)
1488 {
1489 if (gfx_level < GFX10)
1490 return false;
1491
1492 unsigned exec_mode = shader->info.float_controls_execution_mode;
1493 bool set_mode = false;
1494 if (!nir_is_rounding_mode_rtz(exec_mode, 16)) {
1495 nir_foreach_function_impl(impl, shader) {
1496 nir_foreach_block(block, impl) {
1497 nir_foreach_instr(instr, block) {
1498 if (needs_rounding_mode_16_64(instr))
1499 return false;
1500 }
1501 }
1502 }
1503 set_mode = true;
1504 }
1505
1506 bool progress = nir_shader_instructions_pass(shader, split_pack_half,
1507 nir_metadata_control_flow,
1508 &gfx_level);
1509
1510 if (set_mode && progress) {
1511 exec_mode &= ~(FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
1512 exec_mode |= FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64;
1513 shader->info.float_controls_execution_mode = exec_mode;
1514 }
1515 return progress;
1516 }
1517
1518 nir_def *
ac_average_samples(nir_builder * b,nir_def ** samples,unsigned num_samples)1519 ac_average_samples(nir_builder *b, nir_def **samples, unsigned num_samples)
1520 {
1521 /* This works like add-reduce by computing the sum of each pair independently, and then
1522 * computing the sum of each pair of sums, and so on, to get better instruction-level
1523 * parallelism.
1524 */
1525 if (num_samples == 16) {
1526 for (unsigned i = 0; i < 8; i++)
1527 samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
1528 }
1529 if (num_samples >= 8) {
1530 for (unsigned i = 0; i < 4; i++)
1531 samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
1532 }
1533 if (num_samples >= 4) {
1534 for (unsigned i = 0; i < 2; i++)
1535 samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
1536 }
1537 if (num_samples >= 2)
1538 samples[0] = nir_fadd(b, samples[0], samples[1]);
1539
1540 return nir_fmul_imm(b, samples[0], 1.0 / num_samples); /* average the sum */
1541 }
1542
1543 void
ac_optimization_barrier_vgpr_array(const struct radeon_info * info,nir_builder * b,nir_def ** array,unsigned num_elements,unsigned num_components)1544 ac_optimization_barrier_vgpr_array(const struct radeon_info *info, nir_builder *b,
1545 nir_def **array, unsigned num_elements,
1546 unsigned num_components)
1547 {
1548 /* We use the optimization barrier to force LLVM to form VMEM clauses by constraining its
1549 * instruction scheduling options.
1550 *
1551 * VMEM clauses are supported since GFX10. It's not recommended to use the optimization
1552 * barrier in the compute blit for GFX6-8 because the lack of A16 combined with optimization
1553 * barriers would unnecessarily increase VGPR usage for MSAA resources.
1554 */
1555 if (!b->shader->info.use_aco_amd && info->gfx_level >= GFX10) {
1556 for (unsigned i = 0; i < num_elements; i++) {
1557 unsigned prev_num = array[i]->num_components;
1558 array[i] = nir_trim_vector(b, array[i], num_components);
1559 array[i] = nir_optimization_barrier_vgpr_amd(b, array[i]->bit_size, array[i]);
1560 array[i] = nir_pad_vector(b, array[i], prev_num);
1561 }
1562 }
1563 }
1564
1565 nir_def *
ac_get_global_ids(nir_builder * b,unsigned num_components,unsigned bit_size)1566 ac_get_global_ids(nir_builder *b, unsigned num_components, unsigned bit_size)
1567 {
1568 unsigned mask = BITFIELD_MASK(num_components);
1569
1570 nir_def *local_ids = nir_channels(b, nir_load_local_invocation_id(b), mask);
1571 nir_def *block_ids = nir_channels(b, nir_load_workgroup_id(b), mask);
1572 nir_def *block_size = nir_channels(b, nir_load_workgroup_size(b), mask);
1573
1574 assert(bit_size == 32 || bit_size == 16);
1575 if (bit_size == 16) {
1576 local_ids = nir_i2iN(b, local_ids, bit_size);
1577 block_ids = nir_i2iN(b, block_ids, bit_size);
1578 block_size = nir_i2iN(b, block_size, bit_size);
1579 }
1580
1581 return nir_iadd(b, nir_imul(b, block_ids, block_size), local_ids);
1582 }
1583
1584 unsigned
ac_nir_varying_expression_max_cost(nir_shader * producer,nir_shader * consumer)1585 ac_nir_varying_expression_max_cost(nir_shader *producer, nir_shader *consumer)
1586 {
1587 switch (consumer->info.stage) {
1588 case MESA_SHADER_TESS_CTRL:
1589 /* VS->TCS
1590 * Non-amplifying shaders can always have their varying expressions
1591 * moved into later shaders.
1592 */
1593 return UINT_MAX;
1594
1595 case MESA_SHADER_GEOMETRY:
1596 /* VS->GS, TES->GS */
1597 return consumer->info.gs.vertices_in == 1 ? UINT_MAX :
1598 consumer->info.gs.vertices_in == 2 ? 20 : 14;
1599
1600 case MESA_SHADER_TESS_EVAL:
1601 /* TCS->TES and VS->TES (OpenGL only) */
1602 case MESA_SHADER_FRAGMENT:
1603 /* Up to 3 uniforms and 5 ALUs. */
1604 return 14;
1605
1606 default:
1607 unreachable("unexpected shader stage");
1608 }
1609 }
1610
1611 unsigned
ac_nir_varying_estimate_instr_cost(nir_instr * instr)1612 ac_nir_varying_estimate_instr_cost(nir_instr *instr)
1613 {
1614 unsigned dst_bit_size, src_bit_size, num_dst_dwords;
1615 nir_op alu_op;
1616
1617 /* This is a very loose approximation based on gfx10. */
1618 switch (instr->type) {
1619 case nir_instr_type_alu:
1620 dst_bit_size = nir_instr_as_alu(instr)->def.bit_size;
1621 src_bit_size = nir_instr_as_alu(instr)->src[0].src.ssa->bit_size;
1622 alu_op = nir_instr_as_alu(instr)->op;
1623 num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);
1624
1625 switch (alu_op) {
1626 case nir_op_mov:
1627 case nir_op_vec2:
1628 case nir_op_vec3:
1629 case nir_op_vec4:
1630 case nir_op_vec5:
1631 case nir_op_vec8:
1632 case nir_op_vec16:
1633 case nir_op_fabs:
1634 case nir_op_fneg:
1635 case nir_op_fsat:
1636 return 0;
1637
1638 case nir_op_imul:
1639 case nir_op_umul_low:
1640 return dst_bit_size <= 16 ? 1 : 4 * num_dst_dwords;
1641
1642 case nir_op_imul_high:
1643 case nir_op_umul_high:
1644 case nir_op_imul_2x32_64:
1645 case nir_op_umul_2x32_64:
1646 return 4;
1647
1648 case nir_op_fexp2:
1649 case nir_op_flog2:
1650 case nir_op_frcp:
1651 case nir_op_frsq:
1652 case nir_op_fsqrt:
1653 case nir_op_fsin:
1654 case nir_op_fcos:
1655 case nir_op_fsin_amd:
1656 case nir_op_fcos_amd:
1657 return 4; /* FP16 & FP32. */
1658
1659 case nir_op_fpow:
1660 return 4 + 1 + 4; /* log2 + mul + exp2 */
1661
1662 case nir_op_fsign:
1663 return dst_bit_size == 64 ? 4 : 3; /* See ac_build_fsign. */
1664
1665 case nir_op_idiv:
1666 case nir_op_udiv:
1667 case nir_op_imod:
1668 case nir_op_umod:
1669 case nir_op_irem:
1670 return dst_bit_size == 64 ? 80 : 40;
1671
1672 case nir_op_fdiv:
1673 return dst_bit_size == 64 ? 80 : 5; /* FP16 & FP32: rcp + mul */
1674
1675 case nir_op_fmod:
1676 case nir_op_frem:
1677 return dst_bit_size == 64 ? 80 : 8;
1678
1679 default:
1680 /* Double opcodes. Comparisons have always full performance. */
1681 if ((dst_bit_size == 64 &&
1682 nir_op_infos[alu_op].output_type & nir_type_float) ||
1683 (dst_bit_size >= 8 && src_bit_size == 64 &&
1684 nir_op_infos[alu_op].input_types[0] & nir_type_float))
1685 return 16;
1686
1687 return DIV_ROUND_UP(MAX2(dst_bit_size, src_bit_size), 32);
1688 }
1689
1690 case nir_instr_type_intrinsic:
1691 dst_bit_size = nir_instr_as_intrinsic(instr)->def.bit_size;
1692 num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);
1693
1694 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
1695 case nir_intrinsic_load_deref:
1696 /* Uniform or UBO load.
1697 * Set a low cost to balance the number of scalar loads and ALUs.
1698 */
1699 return 3 * num_dst_dwords;
1700
1701 default:
1702 unreachable("unexpected intrinsic");
1703 }
1704
1705 default:
1706 unreachable("unexpected instr type");
1707 }
1708 }
1709