1 /* -*- mesa-c++ -*-
2 * Copyright 2022 Collabora LTD
3 * Author: Gert Wollny <[email protected]>
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "sfn_shader_fs.h"
8
9 #include "sfn_debug.h"
10 #include "sfn_instr_alugroup.h"
11 #include "sfn_instr_export.h"
12 #include "sfn_instr_fetch.h"
13 #include "sfn_instr_tex.h"
14
15 #include <sstream>
16
17 namespace r600 {
18
19 using std::string;
20
FragmentShader(const r600_shader_key & key)21 FragmentShader::FragmentShader(const r600_shader_key& key):
22 Shader("FS", key.ps.first_atomic_counter),
23 m_dual_source_blend(key.ps.dual_source_blend),
24 m_max_color_exports(MAX2(key.ps.nr_cbufs, 1)),
25 m_pos_input(127, false),
26 m_fs_write_all(false),
27 m_apply_sample_mask(key.ps.apply_sample_id_mask),
28 m_rat_base(key.ps.nr_cbufs),
29 m_image_size_const_offset(key.ps.image_size_const_offset)
30 {
31 }
32
33 void
do_get_shader_info(r600_shader * sh_info)34 FragmentShader::do_get_shader_info(r600_shader *sh_info)
35 {
36 sh_info->processor_type = PIPE_SHADER_FRAGMENT;
37
38 sh_info->ps_color_export_mask = m_color_export_mask;
39 sh_info->ps_export_highest = m_export_highest;
40 sh_info->nr_ps_color_exports = m_num_color_exports;
41
42 sh_info->fs_write_all = m_fs_write_all;
43
44 sh_info->rat_base = m_rat_base;
45 sh_info->uses_kill = m_uses_discard;
46 sh_info->gs_prim_id_input = m_gs_prim_id_input;
47 sh_info->nsys_inputs = m_nsys_inputs;
48 sh_info->uses_helper_invocation = m_helper_invocation != nullptr;
49 }
50
51 bool
load_input(nir_intrinsic_instr * intr)52 FragmentShader::load_input(nir_intrinsic_instr *intr)
53 {
54 auto& vf = value_factory();
55
56 auto location = nir_intrinsic_io_semantics(intr).location;
57 if (location == VARYING_SLOT_POS) {
58 AluInstr *ir = nullptr;
59 for (unsigned i = 0; i < intr->def.num_components; ++i) {
60 ir = new AluInstr(op1_mov,
61 vf.dest(intr->def, i, pin_none),
62 m_pos_input[i],
63 AluInstr::write);
64 emit_instruction(ir);
65 }
66 ir->set_alu_flag(alu_last_instr);
67 return true;
68 }
69
70 if (location == VARYING_SLOT_FACE) {
71 auto ir = new AluInstr(op2_setgt_dx10,
72 vf.dest(intr->def, 0, pin_none),
73 m_face_input,
74 vf.inline_const(ALU_SRC_0, 0),
75 AluInstr::last_write);
76 emit_instruction(ir);
77 return true;
78 }
79
80 return load_input_hw(intr);
81 }
82
83 bool
store_output(nir_intrinsic_instr * intr)84 FragmentShader::store_output(nir_intrinsic_instr *intr)
85 {
86 auto location = nir_intrinsic_io_semantics(intr).location;
87
88 if (location == FRAG_RESULT_COLOR && !m_dual_source_blend) {
89 m_fs_write_all = true;
90 }
91
92 return emit_export_pixel(*intr);
93 }
94
95 unsigned
barycentric_ij_index(nir_intrinsic_instr * intr)96 barycentric_ij_index(nir_intrinsic_instr *intr)
97 {
98 unsigned index = 0;
99 switch (intr->intrinsic) {
100 case nir_intrinsic_load_barycentric_sample:
101 index = 0;
102 break;
103 case nir_intrinsic_load_barycentric_at_sample:
104 case nir_intrinsic_load_barycentric_at_offset:
105 case nir_intrinsic_load_barycentric_pixel:
106 index = 1;
107 break;
108 case nir_intrinsic_load_barycentric_centroid:
109 index = 2;
110 break;
111 default:
112 unreachable("Unknown interpolator intrinsic");
113 }
114
115 switch (nir_intrinsic_interp_mode(intr)) {
116 case INTERP_MODE_NONE:
117 case INTERP_MODE_SMOOTH:
118 return index;
119 case INTERP_MODE_NOPERSPECTIVE:
120 return index + 3;
121 case INTERP_MODE_FLAT:
122 case INTERP_MODE_EXPLICIT:
123 default:
124 unreachable("unknown/unsupported mode for load_interpolated");
125 }
126 return 0;
127 }
128
129 bool
process_stage_intrinsic(nir_intrinsic_instr * intr)130 FragmentShader::process_stage_intrinsic(nir_intrinsic_instr *intr)
131 {
132 if (process_stage_intrinsic_hw(intr))
133 return true;
134
135 switch (intr->intrinsic) {
136 case nir_intrinsic_load_input:
137 return load_input(intr);
138 case nir_intrinsic_load_interpolated_input:
139 return load_interpolated_input(intr);
140 case nir_intrinsic_terminate_if:
141 m_uses_discard = true;
142 emit_instruction(new AluInstr(op2_killne_int,
143 nullptr,
144 value_factory().src(intr->src[0], 0),
145 value_factory().zero(),
146 {AluInstr::last}));
147
148 return true;
149 case nir_intrinsic_terminate:
150 m_uses_discard = true;
151 emit_instruction(new AluInstr(op2_kille_int,
152 nullptr,
153 value_factory().zero(),
154 value_factory().zero(),
155 {AluInstr::last}));
156 return true;
157 case nir_intrinsic_load_sample_mask_in:
158 if (m_apply_sample_mask) {
159 return emit_load_sample_mask_in(intr);
160 } else
161 return emit_simple_mov(intr->def, 0, m_sample_mask_reg);
162 case nir_intrinsic_load_sample_id:
163 return emit_simple_mov(intr->def, 0, m_sample_id_reg);
164 case nir_intrinsic_load_helper_invocation:
165 return emit_load_helper_invocation(intr);
166 case nir_intrinsic_load_sample_pos:
167 return emit_load_sample_pos(intr);
168 default:
169 return false;
170 }
171 }
172
173 bool
load_interpolated_input(nir_intrinsic_instr * intr)174 FragmentShader::load_interpolated_input(nir_intrinsic_instr *intr)
175 {
176 auto& vf = value_factory();
177 unsigned loc = nir_intrinsic_io_semantics(intr).location;
178 switch (loc) {
179 case VARYING_SLOT_POS:
180 for (unsigned i = 0; i < intr->def.num_components; ++i)
181 vf.inject_value(intr->def, i, m_pos_input[i]);
182 return true;
183 case VARYING_SLOT_FACE:
184 return false;
185 default:;
186 }
187
188 return load_interpolated_input_hw(intr);
189 }
190
191 int
do_allocate_reserved_registers()192 FragmentShader::do_allocate_reserved_registers()
193 {
194 int next_register = allocate_interpolators_or_inputs();
195
196 if (m_sv_values.test(es_pos)) {
197 set_input_gpr(m_pos_driver_loc, next_register);
198 m_pos_input = value_factory().allocate_pinned_vec4(next_register++, false);
199 }
200
201 int face_reg_index = -1;
202 if (m_sv_values.test(es_face)) {
203 set_input_gpr(m_face_driver_loc, next_register);
204 face_reg_index = next_register++;
205 m_face_input = value_factory().allocate_pinned_register(face_reg_index, 0);
206 }
207
208 if (m_sv_values.test(es_sample_mask_in)) {
209 if (face_reg_index < 0)
210 face_reg_index = next_register++;
211 m_sample_mask_reg = value_factory().allocate_pinned_register(face_reg_index, 2);
212 sfn_log << SfnLog::io << "Set sample mask in register to " << *m_sample_mask_reg
213 << "\n";
214 m_nsys_inputs = 1;
215 ShaderInput input(ninputs());
216 input.set_system_value(SYSTEM_VALUE_SAMPLE_MASK_IN);
217 input.set_gpr(face_reg_index);
218 add_input(input);
219 }
220
221 if (m_sv_values.test(es_sample_id) || m_sv_values.test(es_sample_mask_in)) {
222 int sample_id_reg = next_register++;
223 m_sample_id_reg = value_factory().allocate_pinned_register(sample_id_reg, 3);
224 sfn_log << SfnLog::io << "Set sample id register to " << *m_sample_id_reg << "\n";
225 m_nsys_inputs++;
226 ShaderInput input(ninputs());
227 input.set_system_value(SYSTEM_VALUE_SAMPLE_ID);
228 input.set_gpr(sample_id_reg);
229 add_input(input);
230 }
231
232 if (m_sv_values.test(es_helper_invocation)) {
233 m_helper_invocation = value_factory().temp_register(0, false);
234 }
235
236 return next_register;
237 }
238
239 bool
do_scan_instruction(nir_instr * instr)240 FragmentShader::do_scan_instruction(nir_instr *instr)
241 {
242 if (instr->type != nir_instr_type_intrinsic)
243 return false;
244
245 auto intr = nir_instr_as_intrinsic(instr);
246 switch (intr->intrinsic) {
247 case nir_intrinsic_load_barycentric_pixel:
248 case nir_intrinsic_load_barycentric_sample:
249 case nir_intrinsic_load_barycentric_at_sample:
250 case nir_intrinsic_load_barycentric_at_offset:
251 case nir_intrinsic_load_barycentric_centroid:
252 m_interpolators_used.set(barycentric_ij_index(intr));
253 break;
254 case nir_intrinsic_load_front_face:
255 m_sv_values.set(es_face);
256 break;
257 case nir_intrinsic_load_sample_mask_in:
258 m_sv_values.set(es_sample_mask_in);
259 break;
260 case nir_intrinsic_load_sample_pos:
261 m_sv_values.set(es_sample_pos);
262 FALLTHROUGH;
263 case nir_intrinsic_load_sample_id:
264 m_sv_values.set(es_sample_id);
265 break;
266 case nir_intrinsic_load_helper_invocation:
267 m_sv_values.set(es_helper_invocation);
268 break;
269 case nir_intrinsic_load_input:
270 return scan_input(intr, 0);
271 case nir_intrinsic_load_interpolated_input:
272 return scan_input(intr, 1);
273 default:
274 return false;
275 }
276 return true;
277 }
278
279 bool
emit_load_sample_mask_in(nir_intrinsic_instr * instr)280 FragmentShader::emit_load_sample_mask_in(nir_intrinsic_instr *instr)
281 {
282 auto& vf = value_factory();
283 auto dest = vf.dest(instr->def, 0, pin_free);
284 auto tmp = vf.temp_register();
285 assert(m_sample_id_reg);
286 assert(m_sample_mask_reg);
287
288 emit_instruction(
289 new AluInstr(op2_lshl_int, tmp, vf.one_i(), m_sample_id_reg, AluInstr::last_write));
290 emit_instruction(
291 new AluInstr(op2_and_int, dest, tmp, m_sample_mask_reg, AluInstr::last_write));
292 return true;
293 }
294
295 bool
emit_load_helper_invocation(nir_intrinsic_instr * instr)296 FragmentShader::emit_load_helper_invocation(nir_intrinsic_instr *instr)
297 {
298 assert(m_helper_invocation);
299 auto& vf = value_factory();
300 emit_instruction(
301 new AluInstr(op1_mov, m_helper_invocation, vf.literal(-1), AluInstr::last_write));
302 RegisterVec4 destvec{m_helper_invocation, nullptr, nullptr, nullptr, pin_group};
303
304 auto vtx = new LoadFromBuffer(destvec,
305 {4, 7, 7, 7},
306 m_helper_invocation,
307 0,
308 R600_BUFFER_INFO_CONST_BUFFER,
309 nullptr,
310 fmt_32_32_32_32_float);
311 vtx->set_fetch_flag(FetchInstr::vpm);
312 vtx->set_fetch_flag(FetchInstr::use_tc);
313 vtx->set_always_keep();
314 auto dst = value_factory().dest(instr->def, 0, pin_free);
315 auto ir = new AluInstr(op1_mov, dst, m_helper_invocation, AluInstr::last_write);
316 ir->add_required_instr(vtx);
317 emit_instruction(vtx);
318 emit_instruction(ir);
319
320 return true;
321 }
322
323 bool
scan_input(nir_intrinsic_instr * intr,int index_src_id)324 FragmentShader::scan_input(nir_intrinsic_instr *intr, int index_src_id)
325 {
326 auto index = nir_src_as_const_value(intr->src[index_src_id]);
327 assert(index);
328
329 const unsigned location_offset = chip_class() < ISA_CC_EVERGREEN ? 32 : 0;
330 bool uses_interpol_at_centroid = false;
331
332 auto location =
333 static_cast<gl_varying_slot>(nir_intrinsic_io_semantics(intr).location + index->u32);
334 unsigned driver_location = nir_intrinsic_base(intr) + index->u32;
335
336 if (location == VARYING_SLOT_POS) {
337 m_sv_values.set(es_pos);
338 m_pos_driver_loc = driver_location + location_offset;
339 ShaderInput pos_input(m_pos_driver_loc, location);
340 pos_input.set_interpolator(TGSI_INTERPOLATE_LINEAR,
341 TGSI_INTERPOLATE_LOC_CENTER,
342 false);
343 add_input(pos_input);
344 return true;
345 }
346
347 if (location == VARYING_SLOT_FACE) {
348 m_sv_values.set(es_face);
349 m_face_driver_loc = driver_location + location_offset;
350 ShaderInput face_input(m_face_driver_loc, location);
351 add_input(face_input);
352 return true;
353 }
354
355 tgsi_interpolate_mode tgsi_interpolate = TGSI_INTERPOLATE_CONSTANT;
356 tgsi_interpolate_loc tgsi_loc = TGSI_INTERPOLATE_LOC_CENTER;
357
358 const bool is_color =
359 (location >= VARYING_SLOT_COL0 && location <= VARYING_SLOT_COL1) ||
360 (location >= VARYING_SLOT_BFC0 && location <= VARYING_SLOT_BFC1);
361
362 if (index_src_id > 0) {
363 glsl_interp_mode mode = INTERP_MODE_NONE;
364 auto parent = nir_instr_as_intrinsic(intr->src[0].ssa->parent_instr);
365 mode = (glsl_interp_mode)nir_intrinsic_interp_mode(parent);
366 switch (parent->intrinsic) {
367 case nir_intrinsic_load_barycentric_sample:
368 tgsi_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
369 break;
370 case nir_intrinsic_load_barycentric_at_sample:
371 case nir_intrinsic_load_barycentric_at_offset:
372 case nir_intrinsic_load_barycentric_pixel:
373 tgsi_loc = TGSI_INTERPOLATE_LOC_CENTER;
374 break;
375 case nir_intrinsic_load_barycentric_centroid:
376 tgsi_loc = TGSI_INTERPOLATE_LOC_CENTROID;
377 uses_interpol_at_centroid = true;
378 break;
379 default:
380 std::cerr << "Instruction " << nir_intrinsic_infos[parent->intrinsic].name
381 << " as parent of " << nir_intrinsic_infos[intr->intrinsic].name
382 << " interpolator?\n";
383 assert(0);
384 }
385
386 switch (mode) {
387 case INTERP_MODE_NONE:
388 if (is_color) {
389 tgsi_interpolate = TGSI_INTERPOLATE_COLOR;
390 break;
391 }
392 FALLTHROUGH;
393 case INTERP_MODE_SMOOTH:
394 tgsi_interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
395 break;
396 case INTERP_MODE_NOPERSPECTIVE:
397 tgsi_interpolate = TGSI_INTERPOLATE_LINEAR;
398 break;
399 case INTERP_MODE_FLAT:
400 break;
401 case INTERP_MODE_EXPLICIT:
402 default:
403 assert(0);
404 }
405 }
406
407 if (location == VARYING_SLOT_PRIMITIVE_ID) {
408 m_gs_prim_id_input = true;
409 } else if (!(is_color || (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_MAX) ||
410 (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7) ||
411 (location >= VARYING_SLOT_CLIP_DIST0 && location <= VARYING_SLOT_CLIP_DIST1) ||
412 location == VARYING_SLOT_FOGC || location == VARYING_SLOT_LAYER ||
413 location == VARYING_SLOT_PNTC || location == VARYING_SLOT_VIEWPORT)) {
414 return false;
415 }
416
417 sfn_log << SfnLog::io << " have IO at " << driver_location << "\n";
418 auto iinput = find_input(driver_location);
419 if (iinput == input_not_found()) {
420 ShaderInput input(driver_location, location);
421 input.set_need_lds_pos();
422 input.set_interpolator(tgsi_interpolate, tgsi_loc, uses_interpol_at_centroid);
423 sfn_log << SfnLog::io << "add IO with LDS ID at " << input.location() << "\n";
424 add_input(input);
425 assert(find_input(input.location()) != input_not_found());
426 } else {
427 if (uses_interpol_at_centroid) {
428 iinput->second.set_uses_interpolate_at_centroid();
429 }
430 }
431 return true;
432 }
433
434 bool
emit_export_pixel(nir_intrinsic_instr & intr)435 FragmentShader::emit_export_pixel(nir_intrinsic_instr& intr)
436 {
437 RegisterVec4::Swizzle swizzle;
438 auto semantics = nir_intrinsic_io_semantics(&intr);
439 unsigned driver_location = nir_intrinsic_base(&intr);
440 unsigned write_mask = nir_intrinsic_write_mask(&intr);
441
442 switch (semantics.location) {
443 case FRAG_RESULT_DEPTH:
444 swizzle = {0, 7, 7, 7};
445 break;
446 case FRAG_RESULT_STENCIL:
447 swizzle = {7, 0, 7, 7};
448 break;
449 case FRAG_RESULT_SAMPLE_MASK:
450 swizzle = {7, 7, 0, 7};
451 break;
452 default:
453 for (int i = 0; i < 4; ++i) {
454 swizzle[i] = (1 << i) & write_mask ? i : 7;
455 }
456 }
457
458 auto value = value_factory().src_vec4(intr.src[0], pin_group, swizzle);
459
460 if (semantics.location == FRAG_RESULT_COLOR ||
461 (semantics.location >= FRAG_RESULT_DATA0 &&
462 semantics.location <= FRAG_RESULT_DATA7)) {
463
464 ShaderOutput output(driver_location, write_mask);
465 output.set_frag_result(static_cast<gl_frag_result>(semantics.location));
466 add_output(output);
467
468 unsigned color_outputs =
469 m_fs_write_all && chip_class() >= ISA_CC_R700 ? m_max_color_exports : 1;
470
471 for (unsigned k = 0; k < color_outputs; ++k) {
472
473 unsigned location = semantics.location - FRAG_RESULT_DATA0;
474
475 if (semantics.location == FRAG_RESULT_COLOR)
476 location = driver_location + k;
477
478 if (semantics.dual_source_blend_index)
479 location = semantics.dual_source_blend_index;
480
481 sfn_log << SfnLog::io << "Pixel output at loc:" << location
482 << "("<< semantics.location << ") of "<< m_max_color_exports<<"\n";
483
484 if (location >= m_max_color_exports) {
485 sfn_log << SfnLog::io << "Pixel output loc:" << location
486 << " dl:" << driver_location << " skipped because we have only "
487 << m_max_color_exports << " CBs\n";
488 return true;
489 }
490
491 m_last_pixel_export = new ExportInstr(ExportInstr::pixel, location, value);
492
493 if (m_export_highest < location)
494 m_export_highest = location;
495
496 m_num_color_exports++;
497
498 /* Hack: force dual source output handling if one color output has a
499 * dual_source_blend_index > 0 */
500 if (semantics.dual_source_blend_index > 0)
501 m_dual_source_blend = true;
502
503 if (m_num_color_exports > 1)
504 m_fs_write_all = false;
505 unsigned mask = (0xfu << (location * 4));
506
507 m_color_export_written_mask |= (1 << location);
508
509 /* If the i-th target format is set, all previous target formats must
510 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
511 /*/
512 for (unsigned i = 0; i < location; ++i)
513 mask |= (0x1u << (i * 4));
514
515 m_color_export_mask |= mask;
516
517 emit_instruction(m_last_pixel_export);
518 }
519 } else if (semantics.location == FRAG_RESULT_DEPTH ||
520 semantics.location == FRAG_RESULT_STENCIL ||
521 semantics.location == FRAG_RESULT_SAMPLE_MASK) {
522 emit_instruction(new ExportInstr(ExportInstr::pixel, 61, value));
523
524 ShaderOutput output(driver_location, write_mask);
525 output.set_frag_result(static_cast<gl_frag_result>(semantics.location));
526 add_output(output);
527
528 } else {
529 return false;
530 }
531 return true;
532 }
533
534 bool
emit_load_sample_pos(nir_intrinsic_instr * instr)535 FragmentShader::emit_load_sample_pos(nir_intrinsic_instr *instr)
536 {
537 auto dest = value_factory().dest_vec4(instr->def, pin_group);
538
539 auto fetch = new LoadFromBuffer(dest,
540 {0, 1, 2, 3},
541 m_sample_id_reg,
542 0,
543 R600_BUFFER_INFO_CONST_BUFFER,
544 nullptr,
545 fmt_32_32_32_32_float);
546 fetch->set_fetch_flag(FetchInstr::srf_mode);
547 emit_instruction(fetch);
548 return true;
549 }
550
551 void
do_finalize()552 FragmentShader::do_finalize()
553 {
554 /* On pre-evergreen not emtting something to all color exports that
555 * are enabled might lead to a hang.
556 * see: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9223
557 */
558 if (chip_class() < ISA_CC_EVERGREEN) {
559 unsigned i = 0;
560 unsigned mask = m_color_export_mask;
561
562 while (i < m_max_color_exports && (mask & (1u << (4 * i)))) {
563 if (!(m_color_export_written_mask & (1u << i))) {
564 RegisterVec4 value(0, false, {7, 7, 7, 7});
565 m_last_pixel_export = new ExportInstr(ExportInstr::pixel, i, value);
566 emit_instruction(m_last_pixel_export);
567 m_num_color_exports++;
568 if (m_export_highest < i)
569 m_export_highest = i;
570 }
571 ++i;
572 }
573 }
574
575 if (!m_last_pixel_export) {
576 RegisterVec4 value(0, false, {7, 7, 7, 7});
577 m_last_pixel_export = new ExportInstr(ExportInstr::pixel, 0, value);
578 emit_instruction(m_last_pixel_export);
579 m_num_color_exports++;
580 m_color_export_mask |= 0xf;
581 }
582 m_last_pixel_export->set_is_last_export(true);
583 }
584
585 bool
read_prop(std::istream & is)586 FragmentShader::read_prop(std::istream& is)
587 {
588 string value;
589 is >> value;
590
591 ASSERTED auto splitpos = value.find(':');
592 assert(splitpos != string::npos);
593
594 std::istringstream ival(value);
595 string name;
596 string val;
597
598 std::getline(ival, name, ':');
599
600 if (name == "MAX_COLOR_EXPORTS")
601 ival >> m_max_color_exports;
602 else if (name == "COLOR_EXPORTS")
603 ival >> m_num_color_exports;
604 else if (name == "COLOR_EXPORT_MASK")
605 ival >> m_color_export_mask;
606 else if (name == "WRITE_ALL_COLORS")
607 ival >> m_fs_write_all;
608 else
609 return false;
610 return true;
611 }
612
613 void
do_print_properties(std::ostream & os) const614 FragmentShader::do_print_properties(std::ostream& os) const
615 {
616 os << "PROP MAX_COLOR_EXPORTS:" << m_max_color_exports << "\n";
617 os << "PROP COLOR_EXPORTS:" << m_num_color_exports << "\n";
618 os << "PROP COLOR_EXPORT_MASK:" << m_color_export_mask << "\n";
619 os << "PROP WRITE_ALL_COLORS:" << m_fs_write_all << "\n";
620 }
621
622 int
allocate_interpolators_or_inputs()623 FragmentShaderR600::allocate_interpolators_or_inputs()
624 {
625 int pos = 0;
626 auto& vf = value_factory();
627 for (auto& [index, inp] : inputs()) {
628 if (inp.need_lds_pos()) {
629
630 RegisterVec4 input(vf.allocate_pinned_register(pos, 0),
631 vf.allocate_pinned_register(pos, 1),
632 vf.allocate_pinned_register(pos, 2),
633 vf.allocate_pinned_register(pos, 3),
634 pin_fully);
635 inp.set_gpr(pos++);
636
637 sfn_log << SfnLog::io << "Reserve input register at pos " << index << " as "
638 << input << " with register " << inp.gpr() << "\n";
639
640 m_interpolated_inputs[index] = input;
641 }
642 }
643 return pos;
644 }
645
646 bool
load_input_hw(nir_intrinsic_instr * intr)647 FragmentShaderR600::load_input_hw(nir_intrinsic_instr *intr)
648 {
649 auto& vf = value_factory();
650 AluInstr *ir = nullptr;
651 for (unsigned i = 0; i < intr->def.num_components; ++i) {
652 sfn_log << SfnLog::io << "Inject register "
653 << *m_interpolated_inputs[nir_intrinsic_base(intr)][i] << "\n";
654 unsigned index = nir_intrinsic_component(intr) + i;
655 assert(index < 4);
656 vf.inject_value(intr->def,
657 i,
658 m_interpolated_inputs[nir_intrinsic_base(intr)][index]);
659 }
660 if (ir)
661 ir->set_alu_flag(alu_last_instr);
662 return true;
663 }
664
665 bool
process_stage_intrinsic_hw(nir_intrinsic_instr * intr)666 FragmentShaderR600::process_stage_intrinsic_hw(nir_intrinsic_instr *intr)
667 {
668 switch (intr->intrinsic) {
669 case nir_intrinsic_load_barycentric_centroid:
670 case nir_intrinsic_load_barycentric_pixel:
671 case nir_intrinsic_load_barycentric_sample:
672 return true;
673 default:
674 return false;
675 }
676 }
677
678 bool
load_interpolated_input_hw(nir_intrinsic_instr * intr)679 FragmentShaderR600::load_interpolated_input_hw(nir_intrinsic_instr *intr)
680 {
681 return load_input_hw(intr);
682 }
683
684 bool
load_input_hw(nir_intrinsic_instr * intr)685 FragmentShaderEG::load_input_hw(nir_intrinsic_instr *intr)
686 {
687 auto& vf = value_factory();
688 auto io = input(nir_intrinsic_base(intr));
689 auto comp = nir_intrinsic_component(intr);
690
691 bool need_temp = comp > 0;
692 AluInstr *ir = nullptr;
693 for (unsigned i = 0; i < intr->def.num_components; ++i) {
694 if (need_temp) {
695 auto tmp = vf.temp_register(comp + i);
696 ir =
697 new AluInstr(op1_interp_load_p0,
698 tmp,
699 new InlineConstant(ALU_SRC_PARAM_BASE + io.lds_pos(), i + comp),
700 AluInstr::last_write);
701 emit_instruction(ir);
702 emit_instruction(new AluInstr(
703 op1_mov, vf.dest(intr->def, i, pin_chan), tmp, AluInstr::last_write));
704 } else {
705
706 ir = new AluInstr(op1_interp_load_p0,
707 vf.dest(intr->def, i, pin_chan),
708 new InlineConstant(ALU_SRC_PARAM_BASE + io.lds_pos(), i),
709 AluInstr::write);
710 emit_instruction(ir);
711 }
712 }
713 ir->set_alu_flag(alu_last_instr);
714 return true;
715 }
716
717 int
allocate_interpolators_or_inputs()718 FragmentShaderEG::allocate_interpolators_or_inputs()
719 {
720 for (unsigned i = 0; i < s_max_interpolators; ++i) {
721 if (interpolators_used(i)) {
722 sfn_log << SfnLog::io << "Interpolator " << i << " test enabled\n";
723 m_interpolator[i].enabled = true;
724 }
725 }
726
727 int num_baryc = 0;
728 for (int i = 0; i < 6; ++i) {
729 if (m_interpolator[i].enabled) {
730 sfn_log << SfnLog::io << "Interpolator " << i
731 << " is enabled with ij=" << num_baryc << " \n";
732 unsigned sel = num_baryc / 2;
733 unsigned chan = 2 * (num_baryc % 2);
734
735 m_interpolator[i].i = value_factory().allocate_pinned_register(sel, chan + 1);
736 m_interpolator[i].j = value_factory().allocate_pinned_register(sel, chan);
737
738 m_interpolator[i].ij_index = num_baryc++;
739 }
740 }
741 return (num_baryc + 1) >> 1;
742 }
743
744 bool
process_stage_intrinsic_hw(nir_intrinsic_instr * intr)745 FragmentShaderEG::process_stage_intrinsic_hw(nir_intrinsic_instr *intr)
746 {
747 auto& vf = value_factory();
748 switch (intr->intrinsic) {
749 case nir_intrinsic_load_barycentric_centroid:
750 case nir_intrinsic_load_barycentric_pixel:
751 case nir_intrinsic_load_barycentric_sample: {
752 unsigned ij = barycentric_ij_index(intr);
753 vf.inject_value(intr->def, 0, m_interpolator[ij].i);
754 vf.inject_value(intr->def, 1, m_interpolator[ij].j);
755 return true;
756 }
757 case nir_intrinsic_load_barycentric_at_offset:
758 return load_barycentric_at_offset(intr);
759 case nir_intrinsic_load_barycentric_at_sample:
760 return load_barycentric_at_sample(intr);
761 default:
762 return false;
763 }
764 }
765
766 bool
load_interpolated_input_hw(nir_intrinsic_instr * intr)767 FragmentShaderEG::load_interpolated_input_hw(nir_intrinsic_instr *intr)
768 {
769 auto& vf = value_factory();
770 ASSERTED auto param = nir_src_as_const_value(intr->src[1]);
771 assert(param && "Indirect PS inputs not (yet) supported");
772
773 int dest_num_comp = intr->def.num_components;
774 int start_comp = nir_intrinsic_component(intr);
775 bool need_temp = start_comp > 0;
776
777 auto dst = need_temp ? vf.temp_vec4(pin_chan) : vf.dest_vec4(intr->def, pin_chan);
778
779 InterpolateParams params;
780
781 params.i = vf.src(intr->src[0], 0);
782 params.j = vf.src(intr->src[0], 1);
783 params.base = input(nir_intrinsic_base(intr)).lds_pos();
784
785 if (!load_interpolated(dst, params, dest_num_comp, start_comp))
786 return false;
787
788 if (need_temp) {
789 AluInstr *ir = nullptr;
790 for (unsigned i = 0; i < intr->def.num_components; ++i) {
791 auto real_dst = vf.dest(intr->def, i, pin_chan);
792 ir = new AluInstr(op1_mov, real_dst, dst[i + start_comp], AluInstr::write);
793 emit_instruction(ir);
794 }
795 assert(ir);
796 ir->set_alu_flag(alu_last_instr);
797 }
798
799 return true;
800 }
801
802 bool
load_interpolated(RegisterVec4 & dest,const InterpolateParams & params,int num_dest_comp,int start_comp)803 FragmentShaderEG::load_interpolated(RegisterVec4& dest,
804 const InterpolateParams& params,
805 int num_dest_comp,
806 int start_comp)
807 {
808 sfn_log << SfnLog::io << "Using Interpolator (" << *params.j << ", " << *params.i
809 << ")"
810 << "\n";
811
812 if (num_dest_comp == 1) {
813 switch (start_comp) {
814 case 0:
815 return load_interpolated_one_comp(dest, params, op2_interp_x);
816 case 1:
817 return load_interpolated_two_comp_for_one(dest, params, op2_interp_xy, 1);
818 case 2:
819 return load_interpolated_one_comp(dest, params, op2_interp_z);
820 case 3:
821 return load_interpolated_two_comp_for_one(dest, params, op2_interp_zw, 3);
822 default:
823 assert(0);
824 }
825 }
826
827 if (num_dest_comp == 2) {
828 switch (start_comp) {
829 case 0:
830 return load_interpolated_two_comp(dest, params, op2_interp_xy, 0x3);
831 case 2:
832 return load_interpolated_two_comp(dest, params, op2_interp_zw, 0xc);
833 case 1:
834 return load_interpolated_one_comp(dest, params, op2_interp_z) &&
835 load_interpolated_two_comp_for_one(dest, params, op2_interp_xy, 1);
836 default:
837 assert(0);
838 }
839 }
840
841 if (num_dest_comp == 3 && start_comp == 0)
842 return load_interpolated_two_comp(dest, params, op2_interp_xy, 0x3) &&
843 load_interpolated_one_comp(dest, params, op2_interp_z);
844
845 int full_write_mask = ((1 << num_dest_comp) - 1) << start_comp;
846
847 bool success =
848 load_interpolated_two_comp(dest, params, op2_interp_zw, full_write_mask & 0xc);
849 success &=
850 load_interpolated_two_comp(dest, params, op2_interp_xy, full_write_mask & 0x3);
851 return success;
852 }
853
854 bool
load_barycentric_at_sample(nir_intrinsic_instr * instr)855 FragmentShaderEG::load_barycentric_at_sample(nir_intrinsic_instr *instr)
856 {
857 auto& vf = value_factory();
858 RegisterVec4 slope = vf.temp_vec4(pin_group);
859 auto src = emit_load_to_register(vf.src(instr->src[0], 0));
860 auto fetch = new LoadFromBuffer(slope,
861 {0, 1, 2, 3},
862 src,
863 0,
864 R600_BUFFER_INFO_CONST_BUFFER,
865 nullptr,
866 fmt_32_32_32_32_float);
867
868 fetch->set_fetch_flag(FetchInstr::srf_mode);
869 emit_instruction(fetch);
870
871 auto grad = vf.temp_vec4(pin_group);
872
873 auto interpolator = m_interpolator[barycentric_ij_index(instr)];
874 assert(interpolator.enabled);
875
876 RegisterVec4 interp(interpolator.j, interpolator.i, nullptr, nullptr, pin_group);
877
878 auto tex = new TexInstr(TexInstr::get_gradient_h, grad, {0, 1, 7, 7}, interp, 0, 0);
879 tex->set_tex_flag(TexInstr::grad_fine);
880 tex->set_tex_flag(TexInstr::x_unnormalized);
881 tex->set_tex_flag(TexInstr::y_unnormalized);
882 tex->set_tex_flag(TexInstr::z_unnormalized);
883 tex->set_tex_flag(TexInstr::w_unnormalized);
884 emit_instruction(tex);
885
886 tex = new TexInstr(TexInstr::get_gradient_v, grad, {7, 7, 0, 1}, interp, 0, 0);
887 tex->set_tex_flag(TexInstr::x_unnormalized);
888 tex->set_tex_flag(TexInstr::y_unnormalized);
889 tex->set_tex_flag(TexInstr::z_unnormalized);
890 tex->set_tex_flag(TexInstr::w_unnormalized);
891 tex->set_tex_flag(TexInstr::grad_fine);
892 emit_instruction(tex);
893
894 auto tmp0 = vf.temp_register();
895 auto tmp1 = vf.temp_register();
896
897 emit_instruction(
898 new AluInstr(op3_muladd, tmp0, grad[0], slope[2], interpolator.j, {alu_write}));
899 emit_instruction(new AluInstr(
900 op3_muladd, tmp1, grad[1], slope[2], interpolator.i, {alu_write, alu_last_instr}));
901
902 emit_instruction(new AluInstr(op3_muladd,
903 vf.dest(instr->def, 0, pin_none),
904 grad[3],
905 slope[3],
906 tmp1,
907 {alu_write}));
908 emit_instruction(new AluInstr(op3_muladd,
909 vf.dest(instr->def, 1, pin_none),
910 grad[2],
911 slope[3],
912 tmp0,
913 {alu_write, alu_last_instr}));
914
915 return true;
916 }
917
918 bool
load_barycentric_at_offset(nir_intrinsic_instr * instr)919 FragmentShaderEG::load_barycentric_at_offset(nir_intrinsic_instr *instr)
920 {
921 auto& vf = value_factory();
922 auto interpolator = m_interpolator[barycentric_ij_index(instr)];
923
924 auto help = vf.temp_vec4(pin_group);
925 RegisterVec4 interp(interpolator.j, interpolator.i, nullptr, nullptr, pin_group);
926
927 auto getgradh =
928 new TexInstr(TexInstr::get_gradient_h, help, {0, 1, 7, 7}, interp, 0, 0);
929 getgradh->set_tex_flag(TexInstr::x_unnormalized);
930 getgradh->set_tex_flag(TexInstr::y_unnormalized);
931 getgradh->set_tex_flag(TexInstr::z_unnormalized);
932 getgradh->set_tex_flag(TexInstr::w_unnormalized);
933 getgradh->set_tex_flag(TexInstr::grad_fine);
934 emit_instruction(getgradh);
935
936 auto getgradv =
937 new TexInstr(TexInstr::get_gradient_v, help, {7, 7, 0, 1}, interp, 0, 0);
938 getgradv->set_tex_flag(TexInstr::x_unnormalized);
939 getgradv->set_tex_flag(TexInstr::y_unnormalized);
940 getgradv->set_tex_flag(TexInstr::z_unnormalized);
941 getgradv->set_tex_flag(TexInstr::w_unnormalized);
942 getgradv->set_tex_flag(TexInstr::grad_fine);
943 emit_instruction(getgradv);
944
945 auto ofs_x = vf.src(instr->src[0], 0);
946 auto ofs_y = vf.src(instr->src[0], 1);
947 auto tmp0 = vf.temp_register();
948 auto tmp1 = vf.temp_register();
949 emit_instruction(
950 new AluInstr(op3_muladd, tmp0, help[0], ofs_x, interpolator.j, {alu_write}));
951 emit_instruction(new AluInstr(
952 op3_muladd, tmp1, help[1], ofs_x, interpolator.i, {alu_write, alu_last_instr}));
953 emit_instruction(new AluInstr(
954 op3_muladd, vf.dest(instr->def, 0, pin_none), help[3], ofs_y, tmp1, {alu_write}));
955 emit_instruction(new AluInstr(op3_muladd,
956 vf.dest(instr->def, 1, pin_none),
957 help[2],
958 ofs_y,
959 tmp0,
960 {alu_write, alu_last_instr}));
961
962 return true;
963 }
964
965 bool
load_interpolated_one_comp(RegisterVec4 & dest,const InterpolateParams & params,EAluOp op)966 FragmentShaderEG::load_interpolated_one_comp(RegisterVec4& dest,
967 const InterpolateParams& params,
968 EAluOp op)
969 {
970 auto group = new AluGroup();
971 bool success = true;
972
973 AluInstr *ir = nullptr;
974 for (unsigned i = 0; i < 2 && success; ++i) {
975 int chan = i;
976 if (op == op2_interp_z)
977 chan += 2;
978
979 ir = new AluInstr(op,
980 dest[chan],
981 i & 1 ? params.j : params.i,
982 new InlineConstant(ALU_SRC_PARAM_BASE + params.base, chan),
983 i == 0 ? AluInstr::write : AluInstr::last);
984
985 ir->set_bank_swizzle(alu_vec_210);
986 success = group->add_instruction(ir);
987 }
988 ir->set_alu_flag(alu_last_instr);
989 if (success)
990 emit_instruction(group);
991 return success;
992 }
993
994 bool
load_interpolated_two_comp(RegisterVec4 & dest,const InterpolateParams & params,EAluOp op,int writemask)995 FragmentShaderEG::load_interpolated_two_comp(RegisterVec4& dest,
996 const InterpolateParams& params,
997 EAluOp op,
998 int writemask)
999 {
1000 auto group = new AluGroup();
1001 bool success = true;
1002
1003 AluInstr *ir = nullptr;
1004 assert(params.j);
1005 assert(params.i);
1006 for (unsigned i = 0; i < 4; ++i) {
1007 ir = new AluInstr(op,
1008 dest[i],
1009 i & 1 ? params.j : params.i,
1010 new InlineConstant(ALU_SRC_PARAM_BASE + params.base, i),
1011 (writemask & (1 << i)) ? AluInstr::write : AluInstr::empty);
1012 ir->set_bank_swizzle(alu_vec_210);
1013 success = group->add_instruction(ir);
1014 }
1015 ir->set_alu_flag(alu_last_instr);
1016 if (success)
1017 emit_instruction(group);
1018 return success;
1019 }
1020
1021 bool
load_interpolated_two_comp_for_one(RegisterVec4 & dest,const InterpolateParams & params,EAluOp op,int comp)1022 FragmentShaderEG::load_interpolated_two_comp_for_one(RegisterVec4& dest,
1023 const InterpolateParams& params,
1024 EAluOp op,
1025 int comp)
1026 {
1027 auto group = new AluGroup();
1028 bool success = true;
1029 AluInstr *ir = nullptr;
1030
1031 for (int i = 0; i < 4; ++i) {
1032 ir = new AluInstr(op,
1033 dest[i],
1034 i & 1 ? params.j : params.i,
1035 new InlineConstant(ALU_SRC_PARAM_BASE + params.base, i),
1036 i == comp ? AluInstr::write : AluInstr::empty);
1037 ir->set_bank_swizzle(alu_vec_210);
1038 success = group->add_instruction(ir);
1039 }
1040 ir->set_alu_flag(alu_last_instr);
1041 if (success)
1042 emit_instruction(group);
1043
1044 return success;
1045 }
1046
Interpolator()1047 FragmentShaderEG::Interpolator::Interpolator():
1048 enabled(false)
1049 {
1050 }
1051
1052 } // namespace r600
1053