xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/r600/sfn/sfn_shader_fs.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /* -*- mesa-c++  -*-
2  * Copyright 2022 Collabora LTD
3  * Author: Gert Wollny <[email protected]>
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "sfn_shader_fs.h"
8 
9 #include "sfn_debug.h"
10 #include "sfn_instr_alugroup.h"
11 #include "sfn_instr_export.h"
12 #include "sfn_instr_fetch.h"
13 #include "sfn_instr_tex.h"
14 
15 #include <sstream>
16 
17 namespace r600 {
18 
19 using std::string;
20 
FragmentShader(const r600_shader_key & key)21 FragmentShader::FragmentShader(const r600_shader_key& key):
22     Shader("FS", key.ps.first_atomic_counter),
23     m_dual_source_blend(key.ps.dual_source_blend),
24     m_max_color_exports(MAX2(key.ps.nr_cbufs, 1)),
25     m_pos_input(127, false),
26     m_fs_write_all(false),
27     m_apply_sample_mask(key.ps.apply_sample_id_mask),
28     m_rat_base(key.ps.nr_cbufs),
29     m_image_size_const_offset(key.ps.image_size_const_offset)
30 {
31 }
32 
33 void
do_get_shader_info(r600_shader * sh_info)34 FragmentShader::do_get_shader_info(r600_shader *sh_info)
35 {
36    sh_info->processor_type = PIPE_SHADER_FRAGMENT;
37 
38    sh_info->ps_color_export_mask = m_color_export_mask;
39    sh_info->ps_export_highest = m_export_highest;
40    sh_info->nr_ps_color_exports = m_num_color_exports;
41 
42    sh_info->fs_write_all = m_fs_write_all;
43 
44    sh_info->rat_base = m_rat_base;
45    sh_info->uses_kill = m_uses_discard;
46    sh_info->gs_prim_id_input = m_gs_prim_id_input;
47    sh_info->nsys_inputs = m_nsys_inputs;
48    sh_info->uses_helper_invocation = m_helper_invocation != nullptr;
49 }
50 
51 bool
load_input(nir_intrinsic_instr * intr)52 FragmentShader::load_input(nir_intrinsic_instr *intr)
53 {
54    auto& vf = value_factory();
55 
56    auto location = nir_intrinsic_io_semantics(intr).location;
57    if (location == VARYING_SLOT_POS) {
58       AluInstr *ir = nullptr;
59       for (unsigned i = 0; i < intr->def.num_components; ++i) {
60          ir = new AluInstr(op1_mov,
61                            vf.dest(intr->def, i, pin_none),
62                            m_pos_input[i],
63                            AluInstr::write);
64          emit_instruction(ir);
65       }
66       ir->set_alu_flag(alu_last_instr);
67       return true;
68    }
69 
70    if (location == VARYING_SLOT_FACE) {
71       auto ir = new AluInstr(op2_setgt_dx10,
72                              vf.dest(intr->def, 0, pin_none),
73                              m_face_input,
74                              vf.inline_const(ALU_SRC_0, 0),
75                              AluInstr::last_write);
76       emit_instruction(ir);
77       return true;
78    }
79 
80    return load_input_hw(intr);
81 }
82 
83 bool
store_output(nir_intrinsic_instr * intr)84 FragmentShader::store_output(nir_intrinsic_instr *intr)
85 {
86    auto location = nir_intrinsic_io_semantics(intr).location;
87 
88    if (location == FRAG_RESULT_COLOR && !m_dual_source_blend) {
89       m_fs_write_all = true;
90    }
91 
92    return emit_export_pixel(*intr);
93 }
94 
95 unsigned
barycentric_ij_index(nir_intrinsic_instr * intr)96 barycentric_ij_index(nir_intrinsic_instr *intr)
97 {
98    unsigned index = 0;
99    switch (intr->intrinsic) {
100    case nir_intrinsic_load_barycentric_sample:
101       index = 0;
102       break;
103    case nir_intrinsic_load_barycentric_at_sample:
104    case nir_intrinsic_load_barycentric_at_offset:
105    case nir_intrinsic_load_barycentric_pixel:
106       index = 1;
107       break;
108    case nir_intrinsic_load_barycentric_centroid:
109       index = 2;
110       break;
111    default:
112       unreachable("Unknown interpolator intrinsic");
113    }
114 
115    switch (nir_intrinsic_interp_mode(intr)) {
116    case INTERP_MODE_NONE:
117    case INTERP_MODE_SMOOTH:
118       return index;
119    case INTERP_MODE_NOPERSPECTIVE:
120       return index + 3;
121    case INTERP_MODE_FLAT:
122    case INTERP_MODE_EXPLICIT:
123    default:
124       unreachable("unknown/unsupported mode for load_interpolated");
125    }
126    return 0;
127 }
128 
129 bool
process_stage_intrinsic(nir_intrinsic_instr * intr)130 FragmentShader::process_stage_intrinsic(nir_intrinsic_instr *intr)
131 {
132    if (process_stage_intrinsic_hw(intr))
133       return true;
134 
135    switch (intr->intrinsic) {
136    case nir_intrinsic_load_input:
137       return load_input(intr);
138    case nir_intrinsic_load_interpolated_input:
139       return load_interpolated_input(intr);
140    case nir_intrinsic_terminate_if:
141       m_uses_discard = true;
142       emit_instruction(new AluInstr(op2_killne_int,
143                                     nullptr,
144                                     value_factory().src(intr->src[0], 0),
145                                     value_factory().zero(),
146                                     {AluInstr::last}));
147 
148       return true;
149    case nir_intrinsic_terminate:
150       m_uses_discard = true;
151       emit_instruction(new AluInstr(op2_kille_int,
152                                     nullptr,
153                                     value_factory().zero(),
154                                     value_factory().zero(),
155                                     {AluInstr::last}));
156       return true;
157    case nir_intrinsic_load_sample_mask_in:
158       if (m_apply_sample_mask) {
159          return emit_load_sample_mask_in(intr);
160       } else
161          return emit_simple_mov(intr->def, 0, m_sample_mask_reg);
162    case nir_intrinsic_load_sample_id:
163       return emit_simple_mov(intr->def, 0, m_sample_id_reg);
164    case nir_intrinsic_load_helper_invocation:
165       return emit_load_helper_invocation(intr);
166    case nir_intrinsic_load_sample_pos:
167       return emit_load_sample_pos(intr);
168    default:
169       return false;
170    }
171 }
172 
173 bool
load_interpolated_input(nir_intrinsic_instr * intr)174 FragmentShader::load_interpolated_input(nir_intrinsic_instr *intr)
175 {
176    auto& vf = value_factory();
177    unsigned loc = nir_intrinsic_io_semantics(intr).location;
178    switch (loc) {
179    case VARYING_SLOT_POS:
180       for (unsigned i = 0; i < intr->def.num_components; ++i)
181          vf.inject_value(intr->def, i, m_pos_input[i]);
182       return true;
183    case VARYING_SLOT_FACE:
184       return false;
185    default:;
186    }
187 
188    return load_interpolated_input_hw(intr);
189 }
190 
191 int
do_allocate_reserved_registers()192 FragmentShader::do_allocate_reserved_registers()
193 {
194    int next_register = allocate_interpolators_or_inputs();
195 
196    if (m_sv_values.test(es_pos)) {
197       set_input_gpr(m_pos_driver_loc, next_register);
198       m_pos_input = value_factory().allocate_pinned_vec4(next_register++, false);
199    }
200 
201    int face_reg_index = -1;
202    if (m_sv_values.test(es_face)) {
203       set_input_gpr(m_face_driver_loc, next_register);
204       face_reg_index = next_register++;
205       m_face_input = value_factory().allocate_pinned_register(face_reg_index, 0);
206    }
207 
208    if (m_sv_values.test(es_sample_mask_in)) {
209       if (face_reg_index < 0)
210          face_reg_index = next_register++;
211       m_sample_mask_reg = value_factory().allocate_pinned_register(face_reg_index, 2);
212       sfn_log << SfnLog::io << "Set sample mask in register to " << *m_sample_mask_reg
213               << "\n";
214       m_nsys_inputs = 1;
215       ShaderInput input(ninputs());
216       input.set_system_value(SYSTEM_VALUE_SAMPLE_MASK_IN);
217       input.set_gpr(face_reg_index);
218       add_input(input);
219    }
220 
221    if (m_sv_values.test(es_sample_id) || m_sv_values.test(es_sample_mask_in)) {
222       int sample_id_reg = next_register++;
223       m_sample_id_reg = value_factory().allocate_pinned_register(sample_id_reg, 3);
224       sfn_log << SfnLog::io << "Set sample id register to " << *m_sample_id_reg << "\n";
225       m_nsys_inputs++;
226       ShaderInput input(ninputs());
227       input.set_system_value(SYSTEM_VALUE_SAMPLE_ID);
228       input.set_gpr(sample_id_reg);
229       add_input(input);
230    }
231 
232    if (m_sv_values.test(es_helper_invocation)) {
233       m_helper_invocation = value_factory().temp_register(0, false);
234    }
235 
236    return next_register;
237 }
238 
239 bool
do_scan_instruction(nir_instr * instr)240 FragmentShader::do_scan_instruction(nir_instr *instr)
241 {
242    if (instr->type != nir_instr_type_intrinsic)
243       return false;
244 
245    auto intr = nir_instr_as_intrinsic(instr);
246    switch (intr->intrinsic) {
247    case nir_intrinsic_load_barycentric_pixel:
248    case nir_intrinsic_load_barycentric_sample:
249    case nir_intrinsic_load_barycentric_at_sample:
250    case nir_intrinsic_load_barycentric_at_offset:
251    case nir_intrinsic_load_barycentric_centroid:
252       m_interpolators_used.set(barycentric_ij_index(intr));
253       break;
254    case nir_intrinsic_load_front_face:
255       m_sv_values.set(es_face);
256       break;
257    case nir_intrinsic_load_sample_mask_in:
258       m_sv_values.set(es_sample_mask_in);
259       break;
260    case nir_intrinsic_load_sample_pos:
261       m_sv_values.set(es_sample_pos);
262       FALLTHROUGH;
263    case nir_intrinsic_load_sample_id:
264       m_sv_values.set(es_sample_id);
265       break;
266    case nir_intrinsic_load_helper_invocation:
267       m_sv_values.set(es_helper_invocation);
268       break;
269    case nir_intrinsic_load_input:
270       return scan_input(intr, 0);
271    case nir_intrinsic_load_interpolated_input:
272       return scan_input(intr, 1);
273    default:
274       return false;
275    }
276    return true;
277 }
278 
279 bool
emit_load_sample_mask_in(nir_intrinsic_instr * instr)280 FragmentShader::emit_load_sample_mask_in(nir_intrinsic_instr *instr)
281 {
282    auto& vf = value_factory();
283    auto dest = vf.dest(instr->def, 0, pin_free);
284    auto tmp = vf.temp_register();
285    assert(m_sample_id_reg);
286    assert(m_sample_mask_reg);
287 
288    emit_instruction(
289       new AluInstr(op2_lshl_int, tmp, vf.one_i(), m_sample_id_reg, AluInstr::last_write));
290    emit_instruction(
291       new AluInstr(op2_and_int, dest, tmp, m_sample_mask_reg, AluInstr::last_write));
292    return true;
293 }
294 
295 bool
emit_load_helper_invocation(nir_intrinsic_instr * instr)296 FragmentShader::emit_load_helper_invocation(nir_intrinsic_instr *instr)
297 {
298    assert(m_helper_invocation);
299    auto& vf = value_factory();
300    emit_instruction(
301       new AluInstr(op1_mov, m_helper_invocation, vf.literal(-1), AluInstr::last_write));
302    RegisterVec4 destvec{m_helper_invocation, nullptr, nullptr, nullptr, pin_group};
303 
304    auto vtx = new LoadFromBuffer(destvec,
305                                  {4, 7, 7, 7},
306                                  m_helper_invocation,
307                                  0,
308                                  R600_BUFFER_INFO_CONST_BUFFER,
309                                  nullptr,
310                                  fmt_32_32_32_32_float);
311    vtx->set_fetch_flag(FetchInstr::vpm);
312    vtx->set_fetch_flag(FetchInstr::use_tc);
313    vtx->set_always_keep();
314    auto dst = value_factory().dest(instr->def, 0, pin_free);
315    auto ir = new AluInstr(op1_mov, dst, m_helper_invocation, AluInstr::last_write);
316    ir->add_required_instr(vtx);
317    emit_instruction(vtx);
318    emit_instruction(ir);
319 
320    return true;
321 }
322 
323 bool
scan_input(nir_intrinsic_instr * intr,int index_src_id)324 FragmentShader::scan_input(nir_intrinsic_instr *intr, int index_src_id)
325 {
326    auto index = nir_src_as_const_value(intr->src[index_src_id]);
327    assert(index);
328 
329    const unsigned location_offset = chip_class() < ISA_CC_EVERGREEN ? 32 : 0;
330    bool uses_interpol_at_centroid = false;
331 
332    auto location =
333       static_cast<gl_varying_slot>(nir_intrinsic_io_semantics(intr).location + index->u32);
334    unsigned driver_location = nir_intrinsic_base(intr) + index->u32;
335 
336    if (location == VARYING_SLOT_POS) {
337       m_sv_values.set(es_pos);
338       m_pos_driver_loc = driver_location + location_offset;
339       ShaderInput pos_input(m_pos_driver_loc, location);
340       pos_input.set_interpolator(TGSI_INTERPOLATE_LINEAR,
341                                  TGSI_INTERPOLATE_LOC_CENTER,
342                                  false);
343       add_input(pos_input);
344       return true;
345    }
346 
347    if (location == VARYING_SLOT_FACE) {
348       m_sv_values.set(es_face);
349       m_face_driver_loc = driver_location + location_offset;
350       ShaderInput face_input(m_face_driver_loc, location);
351       add_input(face_input);
352       return true;
353    }
354 
355    tgsi_interpolate_mode tgsi_interpolate = TGSI_INTERPOLATE_CONSTANT;
356    tgsi_interpolate_loc tgsi_loc = TGSI_INTERPOLATE_LOC_CENTER;
357 
358    const bool is_color =
359       (location >= VARYING_SLOT_COL0 && location <= VARYING_SLOT_COL1) ||
360       (location >= VARYING_SLOT_BFC0 && location <= VARYING_SLOT_BFC1);
361 
362    if (index_src_id > 0) {
363       glsl_interp_mode mode = INTERP_MODE_NONE;
364       auto parent = nir_instr_as_intrinsic(intr->src[0].ssa->parent_instr);
365       mode = (glsl_interp_mode)nir_intrinsic_interp_mode(parent);
366       switch (parent->intrinsic) {
367       case nir_intrinsic_load_barycentric_sample:
368          tgsi_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
369          break;
370       case nir_intrinsic_load_barycentric_at_sample:
371       case nir_intrinsic_load_barycentric_at_offset:
372       case nir_intrinsic_load_barycentric_pixel:
373          tgsi_loc = TGSI_INTERPOLATE_LOC_CENTER;
374          break;
375       case nir_intrinsic_load_barycentric_centroid:
376          tgsi_loc = TGSI_INTERPOLATE_LOC_CENTROID;
377          uses_interpol_at_centroid = true;
378          break;
379       default:
380          std::cerr << "Instruction " << nir_intrinsic_infos[parent->intrinsic].name
381                    << " as parent of " << nir_intrinsic_infos[intr->intrinsic].name
382                    << " interpolator?\n";
383          assert(0);
384       }
385 
386       switch (mode) {
387       case INTERP_MODE_NONE:
388          if (is_color) {
389             tgsi_interpolate = TGSI_INTERPOLATE_COLOR;
390             break;
391          }
392          FALLTHROUGH;
393       case INTERP_MODE_SMOOTH:
394          tgsi_interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
395          break;
396       case INTERP_MODE_NOPERSPECTIVE:
397          tgsi_interpolate = TGSI_INTERPOLATE_LINEAR;
398          break;
399       case INTERP_MODE_FLAT:
400          break;
401       case INTERP_MODE_EXPLICIT:
402       default:
403          assert(0);
404       }
405    }
406 
407    if (location == VARYING_SLOT_PRIMITIVE_ID) {
408       m_gs_prim_id_input = true;
409    } else if (!(is_color || (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_MAX) ||
410                 (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7) ||
411                 (location >= VARYING_SLOT_CLIP_DIST0 && location <= VARYING_SLOT_CLIP_DIST1) ||
412                 location == VARYING_SLOT_FOGC || location == VARYING_SLOT_LAYER ||
413                 location == VARYING_SLOT_PNTC || location == VARYING_SLOT_VIEWPORT)) {
414       return false;
415    }
416 
417    sfn_log << SfnLog::io << " have IO at " << driver_location << "\n";
418    auto iinput = find_input(driver_location);
419    if (iinput == input_not_found()) {
420       ShaderInput input(driver_location, location);
421       input.set_need_lds_pos();
422       input.set_interpolator(tgsi_interpolate, tgsi_loc, uses_interpol_at_centroid);
423       sfn_log << SfnLog::io << "add IO with LDS ID at " << input.location() << "\n";
424       add_input(input);
425       assert(find_input(input.location()) != input_not_found());
426    } else {
427       if (uses_interpol_at_centroid) {
428          iinput->second.set_uses_interpolate_at_centroid();
429       }
430    }
431    return true;
432 }
433 
434 bool
emit_export_pixel(nir_intrinsic_instr & intr)435 FragmentShader::emit_export_pixel(nir_intrinsic_instr& intr)
436 {
437    RegisterVec4::Swizzle swizzle;
438    auto semantics = nir_intrinsic_io_semantics(&intr);
439    unsigned driver_location = nir_intrinsic_base(&intr);
440    unsigned write_mask = nir_intrinsic_write_mask(&intr);
441 
442    switch (semantics.location) {
443    case FRAG_RESULT_DEPTH:
444       swizzle = {0, 7, 7, 7};
445       break;
446    case FRAG_RESULT_STENCIL:
447       swizzle = {7, 0, 7, 7};
448       break;
449    case FRAG_RESULT_SAMPLE_MASK:
450       swizzle = {7, 7, 0, 7};
451       break;
452    default:
453       for (int i = 0; i < 4; ++i) {
454          swizzle[i] = (1 << i) & write_mask ? i : 7;
455       }
456    }
457 
458    auto value = value_factory().src_vec4(intr.src[0], pin_group, swizzle);
459 
460    if (semantics.location == FRAG_RESULT_COLOR ||
461        (semantics.location >= FRAG_RESULT_DATA0 &&
462         semantics.location <= FRAG_RESULT_DATA7)) {
463 
464       ShaderOutput output(driver_location, write_mask);
465       output.set_frag_result(static_cast<gl_frag_result>(semantics.location));
466       add_output(output);
467 
468       unsigned color_outputs =
469          m_fs_write_all && chip_class() >= ISA_CC_R700 ? m_max_color_exports : 1;
470 
471       for (unsigned k = 0; k < color_outputs; ++k) {
472 
473          unsigned location = semantics.location - FRAG_RESULT_DATA0;
474 
475          if (semantics.location == FRAG_RESULT_COLOR)
476             location = driver_location + k;
477 
478          if (semantics.dual_source_blend_index)
479             location = semantics.dual_source_blend_index;
480 
481          sfn_log << SfnLog::io << "Pixel output at loc:" << location
482                  << "("<< semantics.location << ") of "<< m_max_color_exports<<"\n";
483 
484          if (location >= m_max_color_exports) {
485             sfn_log << SfnLog::io << "Pixel output loc:" << location
486                     << " dl:" << driver_location << " skipped  because  we have only "
487                     << m_max_color_exports << " CBs\n";
488             return true;
489          }
490 
491          m_last_pixel_export = new ExportInstr(ExportInstr::pixel, location, value);
492 
493          if (m_export_highest < location)
494             m_export_highest = location;
495 
496          m_num_color_exports++;
497 
498          /* Hack: force dual source output handling if one color output has a
499           * dual_source_blend_index > 0 */
500          if (semantics.dual_source_blend_index > 0)
501             m_dual_source_blend = true;
502 
503          if (m_num_color_exports > 1)
504             m_fs_write_all = false;
505          unsigned mask = (0xfu << (location * 4));
506 
507          m_color_export_written_mask |= (1 << location);
508 
509          /* If the i-th target format is set, all previous target formats must
510           * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
511           /*/
512          for (unsigned i = 0; i < location; ++i)
513             mask |= (0x1u << (i * 4));
514 
515          m_color_export_mask |= mask;
516 
517          emit_instruction(m_last_pixel_export);
518       }
519    } else if (semantics.location == FRAG_RESULT_DEPTH ||
520               semantics.location == FRAG_RESULT_STENCIL ||
521               semantics.location == FRAG_RESULT_SAMPLE_MASK) {
522       emit_instruction(new ExportInstr(ExportInstr::pixel, 61, value));
523 
524       ShaderOutput output(driver_location, write_mask);
525       output.set_frag_result(static_cast<gl_frag_result>(semantics.location));
526       add_output(output);
527 
528    } else {
529       return false;
530    }
531    return true;
532 }
533 
534 bool
emit_load_sample_pos(nir_intrinsic_instr * instr)535 FragmentShader::emit_load_sample_pos(nir_intrinsic_instr *instr)
536 {
537    auto dest = value_factory().dest_vec4(instr->def, pin_group);
538 
539    auto fetch = new LoadFromBuffer(dest,
540                                    {0, 1, 2, 3},
541                                    m_sample_id_reg,
542                                    0,
543                                    R600_BUFFER_INFO_CONST_BUFFER,
544                                    nullptr,
545                                    fmt_32_32_32_32_float);
546    fetch->set_fetch_flag(FetchInstr::srf_mode);
547    emit_instruction(fetch);
548    return true;
549 }
550 
551 void
do_finalize()552 FragmentShader::do_finalize()
553 {
554    /* On pre-evergreen not emtting something to all color exports that
555     * are enabled might lead to a hang.
556     * see: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9223
557     */
558    if (chip_class() < ISA_CC_EVERGREEN) {
559       unsigned i = 0;
560       unsigned mask = m_color_export_mask;
561 
562       while (i < m_max_color_exports && (mask & (1u << (4 * i)))) {
563          if (!(m_color_export_written_mask & (1u << i))) {
564             RegisterVec4 value(0, false, {7, 7, 7, 7});
565             m_last_pixel_export = new ExportInstr(ExportInstr::pixel, i, value);
566             emit_instruction(m_last_pixel_export);
567             m_num_color_exports++;
568             if (m_export_highest < i)
569                m_export_highest = i;
570          }
571          ++i;
572       }
573    }
574 
575    if (!m_last_pixel_export) {
576       RegisterVec4 value(0, false, {7, 7, 7, 7});
577       m_last_pixel_export = new ExportInstr(ExportInstr::pixel, 0, value);
578       emit_instruction(m_last_pixel_export);
579       m_num_color_exports++;
580       m_color_export_mask |= 0xf;
581    }
582    m_last_pixel_export->set_is_last_export(true);
583 }
584 
585 bool
read_prop(std::istream & is)586 FragmentShader::read_prop(std::istream& is)
587 {
588    string value;
589    is >> value;
590 
591    ASSERTED auto splitpos = value.find(':');
592    assert(splitpos != string::npos);
593 
594    std::istringstream ival(value);
595    string name;
596    string val;
597 
598    std::getline(ival, name, ':');
599 
600    if (name == "MAX_COLOR_EXPORTS")
601       ival >> m_max_color_exports;
602    else if (name == "COLOR_EXPORTS")
603       ival >> m_num_color_exports;
604    else if (name == "COLOR_EXPORT_MASK")
605       ival >> m_color_export_mask;
606    else if (name == "WRITE_ALL_COLORS")
607       ival >> m_fs_write_all;
608    else
609       return false;
610    return true;
611 }
612 
613 void
do_print_properties(std::ostream & os) const614 FragmentShader::do_print_properties(std::ostream& os) const
615 {
616    os << "PROP MAX_COLOR_EXPORTS:" << m_max_color_exports << "\n";
617    os << "PROP COLOR_EXPORTS:" << m_num_color_exports << "\n";
618    os << "PROP COLOR_EXPORT_MASK:" << m_color_export_mask << "\n";
619    os << "PROP WRITE_ALL_COLORS:" << m_fs_write_all << "\n";
620 }
621 
622 int
allocate_interpolators_or_inputs()623 FragmentShaderR600::allocate_interpolators_or_inputs()
624 {
625    int pos = 0;
626    auto& vf = value_factory();
627    for (auto& [index, inp] : inputs()) {
628       if (inp.need_lds_pos()) {
629 
630          RegisterVec4 input(vf.allocate_pinned_register(pos, 0),
631                             vf.allocate_pinned_register(pos, 1),
632                             vf.allocate_pinned_register(pos, 2),
633                             vf.allocate_pinned_register(pos, 3),
634                             pin_fully);
635          inp.set_gpr(pos++);
636 
637          sfn_log << SfnLog::io << "Reserve input register at pos " << index << " as "
638                  << input << " with register " << inp.gpr() << "\n";
639 
640          m_interpolated_inputs[index] = input;
641       }
642    }
643    return pos;
644 }
645 
646 bool
load_input_hw(nir_intrinsic_instr * intr)647 FragmentShaderR600::load_input_hw(nir_intrinsic_instr *intr)
648 {
649    auto& vf = value_factory();
650    AluInstr *ir = nullptr;
651    for (unsigned i = 0; i < intr->def.num_components; ++i) {
652       sfn_log << SfnLog::io << "Inject register "
653               << *m_interpolated_inputs[nir_intrinsic_base(intr)][i] << "\n";
654       unsigned index = nir_intrinsic_component(intr) + i;
655       assert(index < 4);
656       vf.inject_value(intr->def,
657                       i,
658                       m_interpolated_inputs[nir_intrinsic_base(intr)][index]);
659    }
660    if (ir)
661       ir->set_alu_flag(alu_last_instr);
662    return true;
663 }
664 
665 bool
process_stage_intrinsic_hw(nir_intrinsic_instr * intr)666 FragmentShaderR600::process_stage_intrinsic_hw(nir_intrinsic_instr *intr)
667 {
668    switch (intr->intrinsic) {
669    case nir_intrinsic_load_barycentric_centroid:
670    case nir_intrinsic_load_barycentric_pixel:
671    case nir_intrinsic_load_barycentric_sample:
672       return true;
673    default:
674       return false;
675    }
676 }
677 
678 bool
load_interpolated_input_hw(nir_intrinsic_instr * intr)679 FragmentShaderR600::load_interpolated_input_hw(nir_intrinsic_instr *intr)
680 {
681    return load_input_hw(intr);
682 }
683 
684 bool
load_input_hw(nir_intrinsic_instr * intr)685 FragmentShaderEG::load_input_hw(nir_intrinsic_instr *intr)
686 {
687    auto& vf = value_factory();
688    auto io = input(nir_intrinsic_base(intr));
689    auto comp = nir_intrinsic_component(intr);
690 
691    bool need_temp = comp > 0;
692    AluInstr *ir = nullptr;
693    for (unsigned i = 0; i < intr->def.num_components; ++i) {
694       if (need_temp) {
695          auto tmp = vf.temp_register(comp + i);
696          ir =
697             new AluInstr(op1_interp_load_p0,
698                          tmp,
699                          new InlineConstant(ALU_SRC_PARAM_BASE + io.lds_pos(), i + comp),
700                          AluInstr::last_write);
701          emit_instruction(ir);
702          emit_instruction(new AluInstr(
703             op1_mov, vf.dest(intr->def, i, pin_chan), tmp, AluInstr::last_write));
704       } else {
705 
706          ir = new AluInstr(op1_interp_load_p0,
707                            vf.dest(intr->def, i, pin_chan),
708                            new InlineConstant(ALU_SRC_PARAM_BASE + io.lds_pos(), i),
709                            AluInstr::write);
710          emit_instruction(ir);
711       }
712    }
713    ir->set_alu_flag(alu_last_instr);
714    return true;
715 }
716 
717 int
allocate_interpolators_or_inputs()718 FragmentShaderEG::allocate_interpolators_or_inputs()
719 {
720    for (unsigned i = 0; i < s_max_interpolators; ++i) {
721       if (interpolators_used(i)) {
722          sfn_log << SfnLog::io << "Interpolator " << i << " test enabled\n";
723          m_interpolator[i].enabled = true;
724       }
725    }
726 
727    int num_baryc = 0;
728    for (int i = 0; i < 6; ++i) {
729       if (m_interpolator[i].enabled) {
730          sfn_log << SfnLog::io << "Interpolator " << i
731                  << " is enabled with ij=" << num_baryc << " \n";
732          unsigned sel = num_baryc / 2;
733          unsigned chan = 2 * (num_baryc % 2);
734 
735          m_interpolator[i].i = value_factory().allocate_pinned_register(sel, chan + 1);
736          m_interpolator[i].j = value_factory().allocate_pinned_register(sel, chan);
737 
738          m_interpolator[i].ij_index = num_baryc++;
739       }
740    }
741    return (num_baryc + 1) >> 1;
742 }
743 
744 bool
process_stage_intrinsic_hw(nir_intrinsic_instr * intr)745 FragmentShaderEG::process_stage_intrinsic_hw(nir_intrinsic_instr *intr)
746 {
747    auto& vf = value_factory();
748    switch (intr->intrinsic) {
749    case nir_intrinsic_load_barycentric_centroid:
750    case nir_intrinsic_load_barycentric_pixel:
751    case nir_intrinsic_load_barycentric_sample: {
752       unsigned ij = barycentric_ij_index(intr);
753       vf.inject_value(intr->def, 0, m_interpolator[ij].i);
754       vf.inject_value(intr->def, 1, m_interpolator[ij].j);
755       return true;
756    }
757    case nir_intrinsic_load_barycentric_at_offset:
758       return load_barycentric_at_offset(intr);
759    case nir_intrinsic_load_barycentric_at_sample:
760       return load_barycentric_at_sample(intr);
761    default:
762       return false;
763    }
764 }
765 
766 bool
load_interpolated_input_hw(nir_intrinsic_instr * intr)767 FragmentShaderEG::load_interpolated_input_hw(nir_intrinsic_instr *intr)
768 {
769    auto& vf = value_factory();
770    ASSERTED auto param = nir_src_as_const_value(intr->src[1]);
771    assert(param && "Indirect PS inputs not (yet) supported");
772 
773    int dest_num_comp = intr->def.num_components;
774    int start_comp = nir_intrinsic_component(intr);
775    bool need_temp = start_comp > 0;
776 
777    auto dst = need_temp ? vf.temp_vec4(pin_chan) : vf.dest_vec4(intr->def, pin_chan);
778 
779    InterpolateParams params;
780 
781    params.i = vf.src(intr->src[0], 0);
782    params.j = vf.src(intr->src[0], 1);
783    params.base = input(nir_intrinsic_base(intr)).lds_pos();
784 
785    if (!load_interpolated(dst, params, dest_num_comp, start_comp))
786       return false;
787 
788    if (need_temp) {
789       AluInstr *ir = nullptr;
790       for (unsigned i = 0; i < intr->def.num_components; ++i) {
791          auto real_dst = vf.dest(intr->def, i, pin_chan);
792          ir = new AluInstr(op1_mov, real_dst, dst[i + start_comp], AluInstr::write);
793          emit_instruction(ir);
794       }
795       assert(ir);
796       ir->set_alu_flag(alu_last_instr);
797    }
798 
799    return true;
800 }
801 
802 bool
load_interpolated(RegisterVec4 & dest,const InterpolateParams & params,int num_dest_comp,int start_comp)803 FragmentShaderEG::load_interpolated(RegisterVec4& dest,
804                                     const InterpolateParams& params,
805                                     int num_dest_comp,
806                                     int start_comp)
807 {
808    sfn_log << SfnLog::io << "Using Interpolator (" << *params.j << ", " << *params.i
809            << ")"
810            << "\n";
811 
812    if (num_dest_comp == 1) {
813       switch (start_comp) {
814       case 0:
815          return load_interpolated_one_comp(dest, params, op2_interp_x);
816       case 1:
817          return load_interpolated_two_comp_for_one(dest, params, op2_interp_xy, 1);
818       case 2:
819          return load_interpolated_one_comp(dest, params, op2_interp_z);
820       case 3:
821          return load_interpolated_two_comp_for_one(dest, params, op2_interp_zw, 3);
822       default:
823          assert(0);
824       }
825    }
826 
827    if (num_dest_comp == 2) {
828       switch (start_comp) {
829       case 0:
830          return load_interpolated_two_comp(dest, params, op2_interp_xy, 0x3);
831       case 2:
832          return load_interpolated_two_comp(dest, params, op2_interp_zw, 0xc);
833       case 1:
834          return load_interpolated_one_comp(dest, params, op2_interp_z) &&
835                 load_interpolated_two_comp_for_one(dest, params, op2_interp_xy, 1);
836       default:
837          assert(0);
838       }
839    }
840 
841    if (num_dest_comp == 3 && start_comp == 0)
842       return load_interpolated_two_comp(dest, params, op2_interp_xy, 0x3) &&
843              load_interpolated_one_comp(dest, params, op2_interp_z);
844 
845    int full_write_mask = ((1 << num_dest_comp) - 1) << start_comp;
846 
847    bool success =
848       load_interpolated_two_comp(dest, params, op2_interp_zw, full_write_mask & 0xc);
849    success &=
850       load_interpolated_two_comp(dest, params, op2_interp_xy, full_write_mask & 0x3);
851    return success;
852 }
853 
854 bool
load_barycentric_at_sample(nir_intrinsic_instr * instr)855 FragmentShaderEG::load_barycentric_at_sample(nir_intrinsic_instr *instr)
856 {
857    auto& vf = value_factory();
858    RegisterVec4 slope = vf.temp_vec4(pin_group);
859    auto src = emit_load_to_register(vf.src(instr->src[0], 0));
860    auto fetch = new LoadFromBuffer(slope,
861                                    {0, 1, 2, 3},
862                                    src,
863                                    0,
864                                    R600_BUFFER_INFO_CONST_BUFFER,
865                                    nullptr,
866                                    fmt_32_32_32_32_float);
867 
868    fetch->set_fetch_flag(FetchInstr::srf_mode);
869    emit_instruction(fetch);
870 
871    auto grad = vf.temp_vec4(pin_group);
872 
873    auto interpolator = m_interpolator[barycentric_ij_index(instr)];
874    assert(interpolator.enabled);
875 
876    RegisterVec4 interp(interpolator.j, interpolator.i, nullptr, nullptr, pin_group);
877 
878    auto tex = new TexInstr(TexInstr::get_gradient_h, grad, {0, 1, 7, 7}, interp, 0, 0);
879    tex->set_tex_flag(TexInstr::grad_fine);
880    tex->set_tex_flag(TexInstr::x_unnormalized);
881    tex->set_tex_flag(TexInstr::y_unnormalized);
882    tex->set_tex_flag(TexInstr::z_unnormalized);
883    tex->set_tex_flag(TexInstr::w_unnormalized);
884    emit_instruction(tex);
885 
886    tex = new TexInstr(TexInstr::get_gradient_v, grad, {7, 7, 0, 1}, interp, 0, 0);
887    tex->set_tex_flag(TexInstr::x_unnormalized);
888    tex->set_tex_flag(TexInstr::y_unnormalized);
889    tex->set_tex_flag(TexInstr::z_unnormalized);
890    tex->set_tex_flag(TexInstr::w_unnormalized);
891    tex->set_tex_flag(TexInstr::grad_fine);
892    emit_instruction(tex);
893 
894    auto tmp0 = vf.temp_register();
895    auto tmp1 = vf.temp_register();
896 
897    emit_instruction(
898       new AluInstr(op3_muladd, tmp0, grad[0], slope[2], interpolator.j, {alu_write}));
899    emit_instruction(new AluInstr(
900       op3_muladd, tmp1, grad[1], slope[2], interpolator.i, {alu_write, alu_last_instr}));
901 
902    emit_instruction(new AluInstr(op3_muladd,
903                                  vf.dest(instr->def, 0, pin_none),
904                                  grad[3],
905                                  slope[3],
906                                  tmp1,
907                                  {alu_write}));
908    emit_instruction(new AluInstr(op3_muladd,
909                                  vf.dest(instr->def, 1, pin_none),
910                                  grad[2],
911                                  slope[3],
912                                  tmp0,
913                                  {alu_write, alu_last_instr}));
914 
915    return true;
916 }
917 
918 bool
load_barycentric_at_offset(nir_intrinsic_instr * instr)919 FragmentShaderEG::load_barycentric_at_offset(nir_intrinsic_instr *instr)
920 {
921    auto& vf = value_factory();
922    auto interpolator = m_interpolator[barycentric_ij_index(instr)];
923 
924    auto help = vf.temp_vec4(pin_group);
925    RegisterVec4 interp(interpolator.j, interpolator.i, nullptr, nullptr, pin_group);
926 
927    auto getgradh =
928       new TexInstr(TexInstr::get_gradient_h, help, {0, 1, 7, 7}, interp, 0, 0);
929    getgradh->set_tex_flag(TexInstr::x_unnormalized);
930    getgradh->set_tex_flag(TexInstr::y_unnormalized);
931    getgradh->set_tex_flag(TexInstr::z_unnormalized);
932    getgradh->set_tex_flag(TexInstr::w_unnormalized);
933    getgradh->set_tex_flag(TexInstr::grad_fine);
934    emit_instruction(getgradh);
935 
936    auto getgradv =
937       new TexInstr(TexInstr::get_gradient_v, help, {7, 7, 0, 1}, interp, 0, 0);
938    getgradv->set_tex_flag(TexInstr::x_unnormalized);
939    getgradv->set_tex_flag(TexInstr::y_unnormalized);
940    getgradv->set_tex_flag(TexInstr::z_unnormalized);
941    getgradv->set_tex_flag(TexInstr::w_unnormalized);
942    getgradv->set_tex_flag(TexInstr::grad_fine);
943    emit_instruction(getgradv);
944 
945    auto ofs_x = vf.src(instr->src[0], 0);
946    auto ofs_y = vf.src(instr->src[0], 1);
947    auto tmp0 = vf.temp_register();
948    auto tmp1 = vf.temp_register();
949    emit_instruction(
950       new AluInstr(op3_muladd, tmp0, help[0], ofs_x, interpolator.j, {alu_write}));
951    emit_instruction(new AluInstr(
952       op3_muladd, tmp1, help[1], ofs_x, interpolator.i, {alu_write, alu_last_instr}));
953    emit_instruction(new AluInstr(
954       op3_muladd, vf.dest(instr->def, 0, pin_none), help[3], ofs_y, tmp1, {alu_write}));
955    emit_instruction(new AluInstr(op3_muladd,
956                                  vf.dest(instr->def, 1, pin_none),
957                                  help[2],
958                                  ofs_y,
959                                  tmp0,
960                                  {alu_write, alu_last_instr}));
961 
962    return true;
963 }
964 
965 bool
load_interpolated_one_comp(RegisterVec4 & dest,const InterpolateParams & params,EAluOp op)966 FragmentShaderEG::load_interpolated_one_comp(RegisterVec4& dest,
967                                              const InterpolateParams& params,
968                                              EAluOp op)
969 {
970    auto group = new AluGroup();
971    bool success = true;
972 
973    AluInstr *ir = nullptr;
974    for (unsigned i = 0; i < 2 && success; ++i) {
975       int chan = i;
976       if (op == op2_interp_z)
977          chan += 2;
978 
979       ir = new AluInstr(op,
980                         dest[chan],
981                         i & 1 ? params.j : params.i,
982                         new InlineConstant(ALU_SRC_PARAM_BASE + params.base, chan),
983                         i == 0 ? AluInstr::write : AluInstr::last);
984 
985       ir->set_bank_swizzle(alu_vec_210);
986       success = group->add_instruction(ir);
987    }
988    ir->set_alu_flag(alu_last_instr);
989    if (success)
990       emit_instruction(group);
991    return success;
992 }
993 
994 bool
load_interpolated_two_comp(RegisterVec4 & dest,const InterpolateParams & params,EAluOp op,int writemask)995 FragmentShaderEG::load_interpolated_two_comp(RegisterVec4& dest,
996                                              const InterpolateParams& params,
997                                              EAluOp op,
998                                              int writemask)
999 {
1000    auto group = new AluGroup();
1001    bool success = true;
1002 
1003    AluInstr *ir = nullptr;
1004    assert(params.j);
1005    assert(params.i);
1006    for (unsigned i = 0; i < 4; ++i) {
1007       ir = new AluInstr(op,
1008                         dest[i],
1009                         i & 1 ? params.j : params.i,
1010                         new InlineConstant(ALU_SRC_PARAM_BASE + params.base, i),
1011                         (writemask & (1 << i)) ? AluInstr::write : AluInstr::empty);
1012       ir->set_bank_swizzle(alu_vec_210);
1013       success = group->add_instruction(ir);
1014    }
1015    ir->set_alu_flag(alu_last_instr);
1016    if (success)
1017       emit_instruction(group);
1018    return success;
1019 }
1020 
1021 bool
load_interpolated_two_comp_for_one(RegisterVec4 & dest,const InterpolateParams & params,EAluOp op,int comp)1022 FragmentShaderEG::load_interpolated_two_comp_for_one(RegisterVec4& dest,
1023                                                      const InterpolateParams& params,
1024                                                      EAluOp op,
1025                                                      int comp)
1026 {
1027    auto group = new AluGroup();
1028    bool success = true;
1029    AluInstr *ir = nullptr;
1030 
1031    for (int i = 0; i < 4; ++i) {
1032       ir = new AluInstr(op,
1033                         dest[i],
1034                         i & 1 ? params.j : params.i,
1035                         new InlineConstant(ALU_SRC_PARAM_BASE + params.base, i),
1036                         i == comp ? AluInstr::write : AluInstr::empty);
1037       ir->set_bank_swizzle(alu_vec_210);
1038       success = group->add_instruction(ir);
1039    }
1040    ir->set_alu_flag(alu_last_instr);
1041    if (success)
1042       emit_instruction(group);
1043 
1044    return success;
1045 }
1046 
Interpolator()1047 FragmentShaderEG::Interpolator::Interpolator():
1048     enabled(false)
1049 {
1050 }
1051 
1052 } // namespace r600
1053