1 /* -*- mesa-c++ -*-
2 * Copyright 2022 Collabora LTD
3 * Author: Gert Wollny <[email protected]>
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "sfn_assembler.h"
8
9 #include "../eg_sq.h"
10 #include "../r600_asm.h"
11
12 #include "sfn_callstack.h"
13 #include "sfn_conditionaljumptracker.h"
14 #include "sfn_debug.h"
15 #include "sfn_instr_alugroup.h"
16 #include "sfn_instr_controlflow.h"
17 #include "sfn_instr_export.h"
18 #include "sfn_instr_fetch.h"
19 #include "sfn_instr_mem.h"
20 #include "sfn_instr_tex.h"
21
22 namespace r600 {
Assembler(r600_shader * sh,const r600_shader_key & key)23 Assembler::Assembler(r600_shader *sh, const r600_shader_key& key):
24 m_sh(sh),
25 m_key(key)
26 {
27 }
28
29 extern const std::map<ESDOp, int> ds_opcode_map;
30
31 class AssamblerVisitor : public ConstInstrVisitor {
32 public:
33 AssamblerVisitor(r600_shader *sh, const r600_shader_key& key, bool legacy_math_rules);
34
35 void visit(const AluInstr& instr) override;
36 void visit(const AluGroup& instr) override;
37 void visit(const TexInstr& instr) override;
38 void visit(const ExportInstr& instr) override;
39 void visit(const FetchInstr& instr) override;
40 void visit(const Block& instr) override;
41 void visit(const IfInstr& instr) override;
42 void visit(const ControlFlowInstr& instr) override;
43 void visit(const ScratchIOInstr& instr) override;
44 void visit(const StreamOutInstr& instr) override;
45 void visit(const MemRingOutInstr& instr) override;
46 void visit(const EmitVertexInstr& instr) override;
47 void visit(const GDSInstr& instr) override;
48 void visit(const WriteTFInstr& instr) override;
49 void visit(const LDSAtomicInstr& instr) override;
50 void visit(const LDSReadInstr& instr) override;
51 void visit(const RatInstr& instr) override;
52
53 void finalize();
54
55 const uint32_t sf_vtx = 1;
56 const uint32_t sf_tex = 2;
57 const uint32_t sf_alu = 4;
58 const uint32_t sf_addr_register = 8;
59 const uint32_t sf_all = 0xf;
60
61 void clear_states(const uint32_t& states);
62 bool copy_dst(r600_bytecode_alu_dst& dst, const Register& d, bool write);
63 PVirtualValue copy_src(r600_bytecode_alu_src& src, const VirtualValue& s);
64
65 EBufferIndexMode emit_index_reg(const VirtualValue& addr, unsigned idx);
66
67 void emit_endif();
68 void emit_else();
69 void emit_loop_begin(bool vpm);
70 void emit_loop_end();
71 void emit_loop_break();
72 void emit_loop_cont();
73
74 void emit_alu_op(const AluInstr& ai);
75 void emit_lds_op(const AluInstr& lds);
76
77 auto translate_for_mathrules(EAluOp op) -> EAluOp;
78
79 void emit_wait_ack();
80
81 /* Start initialized in constructor */
82 const r600_shader_key& m_key;
83 r600_shader *m_shader;
84 r600_bytecode *m_bc;
85
86 ConditionalJumpTracker m_jump_tracker;
87 CallStack m_callstack;
88 bool ps_alpha_to_one;
89 /* End initialized in constructor */
90
91 std::set<uint32_t> m_nliterals_in_group;
92 std::set<int> vtx_fetch_results;
93 std::set<int> tex_fetch_results;
94
95 const VirtualValue *m_last_addr{nullptr};
96
97 unsigned m_max_color_exports{0};
98 int m_loop_nesting{0};
99
100 bool m_ack_suggested{false};
101 bool m_has_param_output{false};
102 bool m_has_pos_output{false};
103 bool m_last_op_was_barrier{false};
104 bool m_result{true};
105 bool m_legacy_math_rules{false};
106 };
107
108 bool
lower(Shader * shader)109 Assembler::lower(Shader *shader)
110 {
111 AssamblerVisitor ass(m_sh, m_key, shader->has_flag(Shader::sh_legacy_math_rules));
112
113 auto& blocks = shader->func();
114 for (auto b : blocks) {
115 b->accept(ass);
116 if (!ass.m_result)
117 return false;
118 }
119
120 ass.finalize();
121
122 return ass.m_result;
123 }
124
AssamblerVisitor(r600_shader * sh,const r600_shader_key & key,bool legacy_math_rules)125 AssamblerVisitor::AssamblerVisitor(r600_shader *sh, const r600_shader_key& key,
126 bool legacy_math_rules):
127 m_key(key),
128 m_shader(sh),
129
130 m_bc(&sh->bc),
131 m_callstack(sh->bc),
132 ps_alpha_to_one(key.ps.alpha_to_one),
133 m_legacy_math_rules(legacy_math_rules)
134 {
135 if (m_shader->processor_type == PIPE_SHADER_FRAGMENT)
136 m_max_color_exports = MAX2(m_key.ps.nr_cbufs, 1);
137
138 if (m_shader->processor_type == PIPE_SHADER_VERTEX && m_shader->ninput > 0)
139 r600_bytecode_add_cfinst(m_bc, CF_OP_CALL_FS);
140 }
141
142 void
finalize()143 AssamblerVisitor::finalize()
144 {
145 const struct cf_op_info *last = nullptr;
146
147 if (m_bc->cf_last)
148 last = r600_isa_cf(m_bc->cf_last->op);
149
150 /* alu clause instructions don't have EOP bit, so add NOP */
151 if (m_shader->bc.gfx_level < CAYMAN &&
152 (!last || last->flags & CF_ALU || m_bc->cf_last->op == CF_OP_LOOP_END ||
153 m_bc->cf_last->op == CF_OP_POP))
154 r600_bytecode_add_cfinst(m_bc, CF_OP_NOP);
155
156 /* A fetch shader only can't be EOP (results in hang), but we can replace
157 * it by a NOP */
158 else if (last && m_bc->cf_last->op == CF_OP_CALL_FS)
159 m_bc->cf_last->op = CF_OP_NOP;
160
161 if (m_shader->bc.gfx_level != CAYMAN)
162 m_bc->cf_last->end_of_program = 1;
163 else
164 cm_bytecode_add_cf_end(m_bc);
165 }
166
167 extern const std::map<EAluOp, int> opcode_map;
168
169 void
visit(const AluInstr & ai)170 AssamblerVisitor::visit(const AluInstr& ai)
171 {
172 assert(vtx_fetch_results.empty());
173 assert(tex_fetch_results.empty());
174
175 if (unlikely(ai.has_alu_flag(alu_is_lds)))
176 emit_lds_op(ai);
177 else
178 emit_alu_op(ai);
179 }
180
181 void
emit_lds_op(const AluInstr & lds)182 AssamblerVisitor::emit_lds_op(const AluInstr& lds)
183 {
184 struct r600_bytecode_alu alu;
185 memset(&alu, 0, sizeof(alu));
186
187 alu.is_lds_idx_op = true;
188 alu.op = lds.lds_opcode();
189
190 bool has_lds_fetch = false;
191 switch (alu.op) {
192 case LDS_WRITE:
193 alu.op = LDS_OP2_LDS_WRITE;
194 break;
195 case LDS_WRITE_REL:
196 alu.op = LDS_OP3_LDS_WRITE_REL;
197 alu.lds_idx = 1;
198 break;
199 case DS_OP_READ_RET:
200 alu.op = LDS_OP1_LDS_READ_RET;
201 FALLTHROUGH;
202 case LDS_ADD_RET:
203 case LDS_AND_RET:
204 case LDS_OR_RET:
205 case LDS_MAX_INT_RET:
206 case LDS_MAX_UINT_RET:
207 case LDS_MIN_INT_RET:
208 case LDS_MIN_UINT_RET:
209 case LDS_XOR_RET:
210 case LDS_XCHG_RET:
211 case LDS_CMP_XCHG_RET:
212 has_lds_fetch = true;
213 break;
214 case LDS_ADD:
215 case LDS_AND:
216 case LDS_OR:
217 case LDS_MAX_INT:
218 case LDS_MAX_UINT:
219 case LDS_MIN_INT:
220 case LDS_MIN_UINT:
221 case LDS_XOR:
222 break;
223 default:
224 std::cerr << "\n R600: error op: " << lds << "\n";
225 unreachable("Unhandled LDS op");
226 }
227
228 copy_src(alu.src[0], lds.src(0));
229
230 if (lds.n_sources() > 1)
231 copy_src(alu.src[1], lds.src(1));
232 else
233 alu.src[1].sel = V_SQ_ALU_SRC_0;
234
235 if (lds.n_sources() > 2)
236 copy_src(alu.src[2], lds.src(2));
237 else
238 alu.src[2].sel = V_SQ_ALU_SRC_0;
239
240 alu.last = lds.has_alu_flag(alu_last_instr);
241
242 int r = r600_bytecode_add_alu(m_bc, &alu);
243 if (has_lds_fetch)
244 m_bc->cf_last->nlds_read++;
245
246 if (r)
247 m_result = false;
248 }
249
translate_for_mathrules(EAluOp op)250 auto AssamblerVisitor::translate_for_mathrules(EAluOp op) -> EAluOp
251 {
252 switch (op) {
253 case op2_dot_ieee: return op2_dot;
254 case op2_dot4_ieee: return op2_dot4;
255 case op2_mul_ieee: return op2_mul;
256 case op3_muladd_ieee : return op2_mul_ieee;
257 default:
258 return op;
259 }
260 }
261
262 void
emit_alu_op(const AluInstr & ai)263 AssamblerVisitor::emit_alu_op(const AluInstr& ai)
264 {
265 sfn_log << SfnLog::assembly << "Emit ALU op " << ai << "\n";
266
267 struct r600_bytecode_alu alu;
268 memset(&alu, 0, sizeof(alu));
269
270 auto opcode = ai.opcode();
271
272 if (unlikely(ai.opcode() == op1_mova_int &&
273 (m_bc->gfx_level < CAYMAN || alu.dst.sel == 0))) {
274 m_last_addr = ai.psrc(0);
275 m_bc->ar_chan = m_last_addr->chan();
276 m_bc->ar_reg = m_last_addr->sel();
277 }
278
279 if (m_legacy_math_rules)
280 opcode = translate_for_mathrules(opcode);
281
282 auto hw_opcode = opcode_map.find(opcode);
283
284 if (hw_opcode == opcode_map.end()) {
285 std::cerr << "Opcode not handled for " << ai << "\n";
286 m_result = false;
287 return;
288 }
289
290 // skip multiple barriers
291 if (m_last_op_was_barrier && opcode == op0_group_barrier)
292 return;
293
294 m_last_op_was_barrier = opcode == op0_group_barrier;
295
296 alu.op = hw_opcode->second;
297
298 auto dst = ai.dest();
299 if (dst) {
300 if (ai.opcode() != op1_mova_int) {
301 if (!copy_dst(alu.dst, *dst, ai.has_alu_flag(alu_write))) {
302 m_result = false;
303 return;
304 }
305
306 alu.dst.write = ai.has_alu_flag(alu_write);
307 alu.dst.clamp = ai.has_alu_flag(alu_dst_clamp);
308 alu.dst.rel = dst->addr() ? 1 : 0;
309 } else if (m_bc->gfx_level == CAYMAN && ai.dest()->sel() > 0) {
310 alu.dst.sel = ai.dest()->sel() + 1;
311 }
312 }
313
314 alu.is_op3 = ai.n_sources() == 3;
315
316 EBufferIndexMode kcache_index_mode = bim_none;
317 PVirtualValue buffer_offset = nullptr;
318
319 for (unsigned i = 0; i < ai.n_sources(); ++i) {
320 buffer_offset = copy_src(alu.src[i], ai.src(i));
321 alu.src[i].neg = ai.has_source_mod(i, AluInstr::mod_neg);
322 if (!alu.is_op3)
323 alu.src[i].abs = ai.has_source_mod(i, AluInstr::mod_abs);
324
325 if (buffer_offset && kcache_index_mode == bim_none) {
326 auto idx_reg = buffer_offset->as_register();
327 if (idx_reg && idx_reg->has_flag(Register::addr_or_idx)) {
328 switch (idx_reg->sel()) {
329 case 1: kcache_index_mode = bim_zero; break;
330 case 2: kcache_index_mode = bim_one; break;
331 default:
332 unreachable("Unsupported index mode");
333 }
334 } else {
335 kcache_index_mode = bim_zero;
336 }
337 alu.src[i].kc_rel = kcache_index_mode;
338 }
339
340 if (ai.has_lds_queue_read()) {
341 assert(m_bc->cf_last->nlds_read > 0);
342 m_bc->cf_last->nlds_read--;
343 }
344 }
345
346 if (ai.bank_swizzle() != alu_vec_unknown)
347 alu.bank_swizzle_force = ai.bank_swizzle();
348
349 alu.last = ai.has_alu_flag(alu_last_instr);
350 alu.execute_mask = ai.has_alu_flag(alu_update_exec);
351
352 /* If the destination register is equal to the last loaded address register
353 * then clear the latter one, because the values will no longer be
354 * identical */
355 if (m_last_addr)
356 sfn_log << SfnLog::assembly << " Current address register is " << *m_last_addr
357 << "\n";
358
359 if (dst)
360 sfn_log << SfnLog::assembly << " Current dst register is " << *dst << "\n";
361
362 auto cf_op = ai.cf_type();
363
364 unsigned type = 0;
365 switch (cf_op) {
366 case cf_alu:
367 type = CF_OP_ALU;
368 break;
369 case cf_alu_push_before:
370 type = CF_OP_ALU_PUSH_BEFORE;
371 break;
372 case cf_alu_pop_after:
373 type = CF_OP_ALU_POP_AFTER;
374 break;
375 case cf_alu_pop2_after:
376 type = CF_OP_ALU_POP2_AFTER;
377 break;
378 case cf_alu_break:
379 type = CF_OP_ALU_BREAK;
380 break;
381 case cf_alu_else_after:
382 type = CF_OP_ALU_ELSE_AFTER;
383 break;
384 case cf_alu_continue:
385 type = CF_OP_ALU_CONTINUE;
386 break;
387 case cf_alu_extended:
388 type = CF_OP_ALU_EXT;
389 break;
390 default:
391 assert(0 && "cf_alu_undefined should have been replaced");
392 }
393
394 if (alu.last)
395 m_nliterals_in_group.clear();
396
397 m_result = !r600_bytecode_add_alu_type(m_bc, &alu, type);
398
399 if (unlikely(ai.opcode() == op1_mova_int)) {
400 if (m_bc->gfx_level < CAYMAN || alu.dst.sel == 0) {
401 m_bc->ar_loaded = 1;
402 } else if (m_bc->gfx_level == CAYMAN) {
403 int idx = alu.dst.sel - 2;
404 m_bc->index_loaded[idx] = 1;
405 m_bc->index_reg[idx] = -1;
406 }
407 }
408
409 if (alu.dst.sel >= g_clause_local_start && alu.dst.sel < g_clause_local_end) {
410 int clidx = 4 * (alu.dst.sel - g_clause_local_start) + alu.dst.chan;
411 m_bc->cf_last->clause_local_written |= 1 << clidx;
412 }
413
414 if (ai.opcode() == op1_set_cf_idx0) {
415 m_bc->index_loaded[0] = 1;
416 m_bc->index_reg[0] = -1;
417 }
418
419 if (ai.opcode() == op1_set_cf_idx1) {
420 m_bc->index_loaded[1] = 1;
421 m_bc->index_reg[1] = -1;
422 }
423 }
424
425 void
visit(const AluGroup & group)426 AssamblerVisitor::visit(const AluGroup& group)
427 {
428 clear_states(sf_vtx | sf_tex);
429
430 if (group.slots() == 0)
431 return;
432
433 static const unsigned slot_limit = 256;
434
435 if (m_bc->cf_last && !m_bc->force_add_cf) {
436 if (group.has_lds_group_start()) {
437 if (m_bc->cf_last->ndw + 2 * (*group.begin())->required_slots() > slot_limit) {
438 assert(m_bc->cf_last->nlds_read == 0);
439 assert(0 && "Not allowed to start new alu group here");
440 m_bc->force_add_cf = 1;
441 m_last_addr = nullptr;
442 }
443 } else {
444 if (m_bc->cf_last->ndw + 2 * group.slots() > slot_limit) {
445 std::cerr << "m_bc->cf_last->ndw = " << m_bc->cf_last->ndw
446 << " group.slots() = " << group.slots()
447 << " -> " << m_bc->cf_last->ndw + 2 * group.slots()
448 << "> slot_limit = " << slot_limit << "\n";
449 assert(m_bc->cf_last->nlds_read == 0);
450 assert(0 && "Not allowed to start new alu group here");
451 m_bc->force_add_cf = 1;
452 m_last_addr = nullptr;
453 } else {
454 auto instr = *group.begin();
455 if (instr && !instr->has_alu_flag(alu_is_lds) &&
456 instr->opcode() == op0_group_barrier && m_bc->cf_last->ndw + 14 > slot_limit) {
457 assert(0 && "Not allowed to start new alu group here");
458 assert(m_bc->cf_last->nlds_read == 0);
459 m_bc->force_add_cf = 1;
460 m_last_addr = nullptr;
461 }
462 }
463 }
464 }
465
466 auto [addr, is_index] = group.addr();
467
468 if (addr) {
469 if (!addr->has_flag(Register::addr_or_idx)) {
470 if (is_index) {
471 emit_index_reg(*addr, 0);
472 } else {
473 auto reg = addr->as_register();
474 assert(reg);
475 if (!m_last_addr || !m_bc->ar_loaded || !m_last_addr->equal_to(*reg)) {
476 m_last_addr = reg;
477 m_bc->ar_reg = reg->sel();
478 m_bc->ar_chan = reg->chan();
479 m_bc->ar_loaded = 0;
480 r600_load_ar(m_bc, group.addr_for_src());
481 }
482 }
483 }
484 }
485
486 for (auto& i : group) {
487 if (i)
488 i->accept(*this);
489 }
490 }
491
492 void
visit(const TexInstr & tex_instr)493 AssamblerVisitor::visit(const TexInstr& tex_instr)
494 {
495 clear_states(sf_vtx | sf_alu);
496
497 if (tex_fetch_results.find(tex_instr.src().sel()) != tex_fetch_results.end()) {
498 m_bc->force_add_cf = 1;
499 tex_fetch_results.clear();
500 }
501
502 r600_bytecode_tex tex;
503 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
504 tex.op = tex_instr.opcode();
505 tex.sampler_id = tex_instr.sampler_id();
506 tex.resource_id = tex_instr.resource_id();
507 tex.src_gpr = tex_instr.src().sel();
508 tex.dst_gpr = tex_instr.dst().sel();
509 tex.dst_sel_x = tex_instr.dest_swizzle(0);
510 tex.dst_sel_y = tex_instr.dest_swizzle(1);
511 tex.dst_sel_z = tex_instr.dest_swizzle(2);
512 tex.dst_sel_w = tex_instr.dest_swizzle(3);
513 tex.src_sel_x = tex_instr.src()[0]->chan();
514 tex.src_sel_y = tex_instr.src()[1]->chan();
515 tex.src_sel_z = tex_instr.src()[2]->chan();
516 tex.src_sel_w = tex_instr.src()[3]->chan();
517 tex.coord_type_x = !tex_instr.has_tex_flag(TexInstr::x_unnormalized);
518 tex.coord_type_y = !tex_instr.has_tex_flag(TexInstr::y_unnormalized);
519 tex.coord_type_z = !tex_instr.has_tex_flag(TexInstr::z_unnormalized);
520 tex.coord_type_w = !tex_instr.has_tex_flag(TexInstr::w_unnormalized);
521 tex.offset_x = tex_instr.get_offset(0);
522 tex.offset_y = tex_instr.get_offset(1);
523 tex.offset_z = tex_instr.get_offset(2);
524 tex.resource_index_mode = tex_instr.resource_index_mode();
525 tex.sampler_index_mode = tex_instr.sampler_index_mode();
526
527 if (tex.dst_sel_x < 4 && tex.dst_sel_y < 4 && tex.dst_sel_z < 4 && tex.dst_sel_w < 4)
528 tex_fetch_results.insert(tex.dst_gpr);
529
530 if (tex_instr.opcode() == TexInstr::get_gradient_h ||
531 tex_instr.opcode() == TexInstr::get_gradient_v)
532 tex.inst_mod = tex_instr.has_tex_flag(TexInstr::grad_fine) ? 1 : 0;
533 else
534 tex.inst_mod = tex_instr.inst_mode();
535 if (r600_bytecode_add_tex(m_bc, &tex)) {
536 R600_ASM_ERR("shader_from_nir: Error creating tex assembly instruction\n");
537 m_result = false;
538 }
539 }
540
541 void
visit(const ExportInstr & exi)542 AssamblerVisitor::visit(const ExportInstr& exi)
543 {
544 const auto& value = exi.value();
545
546 r600_bytecode_output output;
547 memset(&output, 0, sizeof(output));
548
549 output.gpr = value.sel();
550 output.elem_size = 3;
551 output.swizzle_x = value[0]->chan();
552 output.swizzle_y = value[1]->chan();
553 output.swizzle_z = value[2]->chan();
554 output.burst_count = 1;
555 output.op = exi.is_last_export() ? CF_OP_EXPORT_DONE : CF_OP_EXPORT;
556 output.type = exi.export_type();
557
558 clear_states(sf_all);
559 switch (exi.export_type()) {
560 case ExportInstr::pixel:
561 output.swizzle_w = ps_alpha_to_one ? 5 : exi.value()[3]->chan();
562 output.array_base = exi.location();
563 break;
564 case ExportInstr::pos:
565 output.swizzle_w = exi.value()[3]->chan();
566 output.array_base = 60 + exi.location();
567 break;
568 case ExportInstr::param:
569 output.swizzle_w = exi.value()[3]->chan();
570 output.array_base = exi.location();
571 break;
572 default:
573 R600_ASM_ERR("shader_from_nir: export %d type not yet supported\n",
574 exi.export_type());
575 m_result = false;
576 }
577
578 /* If all register elements pinned to fixed values
579 * we can override the gpr (the register allocator doesn't see
580 * this because it doesn't take these channels into account. */
581 if (output.swizzle_x > 3 && output.swizzle_y > 3 && output.swizzle_z > 3 &&
582 output.swizzle_w > 3)
583 output.gpr = 0;
584
585 int r = 0;
586 if ((r = r600_bytecode_add_output(m_bc, &output))) {
587 R600_ASM_ERR("Error adding export at location %d : err: %d\n", exi.location(), r);
588 m_result = false;
589 }
590 }
591
592 void
visit(const ScratchIOInstr & instr)593 AssamblerVisitor::visit(const ScratchIOInstr& instr)
594 {
595 clear_states(sf_all);
596
597 struct r600_bytecode_output cf;
598
599 memset(&cf, 0, sizeof(struct r600_bytecode_output));
600
601 cf.op = CF_OP_MEM_SCRATCH;
602 cf.elem_size = 3;
603 cf.gpr = instr.value().sel();
604 cf.mark = !instr.is_read();
605 cf.comp_mask = instr.is_read() ? 0xf : instr.write_mask();
606 cf.swizzle_x = 0;
607 cf.swizzle_y = 1;
608 cf.swizzle_z = 2;
609 cf.swizzle_w = 3;
610 cf.burst_count = 1;
611
612 assert(!instr.is_read() || m_bc->gfx_level < R700);
613
614 if (instr.address()) {
615 cf.type = instr.is_read() || m_bc->gfx_level > R600 ? 3 : 1;
616 cf.index_gpr = instr.address()->sel();
617
618 /* The docu seems to be wrong here: In indirect addressing the
619 * address_base seems to be the array_size */
620 cf.array_size = instr.array_size();
621 } else {
622 cf.type = instr.is_read() || m_bc->gfx_level > R600 ? 2 : 0;
623 cf.array_base = instr.location();
624 }
625
626 if (r600_bytecode_add_output(m_bc, &cf)) {
627 R600_ASM_ERR("shader_from_nir: Error creating SCRATCH_WR assembly instruction\n");
628 m_result = false;
629 }
630 }
631
632 void
visit(const StreamOutInstr & instr)633 AssamblerVisitor::visit(const StreamOutInstr& instr)
634 {
635 struct r600_bytecode_output output;
636 memset(&output, 0, sizeof(struct r600_bytecode_output));
637
638 output.gpr = instr.value().sel();
639 output.elem_size = instr.element_size();
640 output.array_base = instr.array_base();
641 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
642 output.burst_count = instr.burst_count();
643 output.array_size = instr.array_size();
644 output.comp_mask = instr.comp_mask();
645 output.op = instr.op(m_shader->bc.gfx_level);
646
647 if (r600_bytecode_add_output(m_bc, &output)) {
648 R600_ASM_ERR("shader_from_nir: Error creating stream output instruction\n");
649 m_result = false;
650 }
651 }
652
653 void
visit(const MemRingOutInstr & instr)654 AssamblerVisitor::visit(const MemRingOutInstr& instr)
655 {
656 struct r600_bytecode_output output;
657 memset(&output, 0, sizeof(struct r600_bytecode_output));
658
659 output.gpr = instr.value().sel();
660 output.type = instr.type();
661 output.elem_size = 3;
662 output.comp_mask = 0xf;
663 output.burst_count = 1;
664 output.op = instr.op();
665 if (instr.type() == MemRingOutInstr::mem_write_ind ||
666 instr.type() == MemRingOutInstr::mem_write_ind_ack) {
667 output.index_gpr = instr.index_reg();
668 output.array_size = 0xfff;
669 }
670 output.array_base = instr.array_base();
671
672 if (r600_bytecode_add_output(m_bc, &output)) {
673 R600_ASM_ERR("shader_from_nir: Error creating mem ring write instruction\n");
674 m_result = false;
675 }
676 }
677
678 void
visit(const EmitVertexInstr & instr)679 AssamblerVisitor::visit(const EmitVertexInstr& instr)
680 {
681 int r = r600_bytecode_add_cfinst(m_bc, instr.op());
682 if (!r)
683 m_bc->cf_last->count = instr.stream();
684 else
685 m_result = false;
686 assert(m_bc->cf_last->count < 4);
687 }
688
689 void
visit(const FetchInstr & fetch_instr)690 AssamblerVisitor::visit(const FetchInstr& fetch_instr)
691 {
692 bool use_tc =
693 fetch_instr.has_fetch_flag(FetchInstr::use_tc) || (m_bc->gfx_level == CAYMAN);
694
695 auto clear_flags = use_tc ? sf_vtx : sf_tex;
696
697 clear_states(clear_flags | sf_alu);
698
699 if (fetch_instr.has_fetch_flag(FetchInstr::wait_ack))
700 emit_wait_ack();
701
702
703 if (!use_tc &&
704 vtx_fetch_results.find(fetch_instr.src().sel()) != vtx_fetch_results.end()) {
705 m_bc->force_add_cf = 1;
706 vtx_fetch_results.clear();
707 }
708
709 if (fetch_instr.has_fetch_flag(FetchInstr::use_tc) &&
710 tex_fetch_results.find(fetch_instr.src().sel()) != tex_fetch_results.end()) {
711 m_bc->force_add_cf = 1;
712 tex_fetch_results.clear();
713 }
714
715 if (use_tc)
716 tex_fetch_results.insert(fetch_instr.dst().sel());
717 else
718 vtx_fetch_results.insert(fetch_instr.dst().sel());
719
720 struct r600_bytecode_vtx vtx;
721 memset(&vtx, 0, sizeof(vtx));
722 vtx.op = fetch_instr.opcode();
723 vtx.buffer_id = fetch_instr.resource_id();
724 vtx.fetch_type = fetch_instr.fetch_type();
725 vtx.src_gpr = fetch_instr.src().sel();
726 vtx.src_sel_x = fetch_instr.src().chan();
727 vtx.mega_fetch_count = fetch_instr.mega_fetch_count();
728 vtx.dst_gpr = fetch_instr.dst().sel();
729 vtx.dst_sel_x = fetch_instr.dest_swizzle(0); /* SEL_X */
730 vtx.dst_sel_y = fetch_instr.dest_swizzle(1); /* SEL_Y */
731 vtx.dst_sel_z = fetch_instr.dest_swizzle(2); /* SEL_Z */
732 vtx.dst_sel_w = fetch_instr.dest_swizzle(3); /* SEL_W */
733 vtx.use_const_fields = fetch_instr.has_fetch_flag(FetchInstr::use_const_field);
734 vtx.data_format = fetch_instr.data_format();
735 vtx.num_format_all = fetch_instr.num_format(); /* NUM_FORMAT_SCALED */
736 vtx.format_comp_all = fetch_instr.has_fetch_flag(FetchInstr::format_comp_signed);
737 vtx.endian = fetch_instr.endian_swap();
738 vtx.buffer_index_mode = fetch_instr.resource_index_mode();
739 vtx.offset = fetch_instr.src_offset();
740 vtx.indexed = fetch_instr.has_fetch_flag(FetchInstr::indexed);
741 vtx.uncached = fetch_instr.has_fetch_flag(FetchInstr::uncached);
742 vtx.elem_size = fetch_instr.elm_size();
743 vtx.array_base = fetch_instr.array_base();
744 vtx.array_size = fetch_instr.array_size();
745 vtx.srf_mode_all = fetch_instr.has_fetch_flag(FetchInstr::srf_mode);
746
747 if (fetch_instr.has_fetch_flag(FetchInstr::use_tc)) {
748 if ((r600_bytecode_add_vtx_tc(m_bc, &vtx))) {
749 R600_ASM_ERR("shader_from_nir: Error creating tex assembly instruction\n");
750 m_result = false;
751 }
752
753 } else {
754 if ((r600_bytecode_add_vtx(m_bc, &vtx))) {
755 R600_ASM_ERR("shader_from_nir: Error creating tex assembly instruction\n");
756 m_result = false;
757 }
758 }
759
760 m_bc->cf_last->vpm =
761 (m_bc->type == PIPE_SHADER_FRAGMENT) && fetch_instr.has_fetch_flag(FetchInstr::vpm);
762 m_bc->cf_last->barrier = 1;
763 }
764
765 void
visit(const WriteTFInstr & instr)766 AssamblerVisitor::visit(const WriteTFInstr& instr)
767 {
768 struct r600_bytecode_gds gds;
769
770 auto& value = instr.value();
771
772 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
773 gds.src_gpr = value.sel();
774 gds.src_sel_x = value[0]->chan();
775 gds.src_sel_y = value[1]->chan();
776 gds.src_sel_z = 4;
777 gds.dst_sel_x = 7;
778 gds.dst_sel_y = 7;
779 gds.dst_sel_z = 7;
780 gds.dst_sel_w = 7;
781 gds.op = FETCH_OP_TF_WRITE;
782
783 if (r600_bytecode_add_gds(m_bc, &gds) != 0) {
784 m_result = false;
785 return;
786 }
787
788 if (value[2]->chan() != 7) {
789 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
790 gds.src_gpr = value.sel();
791 gds.src_sel_x = value[2]->chan();
792 gds.src_sel_y = value[3]->chan();
793 gds.src_sel_z = 4;
794 gds.dst_sel_x = 7;
795 gds.dst_sel_y = 7;
796 gds.dst_sel_z = 7;
797 gds.dst_sel_w = 7;
798 gds.op = FETCH_OP_TF_WRITE;
799
800 if (r600_bytecode_add_gds(m_bc, &gds)) {
801 m_result = false;
802 return;
803 }
804 }
805 }
806
807 void
visit(const RatInstr & instr)808 AssamblerVisitor::visit(const RatInstr& instr)
809 {
810 struct r600_bytecode_gds gds;
811
812 /* The instruction writes to the retuen buffer location, and
813 * the value will actually be read back, so make sure all previous writes
814 * have been finished */
815 if (m_ack_suggested /*&& instr.has_instr_flag(Instr::ack_rat_return_write)*/)
816 emit_wait_ack();
817
818 int rat_idx = instr.resource_id();
819
820 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
821
822 r600_bytecode_add_cfinst(m_bc, instr.cf_opcode());
823 auto cf = m_bc->cf_last;
824 cf->rat.id = rat_idx + m_shader->rat_base;
825 cf->rat.inst = instr.rat_op();
826 cf->rat.index_mode = instr.resource_index_mode();
827 cf->output.type = instr.need_ack() ? 3 : 1;
828 cf->output.gpr = instr.data_gpr();
829 cf->output.index_gpr = instr.index_gpr();
830 cf->output.comp_mask = instr.comp_mask();
831 cf->output.burst_count = instr.burst_count();
832 assert(instr.data_swz(0) == PIPE_SWIZZLE_X);
833 if (cf->rat.inst != RatInstr::STORE_TYPED) {
834 assert(instr.data_swz(1) == PIPE_SWIZZLE_Y ||
835 instr.data_swz(1) == PIPE_SWIZZLE_MAX);
836 assert(instr.data_swz(2) == PIPE_SWIZZLE_Z ||
837 instr.data_swz(2) == PIPE_SWIZZLE_MAX);
838 }
839
840 cf->vpm = m_bc->type == PIPE_SHADER_FRAGMENT;
841 cf->barrier = 1;
842 cf->mark = instr.need_ack();
843 cf->output.elem_size = instr.elm_size();
844
845 m_ack_suggested |= instr.need_ack();
846 }
847
848 void
clear_states(const uint32_t & states)849 AssamblerVisitor::clear_states(const uint32_t& states)
850 {
851 if (states & sf_vtx)
852 vtx_fetch_results.clear();
853
854 if (states & sf_tex)
855 tex_fetch_results.clear();
856
857 if (states & sf_alu) {
858 m_last_op_was_barrier = false;
859 m_last_addr = nullptr;
860 }
861 }
862
863 void
visit(const Block & block)864 AssamblerVisitor::visit(const Block& block)
865 {
866 if (block.empty())
867 return;
868
869 if (block.has_instr_flag(Instr::force_cf)) {
870 m_bc->force_add_cf = 1;
871 m_bc->ar_loaded = 0;
872 m_last_addr = nullptr;
873 }
874 sfn_log << SfnLog::assembly << "Translate block size: " << block.size()
875 << " new_cf:" << m_bc->force_add_cf << "\n";
876
877 for (const auto& i : block) {
878 sfn_log << SfnLog::assembly << "Translate " << *i << " ";
879 i->accept(*this);
880 sfn_log << SfnLog::assembly << (m_result ? "good" : "fail") << "\n";
881
882 if (!m_result)
883 break;
884 }
885 }
886
887 void
visit(const IfInstr & instr)888 AssamblerVisitor::visit(const IfInstr& instr)
889 {
890 int elems = m_callstack.push(FC_PUSH_VPM);
891 bool needs_workaround = false;
892
893 if (m_bc->gfx_level == CAYMAN && m_bc->stack.loop > 1)
894 needs_workaround = true;
895
896 if (m_bc->gfx_level == EVERGREEN && m_bc->family != CHIP_HEMLOCK &&
897 m_bc->family != CHIP_CYPRESS && m_bc->family != CHIP_JUNIPER) {
898 unsigned dmod1 = (elems - 1) % m_bc->stack.entry_size;
899 unsigned dmod2 = (elems) % m_bc->stack.entry_size;
900
901 if (elems && (!dmod1 || !dmod2))
902 needs_workaround = true;
903 }
904
905 auto pred = instr.predicate();
906 auto [addr, dummy0, dummy1] = pred->indirect_addr();
907 {
908 }
909 assert(!dummy1);
910 if (addr) {
911 if (!m_last_addr || !m_bc->ar_loaded || !m_last_addr->equal_to(*addr)) {
912 m_bc->ar_reg = addr->sel();
913 m_bc->ar_chan = addr->chan();
914 m_last_addr = addr;
915 m_bc->ar_loaded = 0;
916
917 r600_load_ar(m_bc, true);
918 }
919 }
920
921 if (needs_workaround) {
922 r600_bytecode_add_cfinst(m_bc, CF_OP_PUSH);
923 m_bc->cf_last->cf_addr = m_bc->cf_last->id + 2;
924 r600_bytecode_add_cfinst(m_bc, CF_OP_ALU);
925 pred->set_cf_type(cf_alu);
926 }
927
928 clear_states(sf_tex | sf_vtx);
929 pred->accept(*this);
930
931 r600_bytecode_add_cfinst(m_bc, CF_OP_JUMP);
932 clear_states(sf_all);
933
934 m_jump_tracker.push(m_bc->cf_last, jt_if);
935 }
936
937 void
visit(const ControlFlowInstr & instr)938 AssamblerVisitor::visit(const ControlFlowInstr& instr)
939 {
940 clear_states(sf_all);
941 switch (instr.cf_type()) {
942 case ControlFlowInstr::cf_else:
943 emit_else();
944 break;
945 case ControlFlowInstr::cf_endif:
946 emit_endif();
947 break;
948 case ControlFlowInstr::cf_loop_begin: {
949 bool use_vpm = m_shader->processor_type == PIPE_SHADER_FRAGMENT &&
950 instr.has_instr_flag(Instr::vpm) &&
951 !instr.has_instr_flag(Instr::helper);
952 emit_loop_begin(use_vpm);
953 break;
954 }
955 case ControlFlowInstr::cf_loop_end:
956 emit_loop_end();
957 break;
958 case ControlFlowInstr::cf_loop_break:
959 emit_loop_break();
960 break;
961 case ControlFlowInstr::cf_loop_continue:
962 emit_loop_cont();
963 break;
964 case ControlFlowInstr::cf_wait_ack: {
965 int r = r600_bytecode_add_cfinst(m_bc, CF_OP_WAIT_ACK);
966 if (!r) {
967 m_bc->cf_last->cf_addr = 0;
968 m_bc->cf_last->barrier = 1;
969 m_ack_suggested = false;
970 } else {
971 m_result = false;
972 }
973 } break;
974 default:
975 unreachable("Unknown CF instruction type");
976 }
977 }
978
979 void
visit(const GDSInstr & instr)980 AssamblerVisitor::visit(const GDSInstr& instr)
981 {
982 struct r600_bytecode_gds gds;
983
984 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
985
986 gds.op = ds_opcode_map.at(instr.opcode());
987 gds.uav_id = instr.resource_id();
988 gds.uav_index_mode = instr.resource_index_mode();
989 gds.src_gpr = instr.src().sel();
990
991 gds.src_sel_x = instr.src()[0]->chan() < 7 ? instr.src()[0]->chan() : 4;
992 gds.src_sel_y = instr.src()[1]->chan() < 7 ? instr.src()[1]->chan() : 4;
993 gds.src_sel_z = instr.src()[2]->chan() < 7 ? instr.src()[2]->chan() : 4;
994
995 gds.dst_sel_x = 7;
996 gds.dst_sel_y = 7;
997 gds.dst_sel_z = 7;
998 gds.dst_sel_w = 7;
999
1000 if (instr.dest()) {
1001 gds.dst_gpr = instr.dest()->sel();
1002 switch (instr.dest()->chan()) {
1003 case 0:
1004 gds.dst_sel_x = 0;
1005 break;
1006 case 1:
1007 gds.dst_sel_y = 0;
1008 break;
1009 case 2:
1010 gds.dst_sel_z = 0;
1011 break;
1012 case 3:
1013 gds.dst_sel_w = 0;
1014 }
1015 }
1016
1017 gds.src_gpr2 = 0;
1018 gds.alloc_consume = m_bc->gfx_level < CAYMAN ? 1 : 0; // Not Cayman
1019
1020 int r = r600_bytecode_add_gds(m_bc, &gds);
1021 if (r) {
1022 m_result = false;
1023 return;
1024 }
1025 m_bc->cf_last->vpm = PIPE_SHADER_FRAGMENT == m_bc->type;
1026 m_bc->cf_last->barrier = 1;
1027 }
1028
1029 void
visit(const LDSAtomicInstr & instr)1030 AssamblerVisitor::visit(const LDSAtomicInstr& instr)
1031 {
1032 (void)instr;
1033 unreachable("LDSAtomicInstr must be lowered to ALUInstr");
1034 }
1035
1036 void
visit(const LDSReadInstr & instr)1037 AssamblerVisitor::visit(const LDSReadInstr& instr)
1038 {
1039 (void)instr;
1040 unreachable("LDSReadInstr must be lowered to ALUInstr");
1041 }
1042
1043 EBufferIndexMode
emit_index_reg(const VirtualValue & addr,unsigned idx)1044 AssamblerVisitor::emit_index_reg(const VirtualValue& addr, unsigned idx)
1045 {
1046 assert(idx < 2);
1047
1048 if (!m_bc->index_loaded[idx] || m_loop_nesting ||
1049 m_bc->index_reg[idx] != (unsigned)addr.sel() ||
1050 m_bc->index_reg_chan[idx] != (unsigned)addr.chan()) {
1051 struct r600_bytecode_alu alu;
1052
1053 // Make sure MOVA is not last instr in clause
1054
1055 if (!m_bc->cf_last || (m_bc->cf_last->ndw >> 1) >= 110)
1056 m_bc->force_add_cf = 1;
1057
1058 if (m_bc->gfx_level != CAYMAN) {
1059
1060 EAluOp idxop = idx ? op1_set_cf_idx1 : op1_set_cf_idx0;
1061
1062 memset(&alu, 0, sizeof(alu));
1063 alu.op = opcode_map.at(op1_mova_int);
1064 alu.dst.chan = 0;
1065 alu.src[0].sel = addr.sel();
1066 alu.src[0].chan = addr.chan();
1067 alu.last = 1;
1068 sfn_log << SfnLog::assembly << " mova_int, ";
1069 int r = r600_bytecode_add_alu(m_bc, &alu);
1070 if (r)
1071 return bim_invalid;
1072
1073 alu.op = opcode_map.at(idxop);
1074 alu.dst.chan = 0;
1075 alu.src[0].sel = 0;
1076 alu.src[0].chan = 0;
1077 alu.last = 1;
1078 sfn_log << SfnLog::assembly << "op1_set_cf_idx" << idx;
1079 r = r600_bytecode_add_alu(m_bc, &alu);
1080 if (r)
1081 return bim_invalid;
1082 } else {
1083 memset(&alu, 0, sizeof(alu));
1084 alu.op = opcode_map.at(op1_mova_int);
1085 alu.dst.sel = idx == 0 ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
1086 alu.dst.chan = 0;
1087 alu.src[0].sel = addr.sel();
1088 alu.src[0].chan = addr.chan();
1089 alu.last = 1;
1090 sfn_log << SfnLog::assembly << " mova_int, ";
1091 int r = r600_bytecode_add_alu(m_bc, &alu);
1092 if (r)
1093 return bim_invalid;
1094 }
1095
1096 m_bc->ar_loaded = 0;
1097 m_bc->index_reg[idx] = addr.sel();
1098 m_bc->index_reg_chan[idx] = addr.chan();
1099 m_bc->index_loaded[idx] = true;
1100 m_bc->force_add_cf = 1;
1101 sfn_log << SfnLog::assembly << "\n";
1102 }
1103 return idx == 0 ? bim_zero : bim_one;
1104 }
1105
1106 void
emit_else()1107 AssamblerVisitor::emit_else()
1108 {
1109 r600_bytecode_add_cfinst(m_bc, CF_OP_ELSE);
1110 m_bc->cf_last->pop_count = 1;
1111 m_result &= m_jump_tracker.add_mid(m_bc->cf_last, jt_if);
1112 }
1113
1114 void
emit_endif()1115 AssamblerVisitor::emit_endif()
1116 {
1117 m_callstack.pop(FC_PUSH_VPM);
1118
1119 unsigned force_pop = m_bc->force_add_cf;
1120 if (!force_pop) {
1121 int alu_pop = 3;
1122 if (m_bc->cf_last) {
1123 if (m_bc->cf_last->op == CF_OP_ALU)
1124 alu_pop = 0;
1125 else if (m_bc->cf_last->op == CF_OP_ALU_POP_AFTER)
1126 alu_pop = 1;
1127 }
1128 alu_pop += 1;
1129 if (alu_pop == 1) {
1130 m_bc->cf_last->op = CF_OP_ALU_POP_AFTER;
1131 m_bc->force_add_cf = 1;
1132 } else {
1133 force_pop = 1;
1134 }
1135 }
1136
1137 if (force_pop) {
1138 r600_bytecode_add_cfinst(m_bc, CF_OP_POP);
1139 m_bc->cf_last->pop_count = 1;
1140 m_bc->cf_last->cf_addr = m_bc->cf_last->id + 2;
1141 }
1142
1143 m_result &= m_jump_tracker.pop(m_bc->cf_last, jt_if);
1144 }
1145
1146 void
emit_loop_begin(bool vpm)1147 AssamblerVisitor::emit_loop_begin(bool vpm)
1148 {
1149 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_START_DX10);
1150 m_bc->cf_last->vpm = vpm && m_bc->type == PIPE_SHADER_FRAGMENT;
1151 m_jump_tracker.push(m_bc->cf_last, jt_loop);
1152 m_callstack.push(FC_LOOP);
1153 ++m_loop_nesting;
1154 }
1155
1156 void
emit_loop_end()1157 AssamblerVisitor::emit_loop_end()
1158 {
1159 if (m_ack_suggested) {
1160 emit_wait_ack();
1161 m_ack_suggested = false;
1162 }
1163
1164 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_END);
1165 m_callstack.pop(FC_LOOP);
1166 assert(m_loop_nesting);
1167 --m_loop_nesting;
1168 m_result |= m_jump_tracker.pop(m_bc->cf_last, jt_loop);
1169 }
1170
1171 void
emit_loop_break()1172 AssamblerVisitor::emit_loop_break()
1173 {
1174 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_BREAK);
1175 m_result |= m_jump_tracker.add_mid(m_bc->cf_last, jt_loop);
1176 }
1177
1178 void
emit_loop_cont()1179 AssamblerVisitor::emit_loop_cont()
1180 {
1181 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_CONTINUE);
1182 m_result |= m_jump_tracker.add_mid(m_bc->cf_last, jt_loop);
1183 }
1184
1185 bool
copy_dst(r600_bytecode_alu_dst & dst,const Register & d,bool write)1186 AssamblerVisitor::copy_dst(r600_bytecode_alu_dst& dst, const Register& d, bool write)
1187 {
1188 if (write && d.sel() > g_clause_local_end) {
1189 R600_ASM_ERR("shader_from_nir: Don't support more then 123 GPRs + 4 clause "
1190 "local, but try using %d\n",
1191 d.sel());
1192 m_result = false;
1193 return false;
1194 }
1195
1196 dst.sel = d.sel();
1197 dst.chan = d.chan();
1198
1199 if (m_last_addr && m_last_addr->equal_to(d))
1200 m_last_addr = nullptr;
1201
1202 for (int i = 0; i < 2; ++i) {
1203 /* Force emitting index register, if we didn't emit it yet, because
1204 * the register value will change now */
1205 if (dst.sel == m_bc->index_reg[i] && dst.chan == m_bc->index_reg_chan[i])
1206 m_bc->index_loaded[i] = false;
1207 }
1208
1209 return true;
1210 }
1211
1212 void
emit_wait_ack()1213 AssamblerVisitor::emit_wait_ack()
1214 {
1215 int r = r600_bytecode_add_cfinst(m_bc, CF_OP_WAIT_ACK);
1216 if (!r) {
1217 m_bc->cf_last->cf_addr = 0;
1218 m_bc->cf_last->barrier = 1;
1219 m_ack_suggested = false;
1220 } else
1221 m_result = false;
1222 }
1223
1224 class EncodeSourceVisitor : public ConstRegisterVisitor {
1225 public:
1226 EncodeSourceVisitor(r600_bytecode_alu_src& s, r600_bytecode *bc);
1227 void visit(const Register& value) override;
1228 void visit(const LocalArray& value) override;
1229 void visit(const LocalArrayValue& value) override;
1230 void visit(const UniformValue& value) override;
1231 void visit(const LiteralConstant& value) override;
1232 void visit(const InlineConstant& value) override;
1233
1234 r600_bytecode_alu_src& src;
1235 r600_bytecode *m_bc;
1236 PVirtualValue m_buffer_offset{nullptr};
1237 };
1238
1239 PVirtualValue
copy_src(r600_bytecode_alu_src & src,const VirtualValue & s)1240 AssamblerVisitor::copy_src(r600_bytecode_alu_src& src, const VirtualValue& s)
1241 {
1242
1243 EncodeSourceVisitor visitor(src, m_bc);
1244 src.sel = s.sel();
1245 src.chan = s.chan();
1246
1247 if (s.sel() >= g_clause_local_start && s.sel() < g_clause_local_end ) {
1248 assert(m_bc->cf_last);
1249 int clidx = 4 * (s.sel() - g_clause_local_start) + s.chan();
1250 /* Ensure that the clause local register was already written */
1251 assert(m_bc->cf_last->clause_local_written & (1 << clidx));
1252 }
1253
1254 s.accept(visitor);
1255 return visitor.m_buffer_offset;
1256 }
1257
EncodeSourceVisitor(r600_bytecode_alu_src & s,r600_bytecode * bc)1258 EncodeSourceVisitor::EncodeSourceVisitor(r600_bytecode_alu_src& s, r600_bytecode *bc):
1259 src(s),
1260 m_bc(bc)
1261 {
1262 }
1263
1264 void
visit(const Register & value)1265 EncodeSourceVisitor::visit(const Register& value)
1266 {
1267 assert(value.sel() < g_clause_local_end && "Only have 123 reisters + 4 clause local");
1268 }
1269
1270 void
visit(const LocalArray & value)1271 EncodeSourceVisitor::visit(const LocalArray& value)
1272 {
1273 (void)value;
1274 unreachable("An array can't be a source register");
1275 }
1276
1277 void
visit(const LocalArrayValue & value)1278 EncodeSourceVisitor::visit(const LocalArrayValue& value)
1279 {
1280 src.rel = value.addr() ? 1 : 0;
1281 }
1282
1283 void
visit(const UniformValue & value)1284 EncodeSourceVisitor::visit(const UniformValue& value)
1285 {
1286 assert(value.sel() >= 512 && "Uniform values must have a sel >= 512");
1287 m_buffer_offset = value.buf_addr();
1288 src.kc_bank = value.kcache_bank();
1289 }
1290
1291 void
visit(const LiteralConstant & value)1292 EncodeSourceVisitor::visit(const LiteralConstant& value)
1293 {
1294 src.value = value.value();
1295 }
1296
1297 void
visit(const InlineConstant & value)1298 EncodeSourceVisitor::visit(const InlineConstant& value)
1299 {
1300 (void)value;
1301 }
1302
1303 const std::map<EAluOp, int> opcode_map = {
1304
1305 {op2_add, ALU_OP2_ADD },
1306 {op2_mul, ALU_OP2_MUL },
1307 {op2_mul_ieee, ALU_OP2_MUL_IEEE },
1308 {op2_max, ALU_OP2_MAX },
1309 {op2_min, ALU_OP2_MIN },
1310 {op2_max_dx10, ALU_OP2_MAX_DX10 },
1311 {op2_min_dx10, ALU_OP2_MIN_DX10 },
1312 {op2_sete, ALU_OP2_SETE },
1313 {op2_setgt, ALU_OP2_SETGT },
1314 {op2_setge, ALU_OP2_SETGE },
1315 {op2_setne, ALU_OP2_SETNE },
1316 {op2_sete_dx10, ALU_OP2_SETE_DX10 },
1317 {op2_setgt_dx10, ALU_OP2_SETGT_DX10 },
1318 {op2_setge_dx10, ALU_OP2_SETGE_DX10 },
1319 {op2_setne_dx10, ALU_OP2_SETNE_DX10 },
1320 {op1_fract, ALU_OP1_FRACT },
1321 {op1_trunc, ALU_OP1_TRUNC },
1322 {op1_ceil, ALU_OP1_CEIL },
1323 {op1_rndne, ALU_OP1_RNDNE },
1324 {op1_floor, ALU_OP1_FLOOR },
1325 {op2_ashr_int, ALU_OP2_ASHR_INT },
1326 {op2_lshr_int, ALU_OP2_LSHR_INT },
1327 {op2_lshl_int, ALU_OP2_LSHL_INT },
1328 {op1_mov, ALU_OP1_MOV },
1329 {op0_nop, ALU_OP0_NOP },
1330 {op2_mul_64, ALU_OP2_MUL_64 },
1331 {op1v_flt64_to_flt32, ALU_OP1_FLT64_TO_FLT32 },
1332 {op1v_flt32_to_flt64, ALU_OP1_FLT32_TO_FLT64 },
1333 {op2_prede_int, ALU_OP2_PRED_SETE_INT },
1334 {op2_pred_setne_int, ALU_OP2_PRED_SETNE_INT },
1335 {op2_pred_setge_int, ALU_OP2_PRED_SETGE_INT },
1336 {op2_pred_setgt_int, ALU_OP2_PRED_SETGT_INT },
1337 {op2_pred_setgt_uint, ALU_OP2_PRED_SETGT_UINT },
1338 {op2_pred_setge_uint, ALU_OP2_PRED_SETGE_UINT },
1339 {op2_pred_sete, ALU_OP2_PRED_SETE },
1340 {op2_pred_setgt, ALU_OP2_PRED_SETGT },
1341 {op2_pred_setge, ALU_OP2_PRED_SETGE },
1342 {op2_pred_setne, ALU_OP2_PRED_SETNE },
1343 {op0_pred_set_clr, ALU_OP0_PRED_SET_CLR },
1344 {op1_pred_set_restore, ALU_OP1_PRED_SET_RESTORE },
1345 {op2_pred_sete_push, ALU_OP2_PRED_SETE_PUSH },
1346 {op2_pred_setgt_push, ALU_OP2_PRED_SETGT_PUSH },
1347 {op2_pred_setge_push, ALU_OP2_PRED_SETGE_PUSH },
1348 {op2_pred_setne_push, ALU_OP2_PRED_SETNE_PUSH },
1349 {op2_kille, ALU_OP2_KILLE },
1350 {op2_killgt, ALU_OP2_KILLGT },
1351 {op2_killge, ALU_OP2_KILLGE },
1352 {op2_killne, ALU_OP2_KILLNE },
1353 {op2_and_int, ALU_OP2_AND_INT },
1354 {op2_or_int, ALU_OP2_OR_INT },
1355 {op2_xor_int, ALU_OP2_XOR_INT },
1356 {op1_not_int, ALU_OP1_NOT_INT },
1357 {op2_add_int, ALU_OP2_ADD_INT },
1358 {op2_sub_int, ALU_OP2_SUB_INT },
1359 {op2_max_int, ALU_OP2_MAX_INT },
1360 {op2_min_int, ALU_OP2_MIN_INT },
1361 {op2_max_uint, ALU_OP2_MAX_UINT },
1362 {op2_min_uint, ALU_OP2_MIN_UINT },
1363 {op2_sete_int, ALU_OP2_SETE_INT },
1364 {op2_setgt_int, ALU_OP2_SETGT_INT },
1365 {op2_setge_int, ALU_OP2_SETGE_INT },
1366 {op2_setne_int, ALU_OP2_SETNE_INT },
1367 {op2_setgt_uint, ALU_OP2_SETGT_UINT },
1368 {op2_setge_uint, ALU_OP2_SETGE_UINT },
1369 {op2_killgt_uint, ALU_OP2_KILLGT_UINT },
1370 {op2_killge_uint, ALU_OP2_KILLGE_UINT },
1371 {op2_pred_setgt_int, ALU_OP2_PRED_SETGT_INT },
1372 {op2_pred_setge_int, ALU_OP2_PRED_SETGE_INT },
1373 {op2_pred_setne_int, ALU_OP2_PRED_SETNE_INT },
1374 {op2_kille_int, ALU_OP2_KILLE_INT },
1375 {op2_killgt_int, ALU_OP2_KILLGT_INT },
1376 {op2_killge_int, ALU_OP2_KILLGE_INT },
1377 {op2_killne_int, ALU_OP2_KILLNE_INT },
1378 {op2_pred_sete_push_int, ALU_OP2_PRED_SETE_PUSH_INT },
1379 {op2_pred_setgt_push_int, ALU_OP2_PRED_SETGT_PUSH_INT },
1380 {op2_pred_setge_push_int, ALU_OP2_PRED_SETGE_PUSH_INT },
1381 {op2_pred_setne_push_int, ALU_OP2_PRED_SETNE_PUSH_INT },
1382 {op2_pred_setlt_push_int, ALU_OP2_PRED_SETLT_PUSH_INT },
1383 {op2_pred_setle_push_int, ALU_OP2_PRED_SETLE_PUSH_INT },
1384 {op1_flt_to_int, ALU_OP1_FLT_TO_INT },
1385 {op1_bfrev_int, ALU_OP1_BFREV_INT },
1386 {op2_addc_uint, ALU_OP2_ADDC_UINT },
1387 {op2_subb_uint, ALU_OP2_SUBB_UINT },
1388 {op0_group_barrier, ALU_OP0_GROUP_BARRIER },
1389 {op0_group_seq_begin, ALU_OP0_GROUP_SEQ_BEGIN },
1390 {op0_group_seq_end, ALU_OP0_GROUP_SEQ_END },
1391 {op2_set_mode, ALU_OP2_SET_MODE },
1392 {op1_set_cf_idx0, ALU_OP0_SET_CF_IDX0 },
1393 {op1_set_cf_idx1, ALU_OP0_SET_CF_IDX1 },
1394 {op2_set_lds_size, ALU_OP2_SET_LDS_SIZE },
1395 {op1_exp_ieee, ALU_OP1_EXP_IEEE },
1396 {op1_log_clamped, ALU_OP1_LOG_CLAMPED },
1397 {op1_log_ieee, ALU_OP1_LOG_IEEE },
1398 {op1_recip_clamped, ALU_OP1_RECIP_CLAMPED },
1399 {op1_recip_ff, ALU_OP1_RECIP_FF },
1400 {op1_recip_ieee, ALU_OP1_RECIP_IEEE },
1401 {op1_recipsqrt_clamped, ALU_OP1_RECIPSQRT_CLAMPED },
1402 {op1_recipsqrt_ff, ALU_OP1_RECIPSQRT_FF },
1403 {op1_recipsqrt_ieee1, ALU_OP1_RECIPSQRT_IEEE },
1404 {op1_sqrt_ieee, ALU_OP1_SQRT_IEEE },
1405 {op1_sin, ALU_OP1_SIN },
1406 {op1_cos, ALU_OP1_COS },
1407 {op2_mullo_int, ALU_OP2_MULLO_INT },
1408 {op2_mulhi_int, ALU_OP2_MULHI_INT },
1409 {op2_mullo_uint, ALU_OP2_MULLO_UINT },
1410 {op2_mulhi_uint, ALU_OP2_MULHI_UINT },
1411 {op1_recip_int, ALU_OP1_RECIP_INT },
1412 {op1_recip_uint, ALU_OP1_RECIP_UINT },
1413 {op1_recip_64, ALU_OP2_RECIP_64 },
1414 {op1_recip_clamped_64, ALU_OP2_RECIP_CLAMPED_64 },
1415 {op1_recipsqrt_64, ALU_OP2_RECIPSQRT_64 },
1416 {op1_recipsqrt_clamped_64, ALU_OP2_RECIPSQRT_CLAMPED_64 },
1417 {op1_sqrt_64, ALU_OP2_SQRT_64 },
1418 {op1_flt_to_uint, ALU_OP1_FLT_TO_UINT },
1419 {op1_int_to_flt, ALU_OP1_INT_TO_FLT },
1420 {op1_uint_to_flt, ALU_OP1_UINT_TO_FLT },
1421 {op2_bfm_int, ALU_OP2_BFM_INT },
1422 {op1_flt32_to_flt16, ALU_OP1_FLT32_TO_FLT16 },
1423 {op1_flt16_to_flt32, ALU_OP1_FLT16_TO_FLT32 },
1424 {op1_ubyte0_flt, ALU_OP1_UBYTE0_FLT },
1425 {op1_ubyte1_flt, ALU_OP1_UBYTE1_FLT },
1426 {op1_ubyte2_flt, ALU_OP1_UBYTE2_FLT },
1427 {op1_ubyte3_flt, ALU_OP1_UBYTE3_FLT },
1428 {op1_bcnt_int, ALU_OP1_BCNT_INT },
1429 {op1_ffbh_uint, ALU_OP1_FFBH_UINT },
1430 {op1_ffbl_int, ALU_OP1_FFBL_INT },
1431 {op1_ffbh_int, ALU_OP1_FFBH_INT },
1432 {op1_flt_to_uint4, ALU_OP1_FLT_TO_UINT4 },
1433 {op2_dot_ieee, ALU_OP2_DOT_IEEE },
1434 {op1_flt_to_int_rpi, ALU_OP1_FLT_TO_INT_RPI },
1435 {op1_flt_to_int_floor, ALU_OP1_FLT_TO_INT_FLOOR },
1436 {op2_mulhi_uint24, ALU_OP2_MULHI_UINT24 },
1437 {op1_mbcnt_32hi_int, ALU_OP1_MBCNT_32HI_INT },
1438 {op1_offset_to_flt, ALU_OP1_OFFSET_TO_FLT },
1439 {op2_mul_uint24, ALU_OP2_MUL_UINT24 },
1440 {op1_bcnt_accum_prev_int, ALU_OP1_BCNT_ACCUM_PREV_INT },
1441 {op1_mbcnt_32lo_accum_prev_int, ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT},
1442 {op2_sete_64, ALU_OP2_SETE_64 },
1443 {op2_setne_64, ALU_OP2_SETNE_64 },
1444 {op2_setgt_64, ALU_OP2_SETGT_64 },
1445 {op2_setge_64, ALU_OP2_SETGE_64 },
1446 {op2_min_64, ALU_OP2_MIN_64 },
1447 {op2_max_64, ALU_OP2_MAX_64 },
1448 {op2_dot4, ALU_OP2_DOT4 },
1449 {op2_dot4_ieee, ALU_OP2_DOT4_IEEE },
1450 {op2_cube, ALU_OP2_CUBE },
1451 {op1_max4, ALU_OP1_MAX4 },
1452 {op1_frexp_64, ALU_OP1_FREXP_64 },
1453 {op1_ldexp_64, ALU_OP2_LDEXP_64 },
1454 {op1_fract_64, ALU_OP1_FRACT_64 },
1455 {op2_pred_setgt_64, ALU_OP2_PRED_SETGT_64 },
1456 {op2_pred_sete_64, ALU_OP2_PRED_SETE_64 },
1457 {op2_pred_setge_64, ALU_OP2_PRED_SETGE_64 },
1458 {op2_add_64, ALU_OP2_ADD_64 },
1459 {op1_mova_int, ALU_OP1_MOVA_INT },
1460 {op1v_flt64_to_flt32, ALU_OP1_FLT64_TO_FLT32 },
1461 {op1_flt32_to_flt64, ALU_OP1_FLT32_TO_FLT64 },
1462 {op2_sad_accum_prev_uint, ALU_OP2_SAD_ACCUM_PREV_UINT },
1463 {op2_dot, ALU_OP2_DOT },
1464 {op1_mul_prev, ALU_OP1_MUL_PREV },
1465 {op1_mul_ieee_prev, ALU_OP1_MUL_IEEE_PREV },
1466 {op1_add_prev, ALU_OP1_ADD_PREV },
1467 {op2_muladd_prev, ALU_OP2_MULADD_PREV },
1468 {op2_muladd_ieee_prev, ALU_OP2_MULADD_IEEE_PREV },
1469 {op2_interp_xy, ALU_OP2_INTERP_XY },
1470 {op2_interp_zw, ALU_OP2_INTERP_ZW },
1471 {op2_interp_x, ALU_OP2_INTERP_X },
1472 {op2_interp_z, ALU_OP2_INTERP_Z },
1473 {op0_store_flags, ALU_OP1_STORE_FLAGS },
1474 {op1_load_store_flags, ALU_OP1_LOAD_STORE_FLAGS },
1475 {op0_lds_1a, ALU_OP2_LDS_1A },
1476 {op0_lds_1a1d, ALU_OP2_LDS_1A1D },
1477 {op0_lds_2a, ALU_OP2_LDS_2A },
1478 {op1_interp_load_p0, ALU_OP1_INTERP_LOAD_P0 },
1479 {op1_interp_load_p10, ALU_OP1_INTERP_LOAD_P10 },
1480 {op1_interp_load_p20, ALU_OP1_INTERP_LOAD_P20 },
1481 {op3_bfe_uint, ALU_OP3_BFE_UINT },
1482 {op3_bfe_int, ALU_OP3_BFE_INT },
1483 {op3_bfi_int, ALU_OP3_BFI_INT },
1484 {op3_fma, ALU_OP3_FMA },
1485 {op3_cndne_64, ALU_OP3_CNDNE_64 },
1486 {op3_fma_64, ALU_OP3_FMA_64 },
1487 {op3_lerp_uint, ALU_OP3_LERP_UINT },
1488 {op3_bit_align_int, ALU_OP3_BIT_ALIGN_INT },
1489 {op3_byte_align_int, ALU_OP3_BYTE_ALIGN_INT },
1490 {op3_sad_accum_uint, ALU_OP3_SAD_ACCUM_UINT },
1491 {op3_sad_accum_hi_uint, ALU_OP3_SAD_ACCUM_HI_UINT },
1492 {op3_muladd_uint24, ALU_OP3_MULADD_UINT24 },
1493 {op3_lds_idx_op, ALU_OP3_LDS_IDX_OP },
1494 {op3_muladd, ALU_OP3_MULADD },
1495 {op3_muladd_m2, ALU_OP3_MULADD_M2 },
1496 {op3_muladd_m4, ALU_OP3_MULADD_M4 },
1497 {op3_muladd_d2, ALU_OP3_MULADD_D2 },
1498 {op3_muladd_ieee, ALU_OP3_MULADD_IEEE },
1499 {op3_cnde, ALU_OP3_CNDE },
1500 {op3_cndgt, ALU_OP3_CNDGT },
1501 {op3_cndge, ALU_OP3_CNDGE },
1502 {op3_cnde_int, ALU_OP3_CNDE_INT },
1503 {op3_cndgt_int, ALU_OP3_CNDGT_INT },
1504 {op3_cndge_int, ALU_OP3_CNDGE_INT },
1505 {op3_mul_lit, ALU_OP3_MUL_LIT },
1506 };
1507
1508 const std::map<ESDOp, int> ds_opcode_map = {
1509 {DS_OP_ADD, FETCH_OP_GDS_ADD },
1510 {DS_OP_SUB, FETCH_OP_GDS_SUB },
1511 {DS_OP_RSUB, FETCH_OP_GDS_RSUB },
1512 {DS_OP_INC, FETCH_OP_GDS_INC },
1513 {DS_OP_DEC, FETCH_OP_GDS_DEC },
1514 {DS_OP_MIN_INT, FETCH_OP_GDS_MIN_INT },
1515 {DS_OP_MAX_INT, FETCH_OP_GDS_MAX_INT },
1516 {DS_OP_MIN_UINT, FETCH_OP_GDS_MIN_UINT },
1517 {DS_OP_MAX_UINT, FETCH_OP_GDS_MAX_UINT },
1518 {DS_OP_AND, FETCH_OP_GDS_AND },
1519 {DS_OP_OR, FETCH_OP_GDS_OR },
1520 {DS_OP_XOR, FETCH_OP_GDS_XOR },
1521 {DS_OP_MSKOR, FETCH_OP_GDS_MSKOR },
1522 {DS_OP_WRITE, FETCH_OP_GDS_WRITE },
1523 {DS_OP_WRITE_REL, FETCH_OP_GDS_WRITE_REL },
1524 {DS_OP_WRITE2, FETCH_OP_GDS_WRITE2 },
1525 {DS_OP_CMP_STORE, FETCH_OP_GDS_CMP_STORE },
1526 {DS_OP_CMP_STORE_SPF, FETCH_OP_GDS_CMP_STORE_SPF },
1527 {DS_OP_BYTE_WRITE, FETCH_OP_GDS_BYTE_WRITE },
1528 {DS_OP_SHORT_WRITE, FETCH_OP_GDS_SHORT_WRITE },
1529 {DS_OP_ADD_RET, FETCH_OP_GDS_ADD_RET },
1530 {DS_OP_SUB_RET, FETCH_OP_GDS_SUB_RET },
1531 {DS_OP_RSUB_RET, FETCH_OP_GDS_RSUB_RET },
1532 {DS_OP_INC_RET, FETCH_OP_GDS_INC_RET },
1533 {DS_OP_DEC_RET, FETCH_OP_GDS_DEC_RET },
1534 {DS_OP_MIN_INT_RET, FETCH_OP_GDS_MIN_INT_RET },
1535 {DS_OP_MAX_INT_RET, FETCH_OP_GDS_MAX_INT_RET },
1536 {DS_OP_MIN_UINT_RET, FETCH_OP_GDS_MIN_UINT_RET },
1537 {DS_OP_MAX_UINT_RET, FETCH_OP_GDS_MAX_UINT_RET },
1538 {DS_OP_AND_RET, FETCH_OP_GDS_AND_RET },
1539 {DS_OP_OR_RET, FETCH_OP_GDS_OR_RET },
1540 {DS_OP_XOR_RET, FETCH_OP_GDS_XOR_RET },
1541 {DS_OP_MSKOR_RET, FETCH_OP_GDS_MSKOR_RET },
1542 {DS_OP_XCHG_RET, FETCH_OP_GDS_XCHG_RET },
1543 {DS_OP_XCHG_REL_RET, FETCH_OP_GDS_XCHG_REL_RET },
1544 {DS_OP_XCHG2_RET, FETCH_OP_GDS_XCHG2_RET },
1545 {DS_OP_CMP_XCHG_RET, FETCH_OP_GDS_CMP_XCHG_RET },
1546 {DS_OP_CMP_XCHG_SPF_RET, FETCH_OP_GDS_CMP_XCHG_SPF_RET },
1547 {DS_OP_READ_RET, FETCH_OP_GDS_READ_RET },
1548 {DS_OP_READ_REL_RET, FETCH_OP_GDS_READ_REL_RET },
1549 {DS_OP_READ2_RET, FETCH_OP_GDS_READ2_RET },
1550 {DS_OP_READWRITE_RET, FETCH_OP_GDS_READWRITE_RET },
1551 {DS_OP_BYTE_READ_RET, FETCH_OP_GDS_BYTE_READ_RET },
1552 {DS_OP_UBYTE_READ_RET, FETCH_OP_GDS_UBYTE_READ_RET },
1553 {DS_OP_SHORT_READ_RET, FETCH_OP_GDS_SHORT_READ_RET },
1554 {DS_OP_USHORT_READ_RET, FETCH_OP_GDS_USHORT_READ_RET },
1555 {DS_OP_ATOMIC_ORDERED_ALLOC_RET, FETCH_OP_GDS_ATOMIC_ORDERED_ALLOC},
1556 {DS_OP_INVALID, 0 },
1557 };
1558
1559 } // namespace r600
1560