xref: /aosp_15_r20/external/mesa3d/src/panfrost/compiler/bifrost/bi_pack.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2020 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "bi_quirks.h"
25 #include "compiler.h"
26 
27 /* This file contains the final passes of the compiler. Running after
28  * scheduling and RA, the IR is now finalized, so we need to emit it to actual
29  * bits on the wire (as well as fixup branches) */
30 
31 static uint64_t
bi_pack_header(bi_clause * clause,bi_clause * next_1,bi_clause * next_2)32 bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2)
33 {
34    /* next_dependencies are the union of the dependencies of successors'
35     * dependencies */
36 
37    unsigned dependency_wait = next_1 ? next_1->dependencies : 0;
38    dependency_wait |= next_2 ? next_2->dependencies : 0;
39 
40    /* Signal barriers (slot #7) immediately. This is not optimal but good
41     * enough. Doing better requires extending the IR and scheduler.
42     */
43    if (clause->message_type == BIFROST_MESSAGE_BARRIER)
44       dependency_wait |= BITFIELD_BIT(7);
45 
46    bool staging_barrier = next_1 ? next_1->staging_barrier : false;
47    staging_barrier |= next_2 ? next_2->staging_barrier : 0;
48 
49    struct bifrost_header header = {
50       .flow_control = (next_1 == NULL && next_2 == NULL) ? BIFROST_FLOW_END
51                                                          : clause->flow_control,
52       .terminate_discarded_threads = clause->td,
53       .next_clause_prefetch = clause->next_clause_prefetch && next_1,
54       .staging_barrier = staging_barrier,
55       .staging_register = clause->staging_register,
56       .dependency_wait = dependency_wait,
57       .dependency_slot = clause->scoreboard_id,
58       .message_type = clause->message_type,
59       .next_message_type = next_1 ? next_1->message_type : 0,
60       .flush_to_zero = clause->ftz ? BIFROST_FTZ_ALWAYS : BIFROST_FTZ_DISABLE,
61    };
62 
63    uint64_t u = 0;
64    memcpy(&u, &header, sizeof(header));
65    return u;
66 }
67 
68 /* Assigns a slot for reading, before anything is written */
69 
70 static void
bi_assign_slot_read(bi_registers * regs,bi_index src)71 bi_assign_slot_read(bi_registers *regs, bi_index src)
72 {
73    /* We only assign for registers */
74    if (src.type != BI_INDEX_REGISTER)
75       return;
76 
77    /* Check if we already assigned the slot */
78    for (unsigned i = 0; i <= 1; ++i) {
79       if (regs->slot[i] == src.value && regs->enabled[i])
80          return;
81    }
82 
83    if (regs->slot[2] == src.value && regs->slot23.slot2 == BIFROST_OP_READ)
84       return;
85 
86    /* Assign it now */
87 
88    for (unsigned i = 0; i <= 1; ++i) {
89       if (!regs->enabled[i]) {
90          regs->slot[i] = src.value;
91          regs->enabled[i] = true;
92          return;
93       }
94    }
95 
96    if (!regs->slot23.slot3) {
97       regs->slot[2] = src.value;
98       regs->slot23.slot2 = BIFROST_OP_READ;
99       return;
100    }
101 
102    bi_print_slots(regs, stderr);
103    unreachable("Failed to find a free slot for src");
104 }
105 
106 static bi_registers
bi_assign_slots(bi_tuple * now,bi_tuple * prev)107 bi_assign_slots(bi_tuple *now, bi_tuple *prev)
108 {
109    /* We assign slots for the main register mechanism. Special ops
110     * use the data registers, which has its own mechanism entirely
111     * and thus gets skipped over here. */
112 
113    bool read_dreg = now->add && bi_opcode_props[now->add->op].sr_read;
114    bool write_dreg = prev->add && bi_opcode_props[prev->add->op].sr_write;
115 
116    /* First, assign reads */
117 
118    if (now->fma)
119       bi_foreach_src(now->fma, src)
120          bi_assign_slot_read(&now->regs, (now->fma)->src[src]);
121 
122    if (now->add) {
123       bi_foreach_src(now->add, src) {
124          /* This is not a real source, we shouldn't assign a
125           * slot for it.
126           */
127          if (now->add->op == BI_OPCODE_BLEND && src == 4)
128             continue;
129 
130          if (!(src == 0 && read_dreg))
131             bi_assign_slot_read(&now->regs, (now->add)->src[src]);
132       }
133    }
134 
135    /* Next, assign writes. Staging writes are assigned separately, but
136     * +ATEST wants its destination written to both a staging register
137     * _and_ a regular write, because it may not generate a message */
138 
139    if (prev->add && prev->add->nr_dests &&
140        (!write_dreg || prev->add->op == BI_OPCODE_ATEST)) {
141       bi_index idx = prev->add->dest[0];
142 
143       if (idx.type == BI_INDEX_REGISTER) {
144          now->regs.slot[3] = idx.value;
145          now->regs.slot23.slot3 = BIFROST_OP_WRITE;
146       }
147    }
148 
149    if (prev->fma && prev->fma->nr_dests) {
150       bi_index idx = prev->fma->dest[0];
151 
152       if (idx.type == BI_INDEX_REGISTER) {
153          if (now->regs.slot23.slot3) {
154             /* Scheduler constraint: cannot read 3 and write 2 */
155             assert(!now->regs.slot23.slot2);
156             now->regs.slot[2] = idx.value;
157             now->regs.slot23.slot2 = BIFROST_OP_WRITE;
158          } else {
159             now->regs.slot[3] = idx.value;
160             now->regs.slot23.slot3 = BIFROST_OP_WRITE;
161             now->regs.slot23.slot3_fma = true;
162          }
163       }
164    }
165 
166    return now->regs;
167 }
168 
169 static enum bifrost_reg_mode
bi_pack_register_mode(bi_registers r)170 bi_pack_register_mode(bi_registers r)
171 {
172    /* Handle idle as a special case */
173    if (!(r.slot23.slot2 | r.slot23.slot3))
174       return r.first_instruction ? BIFROST_IDLE_1 : BIFROST_IDLE;
175 
176    /* Otherwise, use the LUT */
177    for (unsigned i = 0; i < ARRAY_SIZE(bifrost_reg_ctrl_lut); ++i) {
178       if (memcmp(bifrost_reg_ctrl_lut + i, &r.slot23, sizeof(r.slot23)) == 0)
179          return i;
180    }
181 
182    bi_print_slots(&r, stderr);
183    unreachable("Invalid slot assignment");
184 }
185 
186 static uint64_t
bi_pack_registers(bi_registers regs)187 bi_pack_registers(bi_registers regs)
188 {
189    enum bifrost_reg_mode mode = bi_pack_register_mode(regs);
190    struct bifrost_regs s = {0};
191    uint64_t packed = 0;
192 
193    /* Need to pack 5-bit mode as a 4-bit field. The decoder moves bit 3 to bit 4
194     * for first instruction and adds 16 when reg 2 == reg 3 */
195 
196    unsigned ctrl;
197    bool r2_equals_r3 = false;
198 
199    if (regs.first_instruction) {
200       /* Bit 3 implicitly must be clear for first instructions.
201        * The affected patterns all write both ADD/FMA, but that
202        * is forbidden for the last instruction (whose writes are
203        * encoded by the first), so this does not add additional
204        * encoding constraints */
205       assert(!(mode & 0x8));
206 
207       /* Move bit 4 to bit 3, since bit 3 is clear */
208       ctrl = (mode & 0x7) | ((mode & 0x10) >> 1);
209 
210       /* If we can let r2 equal r3, we have to or the hardware raises
211        * INSTR_INVALID_ENC (it's unclear why). */
212       if (!(regs.slot23.slot2 && regs.slot23.slot3))
213          r2_equals_r3 = true;
214    } else {
215       /* We force r2=r3 or not for the upper bit */
216       ctrl = (mode & 0xF);
217       r2_equals_r3 = (mode & 0x10);
218    }
219 
220    if (regs.enabled[1]) {
221       /* Gotta save that bit!~ Required by the 63-x trick */
222       assert(regs.slot[1] > regs.slot[0]);
223       assert(regs.enabled[0]);
224 
225       /* Do the 63-x trick, see docs/disasm */
226       if (regs.slot[0] > 31) {
227          regs.slot[0] = 63 - regs.slot[0];
228          regs.slot[1] = 63 - regs.slot[1];
229       }
230 
231       assert(regs.slot[0] <= 31);
232       assert(regs.slot[1] <= 63);
233 
234       s.ctrl = ctrl;
235       s.reg1 = regs.slot[1];
236       s.reg0 = regs.slot[0];
237    } else {
238       /* slot 1 disabled, so set to zero and use slot 1 for ctrl */
239       s.ctrl = 0;
240       s.reg1 = ctrl << 2;
241 
242       if (regs.enabled[0]) {
243          /* Bit 0 upper bit of slot 0 */
244          s.reg1 |= (regs.slot[0] >> 5);
245 
246          /* Rest of slot 0 in usual spot */
247          s.reg0 = (regs.slot[0] & 0b11111);
248       } else {
249          /* Bit 1 set if slot 0 also disabled */
250          s.reg1 |= (1 << 1);
251       }
252    }
253 
254    /* Force r2 =/!= r3 as needed */
255    if (r2_equals_r3) {
256       assert(regs.slot[3] == regs.slot[2] ||
257              !(regs.slot23.slot2 && regs.slot23.slot3));
258 
259       if (regs.slot23.slot2)
260          regs.slot[3] = regs.slot[2];
261       else
262          regs.slot[2] = regs.slot[3];
263    } else if (!regs.first_instruction) {
264       /* Enforced by the encoding anyway */
265       assert(regs.slot[2] != regs.slot[3]);
266    }
267 
268    s.reg2 = regs.slot[2];
269    s.reg3 = regs.slot[3];
270    s.fau_idx = regs.fau_idx;
271 
272    memcpy(&packed, &s, sizeof(s));
273    return packed;
274 }
275 
276 /* We must ensure slot 1 > slot 0 for the 63-x trick to function, so we fix
277  * this up at pack time. (Scheduling doesn't care.) */
278 
279 static void
bi_flip_slots(bi_registers * regs)280 bi_flip_slots(bi_registers *regs)
281 {
282    if (regs->enabled[0] && regs->enabled[1] && regs->slot[1] < regs->slot[0]) {
283       unsigned temp = regs->slot[0];
284       regs->slot[0] = regs->slot[1];
285       regs->slot[1] = temp;
286    }
287 }
288 
289 static inline enum bifrost_packed_src
bi_get_src_slot(bi_registers * regs,unsigned reg)290 bi_get_src_slot(bi_registers *regs, unsigned reg)
291 {
292    if (regs->slot[0] == reg && regs->enabled[0])
293       return BIFROST_SRC_PORT0;
294    else if (regs->slot[1] == reg && regs->enabled[1])
295       return BIFROST_SRC_PORT1;
296    else if (regs->slot[2] == reg && regs->slot23.slot2 == BIFROST_OP_READ)
297       return BIFROST_SRC_PORT2;
298    else
299       unreachable("Tried to access register with no port");
300 }
301 
302 static inline enum bifrost_packed_src
bi_get_src_new(bi_instr * ins,bi_registers * regs,unsigned s)303 bi_get_src_new(bi_instr *ins, bi_registers *regs, unsigned s)
304 {
305    if (!ins || s >= ins->nr_srcs)
306       return 0;
307 
308    bi_index src = ins->src[s];
309 
310    if (src.type == BI_INDEX_REGISTER)
311       return bi_get_src_slot(regs, src.value);
312    else if (src.type == BI_INDEX_PASS)
313       return src.value;
314    else {
315       /* TODO make safer */
316       return BIFROST_SRC_STAGE;
317    }
318 }
319 
320 static struct bi_packed_tuple
bi_pack_tuple(bi_clause * clause,bi_tuple * tuple,bi_tuple * prev,bool first_tuple,gl_shader_stage stage)321 bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev,
322               bool first_tuple, gl_shader_stage stage)
323 {
324    bi_assign_slots(tuple, prev);
325    tuple->regs.fau_idx = tuple->fau_idx;
326    tuple->regs.first_instruction = first_tuple;
327 
328    bi_flip_slots(&tuple->regs);
329 
330    bool sr_read = tuple->add && bi_opcode_props[(tuple->add)->op].sr_read;
331 
332    uint64_t reg = bi_pack_registers(tuple->regs);
333    uint64_t fma =
334       bi_pack_fma(tuple->fma, bi_get_src_new(tuple->fma, &tuple->regs, 0),
335                   bi_get_src_new(tuple->fma, &tuple->regs, 1),
336                   bi_get_src_new(tuple->fma, &tuple->regs, 2),
337                   bi_get_src_new(tuple->fma, &tuple->regs, 3));
338 
339    uint64_t add = bi_pack_add(
340       tuple->add, bi_get_src_new(tuple->add, &tuple->regs, sr_read + 0),
341       bi_get_src_new(tuple->add, &tuple->regs, sr_read + 1),
342       bi_get_src_new(tuple->add, &tuple->regs, sr_read + 2), 0);
343 
344    if (tuple->add) {
345       bi_instr *add = tuple->add;
346 
347       bool sr_write =
348          bi_opcode_props[add->op].sr_write && !bi_is_null(add->dest[0]);
349 
350       if (sr_read && !bi_is_null(add->src[0])) {
351          assert(add->src[0].type == BI_INDEX_REGISTER);
352          clause->staging_register = add->src[0].value;
353 
354          if (sr_write)
355             assert(bi_is_equiv(add->src[0], add->dest[0]));
356       } else if (sr_write) {
357          assert(add->dest[0].type == BI_INDEX_REGISTER);
358          clause->staging_register = add->dest[0].value;
359       }
360    }
361 
362    struct bi_packed_tuple packed = {
363       .lo = reg | (fma << 35) | ((add & 0b111111) << 58),
364       .hi = add >> 6,
365    };
366 
367    return packed;
368 }
369 
370 /* A block contains at most one PC-relative constant, from a terminal branch.
371  * Find the last instruction and if it is a relative branch, fix up the
372  * PC-relative constant to contain the absolute offset. This occurs at pack
373  * time instead of schedule time because the number of quadwords between each
374  * block is not known until after all other passes have finished.
375  */
376 
377 static void
bi_assign_branch_offset(bi_context * ctx,bi_block * block)378 bi_assign_branch_offset(bi_context *ctx, bi_block *block)
379 {
380    if (list_is_empty(&block->clauses))
381       return;
382 
383    bi_clause *clause = list_last_entry(&block->clauses, bi_clause, link);
384    bi_instr *br = bi_last_instr_in_clause(clause);
385 
386    if (!br->branch_target)
387       return;
388 
389    /* Put it in the high place */
390    int32_t qwords = bi_block_offset(ctx, clause, br->branch_target);
391    int32_t bytes = qwords * 16;
392 
393    /* Copy so we can toy with the sign without undefined behaviour */
394    uint32_t raw = 0;
395    memcpy(&raw, &bytes, sizeof(raw));
396 
397    /* Clear off top bits for A1/B1 bits */
398    raw &= ~0xF0000000;
399 
400    /* Put in top 32-bits */
401    assert(clause->pcrel_idx < 8);
402    clause->constants[clause->pcrel_idx] |= ((uint64_t)raw) << 32ull;
403 }
404 
405 static void
bi_pack_constants(unsigned tuple_count,uint64_t * constants,unsigned word_idx,unsigned constant_words,bool ec0_packed,struct util_dynarray * emission)406 bi_pack_constants(unsigned tuple_count, uint64_t *constants, unsigned word_idx,
407                   unsigned constant_words, bool ec0_packed,
408                   struct util_dynarray *emission)
409 {
410    unsigned index = (word_idx << 1) + ec0_packed;
411 
412    /* Do more constants follow */
413    bool more = (word_idx + 1) < constant_words;
414 
415    /* Indexed first by tuple count and second by constant word number,
416     * indicates the position in the clause */
417    unsigned pos_lookup[8][3] = {
418       {0}, {1}, {3}, {2, 5}, {4, 8}, {7, 11, 14}, {6, 10, 13}, {9, 12},
419    };
420 
421    /* Compute the pos, and check everything is reasonable */
422    assert((tuple_count - 1) < 8);
423    assert(word_idx < 3);
424    unsigned pos = pos_lookup[tuple_count - 1][word_idx];
425    assert(pos != 0 || (tuple_count == 1 && word_idx == 0));
426 
427    struct bifrost_fmt_constant quad = {
428       .pos = pos,
429       .tag = more ? BIFROST_FMTC_CONSTANTS : BIFROST_FMTC_FINAL,
430       .imm_1 = constants[index + 0] >> 4,
431       .imm_2 = constants[index + 1] >> 4,
432    };
433 
434    util_dynarray_append(emission, struct bifrost_fmt_constant, quad);
435 }
436 
437 uint8_t
bi_pack_literal(enum bi_clause_subword literal)438 bi_pack_literal(enum bi_clause_subword literal)
439 {
440    assert(literal >= BI_CLAUSE_SUBWORD_LITERAL_0);
441    assert(literal <= BI_CLAUSE_SUBWORD_LITERAL_7);
442 
443    return (literal - BI_CLAUSE_SUBWORD_LITERAL_0);
444 }
445 
446 static inline uint8_t
bi_clause_upper(unsigned val,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count)447 bi_clause_upper(unsigned val, struct bi_packed_tuple *tuples,
448                 ASSERTED unsigned tuple_count)
449 {
450    assert(val < tuple_count);
451 
452    /* top 3-bits of 78-bits is tuple >> 75 == (tuple >> 64) >> 11 */
453    struct bi_packed_tuple tuple = tuples[val];
454    return (tuple.hi >> 11);
455 }
456 
457 uint8_t
bi_pack_upper(enum bi_clause_subword upper,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count)458 bi_pack_upper(enum bi_clause_subword upper, struct bi_packed_tuple *tuples,
459               ASSERTED unsigned tuple_count)
460 {
461    assert(upper >= BI_CLAUSE_SUBWORD_UPPER_0);
462    assert(upper <= BI_CLAUSE_SUBWORD_UPPER_7);
463 
464    return bi_clause_upper(upper - BI_CLAUSE_SUBWORD_UPPER_0, tuples,
465                           tuple_count);
466 }
467 
468 uint64_t
bi_pack_tuple_bits(enum bi_clause_subword idx,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,unsigned offset,unsigned nbits)469 bi_pack_tuple_bits(enum bi_clause_subword idx, struct bi_packed_tuple *tuples,
470                    ASSERTED unsigned tuple_count, unsigned offset,
471                    unsigned nbits)
472 {
473    assert(idx >= BI_CLAUSE_SUBWORD_TUPLE_0);
474    assert(idx <= BI_CLAUSE_SUBWORD_TUPLE_7);
475 
476    unsigned val = (idx - BI_CLAUSE_SUBWORD_TUPLE_0);
477    assert(val < tuple_count);
478 
479    struct bi_packed_tuple tuple = tuples[val];
480 
481    assert(offset + nbits < 78);
482    assert(nbits <= 64);
483 
484    /* (X >> start) & m
485     * = (((hi << 64) | lo) >> start) & m
486     * = (((hi << 64) >> start) | (lo >> start)) & m
487     * = { ((hi << (64 - start)) | (lo >> start)) & m if start <= 64
488     *   { ((hi >> (start - 64)) | (lo >> start)) & m if start >= 64
489     * = { ((hi << (64 - start)) & m) | ((lo >> start) & m) if start <= 64
490     *   { ((hi >> (start - 64)) & m) | ((lo >> start) & m) if start >= 64
491     *
492     * By setting m = 2^64 - 1, we justify doing the respective shifts as
493     * 64-bit integers. Zero special cased to avoid undefined behaviour.
494     */
495 
496    uint64_t lo = (tuple.lo >> offset);
497    uint64_t hi = (offset == 0)   ? 0
498                  : (offset > 64) ? (tuple.hi >> (offset - 64))
499                                  : (tuple.hi << (64 - offset));
500 
501    return (lo | hi) & ((1ULL << nbits) - 1);
502 }
503 
504 static inline uint16_t
bi_pack_lu(enum bi_clause_subword word,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count)505 bi_pack_lu(enum bi_clause_subword word, struct bi_packed_tuple *tuples,
506            ASSERTED unsigned tuple_count)
507 {
508    return (word >= BI_CLAUSE_SUBWORD_UPPER_0)
509              ? bi_pack_upper(word, tuples, tuple_count)
510              : bi_pack_literal(word);
511 }
512 
513 uint8_t
bi_pack_sync(enum bi_clause_subword t1,enum bi_clause_subword t2,enum bi_clause_subword t3,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,bool z)514 bi_pack_sync(enum bi_clause_subword t1, enum bi_clause_subword t2,
515              enum bi_clause_subword t3, struct bi_packed_tuple *tuples,
516              ASSERTED unsigned tuple_count, bool z)
517 {
518    uint8_t sync = (bi_pack_lu(t3, tuples, tuple_count) << 0) |
519                   (bi_pack_lu(t2, tuples, tuple_count) << 3);
520 
521    if (t1 == BI_CLAUSE_SUBWORD_Z)
522       sync |= z << 6;
523    else
524       sync |= bi_pack_literal(t1) << 6;
525 
526    return sync;
527 }
528 
529 static inline uint64_t
bi_pack_t_ec(enum bi_clause_subword word,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t ec0)530 bi_pack_t_ec(enum bi_clause_subword word, struct bi_packed_tuple *tuples,
531              ASSERTED unsigned tuple_count, uint64_t ec0)
532 {
533    if (word == BI_CLAUSE_SUBWORD_CONSTANT)
534       return ec0;
535    else
536       return bi_pack_tuple_bits(word, tuples, tuple_count, 0, 60);
537 }
538 
539 static uint32_t
bi_pack_subwords_56(enum bi_clause_subword t,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t header,uint64_t ec0,unsigned tuple_subword)540 bi_pack_subwords_56(enum bi_clause_subword t, struct bi_packed_tuple *tuples,
541                     ASSERTED unsigned tuple_count, uint64_t header,
542                     uint64_t ec0, unsigned tuple_subword)
543 {
544    switch (t) {
545    case BI_CLAUSE_SUBWORD_HEADER:
546       return (header & ((1 << 30) - 1));
547    case BI_CLAUSE_SUBWORD_RESERVED:
548       return 0;
549    case BI_CLAUSE_SUBWORD_CONSTANT:
550       return (ec0 >> 15) & ((1 << 30) - 1);
551    default:
552       return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 30);
553    }
554 }
555 
556 static uint16_t
bi_pack_subword(enum bi_clause_subword t,unsigned format,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t header,uint64_t ec0,unsigned m0,unsigned tuple_subword)557 bi_pack_subword(enum bi_clause_subword t, unsigned format,
558                 struct bi_packed_tuple *tuples, ASSERTED unsigned tuple_count,
559                 uint64_t header, uint64_t ec0, unsigned m0,
560                 unsigned tuple_subword)
561 {
562    switch (t) {
563    case BI_CLAUSE_SUBWORD_HEADER:
564       return header >> 30;
565    case BI_CLAUSE_SUBWORD_M:
566       return m0;
567    case BI_CLAUSE_SUBWORD_CONSTANT:
568       return (format == 5 || format == 10) ? (ec0 & ((1 << 15) - 1))
569                                            : (ec0 >> (15 + 30));
570    case BI_CLAUSE_SUBWORD_UPPER_23:
571       return (bi_clause_upper(2, tuples, tuple_count) << 12) |
572              (bi_clause_upper(3, tuples, tuple_count) << 9);
573    case BI_CLAUSE_SUBWORD_UPPER_56:
574       return (bi_clause_upper(5, tuples, tuple_count) << 12) |
575              (bi_clause_upper(6, tuples, tuple_count) << 9);
576    case BI_CLAUSE_SUBWORD_UPPER_0 ... BI_CLAUSE_SUBWORD_UPPER_7:
577       return bi_pack_upper(t, tuples, tuple_count) << 12;
578    default:
579       return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 15);
580    }
581 }
582 
583 /* EC0 is 60-bits (bottom 4 already shifted off) */
584 void
bi_pack_format(struct util_dynarray * emission,unsigned index,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t header,uint64_t ec0,unsigned m0,bool z)585 bi_pack_format(struct util_dynarray *emission, unsigned index,
586                struct bi_packed_tuple *tuples, ASSERTED unsigned tuple_count,
587                uint64_t header, uint64_t ec0, unsigned m0, bool z)
588 {
589    struct bi_clause_format format = bi_clause_formats[index];
590 
591    uint8_t sync = bi_pack_sync(format.tag_1, format.tag_2, format.tag_3, tuples,
592                                tuple_count, z);
593 
594    uint64_t s0_s3 = bi_pack_t_ec(format.s0_s3, tuples, tuple_count, ec0);
595 
596    uint16_t s4 = bi_pack_subword(format.s4, format.format, tuples, tuple_count,
597                                  header, ec0, m0, 4);
598 
599    uint32_t s5_s6 =
600       bi_pack_subwords_56(format.s5_s6, tuples, tuple_count, header, ec0,
601                           (format.format == 2 || format.format == 7) ? 0 : 3);
602 
603    uint64_t s7 = bi_pack_subword(format.s7, format.format, tuples, tuple_count,
604                                  header, ec0, m0, 2);
605 
606    /* Now that subwords are packed, split into 64-bit halves and emit */
607    uint64_t lo = sync | ((s0_s3 & ((1ull << 56) - 1)) << 8);
608    uint64_t hi = (s0_s3 >> 56) | ((uint64_t)s4 << 4) | ((uint64_t)s5_s6 << 19) |
609                  ((uint64_t)s7 << 49);
610 
611    util_dynarray_append(emission, uint64_t, lo);
612    util_dynarray_append(emission, uint64_t, hi);
613 }
614 
615 static void
bi_pack_clause(bi_context * ctx,bi_clause * clause,bi_clause * next_1,bi_clause * next_2,struct util_dynarray * emission,gl_shader_stage stage)616 bi_pack_clause(bi_context *ctx, bi_clause *clause, bi_clause *next_1,
617                bi_clause *next_2, struct util_dynarray *emission,
618                gl_shader_stage stage)
619 {
620    struct bi_packed_tuple ins[8] = {0};
621 
622    for (unsigned i = 0; i < clause->tuple_count; ++i) {
623       unsigned prev = ((i == 0) ? clause->tuple_count : i) - 1;
624       ins[i] = bi_pack_tuple(clause, &clause->tuples[i], &clause->tuples[prev],
625                              i == 0, stage);
626 
627       bi_instr *add = clause->tuples[i].add;
628 
629       /* Different GPUs support different forms of the CLPER.i32
630        * instruction. Check we use the right one for the target.
631        */
632       if (add && add->op == BI_OPCODE_CLPER_OLD_I32)
633          assert(ctx->quirks & BIFROST_LIMITED_CLPER);
634       else if (add && add->op == BI_OPCODE_CLPER_I32)
635          assert(!(ctx->quirks & BIFROST_LIMITED_CLPER));
636    }
637 
638    bool ec0_packed = bi_ec0_packed(clause->tuple_count);
639 
640    if (ec0_packed)
641       clause->constant_count = MAX2(clause->constant_count, 1);
642 
643    unsigned constant_quads =
644       DIV_ROUND_UP(clause->constant_count - (ec0_packed ? 1 : 0), 2);
645 
646    uint64_t header = bi_pack_header(clause, next_1, next_2);
647    uint64_t ec0 = (clause->constants[0] >> 4);
648    unsigned m0 = (clause->pcrel_idx == 0) ? 4 : 0;
649 
650    unsigned counts[8] = {
651       1, 2, 3, 3, 4, 5, 5, 6,
652    };
653 
654    unsigned indices[8][6] = {
655       {1},          {0, 2},           {0, 3, 4},        {0, 3, 6},
656       {0, 3, 7, 8}, {0, 3, 5, 9, 10}, {0, 3, 5, 9, 11}, {0, 3, 5, 9, 12, 13},
657    };
658 
659    unsigned count = counts[clause->tuple_count - 1];
660 
661    for (unsigned pos = 0; pos < count; ++pos) {
662       ASSERTED unsigned idx = indices[clause->tuple_count - 1][pos];
663       assert(bi_clause_formats[idx].pos == pos);
664       assert((bi_clause_formats[idx].tag_1 == BI_CLAUSE_SUBWORD_Z) ==
665              (pos == count - 1));
666 
667       /* Whether to end the clause immediately after the last tuple */
668       bool z = (constant_quads == 0);
669 
670       bi_pack_format(emission, indices[clause->tuple_count - 1][pos], ins,
671                      clause->tuple_count, header, ec0, m0, z);
672    }
673 
674    /* Pack the remaining constants */
675 
676    for (unsigned pos = 0; pos < constant_quads; ++pos) {
677       bi_pack_constants(clause->tuple_count, clause->constants, pos,
678                         constant_quads, ec0_packed, emission);
679    }
680 }
681 
682 static void
bi_collect_blend_ret_addr(bi_context * ctx,struct util_dynarray * emission,const bi_clause * clause)683 bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission,
684                           const bi_clause *clause)
685 {
686    /* No need to collect return addresses when we're in a blend shader. */
687    if (ctx->inputs->is_blend)
688       return;
689 
690    const bi_tuple *tuple = &clause->tuples[clause->tuple_count - 1];
691    const bi_instr *ins = tuple->add;
692 
693    if (!ins || ins->op != BI_OPCODE_BLEND)
694       return;
695 
696    unsigned loc = tuple->regs.fau_idx - BIR_FAU_BLEND_0;
697    assert(loc < ARRAY_SIZE(ctx->info.bifrost->blend));
698    assert(!ctx->info.bifrost->blend[loc].return_offset);
699    ctx->info.bifrost->blend[loc].return_offset =
700       util_dynarray_num_elements(emission, uint8_t);
701    assert(!(ctx->info.bifrost->blend[loc].return_offset & 0x7));
702 }
703 
704 /*
705  * The second register destination of TEXC_DUAL is encoded into the texture
706  * operation descriptor during register allocation. It's dropped as late as
707  * possible (instruction packing) so the register remains recorded in the IR,
708  * for clause scoreboarding and so on.
709  */
710 static void
bi_lower_texc_dual(bi_context * ctx)711 bi_lower_texc_dual(bi_context *ctx)
712 {
713    bi_foreach_instr_global(ctx, I) {
714       if (I->op == BI_OPCODE_TEXC_DUAL) {
715          /* In hardware, TEXC has 1 destination */
716          I->op = BI_OPCODE_TEXC;
717          bi_drop_dests(I, 1);
718       }
719    }
720 }
721 
722 unsigned
bi_pack(bi_context * ctx,struct util_dynarray * emission)723 bi_pack(bi_context *ctx, struct util_dynarray *emission)
724 {
725    unsigned previous_size = emission->size;
726 
727    bi_lower_texc_dual(ctx);
728 
729    bi_foreach_block(ctx, block) {
730       bi_assign_branch_offset(ctx, block);
731 
732       bi_foreach_clause_in_block(block, clause) {
733          bool is_last = (clause->link.next == &block->clauses);
734 
735          /* Get the succeeding clauses, either two successors of
736           * the block for the last clause in the block or just
737           * the next clause within the block */
738 
739          bi_clause *next = NULL, *next_2 = NULL;
740 
741          if (is_last) {
742             next = bi_next_clause(ctx, block->successors[0], NULL);
743             next_2 = bi_next_clause(ctx, block->successors[1], NULL);
744          } else {
745             next = bi_next_clause(ctx, block, clause);
746          }
747 
748          previous_size = emission->size;
749 
750          bi_pack_clause(ctx, clause, next, next_2, emission, ctx->stage);
751 
752          if (!is_last)
753             bi_collect_blend_ret_addr(ctx, emission, clause);
754       }
755    }
756 
757    return emission->size - previous_size;
758 }
759