xref: /aosp_15_r20/external/mesa3d/src/panfrost/compiler/bifrost/bi_schedule.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <[email protected]>
25  */
26 
27 #include "bi_builder.h"
28 #include "compiler.h"
29 
30 /* Arguments common to worklist, passed by value for convenience */
31 
32 struct bi_worklist {
33    /* # of instructions in the block */
34    unsigned count;
35 
36    /* Instructions in the block */
37    bi_instr **instructions;
38 
39    /* Bitset of instructions in the block ready for scheduling */
40    BITSET_WORD *worklist;
41 
42    /* The backwards dependency graph. nr_dependencies is the number of
43     * unscheduled instructions that must still be scheduled after (before)
44     * this instruction. dependents are which instructions need to be
45     * scheduled before (after) this instruction. */
46    unsigned *dep_counts;
47    BITSET_WORD **dependents;
48 };
49 
50 /* State of a single tuple and clause under construction */
51 
52 struct bi_reg_state {
53    /* Number of register writes */
54    unsigned nr_writes;
55 
56    /* Register reads, expressed as (equivalence classes of)
57     * sources. Only 3 reads are allowed, but up to 2 may spill as
58     * "forced" for the next scheduled tuple, provided such a tuple
59     * can be constructed */
60    bi_index reads[5];
61    unsigned nr_reads;
62 
63    /* The previous tuple scheduled (= the next tuple executed in the
64     * program) may require certain writes, in order to bypass the register
65     * file and use a temporary passthrough for the value. Up to 2 such
66     * constraints are architecturally satisfiable */
67    unsigned forced_count;
68    bi_index forceds[2];
69 };
70 
71 struct bi_tuple_state {
72    /* Is this the last tuple in the clause */
73    bool last;
74 
75    /* Scheduled ADD instruction, or null if none */
76    bi_instr *add;
77 
78    /* Reads for previous (succeeding) tuple */
79    bi_index prev_reads[5];
80    unsigned nr_prev_reads;
81    bi_tuple *prev;
82 
83    /* Register slot state for current tuple */
84    struct bi_reg_state reg;
85 
86    /* Constants are shared in the tuple. If constant_count is nonzero, it
87     * is a size for constant count. Otherwise, fau is the slot read from
88     * FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero,
89     * but within a tuple, that should be encoded as constant_count != 0
90     * and constants[0] = constants[1] = 0 */
91    unsigned constant_count;
92 
93    union {
94       uint32_t constants[2];
95       enum bir_fau fau;
96    };
97 
98    unsigned pcrel_idx;
99 };
100 
101 struct bi_const_state {
102    unsigned constant_count;
103    bool pcrel; /* applies to first const */
104    uint32_t constants[2];
105 
106    /* Index of the constant into the clause */
107    unsigned word_idx;
108 };
109 
110 enum bi_ftz_state {
111    /* No flush-to-zero state assigned yet */
112    BI_FTZ_STATE_NONE,
113 
114    /* Never flush-to-zero */
115    BI_FTZ_STATE_DISABLE,
116 
117    /* Always flush-to-zero */
118    BI_FTZ_STATE_ENABLE,
119 };
120 
121 /* At this point, pseudoinstructions have been lowered so sources/destinations
122  * are limited to what's physically supported.
123  */
124 #define BI_MAX_PHYS_SRCS  4
125 #define BI_MAX_PHYS_DESTS 2
126 
127 struct bi_clause_state {
128    /* Has a message-passing instruction already been assigned? */
129    bool message;
130 
131    /* Indices already accessed, this needs to be tracked to avoid hazards
132     * around message-passing instructions */
133    unsigned access_count;
134    bi_index accesses[(BI_MAX_PHYS_SRCS + BI_MAX_PHYS_DESTS) * 16];
135 
136    unsigned tuple_count;
137    struct bi_const_state consts[8];
138 
139    /* Numerical state of the clause */
140    enum bi_ftz_state ftz;
141 };
142 
143 /* Determines messsage type by checking the table and a few special cases. Only
144  * case missing is tilebuffer instructions that access depth/stencil, which
145  * require a Z_STENCIL message (to implement
146  * ARM_shader_framebuffer_fetch_depth_stencil) */
147 
148 static enum bifrost_message_type
bi_message_type_for_instr(bi_instr * ins)149 bi_message_type_for_instr(bi_instr *ins)
150 {
151    enum bifrost_message_type msg = bi_opcode_props[ins->op].message;
152    bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL);
153 
154    if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z)
155       return BIFROST_MESSAGE_Z_STENCIL;
156 
157    if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO)
158       return BIFROST_MESSAGE_ATTRIBUTE;
159 
160    return msg;
161 }
162 
163 /* Attribute, texture, and UBO load (attribute message) instructions support
164  * bindless, so just check the message type */
165 
166 ASSERTED static bool
bi_supports_dtsel(bi_instr * ins)167 bi_supports_dtsel(bi_instr *ins)
168 {
169    switch (bi_message_type_for_instr(ins)) {
170    case BIFROST_MESSAGE_ATTRIBUTE:
171       return ins->op != BI_OPCODE_LD_GCLK_U64;
172    case BIFROST_MESSAGE_TEX:
173       return true;
174    default:
175       return false;
176    }
177 }
178 
179 /* Adds an edge to the dependency graph */
180 
181 static void
bi_push_dependency(unsigned parent,unsigned child,BITSET_WORD ** dependents,unsigned * dep_counts)182 bi_push_dependency(unsigned parent, unsigned child, BITSET_WORD **dependents,
183                    unsigned *dep_counts)
184 {
185    if (!BITSET_TEST(dependents[parent], child)) {
186       BITSET_SET(dependents[parent], child);
187       dep_counts[child]++;
188    }
189 }
190 
191 static void
add_dependency(struct util_dynarray * table,unsigned index,unsigned child,BITSET_WORD ** dependents,unsigned * dep_counts)192 add_dependency(struct util_dynarray *table, unsigned index, unsigned child,
193                BITSET_WORD **dependents, unsigned *dep_counts)
194 {
195    assert(index < 64);
196    util_dynarray_foreach(table + index, unsigned, parent)
197       bi_push_dependency(*parent, child, dependents, dep_counts);
198 }
199 
200 static void
mark_access(struct util_dynarray * table,unsigned index,unsigned parent)201 mark_access(struct util_dynarray *table, unsigned index, unsigned parent)
202 {
203    assert(index < 64);
204    util_dynarray_append(&table[index], unsigned, parent);
205 }
206 
207 static bool
bi_is_sched_barrier(bi_instr * I)208 bi_is_sched_barrier(bi_instr *I)
209 {
210    switch (I->op) {
211    case BI_OPCODE_BARRIER:
212    case BI_OPCODE_DISCARD_F32:
213       return true;
214    default:
215       return false;
216    }
217 }
218 
219 static void
bi_create_dependency_graph(struct bi_worklist st,bool inorder,bool is_blend)220 bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend)
221 {
222    struct util_dynarray last_read[64], last_write[64];
223 
224    for (unsigned i = 0; i < 64; ++i) {
225       util_dynarray_init(&last_read[i], NULL);
226       util_dynarray_init(&last_write[i], NULL);
227    }
228 
229    /* Initialize dependency graph */
230    for (unsigned i = 0; i < st.count; ++i) {
231       st.dependents[i] = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
232 
233       st.dep_counts[i] = 0;
234    }
235 
236    unsigned prev_msg = ~0;
237 
238    /* Populate dependency graph */
239    for (signed i = st.count - 1; i >= 0; --i) {
240       bi_instr *ins = st.instructions[i];
241 
242       bi_foreach_src(ins, s) {
243          if (ins->src[s].type != BI_INDEX_REGISTER)
244             continue;
245          unsigned count = bi_count_read_registers(ins, s);
246 
247          for (unsigned c = 0; c < count; ++c)
248             add_dependency(last_write, ins->src[s].value + c, i, st.dependents,
249                            st.dep_counts);
250       }
251 
252       /* Keep message-passing ops in order. (This pass only cares
253        * about bundling; reordering of message-passing instructions
254        * happens during earlier scheduling.) */
255 
256       if (bi_message_type_for_instr(ins)) {
257          if (prev_msg != ~0)
258             bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);
259 
260          prev_msg = i;
261       }
262 
263       /* Handle schedule barriers, adding All the deps */
264       if (inorder || bi_is_sched_barrier(ins)) {
265          for (unsigned j = 0; j < st.count; ++j) {
266             if (i == j)
267                continue;
268 
269             bi_push_dependency(MAX2(i, j), MIN2(i, j), st.dependents,
270                                st.dep_counts);
271          }
272       }
273 
274       bi_foreach_dest(ins, d) {
275          assert(ins->dest[d].type == BI_INDEX_REGISTER);
276          unsigned dest = ins->dest[d].value;
277 
278          unsigned count = bi_count_write_registers(ins, d);
279 
280          for (unsigned c = 0; c < count; ++c) {
281             add_dependency(last_read, dest + c, i, st.dependents,
282                            st.dep_counts);
283             add_dependency(last_write, dest + c, i, st.dependents,
284                            st.dep_counts);
285             mark_access(last_write, dest + c, i);
286          }
287       }
288 
289       /* Blend shaders are allowed to clobber R0-R15. Treat these
290        * registers like extra destinations for scheduling purposes.
291        */
292       if (ins->op == BI_OPCODE_BLEND && !is_blend) {
293          for (unsigned c = 0; c < 16; ++c) {
294             add_dependency(last_read, c, i, st.dependents, st.dep_counts);
295             add_dependency(last_write, c, i, st.dependents, st.dep_counts);
296             mark_access(last_write, c, i);
297          }
298       }
299 
300       bi_foreach_src(ins, s) {
301          if (ins->src[s].type != BI_INDEX_REGISTER)
302             continue;
303 
304          unsigned count = bi_count_read_registers(ins, s);
305 
306          for (unsigned c = 0; c < count; ++c)
307             mark_access(last_read, ins->src[s].value + c, i);
308       }
309    }
310 
311    /* If there is a branch, all instructions depend on it, as interblock
312     * execution must be purely in-order */
313 
314    bi_instr *last = st.instructions[st.count - 1];
315    if (last->branch_target || last->op == BI_OPCODE_JUMP) {
316       for (signed i = st.count - 2; i >= 0; --i)
317          bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);
318    }
319 
320    /* Free the intermediate structures */
321    for (unsigned i = 0; i < 64; ++i) {
322       util_dynarray_fini(&last_read[i]);
323       util_dynarray_fini(&last_write[i]);
324    }
325 }
326 
327 /* Scheduler pseudoinstruction lowerings to enable instruction pairings.
328  * Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2
329  */
330 
331 static bi_instr *
bi_lower_cubeface(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)332 bi_lower_cubeface(bi_context *ctx, struct bi_clause_state *clause,
333                   struct bi_tuple_state *tuple)
334 {
335    bi_instr *pinstr = tuple->add;
336    bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
337    bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0], pinstr->src[0],
338                                          pinstr->src[1], pinstr->src[2]);
339 
340    pinstr->op = BI_OPCODE_CUBEFACE2;
341    pinstr->dest[0] = pinstr->dest[1];
342    bi_drop_dests(pinstr, 1);
343 
344    pinstr->src[0] = cubeface1->dest[0];
345    bi_drop_srcs(pinstr, 1);
346 
347    return cubeface1;
348 }
349 
350 /* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to
351  * have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the
352  * arguments (rbase, address lo, address hi, rbase) */
353 
354 static bi_instr *
bi_lower_atom_c(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)355 bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause,
356                 struct bi_tuple_state *tuple)
357 {
358    bi_instr *pinstr = tuple->add;
359    bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
360    bi_instr *atom_c = bi_atom_c_return_i32(&b, pinstr->src[1], pinstr->src[2],
361                                            pinstr->src[0], pinstr->atom_opc);
362 
363    if (bi_is_null(pinstr->dest[0]))
364       atom_c->op = BI_OPCODE_ATOM_C_I32;
365 
366    bi_instr *atom_cx =
367       bi_atom_cx_to(&b, pinstr->dest[0], pinstr->src[0], pinstr->src[1],
368                     pinstr->src[2], pinstr->src[0], pinstr->sr_count);
369    tuple->add = atom_cx;
370    bi_remove_instruction(pinstr);
371 
372    return atom_c;
373 }
374 
375 static bi_instr *
bi_lower_atom_c1(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)376 bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause,
377                  struct bi_tuple_state *tuple)
378 {
379    bi_instr *pinstr = tuple->add;
380    bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
381    bi_instr *atom_c = bi_atom_c1_return_i32(&b, pinstr->src[0], pinstr->src[1],
382                                             pinstr->atom_opc);
383 
384    if (bi_is_null(pinstr->dest[0]))
385       atom_c->op = BI_OPCODE_ATOM_C1_I32;
386 
387    bi_instr *atom_cx =
388       bi_atom_cx_to(&b, pinstr->dest[0], bi_null(), pinstr->src[0],
389                     pinstr->src[1], bi_dontcare(&b), pinstr->sr_count);
390    tuple->add = atom_cx;
391    bi_remove_instruction(pinstr);
392 
393    return atom_c;
394 }
395 
396 static bi_instr *
bi_lower_seg_add(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)397 bi_lower_seg_add(bi_context *ctx, struct bi_clause_state *clause,
398                  struct bi_tuple_state *tuple)
399 {
400    bi_instr *pinstr = tuple->add;
401    bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
402 
403    bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],
404                                  pinstr->preserve_null, pinstr->seg);
405 
406    pinstr->op = BI_OPCODE_SEG_ADD;
407    pinstr->src[0] = pinstr->src[1];
408    bi_drop_srcs(pinstr, 1);
409 
410    assert(pinstr->dest[0].type == BI_INDEX_REGISTER);
411    pinstr->dest[0].value += 1;
412 
413    return fma;
414 }
415 
416 static bi_instr *
bi_lower_dtsel(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)417 bi_lower_dtsel(bi_context *ctx, struct bi_clause_state *clause,
418                struct bi_tuple_state *tuple)
419 {
420    bi_instr *add = tuple->add;
421    bi_builder b = bi_init_builder(ctx, bi_before_instr(add));
422 
423    bi_instr *dtsel =
424       bi_dtsel_imm_to(&b, bi_temp(b.shader), add->src[0], add->table);
425    assert(add->nr_srcs >= 1);
426    add->src[0] = dtsel->dest[0];
427 
428    assert(bi_supports_dtsel(add));
429    return dtsel;
430 }
431 
432 /* Flatten linked list to array for O(1) indexing */
433 
434 static bi_instr **
bi_flatten_block(bi_block * block,unsigned * len)435 bi_flatten_block(bi_block *block, unsigned *len)
436 {
437    if (list_is_empty(&block->instructions))
438       return NULL;
439 
440    *len = list_length(&block->instructions);
441    bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len));
442 
443    unsigned i = 0;
444 
445    bi_foreach_instr_in_block(block, ins)
446       instructions[i++] = ins;
447 
448    return instructions;
449 }
450 
451 /* The worklist would track instructions without outstanding dependencies. For
452  * debug, force in-order scheduling (no dependency graph is constructed).
453  */
454 
455 static struct bi_worklist
bi_initialize_worklist(bi_block * block,bool inorder,bool is_blend)456 bi_initialize_worklist(bi_block *block, bool inorder, bool is_blend)
457 {
458    struct bi_worklist st = {};
459    st.instructions = bi_flatten_block(block, &st.count);
460 
461    if (!st.count)
462       return st;
463 
464    st.dependents = calloc(st.count, sizeof(st.dependents[0]));
465    st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));
466 
467    bi_create_dependency_graph(st, inorder, is_blend);
468    st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
469 
470    for (unsigned i = 0; i < st.count; ++i) {
471       if (st.dep_counts[i] == 0)
472          BITSET_SET(st.worklist, i);
473    }
474 
475    return st;
476 }
477 
478 static void
bi_free_worklist(struct bi_worklist st)479 bi_free_worklist(struct bi_worklist st)
480 {
481    free(st.dep_counts);
482    free(st.dependents);
483    free(st.instructions);
484    free(st.worklist);
485 }
486 
487 static void
bi_update_worklist(struct bi_worklist st,unsigned idx)488 bi_update_worklist(struct bi_worklist st, unsigned idx)
489 {
490    assert(st.dep_counts[idx] == 0);
491 
492    if (!st.dependents[idx])
493       return;
494 
495    /* Iterate each dependent to remove one dependency (`done`),
496     * adding dependents to the worklist where possible. */
497 
498    unsigned i;
499    BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {
500       assert(st.dep_counts[i] != 0);
501       unsigned new_deps = --st.dep_counts[i];
502 
503       if (new_deps == 0)
504          BITSET_SET(st.worklist, i);
505    }
506 
507    free(st.dependents[idx]);
508 }
509 
510 /* Scheduler predicates */
511 
512 /* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */
513 static bool
bi_can_iaddc(bi_instr * ins)514 bi_can_iaddc(bi_instr *ins)
515 {
516    return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate &&
517            ins->src[0].swizzle == BI_SWIZZLE_H01 &&
518            ins->src[1].swizzle == BI_SWIZZLE_H01);
519 }
520 
521 /*
522  * The encoding of *FADD.v2f16 only specifies a single abs flag. All abs
523  * encodings are permitted by swapping operands; however, this scheme fails if
524  * both operands are equal. Test for this case.
525  */
526 static bool
bi_impacted_abs(bi_instr * I)527 bi_impacted_abs(bi_instr *I)
528 {
529    return I->src[0].abs && I->src[1].abs &&
530           bi_is_word_equiv(I->src[0], I->src[1]);
531 }
532 
533 bool
bi_can_fma(bi_instr * ins)534 bi_can_fma(bi_instr *ins)
535 {
536    /* +IADD.i32 -> *IADDC.i32 */
537    if (bi_can_iaddc(ins))
538       return true;
539 
540    /* +MUX -> *CSEL */
541    if (bi_can_replace_with_csel(ins))
542       return true;
543 
544    /* *FADD.v2f16 has restricted abs modifiers, use +FADD.v2f16 instead */
545    if (ins->op == BI_OPCODE_FADD_V2F16 && bi_impacted_abs(ins))
546       return false;
547 
548    /* TODO: some additional fp16 constraints */
549    return bi_opcode_props[ins->op].fma;
550 }
551 
552 static bool
bi_impacted_fadd_widens(bi_instr * I)553 bi_impacted_fadd_widens(bi_instr *I)
554 {
555    enum bi_swizzle swz0 = I->src[0].swizzle;
556    enum bi_swizzle swz1 = I->src[1].swizzle;
557 
558    return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) ||
559           (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) ||
560           (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00);
561 }
562 
563 bool
bi_can_add(bi_instr * ins)564 bi_can_add(bi_instr *ins)
565 {
566    /* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */
567    if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp)
568       return false;
569 
570    /* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */
571    if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs))
572       return false;
573 
574    /* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */
575    if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins))
576       return false;
577 
578    /* TODO: some additional fp16 constraints */
579    return bi_opcode_props[ins->op].add;
580 }
581 
582 /* Architecturally, no single instruction has a "not last" constraint. However,
583  * pseudoinstructions writing multiple destinations (expanding to multiple
584  * paired instructions) can run afoul of the "no two writes on the last clause"
585  * constraint, so we check for that here.
586  *
587  * Exception to the exception: TEXC_DUAL, which writes to multiple sets of
588  * staging registers. Staging registers bypass the usual register write
589  * mechanism so this restriction does not apply.
590  */
591 
592 static bool
bi_must_not_last(bi_instr * ins)593 bi_must_not_last(bi_instr *ins)
594 {
595    return (ins->nr_dests >= 2) && (ins->op != BI_OPCODE_TEXC_DUAL);
596 }
597 
598 /* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we
599  * treat it as a message-passing instruction for the purpose of scheduling
600  * despite no passing no logical message. Otherwise invalid encoding faults may
601  * be raised for unknown reasons (possibly an errata).
602  */
603 
604 bool
bi_must_message(bi_instr * ins)605 bi_must_message(bi_instr *ins)
606 {
607    return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) ||
608           (ins->op == BI_OPCODE_DISCARD_F32);
609 }
610 
611 static bool
bi_fma_atomic(enum bi_opcode op)612 bi_fma_atomic(enum bi_opcode op)
613 {
614    switch (op) {
615    case BI_OPCODE_ATOM_C_I32:
616    case BI_OPCODE_ATOM_C_I64:
617    case BI_OPCODE_ATOM_C1_I32:
618    case BI_OPCODE_ATOM_C1_I64:
619    case BI_OPCODE_ATOM_C1_RETURN_I32:
620    case BI_OPCODE_ATOM_C1_RETURN_I64:
621    case BI_OPCODE_ATOM_C_RETURN_I32:
622    case BI_OPCODE_ATOM_C_RETURN_I64:
623    case BI_OPCODE_ATOM_POST_I32:
624    case BI_OPCODE_ATOM_POST_I64:
625    case BI_OPCODE_ATOM_PRE_I64:
626       return true;
627    default:
628       return false;
629    }
630 }
631 
632 bool
bi_reads_zero(bi_instr * ins)633 bi_reads_zero(bi_instr *ins)
634 {
635    return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD);
636 }
637 
638 bool
bi_reads_temps(bi_instr * ins,unsigned src)639 bi_reads_temps(bi_instr *ins, unsigned src)
640 {
641    switch (ins->op) {
642    /* Cannot permute a temporary */
643    case BI_OPCODE_CLPER_I32:
644    case BI_OPCODE_CLPER_OLD_I32:
645       return src != 0;
646 
647    /* ATEST isn't supposed to be restricted, but in practice it always
648     * wants to source its coverage mask input (source 0) from register 60,
649     * which won't work properly if we put the input in a temp. This
650     * requires workarounds in both RA and clause scheduling.
651     */
652    case BI_OPCODE_ATEST:
653       return src != 0;
654 
655    case BI_OPCODE_IMULD:
656       return false;
657    default:
658       return true;
659    }
660 }
661 
662 static bool
bi_impacted_t_modifiers(bi_instr * I,unsigned src)663 bi_impacted_t_modifiers(bi_instr *I, unsigned src)
664 {
665    assert(src < I->nr_srcs);
666    enum bi_swizzle swizzle = I->src[src].swizzle;
667 
668    switch (I->op) {
669    case BI_OPCODE_F16_TO_F32:
670    case BI_OPCODE_F16_TO_S32:
671    case BI_OPCODE_F16_TO_U32:
672    case BI_OPCODE_MKVEC_V2I16:
673    case BI_OPCODE_S16_TO_F32:
674    case BI_OPCODE_S16_TO_S32:
675    case BI_OPCODE_U16_TO_F32:
676    case BI_OPCODE_U16_TO_U32:
677       return (swizzle != BI_SWIZZLE_H00);
678 
679    case BI_OPCODE_BRANCH_F32:
680    case BI_OPCODE_LOGB_F32:
681    case BI_OPCODE_ILOGB_F32:
682    case BI_OPCODE_FADD_F32:
683    case BI_OPCODE_FCMP_F32:
684    case BI_OPCODE_FREXPE_F32:
685    case BI_OPCODE_FREXPM_F32:
686    case BI_OPCODE_FROUND_F32:
687       return (swizzle != BI_SWIZZLE_H01);
688 
689    case BI_OPCODE_IADD_S32:
690    case BI_OPCODE_IADD_U32:
691    case BI_OPCODE_ISUB_S32:
692    case BI_OPCODE_ISUB_U32:
693    case BI_OPCODE_IADD_V4S8:
694    case BI_OPCODE_IADD_V4U8:
695    case BI_OPCODE_ISUB_V4S8:
696    case BI_OPCODE_ISUB_V4U8:
697       return (src == 1) && (swizzle != BI_SWIZZLE_H01);
698 
699    case BI_OPCODE_S8_TO_F32:
700    case BI_OPCODE_S8_TO_S32:
701    case BI_OPCODE_U8_TO_F32:
702    case BI_OPCODE_U8_TO_U32:
703       return (swizzle != BI_SWIZZLE_B0000);
704 
705    case BI_OPCODE_V2S8_TO_V2F16:
706    case BI_OPCODE_V2S8_TO_V2S16:
707    case BI_OPCODE_V2U8_TO_V2F16:
708    case BI_OPCODE_V2U8_TO_V2U16:
709       return (swizzle != BI_SWIZZLE_B0022);
710 
711    case BI_OPCODE_IADD_V2S16:
712    case BI_OPCODE_IADD_V2U16:
713    case BI_OPCODE_ISUB_V2S16:
714    case BI_OPCODE_ISUB_V2U16:
715       return (src == 1) && (swizzle >= BI_SWIZZLE_H11);
716 
717 #if 0
718         /* Restriction on IADD in 64-bit clauses on G72 */
719         case BI_OPCODE_IADD_S64:
720         case BI_OPCODE_IADD_U64:
721                 return (src == 1) && (swizzle != BI_SWIZZLE_D0);
722 #endif
723 
724    default:
725       return false;
726    }
727 }
728 
729 bool
bi_reads_t(bi_instr * ins,unsigned src)730 bi_reads_t(bi_instr *ins, unsigned src)
731 {
732    /* Branch offset cannot come from passthrough */
733    if (bi_opcode_props[ins->op].branch)
734       return src != 2;
735 
736    /* Table can never read passthrough */
737    if (bi_opcode_props[ins->op].table)
738       return false;
739 
740    /* Staging register reads may happen before the succeeding register
741     * block encodes a write, so effectively there is no passthrough */
742    if (bi_is_staging_src(ins, src))
743       return false;
744 
745    /* Bifrost cores newer than Mali G71 have restrictions on swizzles on
746     * same-cycle temporaries. Check the list for these hazards. */
747    if (bi_impacted_t_modifiers(ins, src))
748       return false;
749 
750    /* Descriptor must not come from a passthrough */
751    switch (ins->op) {
752    case BI_OPCODE_LD_CVT:
753    case BI_OPCODE_LD_TILE:
754    case BI_OPCODE_ST_CVT:
755    case BI_OPCODE_ST_TILE:
756    case BI_OPCODE_TEXC:
757    case BI_OPCODE_TEXC_DUAL:
758       return src != 2;
759    case BI_OPCODE_BLEND:
760       return src != 2 && src != 3;
761 
762    /* +JUMP can't read the offset from T */
763    case BI_OPCODE_JUMP:
764       return false;
765 
766    /* Else, just check if we can read any temps */
767    default:
768       return bi_reads_temps(ins, src);
769    }
770 }
771 
772 /* Counts the number of 64-bit constants required by a clause. TODO: We
773  * might want to account for merging, right now we overestimate, but
774  * that's probably fine most of the time */
775 
776 static unsigned
bi_nconstants(struct bi_clause_state * clause)777 bi_nconstants(struct bi_clause_state *clause)
778 {
779    unsigned count_32 = 0;
780 
781    for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i)
782       count_32 += clause->consts[i].constant_count;
783 
784    return DIV_ROUND_UP(count_32, 2);
785 }
786 
787 /* Would there be space for constants if we added one tuple? */
788 
789 static bool
bi_space_for_more_constants(struct bi_clause_state * clause)790 bi_space_for_more_constants(struct bi_clause_state *clause)
791 {
792    return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1));
793 }
794 
795 /* Updates the FAU assignment for a tuple. A valid FAU assignment must be
796  * possible (as a precondition), though not necessarily on the selected unit;
797  * this is gauranteed per-instruction by bi_lower_fau and per-tuple by
798  * bi_instr_schedulable */
799 
800 static bool
bi_update_fau(struct bi_clause_state * clause,struct bi_tuple_state * tuple,bi_instr * instr,bool fma,bool destructive)801 bi_update_fau(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
802               bi_instr *instr, bool fma, bool destructive)
803 {
804    /* Maintain our own constants, for nondestructive mode */
805    uint32_t copied_constants[2], copied_count;
806    unsigned *constant_count = &tuple->constant_count;
807    uint32_t *constants = tuple->constants;
808    enum bir_fau fau = tuple->fau;
809 
810    if (!destructive) {
811       memcpy(copied_constants, tuple->constants,
812              (*constant_count) * sizeof(constants[0]));
813       copied_count = tuple->constant_count;
814 
815       constant_count = &copied_count;
816       constants = copied_constants;
817    }
818 
819    bi_foreach_src(instr, s) {
820       bi_index src = instr->src[s];
821 
822       if (src.type == BI_INDEX_FAU) {
823          bool no_constants = *constant_count == 0;
824          bool no_other_fau = (fau == src.value) || !fau;
825          bool mergable = no_constants && no_other_fau;
826 
827          if (destructive) {
828             assert(mergable);
829             tuple->fau = src.value;
830          } else if (!mergable) {
831             return false;
832          }
833 
834          fau = src.value;
835       } else if (src.type == BI_INDEX_CONSTANT) {
836          /* No need to reserve space if we have a fast 0 */
837          if (src.value == 0 && fma && bi_reads_zero(instr))
838             continue;
839 
840          /* If there is a branch target, #0 by convention is the
841           * PC-relative offset to the target */
842          bool pcrel = instr->branch_target && src.value == 0;
843          bool found = false;
844 
845          for (unsigned i = 0; i < *constant_count; ++i) {
846             found |= (constants[i] == src.value) && (i != tuple->pcrel_idx);
847          }
848 
849          /* pcrel constants are unique, so don't match */
850          if (found && !pcrel)
851             continue;
852 
853          bool no_fau = (*constant_count > 0) || !fau;
854          bool mergable = no_fau && ((*constant_count) < 2);
855 
856          if (destructive) {
857             assert(mergable);
858 
859             if (pcrel)
860                tuple->pcrel_idx = *constant_count;
861          } else if (!mergable)
862             return false;
863 
864          constants[(*constant_count)++] = src.value;
865       }
866    }
867 
868    /* Constants per clause may be limited by tuple count */
869    bool room_for_constants =
870       (*constant_count == 0) || bi_space_for_more_constants(clause);
871 
872    if (destructive)
873       assert(room_for_constants);
874    else if (!room_for_constants)
875       return false;
876 
877    return true;
878 }
879 
880 /* Given an in-progress tuple, a candidate new instruction to add to the tuple,
881  * and a source (index) from that candidate, determine whether this source is
882  * "new", in the sense of requiring an additional read slot. That is, checks
883  * whether the specified source reads from the register file via a read slot
884  * (determined by its type and placement) and whether the source was already
885  * specified by a prior read slot (to avoid double counting) */
886 
887 static bool
bi_tuple_is_new_src(bi_instr * instr,struct bi_reg_state * reg,unsigned src_idx)888 bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx)
889 {
890    assert(src_idx < instr->nr_srcs);
891    bi_index src = instr->src[src_idx];
892 
893    /* Only consider sources which come from the register file */
894    if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER))
895       return false;
896 
897    /* Staging register reads bypass the usual register file mechanism */
898    if (bi_is_staging_src(instr, src_idx))
899       return false;
900 
901    /* If a source is already read in the tuple, it is already counted */
902    for (unsigned t = 0; t < reg->nr_reads; ++t)
903       if (bi_is_word_equiv(src, reg->reads[t]))
904          return false;
905 
906    /* If a source is read in _this instruction_, it is already counted */
907    for (unsigned t = 0; t < src_idx; ++t)
908       if (bi_is_word_equiv(src, instr->src[t]))
909          return false;
910 
911    return true;
912 }
913 
914 /* Given two tuples in source order, count the number of register reads of the
915  * successor, determined as the number of unique words accessed that aren't
916  * written by the predecessor (since those are tempable).
917  */
918 
919 static unsigned
bi_count_succ_reads(bi_index t0,bi_index t1,bi_index * succ_reads,unsigned nr_succ_reads)920 bi_count_succ_reads(bi_index t0, bi_index t1, bi_index *succ_reads,
921                     unsigned nr_succ_reads)
922 {
923    unsigned reads = 0;
924 
925    for (unsigned i = 0; i < nr_succ_reads; ++i) {
926       bool unique = true;
927 
928       for (unsigned j = 0; j < i; ++j)
929          if (bi_is_word_equiv(succ_reads[i], succ_reads[j]))
930             unique = false;
931 
932       if (!unique)
933          continue;
934 
935       if (bi_is_word_equiv(succ_reads[i], t0))
936          continue;
937 
938       if (bi_is_word_equiv(succ_reads[i], t1))
939          continue;
940 
941       reads++;
942    }
943 
944    return reads;
945 }
946 
947 /* Not all instructions can read from the staging passthrough (as determined by
948  * reads_t), check if a given pair of instructions has such a restriction. Note
949  * we also use this mechanism to prevent data races around staging register
950  * reads, so we allow the input source to potentially be vector-valued */
951 
952 static bool
bi_has_staging_passthrough_hazard(bi_index fma,bi_instr * add)953 bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add)
954 {
955    bi_foreach_src(add, s) {
956       bi_index src = add->src[s];
957 
958       if (src.type != BI_INDEX_REGISTER)
959          continue;
960 
961       unsigned count = bi_count_read_registers(add, s);
962       bool read = false;
963 
964       for (unsigned d = 0; d < count; ++d)
965          read |= bi_is_equiv(fma, bi_register(src.value + d));
966 
967       if (read && !bi_reads_t(add, s))
968          return true;
969    }
970 
971    return false;
972 }
973 
974 /* Likewise for cross-tuple passthrough (reads_temps) */
975 
976 static bool
bi_has_cross_passthrough_hazard(bi_tuple * succ,bi_instr * ins)977 bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)
978 {
979    if (ins->nr_dests == 0)
980       return false;
981 
982    bi_foreach_instr_in_tuple(succ, pins) {
983       bi_foreach_src(pins, s) {
984          if (bi_is_word_equiv(ins->dest[0], pins->src[s]) &&
985              !bi_reads_temps(pins, s))
986             return true;
987       }
988    }
989 
990    return false;
991 }
992 
993 /* Is a register written other than the staging mechanism? ATEST is special,
994  * writing to both a staging register and a regular register (fixed packing).
995  * BLEND is special since it has to write r48 the normal way even if it never
996  * gets read. This depends on liveness analysis, as a register is not needed
997  * for a write that will be discarded after one tuple. */
998 
999 static unsigned
bi_write_count(bi_instr * instr,uint64_t live_after_temp)1000 bi_write_count(bi_instr *instr, uint64_t live_after_temp)
1001 {
1002    if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)
1003       return 1;
1004 
1005    unsigned count = 0;
1006 
1007    bi_foreach_dest(instr, d) {
1008       if (d == 0 && bi_opcode_props[instr->op].sr_write)
1009          continue;
1010 
1011       assert(instr->dest[0].type == BI_INDEX_REGISTER);
1012       if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value))
1013          count++;
1014    }
1015 
1016    return count;
1017 }
1018 
1019 /*
1020  * Test if an instruction required flush-to-zero mode. Currently only supported
1021  * for f16<-->f32 conversions to implement fquantize16
1022  */
1023 static bool
bi_needs_ftz(bi_instr * I)1024 bi_needs_ftz(bi_instr *I)
1025 {
1026    return (I->op == BI_OPCODE_F16_TO_F32 ||
1027            I->op == BI_OPCODE_V2F32_TO_V2F16) &&
1028           I->ftz;
1029 }
1030 
1031 /*
1032  * Test if an instruction would be numerically incompatible with the clause. At
1033  * present we only consider flush-to-zero modes.
1034  */
1035 static bool
bi_numerically_incompatible(struct bi_clause_state * clause,bi_instr * instr)1036 bi_numerically_incompatible(struct bi_clause_state *clause, bi_instr *instr)
1037 {
1038    return (clause->ftz != BI_FTZ_STATE_NONE) &&
1039           ((clause->ftz == BI_FTZ_STATE_ENABLE) != bi_needs_ftz(instr));
1040 }
1041 
1042 /* Instruction placement entails two questions: what subset of instructions in
1043  * the block can legally be scheduled? and of those which is the best? That is,
1044  * we seek to maximize a cost function on a subset of the worklist satisfying a
1045  * particular predicate. The necessary predicate is determined entirely by
1046  * Bifrost's architectural limitations and is described in the accompanying
1047  * whitepaper. The cost function is a heuristic. */
1048 
1049 static bool
bi_instr_schedulable(bi_instr * instr,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1050 bi_instr_schedulable(bi_instr *instr, struct bi_clause_state *clause,
1051                      struct bi_tuple_state *tuple, uint64_t live_after_temp,
1052                      bool fma)
1053 {
1054    /* The units must match */
1055    if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr)))
1056       return false;
1057 
1058    /* There can only be one message-passing instruction per clause */
1059    if (bi_must_message(instr) && clause->message)
1060       return false;
1061 
1062    /* Some instructions have placement requirements */
1063    if (bi_opcode_props[instr->op].last && !tuple->last)
1064       return false;
1065 
1066    if (bi_must_not_last(instr) && tuple->last)
1067       return false;
1068 
1069    /* Numerical properties must be compatible with the clause */
1070    if (bi_numerically_incompatible(clause, instr))
1071       return false;
1072 
1073    /* Message-passing instructions are not guaranteed write within the
1074     * same clause (most likely they will not), so if a later instruction
1075     * in the clause accesses the destination, the message-passing
1076     * instruction can't be scheduled */
1077    if (bi_opcode_props[instr->op].sr_write) {
1078       bi_foreach_dest(instr, d) {
1079          unsigned nr = bi_count_write_registers(instr, d);
1080          assert(instr->dest[d].type == BI_INDEX_REGISTER);
1081          unsigned reg = instr->dest[d].value;
1082 
1083          for (unsigned i = 0; i < clause->access_count; ++i) {
1084             bi_index idx = clause->accesses[i];
1085             for (unsigned d = 0; d < nr; ++d) {
1086                if (bi_is_equiv(bi_register(reg + d), idx))
1087                   return false;
1088             }
1089          }
1090       }
1091    }
1092 
1093    if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {
1094       unsigned nr = bi_count_read_registers(instr, 0);
1095       assert(instr->src[0].type == BI_INDEX_REGISTER);
1096       unsigned reg = instr->src[0].value;
1097 
1098       for (unsigned i = 0; i < clause->access_count; ++i) {
1099          bi_index idx = clause->accesses[i];
1100          for (unsigned d = 0; d < nr; ++d) {
1101             if (bi_is_equiv(bi_register(reg + d), idx))
1102                return false;
1103          }
1104       }
1105    }
1106 
1107    /* If FAU is already assigned, we may not disrupt that. Do a
1108     * non-disruptive test update */
1109    if (!bi_update_fau(clause, tuple, instr, fma, false))
1110       return false;
1111 
1112    /* If this choice of FMA would force a staging passthrough, the ADD
1113     * instruction must support such a passthrough */
1114    if (tuple->add && instr->nr_dests &&
1115        bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add))
1116       return false;
1117 
1118    /* If this choice of destination would force a cross-tuple passthrough, the
1119     * next tuple must support that */
1120    if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))
1121       return false;
1122 
1123    /* Register file writes are limited */
1124    unsigned total_writes = tuple->reg.nr_writes;
1125    total_writes += bi_write_count(instr, live_after_temp);
1126 
1127    /* Last tuple in a clause can only write a single value */
1128    if (tuple->last && total_writes > 1)
1129       return false;
1130 
1131    /* Register file reads are limited, so count unique */
1132 
1133    unsigned unique_new_srcs = 0;
1134 
1135    bi_foreach_src(instr, s) {
1136       if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1137          unique_new_srcs++;
1138    }
1139 
1140    unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs;
1141 
1142    bool can_spill_to_moves = (!tuple->add);
1143    can_spill_to_moves &=
1144       (bi_nconstants(clause) < 13 - (clause->tuple_count + 2));
1145    can_spill_to_moves &= (clause->tuple_count < 7);
1146 
1147    /* However, we can get an extra 1 or 2 sources by inserting moves */
1148    if (total_srcs > (can_spill_to_moves ? 4 : 3))
1149       return false;
1150 
1151    /* Count effective reads for the successor */
1152    unsigned succ_reads = 0;
1153 
1154    if (instr->nr_dests) {
1155       bool has_t1 = tuple->add && tuple->add->nr_dests;
1156       succ_reads = bi_count_succ_reads(instr->dest[0],
1157                                        has_t1 ? tuple->add->dest[0] : bi_null(),
1158                                        tuple->prev_reads, tuple->nr_prev_reads);
1159    }
1160 
1161    /* Successor must satisfy R+W <= 4, so we require W <= 4-R */
1162    if ((signed)total_writes > (4 - (signed)succ_reads))
1163       return false;
1164 
1165    return true;
1166 }
1167 
1168 static signed
bi_instr_cost(bi_instr * instr,struct bi_tuple_state * tuple)1169 bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple)
1170 {
1171    signed cost = 0;
1172 
1173    /* Instructions that can schedule to either FMA or to ADD should be
1174     * deprioritized since they're easier to reschedule elsewhere */
1175    if (bi_can_fma(instr) && bi_can_add(instr))
1176       cost++;
1177 
1178    /* Message-passing instructions impose constraints on the registers
1179     * later in the clause, so schedule them as late within a clause as
1180     * possible (<==> prioritize them since we're backwards <==> decrease
1181     * cost) */
1182    if (bi_must_message(instr))
1183       cost--;
1184 
1185    /* Last instructions are big constraints (XXX: no effect on shader-db) */
1186    if (bi_opcode_props[instr->op].last)
1187       cost -= 2;
1188 
1189    return cost;
1190 }
1191 
1192 static unsigned
bi_choose_index(struct bi_worklist st,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1193 bi_choose_index(struct bi_worklist st, struct bi_clause_state *clause,
1194                 struct bi_tuple_state *tuple, uint64_t live_after_temp,
1195                 bool fma)
1196 {
1197    unsigned i, best_idx = ~0;
1198    signed best_cost = INT_MAX;
1199 
1200    BITSET_FOREACH_SET(i, st.worklist, st.count) {
1201       bi_instr *instr = st.instructions[i];
1202 
1203       if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))
1204          continue;
1205 
1206       signed cost = bi_instr_cost(instr, tuple);
1207 
1208       /* Tie break in favour of later instructions, under the
1209        * assumption this promotes temporary usage (reducing pressure
1210        * on the register file). This is a side effect of a prepass
1211        * scheduling for pressure. */
1212 
1213       if (cost <= best_cost) {
1214          best_idx = i;
1215          best_cost = cost;
1216       }
1217    }
1218 
1219    return best_idx;
1220 }
1221 
1222 static void
bi_pop_instr(struct bi_clause_state * clause,struct bi_tuple_state * tuple,bi_instr * instr,uint64_t live_after_temp,bool fma)1223 bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
1224              bi_instr *instr, uint64_t live_after_temp, bool fma)
1225 {
1226    bi_update_fau(clause, tuple, instr, fma, true);
1227 
1228    assert(clause->access_count + instr->nr_srcs + instr->nr_dests <=
1229           ARRAY_SIZE(clause->accesses));
1230 
1231    memcpy(clause->accesses + clause->access_count, instr->src,
1232           sizeof(instr->src[0]) * instr->nr_srcs);
1233    clause->access_count += instr->nr_srcs;
1234 
1235    memcpy(clause->accesses + clause->access_count, instr->dest,
1236           sizeof(instr->dest[0]) * instr->nr_dests);
1237    clause->access_count += instr->nr_dests;
1238 
1239    tuple->reg.nr_writes += bi_write_count(instr, live_after_temp);
1240 
1241    bi_foreach_src(instr, s) {
1242       if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1243          tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
1244    }
1245 
1246    /* This could be optimized to allow pairing integer instructions with
1247     * special flush-to-zero instructions, but punting on this until we have
1248     * a workload that cares.
1249     */
1250    clause->ftz =
1251       bi_needs_ftz(instr) ? BI_FTZ_STATE_ENABLE : BI_FTZ_STATE_DISABLE;
1252 }
1253 
1254 /* Choose the best instruction and pop it off the worklist. Returns NULL if no
1255  * instruction is available. This function is destructive. */
1256 
1257 static bi_instr *
bi_take_instr(bi_context * ctx,struct bi_worklist st,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1258 bi_take_instr(bi_context *ctx, struct bi_worklist st,
1259               struct bi_clause_state *clause, struct bi_tuple_state *tuple,
1260               uint64_t live_after_temp, bool fma)
1261 {
1262    if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)
1263       return bi_lower_cubeface(ctx, clause, tuple);
1264    else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM_RETURN_I32)
1265       return bi_lower_atom_c(ctx, clause, tuple);
1266    else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM1_RETURN_I32)
1267       return bi_lower_atom_c1(ctx, clause, tuple);
1268    else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64)
1269       return bi_lower_seg_add(ctx, clause, tuple);
1270    else if (tuple->add && tuple->add->table)
1271       return bi_lower_dtsel(ctx, clause, tuple);
1272 
1273    /* TODO: Optimize these moves */
1274    if (!fma && tuple->nr_prev_reads > 3) {
1275       /* Only spill by one source for now */
1276       assert(tuple->nr_prev_reads == 4);
1277 
1278       /* Pick a source to spill */
1279       bi_index src = tuple->prev_reads[0];
1280 
1281       /* Schedule the spill */
1282       bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev));
1283       bi_instr *mov = bi_mov_i32_to(&b, src, src);
1284       bi_pop_instr(clause, tuple, mov, live_after_temp, fma);
1285       return mov;
1286    }
1287 
1288 #ifndef NDEBUG
1289    /* Don't pair instructions if debugging */
1290    if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add)
1291       return NULL;
1292 #endif
1293 
1294    unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);
1295 
1296    if (idx >= st.count)
1297       return NULL;
1298 
1299    /* Update state to reflect taking the instruction */
1300    bi_instr *instr = st.instructions[idx];
1301 
1302    BITSET_CLEAR(st.worklist, idx);
1303    bi_update_worklist(st, idx);
1304    bi_pop_instr(clause, tuple, instr, live_after_temp, fma);
1305 
1306    /* Fixups */
1307    bi_builder b = bi_init_builder(ctx, bi_before_instr(instr));
1308 
1309    if (instr->op == BI_OPCODE_IADD_U32 && fma) {
1310       assert(bi_can_iaddc(instr));
1311       bi_instr *iaddc = bi_iaddc_i32_to(&b, instr->dest[0], instr->src[0],
1312                                         instr->src[1], bi_zero());
1313 
1314       bi_remove_instruction(instr);
1315       instr = iaddc;
1316    } else if (fma && bi_can_replace_with_csel(instr)) {
1317       bi_instr *csel = bi_csel_from_mux(&b, instr, false);
1318 
1319       bi_remove_instruction(instr);
1320       instr = csel;
1321    }
1322 
1323    return instr;
1324 }
1325 
1326 /* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting
1327  * to a passthrough register. If except_sr is true, the staging sources are
1328  * skipped, so staging register reads are not accidentally encoded as
1329  * passthrough (which is impossible) */
1330 
1331 static void
bi_use_passthrough(bi_instr * ins,bi_index old,enum bifrost_packed_src new,bool except_sr)1332 bi_use_passthrough(bi_instr *ins, bi_index old, enum bifrost_packed_src new,
1333                    bool except_sr)
1334 {
1335    /* Optional for convenience */
1336    if (!ins)
1337       return;
1338 
1339    assert(!bi_is_null(old));
1340 
1341    bi_foreach_src(ins, i) {
1342       if ((i == 0 || i == 4) && except_sr)
1343          continue;
1344 
1345       if (bi_is_word_equiv(ins->src[i], old)) {
1346          ins->src[i].type = BI_INDEX_PASS;
1347          ins->src[i].value = new;
1348          ins->src[i].offset = 0;
1349       }
1350    }
1351 }
1352 
1353 /* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use
1354  * intertuple passthroughs where necessary. Passthroughs are allowed as a
1355  * post-condition of scheduling. Note we rewrite ADD first, FMA second --
1356  * opposite the order of execution. This is deliberate -- if both FMA and ADD
1357  * write to the same logical register, the next executed tuple will get the
1358  * latter result. There's no interference issue under the assumption of correct
1359  * register allocation. */
1360 
1361 static void
bi_rewrite_passthrough(bi_tuple prec,bi_tuple succ)1362 bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ)
1363 {
1364    bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false;
1365 
1366    if (prec.add && prec.add->nr_dests) {
1367       bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD,
1368                          false);
1369       bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD,
1370                          sr_read);
1371    }
1372 
1373    if (prec.fma && prec.fma->nr_dests) {
1374       bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA,
1375                          false);
1376       bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA,
1377                          sr_read);
1378    }
1379 }
1380 
1381 static void
bi_rewrite_fau_to_pass(bi_tuple * tuple)1382 bi_rewrite_fau_to_pass(bi_tuple *tuple)
1383 {
1384    bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1385       if (ins->src[s].type != BI_INDEX_FAU)
1386          continue;
1387 
1388       bi_index pass = bi_passthrough(ins->src[s].offset ? BIFROST_SRC_FAU_HI
1389                                                         : BIFROST_SRC_FAU_LO);
1390 
1391       bi_replace_src(ins, s, pass);
1392    }
1393 }
1394 
1395 static void
bi_rewrite_zero(bi_instr * ins,bool fma)1396 bi_rewrite_zero(bi_instr *ins, bool fma)
1397 {
1398    bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO);
1399 
1400    bi_foreach_src(ins, s) {
1401       bi_index src = ins->src[s];
1402 
1403       if (src.type == BI_INDEX_CONSTANT && src.value == 0)
1404          bi_replace_src(ins, s, zero);
1405    }
1406 }
1407 
1408 /* Assumes #0 to {T, FAU} rewrite has already occurred */
1409 
1410 static void
bi_rewrite_constants_to_pass(bi_tuple * tuple,uint64_t constant,bool pcrel)1411 bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel)
1412 {
1413    bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1414       if (ins->src[s].type != BI_INDEX_CONSTANT)
1415          continue;
1416 
1417       uint32_t cons = ins->src[s].value;
1418 
1419       ASSERTED bool lo = (cons == (constant & 0xffffffff));
1420       bool hi = (cons == (constant >> 32ull));
1421 
1422       /* PC offsets always live in the upper half, set to zero by
1423        * convention before pack time. (This is safe, since if you
1424        * wanted to compare against zero, you would use a BRANCHZ
1425        * instruction instead.) */
1426       if (cons == 0 && ins->branch_target != NULL) {
1427          assert(pcrel);
1428          hi = true;
1429          lo = false;
1430       } else if (pcrel) {
1431          hi = false;
1432       }
1433 
1434       assert(lo || hi);
1435 
1436       bi_replace_src(
1437          ins, s, bi_passthrough(hi ? BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO));
1438    }
1439 }
1440 
1441 /* Constructs a constant state given a tuple state. This has the
1442  * postcondition that pcrel applies to the first constant by convention,
1443  * and PC-relative constants will be #0 by convention here, so swap to
1444  * match if needed */
1445 
1446 static struct bi_const_state
bi_get_const_state(struct bi_tuple_state * tuple)1447 bi_get_const_state(struct bi_tuple_state *tuple)
1448 {
1449    struct bi_const_state consts = {
1450       .constant_count = tuple->constant_count,
1451       .constants[0] = tuple->constants[0],
1452       .constants[1] = tuple->constants[1],
1453       .pcrel = tuple->add && tuple->add->branch_target,
1454    };
1455 
1456    /* pcrel applies to the first constant by convention, and
1457     * PC-relative constants will be #0 by convention here, so swap
1458     * to match if needed */
1459    if (consts.pcrel && consts.constants[0]) {
1460       assert(consts.constant_count == 2);
1461       assert(consts.constants[1] == 0);
1462 
1463       consts.constants[1] = consts.constants[0];
1464       consts.constants[0] = 0;
1465    }
1466 
1467    return consts;
1468 }
1469 
1470 /* Merges constants in a clause, satisfying the following rules, assuming no
1471  * more than one tuple has pcrel:
1472  *
1473  * 1. If a tuple has two constants, they must be packed together. If one is
1474  * pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) +
1475  * (PC << 32)]. Otherwise choose an arbitrary order.
1476  *
1477  * 4. If a tuple has one constant, it may be shared with an existing
1478  * pair that already contains that constant, or it may be combined with another
1479  * (distinct) tuple of a single constant.
1480  *
1481  * This gaurantees a packing is possible. The next routine handles modification
1482  * related swapping, to satisfy format 12 and the lack of modification for
1483  * tuple count 5/8 in EC0.
1484  */
1485 
1486 static uint64_t
bi_merge_u32(uint32_t c0,uint32_t c1,bool pcrel)1487 bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel)
1488 {
1489    /* At this point in the constant merge algorithm, pcrel constants are
1490     * treated as zero, so pcrel implies at least one constants is zero */
1491    assert(!pcrel || (c0 == 0 || c1 == 0));
1492 
1493    /* Order: pcrel, maximum non-pcrel, minimum non-pcrel */
1494    uint32_t hi = pcrel ? 0 : MAX2(c0, c1);
1495    uint32_t lo = (c0 == hi) ? c1 : c0;
1496 
1497    /* Merge in the selected order */
1498    return lo | (((uint64_t)hi) << 32ull);
1499 }
1500 
1501 static unsigned
bi_merge_pairs(struct bi_const_state * consts,unsigned tuple_count,uint64_t * merged,unsigned * pcrel_pair)1502 bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count,
1503                uint64_t *merged, unsigned *pcrel_pair)
1504 {
1505    unsigned merge_count = 0;
1506 
1507    for (unsigned t = 0; t < tuple_count; ++t) {
1508       if (consts[t].constant_count != 2)
1509          continue;
1510 
1511       unsigned idx = ~0;
1512       uint64_t val = bi_merge_u32(consts[t].constants[0],
1513                                   consts[t].constants[1], consts[t].pcrel);
1514 
1515       /* Skip the pcrel pair if assigned, because if one is assigned,
1516        * this one is not pcrel by uniqueness so it's a mismatch */
1517       for (unsigned s = 0; s < merge_count; ++s) {
1518          if (merged[s] == val && (*pcrel_pair) != s) {
1519             idx = s;
1520             break;
1521          }
1522       }
1523 
1524       if (idx == ~0) {
1525          idx = merge_count++;
1526          merged[idx] = val;
1527 
1528          if (consts[t].pcrel)
1529             (*pcrel_pair) = idx;
1530       }
1531 
1532       consts[t].word_idx = idx;
1533    }
1534 
1535    return merge_count;
1536 }
1537 
1538 static unsigned
bi_merge_singles(struct bi_const_state * consts,unsigned tuple_count,uint64_t * pairs,unsigned pair_count,unsigned * pcrel_pair)1539 bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count,
1540                  uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair)
1541 {
1542    bool pending = false, pending_pcrel = false;
1543    uint32_t pending_single = 0;
1544 
1545    for (unsigned t = 0; t < tuple_count; ++t) {
1546       if (consts[t].constant_count != 1)
1547          continue;
1548 
1549       uint32_t val = consts[t].constants[0];
1550       unsigned idx = ~0;
1551 
1552       /* Try to match, but don't match pcrel with non-pcrel, even
1553        * though we can merge a pcrel with a non-pcrel single */
1554       for (unsigned i = 0; i < pair_count; ++i) {
1555          bool lo = ((pairs[i] & 0xffffffff) == val);
1556          bool hi = ((pairs[i] >> 32) == val);
1557          bool match = (lo || hi);
1558          match &= ((*pcrel_pair) != i);
1559          if (match && !consts[t].pcrel) {
1560             idx = i;
1561             break;
1562          }
1563       }
1564 
1565       if (idx == ~0) {
1566          idx = pair_count;
1567 
1568          if (pending && pending_single != val) {
1569             assert(!(pending_pcrel && consts[t].pcrel));
1570             bool pcrel = pending_pcrel || consts[t].pcrel;
1571 
1572             if (pcrel)
1573                *pcrel_pair = idx;
1574 
1575             pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel);
1576 
1577             pending = pending_pcrel = false;
1578          } else {
1579             pending = true;
1580             pending_pcrel = consts[t].pcrel;
1581             pending_single = val;
1582          }
1583       }
1584 
1585       consts[t].word_idx = idx;
1586    }
1587 
1588    /* Shift so it works whether pending_pcrel is set or not */
1589    if (pending) {
1590       if (pending_pcrel)
1591          *pcrel_pair = pair_count;
1592 
1593       pairs[pair_count++] = ((uint64_t)pending_single) << 32ull;
1594    }
1595 
1596    return pair_count;
1597 }
1598 
1599 static unsigned
bi_merge_constants(struct bi_const_state * consts,uint64_t * pairs,unsigned * pcrel_idx)1600 bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs,
1601                    unsigned *pcrel_idx)
1602 {
1603    unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx);
1604    return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx);
1605 }
1606 
1607 /* Swap two constants at word i and i+1 by swapping their actual positions and
1608  * swapping all references so the meaning of the clause is preserved */
1609 
1610 static void
bi_swap_constants(struct bi_const_state * consts,uint64_t * pairs,unsigned i)1611 bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i)
1612 {
1613    uint64_t tmp_pair = pairs[i + 0];
1614    pairs[i + 0] = pairs[i + 1];
1615    pairs[i + 1] = tmp_pair;
1616 
1617    for (unsigned t = 0; t < 8; ++t) {
1618       if (consts[t].word_idx == i)
1619          consts[t].word_idx = (i + 1);
1620       else if (consts[t].word_idx == (i + 1))
1621          consts[t].word_idx = i;
1622    }
1623 }
1624 
1625 /* Given merged constants, one of which might be PC-relative, fix up the M
1626  * values so the PC-relative constant (if it exists) has the M1=4 modification
1627  * and other constants are used as-is (which might require swapping) */
1628 
1629 static unsigned
bi_apply_constant_modifiers(struct bi_const_state * consts,uint64_t * pairs,unsigned * pcrel_idx,unsigned tuple_count,unsigned constant_count)1630 bi_apply_constant_modifiers(struct bi_const_state *consts, uint64_t *pairs,
1631                             unsigned *pcrel_idx, unsigned tuple_count,
1632                             unsigned constant_count)
1633 {
1634    unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0;
1635 
1636    /* Clauses with these tuple counts lack an M field for the packed EC0,
1637     * so EC0 cannot be PC-relative, which might require swapping (and
1638     * possibly adding an unused constant) to fit */
1639 
1640    if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) {
1641       constant_count = MAX2(constant_count, 2);
1642       *pcrel_idx = 1;
1643       bi_swap_constants(consts, pairs, 0);
1644    }
1645 
1646    /* EC0 might be packed free, after that constants are packed in pairs
1647     * (with clause format 12), with M1 values computed from the pair */
1648 
1649    for (unsigned i = start; i < constant_count; i += 2) {
1650       bool swap = false;
1651       bool last = (i + 1) == constant_count;
1652 
1653       unsigned A1 = (pairs[i] >> 60);
1654       unsigned B1 = (pairs[i + 1] >> 60);
1655 
1656       if (*pcrel_idx == i || *pcrel_idx == (i + 1)) {
1657          /* PC-relative constant must be E0, not E1 */
1658          swap = (*pcrel_idx == (i + 1));
1659 
1660          /* Set M1 = 4 by noting (A - B) mod 16 = 4 is
1661           * equivalent to A = (B + 4) mod 16 and that we can
1662           * control A */
1663          unsigned B = swap ? A1 : B1;
1664          unsigned A = (B + 4) & 0xF;
1665          pairs[*pcrel_idx] |= ((uint64_t)A) << 60;
1666 
1667          /* Swapped if swap set, identity if swap not set */
1668          *pcrel_idx = i;
1669       } else {
1670          /* Compute M1 value if we don't swap */
1671          unsigned M1 = (16 + A1 - B1) & 0xF;
1672 
1673          /* For M1 = 0 or M1 >= 8, the constants are unchanged,
1674           * we have 0 < (A1 - B1) % 16 < 8, which implies (B1 -
1675           * A1) % 16 >= 8, so swapping will let them be used
1676           * unchanged */
1677          swap = (M1 != 0) && (M1 < 8);
1678 
1679          /* However, we can't swap the last constant, so we
1680           * force M1 = 0 instead for this case */
1681          if (last && swap) {
1682             pairs[i + 1] |= pairs[i] & (0xfull << 60);
1683             swap = false;
1684          }
1685       }
1686 
1687       if (swap) {
1688          assert(!last);
1689          bi_swap_constants(consts, pairs, i);
1690       }
1691    }
1692 
1693    return constant_count;
1694 }
1695 
1696 /* Schedule a single clause. If no instructions remain, return NULL. */
1697 
1698 static bi_clause *
bi_schedule_clause(bi_context * ctx,bi_block * block,struct bi_worklist st,uint64_t * live)1699 bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st,
1700                    uint64_t *live)
1701 {
1702    struct bi_clause_state clause_state = {0};
1703    bi_clause *clause = rzalloc(ctx, bi_clause);
1704    bi_tuple *tuple = NULL;
1705 
1706    const unsigned max_tuples = ARRAY_SIZE(clause->tuples);
1707 
1708    /* TODO: Decide flow control better */
1709    clause->flow_control = BIFROST_FLOW_NBTB;
1710 
1711    /* The last clause can only write one instruction, so initialize that */
1712    struct bi_reg_state reg_state = {};
1713    bi_index prev_reads[5] = {bi_null()};
1714    unsigned nr_prev_reads = 0;
1715 
1716    /* We need to track future liveness. The main *live set tracks what is
1717     * live at the current point int he program we are scheduling, but to
1718     * determine temp eligibility, we instead want what will be live after
1719     * the next tuple in the program. If you scheduled forwards, you'd need
1720     * a crystall ball for this. Luckily we schedule backwards, so we just
1721     * delay updates to the live_after_temp by an extra tuple. */
1722    uint64_t live_after_temp = *live;
1723    uint64_t live_next_tuple = live_after_temp;
1724 
1725    do {
1726       struct bi_tuple_state tuple_state = {
1727          .last = (clause->tuple_count == 0),
1728          .reg = reg_state,
1729          .nr_prev_reads = nr_prev_reads,
1730          .prev = tuple,
1731          .pcrel_idx = ~0,
1732       };
1733 
1734       assert(nr_prev_reads < ARRAY_SIZE(prev_reads));
1735       memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads));
1736 
1737       unsigned idx = max_tuples - clause->tuple_count - 1;
1738 
1739       tuple = &clause->tuples[idx];
1740 
1741       if (clause->message && bi_opcode_props[clause->message->op].sr_read &&
1742           !bi_is_null(clause->message->src[0])) {
1743          unsigned nr = bi_count_read_registers(clause->message, 0);
1744          live_after_temp |=
1745             (BITFIELD64_MASK(nr) << clause->message->src[0].value);
1746       }
1747 
1748       /* Since we schedule backwards, we schedule ADD first */
1749       tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state,
1750                                       live_after_temp, false);
1751       tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state,
1752                                  live_after_temp, true);
1753       tuple->add = tuple_state.add;
1754 
1755       /* Update liveness from the new instructions */
1756       if (tuple->add)
1757          *live = bi_postra_liveness_ins(*live, tuple->add);
1758 
1759       if (tuple->fma)
1760          *live = bi_postra_liveness_ins(*live, tuple->fma);
1761 
1762       /* Rotate in the new per-tuple liveness */
1763       live_after_temp = live_next_tuple;
1764       live_next_tuple = *live;
1765 
1766       /* We may have a message, but only one per clause */
1767       if (tuple->add && bi_must_message(tuple->add)) {
1768          assert(!clause_state.message);
1769          clause_state.message = true;
1770 
1771          clause->message_type = bi_message_type_for_instr(tuple->add);
1772          clause->message = tuple->add;
1773 
1774          /* We don't need to set dependencies for blend shaders
1775           * because the BLEND instruction in the fragment
1776           * shader should have already done the wait */
1777          if (!ctx->inputs->is_blend) {
1778             switch (tuple->add->op) {
1779             case BI_OPCODE_ATEST:
1780                clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1781                break;
1782             case BI_OPCODE_LD_TILE:
1783             case BI_OPCODE_ST_TILE:
1784                clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1785                break;
1786             case BI_OPCODE_BLEND:
1787                clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1788                clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1789                break;
1790             default:
1791                break;
1792             }
1793          }
1794       }
1795 
1796       clause_state.consts[idx] = bi_get_const_state(&tuple_state);
1797 
1798       /* Before merging constants, eliminate zeroes, otherwise the
1799        * merging will fight over the #0 that never gets read (and is
1800        * never marked as read by update_fau) */
1801       if (tuple->fma && bi_reads_zero(tuple->fma))
1802          bi_rewrite_zero(tuple->fma, true);
1803 
1804       /* Rewrite away FAU, constant write is deferred */
1805       if (!tuple_state.constant_count) {
1806          tuple->fau_idx = tuple_state.fau;
1807          bi_rewrite_fau_to_pass(tuple);
1808       }
1809 
1810       /* Use passthrough register for cross-stage accesses. Since
1811        * there are just FMA and ADD stages, that means we rewrite to
1812        * passthrough the sources of the ADD that read from the
1813        * destination of the FMA */
1814 
1815       if (tuple->fma && tuple->fma->nr_dests) {
1816          bi_use_passthrough(tuple->add, tuple->fma->dest[0], BIFROST_SRC_STAGE,
1817                             false);
1818       }
1819 
1820       /* Don't add an empty tuple, unless the worklist has nothing
1821        * but a (pseudo)instruction failing to schedule due to a "not
1822        * last instruction" constraint */
1823 
1824       int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count));
1825       bool not_last = (some_instruction > 0) &&
1826                       bi_must_not_last(st.instructions[some_instruction - 1]);
1827 
1828       bool insert_empty = tuple_state.last && not_last;
1829 
1830       if (!(tuple->fma || tuple->add || insert_empty))
1831          break;
1832 
1833       clause->tuple_count++;
1834 
1835       /* Adding enough tuple might overflow constants */
1836       if (!bi_space_for_more_constants(&clause_state))
1837          break;
1838 
1839 #ifndef NDEBUG
1840       /* Don't schedule more than 1 tuple if debugging */
1841       if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty)
1842          break;
1843 #endif
1844 
1845       /* Link through the register state */
1846       STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads));
1847       memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads));
1848       nr_prev_reads = tuple_state.reg.nr_reads;
1849       clause_state.tuple_count++;
1850    } while (clause->tuple_count < 8);
1851 
1852    /* Don't schedule an empty clause */
1853    if (!clause->tuple_count)
1854       return NULL;
1855 
1856    /* Before merging, rewrite away any tuples that read only zero */
1857    for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1858       bi_tuple *tuple = &clause->tuples[i];
1859       struct bi_const_state *st = &clause_state.consts[i];
1860 
1861       if (st->constant_count == 0 || st->constants[0] || st->constants[1] ||
1862           st->pcrel)
1863          continue;
1864 
1865       bi_foreach_instr_in_tuple(tuple, ins)
1866          bi_rewrite_zero(ins, false);
1867 
1868       /* Constant has been demoted to FAU, so don't pack it separately */
1869       st->constant_count = 0;
1870 
1871       /* Default */
1872       assert(tuple->fau_idx == BIR_FAU_ZERO);
1873    }
1874 
1875    uint64_t constant_pairs[8] = {0};
1876    unsigned pcrel_idx = ~0;
1877    unsigned constant_words =
1878       bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx);
1879 
1880    constant_words = bi_apply_constant_modifiers(
1881       clause_state.consts, constant_pairs, &pcrel_idx, clause->tuple_count,
1882       constant_words);
1883 
1884    clause->pcrel_idx = pcrel_idx;
1885 
1886    for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1887       bi_tuple *tuple = &clause->tuples[i];
1888 
1889       /* If no constants, leave FAU as it is, possibly defaulting to 0 */
1890       if (clause_state.consts[i].constant_count == 0)
1891          continue;
1892 
1893       /* FAU is already handled */
1894       assert(!tuple->fau_idx);
1895 
1896       unsigned word_idx = clause_state.consts[i].word_idx;
1897       assert(word_idx <= 8);
1898 
1899       /* We could try to merge regardless of bottom bits as well, but
1900        * that's probably diminishing returns */
1901       uint64_t pair = constant_pairs[word_idx];
1902       unsigned lo = pair & 0xF;
1903 
1904       tuple->fau_idx = bi_constant_field(word_idx) | lo;
1905       bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx);
1906    }
1907 
1908    clause->constant_count = constant_words;
1909    memcpy(clause->constants, constant_pairs, sizeof(constant_pairs));
1910 
1911    /* Branches must be last, so this can be factored out */
1912    bi_instr *last = clause->tuples[max_tuples - 1].add;
1913    clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
1914    clause->block = block;
1915 
1916    clause->ftz = (clause_state.ftz == BI_FTZ_STATE_ENABLE);
1917 
1918    /* We emit in reverse and emitted to the back of the tuples array, so
1919     * move it up front for easy indexing */
1920    memmove(clause->tuples, clause->tuples + (max_tuples - clause->tuple_count),
1921            clause->tuple_count * sizeof(clause->tuples[0]));
1922 
1923    /* Use passthrough register for cross-tuple accesses. Note this is
1924     * after the memmove, so this is forwards. Skip the first tuple since
1925     * there is nothing before it to passthrough */
1926 
1927    for (unsigned t = 1; t < clause->tuple_count; ++t)
1928       bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]);
1929 
1930    return clause;
1931 }
1932 
1933 static void
bi_schedule_block(bi_context * ctx,bi_block * block)1934 bi_schedule_block(bi_context *ctx, bi_block *block)
1935 {
1936    list_inithead(&block->clauses);
1937 
1938    /* Copy list to dynamic array */
1939    struct bi_worklist st = bi_initialize_worklist(
1940       block, bifrost_debug & BIFROST_DBG_INORDER, ctx->inputs->is_blend);
1941 
1942    if (!st.count) {
1943       bi_free_worklist(st);
1944       return;
1945    }
1946 
1947    /* We need to track liveness during scheduling in order to determine whether
1948     * we can use temporary (passthrough) registers */
1949    uint64_t live = block->reg_live_out;
1950 
1951    /* Schedule as many clauses as needed to fill the block */
1952    bi_clause *u = NULL;
1953    while ((u = bi_schedule_clause(ctx, block, st, &live)))
1954       list_add(&u->link, &block->clauses);
1955 
1956    /* Back-to-back bit affects only the last clause of a block,
1957     * the rest are implicitly true */
1958    if (!list_is_empty(&block->clauses)) {
1959       bi_clause *last_clause =
1960          list_last_entry(&block->clauses, bi_clause, link);
1961       if (bi_reconverge_branches(block))
1962          last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
1963    }
1964 
1965    /* Reorder instructions to match the new schedule. First remove
1966     * existing instructions and then recreate the list */
1967 
1968    bi_foreach_instr_in_block_safe(block, ins) {
1969       list_del(&ins->link);
1970    }
1971 
1972    bi_foreach_clause_in_block(block, clause) {
1973       for (unsigned i = 0; i < clause->tuple_count; ++i) {
1974          bi_foreach_instr_in_tuple(&clause->tuples[i], ins) {
1975             list_addtail(&ins->link, &block->instructions);
1976          }
1977       }
1978    }
1979 
1980    block->scheduled = true;
1981 
1982 #ifndef NDEBUG
1983    unsigned i;
1984    bool incomplete = false;
1985 
1986    BITSET_FOREACH_SET(i, st.worklist, st.count) {
1987       bi_print_instr(st.instructions[i], stderr);
1988       incomplete = true;
1989    }
1990 
1991    if (incomplete)
1992       unreachable("The above instructions failed to schedule.");
1993 #endif
1994 
1995    bi_free_worklist(st);
1996 }
1997 
1998 static bool
bi_check_fau_src(bi_instr * ins,unsigned s,uint32_t * constants,unsigned * cwords,bi_index * fau)1999 bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants,
2000                  unsigned *cwords, bi_index *fau)
2001 {
2002    assert(s < ins->nr_srcs);
2003    bi_index src = ins->src[s];
2004 
2005    /* Staging registers can't have FAU accesses */
2006    if (bi_is_staging_src(ins, s))
2007       return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU);
2008 
2009    if (src.type == BI_INDEX_CONSTANT) {
2010       /* Allow fast zero */
2011       if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins))
2012          return true;
2013 
2014       if (!bi_is_null(*fau))
2015          return false;
2016 
2017       /* Else, try to inline a constant */
2018       for (unsigned i = 0; i < *cwords; ++i) {
2019          if (src.value == constants[i])
2020             return true;
2021       }
2022 
2023       if (*cwords >= 2)
2024          return false;
2025 
2026       constants[(*cwords)++] = src.value;
2027    } else if (src.type == BI_INDEX_FAU) {
2028       if (*cwords != 0)
2029          return false;
2030 
2031       /* Can only read from one pair of FAU words */
2032       if (!bi_is_null(*fau) && (src.value != fau->value))
2033          return false;
2034 
2035       /* If there is a target, we'll need a PC-relative constant */
2036       if (ins->branch_target)
2037          return false;
2038 
2039       *fau = src;
2040    }
2041 
2042    return true;
2043 }
2044 
2045 void
bi_lower_fau(bi_context * ctx)2046 bi_lower_fau(bi_context *ctx)
2047 {
2048    bi_foreach_instr_global_safe(ctx, ins) {
2049       bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
2050 
2051       uint32_t constants[2];
2052       unsigned cwords = 0;
2053       bi_index fau = bi_null();
2054 
2055       /* ATEST must have the ATEST datum encoded, not any other
2056        * uniform. See to it this is the case. */
2057       if (ins->op == BI_OPCODE_ATEST)
2058          fau = ins->src[2];
2059 
2060       /* Dual texturing requires the texture operation descriptor
2061        * encoded as an immediate so we can fix up.
2062        */
2063       if (ins->op == BI_OPCODE_TEXC_DUAL) {
2064          assert(ins->src[3].type == BI_INDEX_CONSTANT);
2065          constants[cwords++] = ins->src[3].value;
2066       }
2067 
2068       /* Phis get split up into moves so are unrestricted */
2069       if (ins->op == BI_OPCODE_PHI)
2070          continue;
2071 
2072       bi_foreach_src(ins, s) {
2073          if (bi_check_fau_src(ins, s, constants, &cwords, &fau))
2074             continue;
2075 
2076          bi_index copy = bi_mov_i32(&b, ins->src[s]);
2077          bi_replace_src(ins, s, copy);
2078       }
2079    }
2080 }
2081 
2082 /* Only v7 allows specifying a dependency on the tilebuffer for the first
2083  * clause of a shader. v6 requires adding a NOP clause with the depedency. */
2084 
2085 static void
bi_add_nop_for_atest(bi_context * ctx)2086 bi_add_nop_for_atest(bi_context *ctx)
2087 {
2088    /* Only needed on v6 */
2089    if (ctx->arch >= 7)
2090       return;
2091 
2092    if (list_is_empty(&ctx->blocks))
2093       return;
2094 
2095    /* Fetch the first clause of the shader */
2096    bi_block *block = list_first_entry(&ctx->blocks, bi_block, link);
2097    bi_clause *clause = bi_next_clause(ctx, block, NULL);
2098 
2099    if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) |
2100                                             (1 << BIFROST_SLOT_ELDEST_COLOUR))))
2101       return;
2102 
2103    /* Add a NOP so we can wait for the dependencies required by the first
2104     * clause */
2105 
2106    bi_instr *I = rzalloc(ctx, bi_instr);
2107    I->op = BI_OPCODE_NOP;
2108 
2109    bi_clause *new_clause = ralloc(ctx, bi_clause);
2110    *new_clause = (bi_clause){
2111       .flow_control = BIFROST_FLOW_NBTB,
2112       .next_clause_prefetch = true,
2113       .block = clause->block,
2114 
2115       .tuple_count = 1,
2116       .tuples[0] =
2117          {
2118             .fma = I,
2119          },
2120    };
2121 
2122    list_add(&new_clause->link, &clause->block->clauses);
2123 }
2124 
2125 void
bi_schedule(bi_context * ctx)2126 bi_schedule(bi_context *ctx)
2127 {
2128    /* Fed into both scheduling and DCE */
2129    bi_postra_liveness(ctx);
2130 
2131    bi_foreach_block(ctx, block) {
2132       bi_schedule_block(ctx, block);
2133    }
2134 
2135    bi_opt_dce_post_ra(ctx);
2136    bi_add_nop_for_atest(ctx);
2137 }
2138