xref: /aosp_15_r20/external/mesa3d/src/asahi/compiler/agx_register_allocate.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "util/bitset.h"
7 #include "util/macros.h"
8 #include "util/u_dynarray.h"
9 #include "util/u_memory.h"
10 #include "util/u_qsort.h"
11 #include "agx_builder.h"
12 #include "agx_compile.h"
13 #include "agx_compiler.h"
14 #include "agx_debug.h"
15 #include "agx_opcodes.h"
16 #include "shader_enums.h"
17 
18 /* SSA-based register allocator */
19 
20 enum ra_class {
21    /* General purpose register */
22    RA_GPR,
23 
24    /* Memory, used to assign stack slots */
25    RA_MEM,
26 
27    /* Keep last */
28    RA_CLASSES,
29 };
30 
31 static inline enum ra_class
ra_class_for_index(agx_index idx)32 ra_class_for_index(agx_index idx)
33 {
34    return idx.memory ? RA_MEM : RA_GPR;
35 }
36 
37 struct phi_web_node {
38    /* Parent index, or circular for root */
39    uint32_t parent;
40 
41    /* If root, assigned register, or ~0 if no register assigned. */
42    uint16_t reg;
43    bool assigned;
44 
45    /* Rank, at most log2(n) so need ~5-bits */
46    uint8_t rank;
47 };
48 static_assert(sizeof(struct phi_web_node) == 8, "packed");
49 
50 static unsigned
phi_web_find(struct phi_web_node * web,unsigned x)51 phi_web_find(struct phi_web_node *web, unsigned x)
52 {
53    if (web[x].parent == x) {
54       /* Root */
55       return x;
56    } else {
57       /* Search up the tree */
58       unsigned root = x;
59       while (web[root].parent != root)
60          root = web[root].parent;
61 
62       /* Compress path. Second pass ensures O(1) memory usage. */
63       while (web[x].parent != x) {
64          unsigned temp = web[x].parent;
65          web[x].parent = root;
66          x = temp;
67       }
68 
69       return root;
70    }
71 }
72 
73 static void
phi_web_union(struct phi_web_node * web,unsigned x,unsigned y)74 phi_web_union(struct phi_web_node *web, unsigned x, unsigned y)
75 {
76    x = phi_web_find(web, x);
77    y = phi_web_find(web, y);
78 
79    if (x == y)
80       return;
81 
82    /* Union-by-rank: ensure x.rank >= y.rank */
83    if (web[x].rank < web[y].rank) {
84       unsigned temp = x;
85       x = y;
86       y = temp;
87    }
88 
89    web[y].parent = x;
90 
91    /* Increment rank if necessary */
92    if (web[x].rank == web[y].rank) {
93       web[x].rank++;
94    }
95 }
96 
97 struct ra_ctx {
98    agx_context *shader;
99    agx_block *block;
100    agx_instr *instr;
101    uint16_t *ssa_to_reg;
102    uint8_t *ncomps;
103    enum agx_size *sizes;
104    enum ra_class *classes;
105    BITSET_WORD *visited;
106    BITSET_WORD *used_regs[RA_CLASSES];
107 
108    /* Maintained while assigning registers */
109    unsigned *max_reg[RA_CLASSES];
110 
111    /* For affinities */
112    agx_instr **src_to_collect_phi;
113    struct phi_web_node *phi_web;
114 
115    /* If bit i of used_regs is set, and register i is the first consecutive
116     * register holding an SSA value, then reg_to_ssa[i] is the SSA index of the
117     * value currently in register  i.
118     *
119     * Only for GPRs. We can add reg classes later if we have a use case.
120     */
121    uint32_t reg_to_ssa[AGX_NUM_REGS];
122 
123    /* Maximum number of registers that RA is allowed to use */
124    unsigned bound[RA_CLASSES];
125 };
126 
127 enum agx_size
agx_split_width(const agx_instr * I)128 agx_split_width(const agx_instr *I)
129 {
130    enum agx_size width = ~0;
131 
132    agx_foreach_dest(I, d) {
133       if (I->dest[d].type == AGX_INDEX_NULL)
134          continue;
135       else if (width != ~0)
136          assert(width == I->dest[d].size);
137       else
138          width = I->dest[d].size;
139    }
140 
141    assert(width != ~0 && "should have been DCE'd");
142    return width;
143 }
144 
145 /*
146  * Calculate register demand in 16-bit registers, while gathering widths and
147  * classes. Becuase we allocate in SSA, this calculation is exact in
148  * linear-time. Depends on liveness information.
149  */
150 static unsigned
agx_calc_register_demand(agx_context * ctx)151 agx_calc_register_demand(agx_context *ctx)
152 {
153    uint8_t *widths = calloc(ctx->alloc, sizeof(uint8_t));
154    enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
155 
156    agx_foreach_instr_global(ctx, I) {
157       agx_foreach_ssa_dest(I, d) {
158          unsigned v = I->dest[d].value;
159          assert(widths[v] == 0 && "broken SSA");
160          /* Round up vectors for easier live range splitting */
161          widths[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
162          classes[v] = ra_class_for_index(I->dest[d]);
163       }
164    }
165 
166    /* Calculate demand at the start of each block based on live-in, then update
167     * for each instruction processed. Calculate rolling maximum.
168     */
169    unsigned max_demand = 0;
170 
171    agx_foreach_block(ctx, block) {
172       unsigned demand = 0;
173 
174       /* RA treats the nesting counter as alive throughout if control flow is
175        * used anywhere. This could be optimized.
176        */
177       if (ctx->any_cf)
178          demand++;
179 
180       if (ctx->any_quad_divergent_shuffle)
181          demand++;
182 
183       if (ctx->has_spill_pcopy_reserved)
184          demand = 8;
185 
186       /* Everything live-in */
187       {
188          int i;
189          BITSET_FOREACH_SET(i, block->live_in, ctx->alloc) {
190             if (classes[i] == RA_GPR)
191                demand += widths[i];
192          }
193       }
194 
195       max_demand = MAX2(demand, max_demand);
196 
197       /* To handle non-power-of-two vectors, sometimes live range splitting
198        * needs extra registers for 1 instruction. This counter tracks the number
199        * of registers to be freed after 1 extra instruction.
200        */
201       unsigned late_kill_count = 0;
202 
203       agx_foreach_instr_in_block(block, I) {
204          /* Phis happen in parallel and are already accounted for in the live-in
205           * set, just skip them so we don't double count.
206           */
207          if (I->op == AGX_OPCODE_PHI)
208             continue;
209 
210          if (I->op == AGX_OPCODE_PRELOAD) {
211             unsigned size = agx_size_align_16(I->src[0].size);
212             max_demand = MAX2(max_demand, I->src[0].value + size);
213          } else if (I->op == AGX_OPCODE_EXPORT) {
214             unsigned size = agx_size_align_16(I->src[0].size);
215             max_demand = MAX2(max_demand, I->imm + size);
216          }
217 
218          /* Handle late-kill registers from last instruction */
219          demand -= late_kill_count;
220          late_kill_count = 0;
221 
222          /* Kill sources the first time we see them */
223          agx_foreach_src(I, s) {
224             if (!I->src[s].kill)
225                continue;
226             assert(I->src[s].type == AGX_INDEX_NORMAL);
227             if (ra_class_for_index(I->src[s]) != RA_GPR)
228                continue;
229 
230             bool skip = false;
231 
232             for (unsigned backwards = 0; backwards < s; ++backwards) {
233                if (agx_is_equiv(I->src[backwards], I->src[s])) {
234                   skip = true;
235                   break;
236                }
237             }
238 
239             if (!skip)
240                demand -= widths[I->src[s].value];
241          }
242 
243          /* Make destinations live */
244          agx_foreach_ssa_dest(I, d) {
245             if (ra_class_for_index(I->dest[d]) != RA_GPR)
246                continue;
247 
248             /* Live range splits allocate at power-of-two granularity. Round up
249              * destination sizes (temporarily) to powers-of-two.
250              */
251             unsigned real_width = widths[I->dest[d].value];
252             unsigned pot_width = util_next_power_of_two(real_width);
253 
254             demand += pot_width;
255             late_kill_count += (pot_width - real_width);
256          }
257 
258          max_demand = MAX2(demand, max_demand);
259       }
260 
261       demand -= late_kill_count;
262    }
263 
264    free(widths);
265    free(classes);
266    return max_demand;
267 }
268 
269 static bool
find_regs_simple(struct ra_ctx * rctx,enum ra_class cls,unsigned count,unsigned align,unsigned * out)270 find_regs_simple(struct ra_ctx *rctx, enum ra_class cls, unsigned count,
271                  unsigned align, unsigned *out)
272 {
273    for (unsigned reg = 0; reg + count <= rctx->bound[cls]; reg += align) {
274       if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1)) {
275          *out = reg;
276          return true;
277       }
278    }
279 
280    return false;
281 }
282 
283 /*
284  * Search the register file for the best contiguous aligned region of the given
285  * size to evict when shuffling registers. The region must not contain any
286  * register marked in the passed bitset.
287  *
288  * As a hint, this also takes in the set of registers from killed sources passed
289  * to this instruction. These should be deprioritized, since they are more
290  * expensive to use (extra moves to shuffle the contents away).
291  *
292  * Precondition: such a region exists.
293  *
294  * Postcondition: at least one register in the returned region is already free.
295  */
296 static unsigned
find_best_region_to_evict(struct ra_ctx * rctx,enum ra_class cls,unsigned size,BITSET_WORD * already_evicted,BITSET_WORD * killed)297 find_best_region_to_evict(struct ra_ctx *rctx, enum ra_class cls, unsigned size,
298                           BITSET_WORD *already_evicted, BITSET_WORD *killed)
299 {
300    assert(util_is_power_of_two_or_zero(size) && "precondition");
301    assert((rctx->bound[cls] % size) == 0 &&
302           "register file size must be aligned to the maximum vector size");
303    assert(cls == RA_GPR);
304 
305    unsigned best_base = ~0;
306    unsigned best_moves = ~0;
307 
308    /* Beginning region evictability condition */
309    bool r0_evictable =
310       !rctx->shader->any_cf && !rctx->shader->has_spill_pcopy_reserved;
311 
312    assert(!(r0_evictable && rctx->shader->any_quad_divergent_shuffle));
313 
314    for (unsigned base = 0; base + size <= rctx->bound[cls]; base += size) {
315       /* The first k registers are preallocated and unevictable, so must be
316        * skipped. By itself, this does not pose a problem. We are allocating n
317        * registers, but this region has at most n-k free.  Since there are at
318        * least n free registers total, there is at least k free registers
319        * outside this region. Choose any such free register. The region
320        * containing it has at most n-1 occupied registers. In the worst case,
321        * n-k of those registers are are moved to the beginning region and the
322        * remaining (n-1)-(n-k) = k-1 registers are moved to the k-1 free
323        * registers in other regions, given there are k free registers total.
324        * These recursive shuffles work out because everything is power-of-two
325        * sized and naturally aligned, so the sizes shuffled are strictly
326        * descending. So, we do not need extra registers to handle "single
327        * region" unevictability.
328        */
329       if (base == 0 && !r0_evictable)
330          continue;
331 
332       /* Do not evict the same register multiple times. It's not necessary since
333        * we're just shuffling, there are enough free registers elsewhere.
334        */
335       if (BITSET_TEST_RANGE(already_evicted, base, base + size - 1))
336          continue;
337 
338       /* Estimate the number of moves required if we pick this region */
339       unsigned moves = 0;
340       bool any_free = false;
341 
342       for (unsigned reg = base; reg < base + size; ++reg) {
343          /* We need a move for each blocked register (TODO: we only need a
344           * single move for 32-bit pairs, could optimize to use that instead.)
345           */
346          if (BITSET_TEST(rctx->used_regs[cls], reg))
347             moves++;
348          else
349             any_free = true;
350 
351          /* Each clobbered killed register requires a move or a swap. Since
352           * swaps require more instructions, assign a higher cost here. In
353           * practice, 3 is too high but 2 is slightly better than 1.
354           */
355          if (BITSET_TEST(killed, reg))
356             moves += 2;
357       }
358 
359       /* Pick the region requiring fewest moves as a heuristic. Regions with no
360        * free registers are skipped even if the heuristic estimates a lower cost
361        * (due to killed sources), since the recursive splitting algorithm
362        * requires at least one free register.
363        */
364       if (any_free && moves < best_moves) {
365          best_moves = moves;
366          best_base = base;
367       }
368    }
369 
370    assert(best_base < rctx->bound[cls] &&
371           "not enough registers (should have spilled already)");
372    return best_base;
373 }
374 
375 static void
set_ssa_to_reg(struct ra_ctx * rctx,unsigned ssa,unsigned reg)376 set_ssa_to_reg(struct ra_ctx *rctx, unsigned ssa, unsigned reg)
377 {
378    enum ra_class cls = rctx->classes[ssa];
379 
380    *(rctx->max_reg[cls]) =
381       MAX2(*(rctx->max_reg[cls]), reg + rctx->ncomps[ssa] - 1);
382 
383    rctx->ssa_to_reg[ssa] = reg;
384 }
385 
386 static unsigned
assign_regs_by_copying(struct ra_ctx * rctx,unsigned npot_count,unsigned align,const agx_instr * I,struct util_dynarray * copies,BITSET_WORD * clobbered,BITSET_WORD * killed,enum ra_class cls)387 assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
388                        const agx_instr *I, struct util_dynarray *copies,
389                        BITSET_WORD *clobbered, BITSET_WORD *killed,
390                        enum ra_class cls)
391 {
392    assert(cls == RA_GPR);
393 
394    /* Expand the destination to the next power-of-two size. This simplifies
395     * splitting and is accounted for by the demand calculation, so is legal.
396     */
397    unsigned count = util_next_power_of_two(npot_count);
398    assert(align <= count && "still aligned");
399    align = count;
400 
401    /* There's not enough contiguous room in the register file. We need to
402     * shuffle some variables around. Look for a range of the register file
403     * that is partially blocked.
404     */
405    unsigned base =
406       find_best_region_to_evict(rctx, cls, count, clobbered, killed);
407 
408    assert(count <= 16 && "max allocation size (conservative)");
409    BITSET_DECLARE(evict_set, 16) = {0};
410 
411    /* Store the set of blocking registers that need to be evicted */
412    for (unsigned i = 0; i < count; ++i) {
413       if (BITSET_TEST(rctx->used_regs[cls], base + i)) {
414          BITSET_SET(evict_set, i);
415       }
416    }
417 
418    /* We are going to allocate the destination to this range, so it is now fully
419     * used. Mark it as such so we don't reassign here later.
420     */
421    BITSET_SET_RANGE(rctx->used_regs[cls], base, base + count - 1);
422 
423    /* Before overwriting the range, we need to evict blocked variables */
424    for (unsigned i = 0; i < 16; ++i) {
425       /* Look for subranges that needs eviction */
426       if (!BITSET_TEST(evict_set, i))
427          continue;
428 
429       unsigned reg = base + i;
430       uint32_t ssa = rctx->reg_to_ssa[reg];
431       uint32_t nr = rctx->ncomps[ssa];
432       unsigned align = agx_size_align_16(rctx->sizes[ssa]);
433 
434       assert(nr >= 1 && "must be assigned");
435       assert(rctx->ssa_to_reg[ssa] == reg &&
436              "variable must start within the range, since vectors are limited");
437 
438       for (unsigned j = 0; j < nr; ++j) {
439          assert(BITSET_TEST(evict_set, i + j) &&
440                 "variable is allocated contiguous and vectors are limited, "
441                 "so evicted in full");
442       }
443 
444       /* Assign a new location for the variable. This terminates with finite
445        * recursion because nr is decreasing because of the gap.
446        */
447       assert(nr < count && "fully contained in range that's not full");
448       unsigned new_reg = assign_regs_by_copying(rctx, nr, align, I, copies,
449                                                 clobbered, killed, cls);
450 
451       /* Copy the variable over, register by register */
452       for (unsigned i = 0; i < nr; i += align) {
453          assert(cls == RA_GPR);
454 
455          struct agx_copy copy = {
456             .dest = new_reg + i,
457             .src = agx_register(reg + i, rctx->sizes[ssa]),
458          };
459 
460          assert((copy.dest % agx_size_align_16(rctx->sizes[ssa])) == 0 &&
461                 "new dest must be aligned");
462          assert((copy.src.value % agx_size_align_16(rctx->sizes[ssa])) == 0 &&
463                 "src must be aligned");
464          util_dynarray_append(copies, struct agx_copy, copy);
465       }
466 
467       /* Mark down the set of clobbered registers, so that killed sources may be
468        * handled correctly later.
469        */
470       BITSET_SET_RANGE(clobbered, new_reg, new_reg + nr - 1);
471 
472       /* Update bookkeeping for this variable */
473       assert(cls == rctx->classes[cls]);
474       set_ssa_to_reg(rctx, ssa, new_reg);
475       rctx->reg_to_ssa[new_reg] = ssa;
476 
477       /* Skip to the next variable */
478       i += nr - 1;
479    }
480 
481    /* We overallocated for non-power-of-two vectors. Free up the excess now.
482     * This is modelled as late kill in demand calculation.
483     */
484    if (npot_count != count) {
485       BITSET_CLEAR_RANGE(rctx->used_regs[cls], base + npot_count,
486                          base + count - 1);
487    }
488 
489    return base;
490 }
491 
492 static int
sort_by_size(const void * a_,const void * b_,void * sizes_)493 sort_by_size(const void *a_, const void *b_, void *sizes_)
494 {
495    const enum agx_size *sizes = sizes_;
496    const unsigned *a = a_, *b = b_;
497 
498    return sizes[*b] - sizes[*a];
499 }
500 
501 /*
502  * Allocating a destination of n consecutive registers may require moving those
503  * registers' contents to the locations of killed sources. For the instruction
504  * to read the correct values, the killed sources themselves need to be moved to
505  * the space where the destination will go.
506  *
507  * This is legal because there is no interference between the killed source and
508  * the destination. This is always possible because, after this insertion, the
509  * destination needs to contain the killed sources already overlapping with the
510  * destination (size k) plus the killed sources clobbered to make room for
511  * livethrough sources overlapping with the destination (at most size |dest|-k),
512  * so the total size is at most k + |dest| - k = |dest| and so fits in the dest.
513  * Sorting by alignment may be necessary.
514  */
515 static void
insert_copies_for_clobbered_killed(struct ra_ctx * rctx,unsigned reg,unsigned count,const agx_instr * I,struct util_dynarray * copies,BITSET_WORD * clobbered)516 insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg,
517                                    unsigned count, const agx_instr *I,
518                                    struct util_dynarray *copies,
519                                    BITSET_WORD *clobbered)
520 {
521    unsigned vars[16] = {0};
522    unsigned nr_vars = 0;
523 
524    /* Precondition: the nesting counter is not overwritten. Therefore we do not
525     * have to move it.  find_best_region_to_evict knows better than to try.
526     */
527    assert(!(reg == 0 && rctx->shader->any_cf) && "r0l is never moved");
528    assert(!(reg == 1 && rctx->shader->any_quad_divergent_shuffle) &&
529           "r0h is never moved");
530 
531    /* Consider the destination clobbered for the purpose of source collection.
532     * This way, killed sources already in the destination will be preserved
533     * (though possibly compacted).
534     */
535    BITSET_SET_RANGE(clobbered, reg, reg + count - 1);
536 
537    /* Collect killed clobbered sources, if any */
538    agx_foreach_ssa_src(I, s) {
539       unsigned reg = rctx->ssa_to_reg[I->src[s].value];
540 
541       if (I->src[s].kill && ra_class_for_index(I->src[s]) == RA_GPR &&
542           BITSET_TEST(clobbered, reg)) {
543 
544          assert(nr_vars < ARRAY_SIZE(vars) &&
545                 "cannot clobber more than max variable size");
546 
547          vars[nr_vars++] = I->src[s].value;
548       }
549    }
550 
551    if (nr_vars == 0)
552       return;
553 
554    assert(I->op != AGX_OPCODE_PHI && "kill bit not set for phis");
555 
556    /* Sort by descending alignment so they are packed with natural alignment */
557    util_qsort_r(vars, nr_vars, sizeof(vars[0]), sort_by_size, rctx->sizes);
558 
559    /* Reassign in the destination region */
560    unsigned base = reg;
561 
562    /* We align vectors to their sizes, so this assertion holds as long as no
563     * instruction has a source whose scalar size is greater than the entire size
564     * of the vector destination. Yet the killed source must fit within this
565     * destination, so the destination must be bigger and therefore have bigger
566     * alignment.
567     */
568    assert((base % agx_size_align_16(rctx->sizes[vars[0]])) == 0 &&
569           "destination alignment >= largest killed source alignment");
570 
571    for (unsigned i = 0; i < nr_vars; ++i) {
572       unsigned var = vars[i];
573       unsigned var_base = rctx->ssa_to_reg[var];
574       unsigned var_count = rctx->ncomps[var];
575       unsigned var_align = agx_size_align_16(rctx->sizes[var]);
576 
577       assert(rctx->classes[var] == RA_GPR && "construction");
578       assert((base % var_align) == 0 && "induction");
579       assert((var_count % var_align) == 0 && "no partial variables");
580 
581       for (unsigned j = 0; j < var_count; j += var_align) {
582          struct agx_copy copy = {
583             .dest = base + j,
584             .src = agx_register(var_base + j, rctx->sizes[var]),
585          };
586 
587          util_dynarray_append(copies, struct agx_copy, copy);
588       }
589 
590       set_ssa_to_reg(rctx, var, base);
591       rctx->reg_to_ssa[base] = var;
592 
593       base += var_count;
594    }
595 
596    assert(base <= reg + count && "no overflow");
597 }
598 
599 /*
600  * When shuffling registers to assign a phi destination, we can't simply insert
601  * the required moves before the phi, since phis happen in parallel along the
602  * edge. Instead, there are two cases:
603  *
604  * 1. The source of the copy is the destination of a phi. Since we are
605  *    emitting shuffle code, there will be no more reads of that destination
606  *    with the old register. Since the phis all happen in parallel and writes
607  *    precede reads, there was no previous read of that destination either. So
608  *    the old destination is dead. Just replace the phi's destination with the
609  *    moves's destination instead.
610  *
611  * 2. Otherwise, the source of the copy is a live-in value, since it's
612  *    live when assigning phis at the start of a block but it is not a phi.
613  *    If we move in parallel with the phi, the phi will still read the correct
614  *    old register regardless and the destinations can't alias. So, insert a phi
615  *    to do the copy in parallel along the incoming edges.
616  */
617 static void
agx_emit_move_before_phi(agx_context * ctx,agx_block * block,struct agx_copy * copy)618 agx_emit_move_before_phi(agx_context *ctx, agx_block *block,
619                          struct agx_copy *copy)
620 {
621    assert(!copy->dest_mem && !copy->src.memory && "no memory shuffles");
622 
623    /* Look for the phi writing the destination */
624    agx_foreach_phi_in_block(block, phi) {
625       if (agx_is_equiv(phi->dest[0], copy->src) && !phi->dest[0].memory) {
626          phi->dest[0].value = copy->dest;
627          return;
628       }
629    }
630 
631    /* There wasn't such a phi, so it's live-in. Insert a phi instead. */
632    agx_builder b = agx_init_builder(ctx, agx_before_block(block));
633 
634    agx_instr *phi = agx_phi_to(&b, agx_register_like(copy->dest, copy->src),
635                                agx_num_predecessors(block));
636 
637    agx_foreach_src(phi, s) {
638       phi->src[s] = copy->src;
639    }
640 }
641 
642 static unsigned
find_regs(struct ra_ctx * rctx,agx_instr * I,unsigned dest_idx,unsigned count,unsigned align)643 find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
644           unsigned align)
645 {
646    unsigned reg;
647    assert(count == align);
648 
649    enum ra_class cls = ra_class_for_index(I->dest[dest_idx]);
650 
651    if (find_regs_simple(rctx, cls, count, align, &reg)) {
652       return reg;
653    } else {
654       assert(cls == RA_GPR && "no memory live range splits");
655 
656       BITSET_DECLARE(clobbered, AGX_NUM_REGS) = {0};
657       BITSET_DECLARE(killed, AGX_NUM_REGS) = {0};
658       struct util_dynarray copies = {0};
659       util_dynarray_init(&copies, NULL);
660 
661       /* Initialize the set of registers killed by this instructions' sources */
662       agx_foreach_ssa_src(I, s) {
663          unsigned v = I->src[s].value;
664 
665          if (BITSET_TEST(rctx->visited, v) && !I->src[s].memory) {
666             unsigned base = rctx->ssa_to_reg[v];
667             unsigned nr = rctx->ncomps[v];
668 
669             assert(base + nr <= AGX_NUM_REGS);
670             BITSET_SET_RANGE(killed, base, base + nr - 1);
671          }
672       }
673 
674       reg = assign_regs_by_copying(rctx, count, align, I, &copies, clobbered,
675                                    killed, cls);
676       insert_copies_for_clobbered_killed(rctx, reg, count, I, &copies,
677                                          clobbered);
678 
679       /* Insert the necessary copies. Phis need special handling since we can't
680        * insert instructions before the phi.
681        */
682       if (I->op == AGX_OPCODE_PHI) {
683          util_dynarray_foreach(&copies, struct agx_copy, copy) {
684             agx_emit_move_before_phi(rctx->shader, rctx->block, copy);
685          }
686       } else {
687          agx_builder b = agx_init_builder(rctx->shader, agx_before_instr(I));
688          agx_emit_parallel_copies(
689             &b, copies.data,
690             util_dynarray_num_elements(&copies, struct agx_copy));
691       }
692 
693       util_dynarray_fini(&copies);
694 
695       /* assign_regs asserts this is cleared, so clear to be reassigned */
696       BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
697       return reg;
698    }
699 }
700 
701 static uint32_t
search_ssa_to_reg_out(struct ra_ctx * ctx,struct agx_block * blk,enum ra_class cls,unsigned ssa)702 search_ssa_to_reg_out(struct ra_ctx *ctx, struct agx_block *blk,
703                       enum ra_class cls, unsigned ssa)
704 {
705    for (unsigned reg = 0; reg < ctx->bound[cls]; ++reg) {
706       if (blk->reg_to_ssa_out[cls][reg] == ssa)
707          return reg;
708    }
709 
710    unreachable("variable not defined in block");
711 }
712 
713 /*
714  * Loop over live-in values at the start of the block and mark their registers
715  * as in-use. We process blocks in dominance order, so this handles everything
716  * but loop headers.
717  *
718  * For loop headers, this handles the forward edges but not the back edge.
719  * However, that's okay: we don't want to reserve the registers that are
720  * defined within the loop, because then we'd get a contradiction. Instead we
721  * leave them available and then they become fixed points of a sort.
722  */
723 static void
reserve_live_in(struct ra_ctx * rctx)724 reserve_live_in(struct ra_ctx *rctx)
725 {
726    /* If there are no predecessors, there is nothing live-in */
727    unsigned nr_preds = agx_num_predecessors(rctx->block);
728    if (nr_preds == 0)
729       return;
730 
731    agx_builder b =
732       agx_init_builder(rctx->shader, agx_before_block(rctx->block));
733 
734    int i;
735    BITSET_FOREACH_SET(i, rctx->block->live_in, rctx->shader->alloc) {
736       /* Skip values defined in loops when processing the loop header */
737       if (!BITSET_TEST(rctx->visited, i))
738          continue;
739 
740       unsigned base;
741       enum ra_class cls = rctx->classes[i];
742 
743       /* If we split live ranges, the variable might be defined differently at
744        * the end of each predecessor. Join them together with a phi inserted at
745        * the start of the block.
746        */
747       if (nr_preds > 1) {
748          /* We'll fill in the destination after, to coalesce one of the moves */
749          agx_instr *phi = agx_phi_to(&b, agx_null(), nr_preds);
750          enum agx_size size = rctx->sizes[i];
751 
752          agx_foreach_predecessor(rctx->block, pred) {
753             unsigned pred_idx = agx_predecessor_index(rctx->block, *pred);
754 
755             if ((*pred)->reg_to_ssa_out[cls] == NULL) {
756                /* If this is a loop header, we don't know where the register
757                 * will end up. So, we create a phi conservatively but don't fill
758                 * it in until the end of the loop. Stash in the information
759                 * we'll need to fill in the real register later.
760                 */
761                assert(rctx->block->loop_header);
762                phi->src[pred_idx] = agx_get_index(i, size);
763                phi->src[pred_idx].memory = rctx->classes[i] == RA_MEM;
764             } else {
765                /* Otherwise, we can build the phi now */
766                unsigned reg = search_ssa_to_reg_out(rctx, *pred, cls, i);
767                phi->src[pred_idx] = cls == RA_MEM
768                                        ? agx_memory_register(reg, size)
769                                        : agx_register(reg, size);
770             }
771          }
772 
773          /* Pick the phi destination to coalesce a move. Predecessor ordering is
774           * stable, so this means all live-in values get their registers from a
775           * particular predecessor. That means that such a register allocation
776           * is valid here, because it was valid in the predecessor.
777           */
778          phi->dest[0] = phi->src[0];
779          base = phi->dest[0].value;
780       } else {
781          /* If we don't emit a phi, there is already a unique register */
782          assert(nr_preds == 1);
783 
784          agx_block **pred = util_dynarray_begin(&rctx->block->predecessors);
785          /* TODO: Flip logic to eliminate the search */
786          base = search_ssa_to_reg_out(rctx, *pred, cls, i);
787       }
788 
789       set_ssa_to_reg(rctx, i, base);
790 
791       for (unsigned j = 0; j < rctx->ncomps[i]; ++j) {
792          BITSET_SET(rctx->used_regs[cls], base + j);
793 
794          if (cls == RA_GPR)
795             rctx->reg_to_ssa[base + j] = i;
796       }
797    }
798 }
799 
800 static void
assign_regs(struct ra_ctx * rctx,agx_index v,unsigned reg)801 assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg)
802 {
803    enum ra_class cls = ra_class_for_index(v);
804    assert(reg < rctx->bound[cls] && "must not overflow register file");
805    assert(v.type == AGX_INDEX_NORMAL && "only SSA gets registers allocated");
806    set_ssa_to_reg(rctx, v.value, reg);
807 
808    assert(!BITSET_TEST(rctx->visited, v.value) && "SSA violated");
809    BITSET_SET(rctx->visited, v.value);
810 
811    assert(rctx->ncomps[v.value] >= 1);
812    unsigned end = reg + rctx->ncomps[v.value] - 1;
813 
814    assert(!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, end) &&
815           "no interference");
816    BITSET_SET_RANGE(rctx->used_regs[cls], reg, end);
817 
818    if (cls == RA_GPR)
819       rctx->reg_to_ssa[reg] = v.value;
820 
821    /* Phi webs need to remember which register they're assigned to */
822    struct phi_web_node *node =
823       &rctx->phi_web[phi_web_find(rctx->phi_web, v.value)];
824 
825    if (!node->assigned) {
826       node->reg = reg;
827       node->assigned = true;
828    }
829 }
830 
831 static void
agx_set_sources(struct ra_ctx * rctx,agx_instr * I)832 agx_set_sources(struct ra_ctx *rctx, agx_instr *I)
833 {
834    assert(I->op != AGX_OPCODE_PHI);
835 
836    agx_foreach_ssa_src(I, s) {
837       assert(BITSET_TEST(rctx->visited, I->src[s].value) && "no phis");
838 
839       unsigned v = rctx->ssa_to_reg[I->src[s].value];
840       agx_replace_src(I, s, agx_register_like(v, I->src[s]));
841    }
842 }
843 
844 static void
agx_set_dests(struct ra_ctx * rctx,agx_instr * I)845 agx_set_dests(struct ra_ctx *rctx, agx_instr *I)
846 {
847    agx_foreach_ssa_dest(I, s) {
848       unsigned v = rctx->ssa_to_reg[I->dest[s].value];
849       I->dest[s] =
850          agx_replace_index(I->dest[s], agx_register_like(v, I->dest[s]));
851    }
852 }
853 
854 static unsigned
affinity_base_of_collect(struct ra_ctx * rctx,agx_instr * collect,unsigned src)855 affinity_base_of_collect(struct ra_ctx *rctx, agx_instr *collect, unsigned src)
856 {
857    unsigned src_reg = rctx->ssa_to_reg[collect->src[src].value];
858    unsigned src_offset = src * agx_size_align_16(collect->src[src].size);
859 
860    if (src_reg >= src_offset)
861       return src_reg - src_offset;
862    else
863       return ~0;
864 }
865 
866 static bool
try_coalesce_with(struct ra_ctx * rctx,agx_index ssa,unsigned count,bool may_be_unvisited,unsigned * out)867 try_coalesce_with(struct ra_ctx *rctx, agx_index ssa, unsigned count,
868                   bool may_be_unvisited, unsigned *out)
869 {
870    assert(ssa.type == AGX_INDEX_NORMAL);
871    if (!BITSET_TEST(rctx->visited, ssa.value)) {
872       assert(may_be_unvisited);
873       return false;
874    }
875 
876    unsigned base = rctx->ssa_to_reg[ssa.value];
877    enum ra_class cls = ra_class_for_index(ssa);
878 
879    if (BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
880       return false;
881 
882    assert(base + count <= rctx->bound[cls] && "invariant");
883    *out = base;
884    return true;
885 }
886 
887 static unsigned
pick_regs(struct ra_ctx * rctx,agx_instr * I,unsigned d)888 pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
889 {
890    agx_index idx = I->dest[d];
891    enum ra_class cls = ra_class_for_index(idx);
892    assert(idx.type == AGX_INDEX_NORMAL);
893 
894    unsigned count = rctx->ncomps[idx.value];
895    assert(count >= 1);
896 
897    unsigned align = count;
898 
899    /* Try to allocate entire phi webs compatibly */
900    unsigned phi_idx = phi_web_find(rctx->phi_web, idx.value);
901    if (rctx->phi_web[phi_idx].assigned) {
902       unsigned reg = rctx->phi_web[phi_idx].reg;
903       if ((reg % align) == 0 && reg + align < rctx->bound[cls] &&
904           !BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + align - 1))
905          return reg;
906    }
907 
908    /* Try to allocate moves compatibly with their sources */
909    if (I->op == AGX_OPCODE_MOV && I->src[0].type == AGX_INDEX_NORMAL &&
910        I->src[0].memory == I->dest[0].memory &&
911        I->src[0].size == I->dest[0].size) {
912 
913       unsigned out;
914       if (try_coalesce_with(rctx, I->src[0], count, false, &out))
915          return out;
916    }
917 
918    /* Try to allocate phis compatibly with their sources */
919    if (I->op == AGX_OPCODE_PHI) {
920       agx_foreach_ssa_src(I, s) {
921          /* Loop headers have phis with a source preceding the definition */
922          bool may_be_unvisited = rctx->block->loop_header;
923 
924          unsigned out;
925          if (try_coalesce_with(rctx, I->src[s], count, may_be_unvisited, &out))
926             return out;
927       }
928    }
929 
930    /* Try to allocate collects compatibly with their sources */
931    if (I->op == AGX_OPCODE_COLLECT) {
932       agx_foreach_ssa_src(I, s) {
933          assert(BITSET_TEST(rctx->visited, I->src[s].value) &&
934                 "registers assigned in an order compatible with dominance "
935                 "and this is not a phi node, so we have assigned a register");
936 
937          unsigned base = affinity_base_of_collect(rctx, I, s);
938          if (base >= rctx->bound[cls] || (base + count) > rctx->bound[cls])
939             continue;
940 
941          /* Unaligned destinations can happen when dest size > src size */
942          if (base % align)
943             continue;
944 
945          if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
946             return base;
947       }
948    }
949 
950    /* Try to coalesce scalar exports */
951    agx_instr *collect_phi = rctx->src_to_collect_phi[idx.value];
952    if (collect_phi && collect_phi->op == AGX_OPCODE_EXPORT) {
953       unsigned reg = collect_phi->imm;
954 
955       if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + align - 1) &&
956           (reg % align) == 0)
957          return reg;
958    }
959 
960    /* Try to coalesce vector exports */
961    if (collect_phi && collect_phi->op == AGX_OPCODE_SPLIT) {
962       if (collect_phi->dest[0].type == AGX_INDEX_NORMAL) {
963          agx_instr *exp = rctx->src_to_collect_phi[collect_phi->dest[0].value];
964          if (exp && exp->op == AGX_OPCODE_EXPORT) {
965             unsigned reg = exp->imm;
966 
967             if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg,
968                                    reg + align - 1) &&
969                 (reg % align) == 0)
970                return reg;
971          }
972       }
973    }
974 
975    /* Try to allocate sources of collects contiguously */
976    if (collect_phi && collect_phi->op == AGX_OPCODE_COLLECT) {
977       agx_instr *collect = collect_phi;
978 
979       assert(count == align && "collect sources are scalar");
980 
981       /* Find our offset in the collect. If our source is repeated in the
982        * collect, this may not be unique. We arbitrarily choose the first.
983        */
984       unsigned our_source = ~0;
985       agx_foreach_ssa_src(collect, s) {
986          if (agx_is_equiv(collect->src[s], idx)) {
987             our_source = s;
988             break;
989          }
990       }
991 
992       assert(our_source < collect->nr_srcs && "source must be in the collect");
993 
994       /* See if we can allocate compatibly with any source of the collect */
995       agx_foreach_ssa_src(collect, s) {
996          if (!BITSET_TEST(rctx->visited, collect->src[s].value))
997             continue;
998 
999          /* Determine where the collect should start relative to the source */
1000          unsigned base = affinity_base_of_collect(rctx, collect, s);
1001          if (base >= rctx->bound[cls])
1002             continue;
1003 
1004          unsigned our_reg = base + (our_source * align);
1005 
1006          /* Don't allocate past the end of the register file */
1007          if ((our_reg + align) > rctx->bound[cls])
1008             continue;
1009 
1010          /* If those registers are free, then choose them */
1011          if (!BITSET_TEST_RANGE(rctx->used_regs[cls], our_reg,
1012                                 our_reg + align - 1))
1013             return our_reg;
1014       }
1015 
1016       unsigned collect_align = rctx->ncomps[collect->dest[0].value];
1017       unsigned offset = our_source * align;
1018 
1019       /* Prefer ranges of the register file that leave room for all sources of
1020        * the collect contiguously.
1021        */
1022       for (unsigned base = 0;
1023            base + (collect->nr_srcs * align) <= rctx->bound[cls];
1024            base += collect_align) {
1025          if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base,
1026                                 base + (collect->nr_srcs * align) - 1))
1027             return base + offset;
1028       }
1029 
1030       /* Try to respect the alignment requirement of the collect destination,
1031        * which may be greater than the sources (e.g. pack_64_2x32_split). Look
1032        * for a register for the source such that the collect base is aligned.
1033        */
1034       if (collect_align > align) {
1035          for (unsigned reg = offset; reg + collect_align <= rctx->bound[cls];
1036               reg += collect_align) {
1037             if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1))
1038                return reg;
1039          }
1040       }
1041    }
1042 
1043    /* Try to allocate phi sources compatibly with their phis */
1044    if (collect_phi && collect_phi->op == AGX_OPCODE_PHI) {
1045       agx_instr *phi = collect_phi;
1046       unsigned out;
1047 
1048       agx_foreach_ssa_src(phi, s) {
1049          if (try_coalesce_with(rctx, phi->src[s], count, true, &out))
1050             return out;
1051       }
1052 
1053       /* If we're in a loop, we may have already allocated the phi. Try that. */
1054       if (phi->dest[0].type == AGX_INDEX_REGISTER) {
1055          unsigned base = phi->dest[0].value;
1056 
1057          if (base + count <= rctx->bound[cls] &&
1058              !BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
1059             return base;
1060       }
1061    }
1062 
1063    /* Default to any contiguous sequence of registers */
1064    return find_regs(rctx, I, d, count, align);
1065 }
1066 
1067 /** Assign registers to SSA values in a block. */
1068 
1069 static void
agx_ra_assign_local(struct ra_ctx * rctx)1070 agx_ra_assign_local(struct ra_ctx *rctx)
1071 {
1072    BITSET_DECLARE(used_regs_gpr, AGX_NUM_REGS) = {0};
1073    BITSET_DECLARE(used_regs_mem, AGX_NUM_MODELED_REGS) = {0};
1074    uint16_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint16_t));
1075 
1076    agx_block *block = rctx->block;
1077    uint8_t *ncomps = rctx->ncomps;
1078    rctx->used_regs[RA_GPR] = used_regs_gpr;
1079    rctx->used_regs[RA_MEM] = used_regs_mem;
1080    rctx->ssa_to_reg = ssa_to_reg;
1081 
1082    reserve_live_in(rctx);
1083 
1084    /* Force the nesting counter r0l live throughout shaders using control flow.
1085     * This could be optimized (sync with agx_calc_register_demand).
1086     */
1087    if (rctx->shader->any_cf)
1088       BITSET_SET(used_regs_gpr, 0);
1089 
1090    /* Force the zero r0h live throughout shaders using divergent shuffles. */
1091    if (rctx->shader->any_quad_divergent_shuffle) {
1092       assert(rctx->shader->any_cf);
1093       BITSET_SET(used_regs_gpr, 1);
1094    }
1095 
1096    /* Reserve bottom registers as temporaries for parallel copy lowering */
1097    if (rctx->shader->has_spill_pcopy_reserved) {
1098       BITSET_SET_RANGE(used_regs_gpr, 0, 7);
1099    }
1100 
1101    agx_foreach_instr_in_block(block, I) {
1102       rctx->instr = I;
1103 
1104       /* Optimization: if a split contains the last use of a vector, the split
1105        * can be removed by assigning the destinations overlapping the source.
1106        */
1107       if (I->op == AGX_OPCODE_SPLIT && I->src[0].kill) {
1108          assert(ra_class_for_index(I->src[0]) == RA_GPR);
1109          unsigned reg = ssa_to_reg[I->src[0].value];
1110          unsigned width = agx_size_align_16(agx_split_width(I));
1111 
1112          agx_foreach_dest(I, d) {
1113             assert(ra_class_for_index(I->dest[0]) == RA_GPR);
1114 
1115             /* Free up the source */
1116             unsigned offset_reg = reg + (d * width);
1117             BITSET_CLEAR_RANGE(used_regs_gpr, offset_reg,
1118                                offset_reg + width - 1);
1119 
1120             /* Assign the destination where the source was */
1121             if (!agx_is_null(I->dest[d]))
1122                assign_regs(rctx, I->dest[d], offset_reg);
1123          }
1124 
1125          unsigned excess =
1126             rctx->ncomps[I->src[0].value] - (I->nr_dests * width);
1127          if (excess) {
1128             BITSET_CLEAR_RANGE(used_regs_gpr, reg + (I->nr_dests * width),
1129                                reg + rctx->ncomps[I->src[0].value] - 1);
1130          }
1131 
1132          agx_set_sources(rctx, I);
1133          agx_set_dests(rctx, I);
1134          continue;
1135       } else if (I->op == AGX_OPCODE_PRELOAD) {
1136          /* We must coalesce all preload moves */
1137          assert(I->dest[0].size == I->src[0].size);
1138          assert(I->src[0].type == AGX_INDEX_REGISTER);
1139 
1140          assign_regs(rctx, I->dest[0], I->src[0].value);
1141          agx_set_dests(rctx, I);
1142          continue;
1143       }
1144 
1145       /* First, free killed sources */
1146       agx_foreach_ssa_src(I, s) {
1147          if (I->src[s].kill) {
1148             assert(I->op != AGX_OPCODE_PHI && "phis don't use .kill");
1149 
1150             enum ra_class cls = ra_class_for_index(I->src[s]);
1151             unsigned reg = ssa_to_reg[I->src[s].value];
1152             unsigned count = ncomps[I->src[s].value];
1153 
1154             assert(count >= 1);
1155             BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
1156          }
1157       }
1158 
1159       /* Next, assign destinations one at a time. This is always legal
1160        * because of the SSA form.
1161        */
1162       agx_foreach_ssa_dest(I, d) {
1163          assign_regs(rctx, I->dest[d], pick_regs(rctx, I, d));
1164       }
1165 
1166       /* Phi sources are special. Set in the corresponding predecessors */
1167       if (I->op != AGX_OPCODE_PHI)
1168          agx_set_sources(rctx, I);
1169 
1170       agx_set_dests(rctx, I);
1171    }
1172 
1173    for (unsigned i = 0; i < RA_CLASSES; ++i) {
1174       block->reg_to_ssa_out[i] =
1175          malloc(rctx->bound[i] * sizeof(*block->reg_to_ssa_out[i]));
1176 
1177       /* Initialize with sentinel so we don't have unused regs mapping to r0 */
1178       memset(block->reg_to_ssa_out[i], 0xFF,
1179              rctx->bound[i] * sizeof(*block->reg_to_ssa_out[i]));
1180    }
1181 
1182    int i;
1183    BITSET_FOREACH_SET(i, block->live_out, rctx->shader->alloc) {
1184       block->reg_to_ssa_out[rctx->classes[i]][rctx->ssa_to_reg[i]] = i;
1185    }
1186 
1187    /* Also set the sources for the phis in our successors, since that logically
1188     * happens now (given the possibility of live range splits, etc)
1189     */
1190    agx_foreach_successor(block, succ) {
1191       unsigned pred_idx = agx_predecessor_index(succ, block);
1192 
1193       agx_foreach_phi_in_block(succ, phi) {
1194          if (phi->src[pred_idx].type == AGX_INDEX_NORMAL) {
1195             /* This source needs a fixup */
1196             unsigned value = phi->src[pred_idx].value;
1197 
1198             agx_replace_src(
1199                phi, pred_idx,
1200                agx_register_like(rctx->ssa_to_reg[value], phi->src[pred_idx]));
1201          }
1202       }
1203    }
1204 
1205    free(rctx->ssa_to_reg);
1206 }
1207 
1208 /*
1209  * Lower phis to parallel copies at the logical end of a given block. If a block
1210  * needs parallel copies inserted, a successor of the block has a phi node. To
1211  * have a (nontrivial) phi node, a block must have multiple predecessors. So the
1212  * edge from the block to the successor (with phi) is not the only edge entering
1213  * the successor. Because the control flow graph has no critical edges, this
1214  * edge must therefore be the only edge leaving the block, so the block must
1215  * have only a single successor.
1216  */
1217 static void
agx_insert_parallel_copies(agx_context * ctx,agx_block * block)1218 agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
1219 {
1220    bool any_succ = false;
1221    unsigned nr_phi = 0;
1222 
1223    /* Phi nodes logically happen on the control flow edge, so parallel copies
1224     * are added at the end of the predecessor */
1225    agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));
1226 
1227    agx_foreach_successor(block, succ) {
1228       assert(nr_phi == 0 && "control flow graph has a critical edge");
1229 
1230       agx_foreach_phi_in_block(succ, phi) {
1231          assert(!any_succ && "control flow graph has a critical edge");
1232          nr_phi += agx_channels(phi->dest[0]);
1233       }
1234 
1235       any_succ = true;
1236 
1237       /* Nothing to do if there are no phi nodes */
1238       if (nr_phi == 0)
1239          continue;
1240 
1241       unsigned pred_index = agx_predecessor_index(succ, block);
1242 
1243       /* Create a parallel copy lowering all the phi nodes */
1244       struct agx_copy *copies = calloc(sizeof(*copies), nr_phi);
1245 
1246       unsigned i = 0;
1247 
1248       agx_foreach_phi_in_block(succ, phi) {
1249          agx_index dest = phi->dest[0];
1250          agx_index src = phi->src[pred_index];
1251 
1252          if (src.type == AGX_INDEX_IMMEDIATE)
1253             src.size = dest.size;
1254 
1255          assert(dest.type == AGX_INDEX_REGISTER);
1256          assert(dest.size == src.size);
1257 
1258          /* Scalarize the phi, since the parallel copy lowering doesn't handle
1259           * vector phis. While we scalarize phis in NIR, we can generate vector
1260           * phis from spilling so must take care.
1261           */
1262          for (unsigned c = 0; c < agx_channels(phi->dest[0]); ++c) {
1263             agx_index src_ = src;
1264             unsigned offs = c * agx_size_align_16(src.size);
1265 
1266             if (src.type != AGX_INDEX_IMMEDIATE) {
1267                assert(src.type == AGX_INDEX_UNIFORM ||
1268                       src.type == AGX_INDEX_REGISTER);
1269                src_.value += offs;
1270                src_.channels_m1 = 1 - 1;
1271             }
1272 
1273             assert(i < nr_phi);
1274             copies[i++] = (struct agx_copy){
1275                .dest = dest.value + offs,
1276                .dest_mem = dest.memory,
1277                .src = src_,
1278             };
1279          }
1280       }
1281 
1282       agx_emit_parallel_copies(&b, copies, nr_phi);
1283 
1284       free(copies);
1285    }
1286 }
1287 
1288 static void
lower_exports(agx_context * ctx)1289 lower_exports(agx_context *ctx)
1290 {
1291    struct agx_copy copies[AGX_NUM_REGS];
1292    unsigned nr = 0;
1293    agx_block *block = agx_exit_block(ctx);
1294 
1295    agx_foreach_instr_in_block_safe(block, I) {
1296       if (I->op != AGX_OPCODE_EXPORT)
1297          continue;
1298 
1299       assert(agx_channels(I->src[0]) == 1 && "scalarized in frontend");
1300       assert(nr < ARRAY_SIZE(copies));
1301 
1302       copies[nr++] = (struct agx_copy){
1303          .dest = I->imm,
1304          .src = I->src[0],
1305       };
1306 
1307       /* We cannot use fewer registers than we export */
1308       ctx->max_reg =
1309          MAX2(ctx->max_reg, I->imm + agx_size_align_16(I->src[0].size));
1310    }
1311 
1312    agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));
1313    agx_emit_parallel_copies(&b, copies, nr);
1314 }
1315 
1316 void
agx_ra(agx_context * ctx)1317 agx_ra(agx_context *ctx)
1318 {
1319    bool force_spilling =
1320       (agx_compiler_debug & AGX_DBG_SPILL) && ctx->key->has_scratch;
1321 
1322    /* Determine maximum possible registers. We won't exceed this! */
1323    unsigned max_possible_regs = AGX_NUM_REGS;
1324 
1325    /* Compute shaders need to have their entire workgroup together, so our
1326     * register usage is bounded by the workgroup size.
1327     */
1328    if (gl_shader_stage_is_compute(ctx->stage)) {
1329       unsigned threads_per_workgroup;
1330 
1331       /* If we don't know the workgroup size, worst case it. TODO: Optimize
1332        * this, since it'll decimate opencl perf.
1333        */
1334       if (ctx->nir->info.workgroup_size_variable) {
1335          threads_per_workgroup = 1024;
1336       } else {
1337          threads_per_workgroup = ctx->nir->info.workgroup_size[0] *
1338                                  ctx->nir->info.workgroup_size[1] *
1339                                  ctx->nir->info.workgroup_size[2];
1340       }
1341 
1342       max_possible_regs =
1343          agx_max_registers_for_occupancy(threads_per_workgroup);
1344    }
1345 
1346    /* The helper program is unspillable and has a limited register file */
1347    if (force_spilling)
1348       max_possible_regs = 32;
1349    else if (ctx->key->is_helper)
1350       max_possible_regs = 32;
1351 
1352    /* Calculate the demand. We'll use it to determine if we need to spill and to
1353     * bound register assignment.
1354     */
1355    agx_compute_liveness(ctx);
1356    unsigned effective_demand = agx_calc_register_demand(ctx);
1357    bool spilling = (effective_demand > max_possible_regs);
1358 
1359    if (spilling) {
1360       assert(ctx->key->has_scratch && "internal shaders are unspillable");
1361       agx_spill(ctx, max_possible_regs);
1362 
1363       /* After spilling, recalculate liveness and demand */
1364       agx_compute_liveness(ctx);
1365       effective_demand = agx_calc_register_demand(ctx);
1366 
1367       /* The resulting program can now be assigned registers */
1368       assert(effective_demand <= max_possible_regs && "spiller post-condition");
1369    }
1370 
1371    /* Record all phi webs. First initialize the union-find data structure with
1372     * all SSA defs in their own singletons, then union together anything related
1373     * by a phi. The resulting union-find structure will be the webs.
1374     */
1375    struct phi_web_node *phi_web = calloc(ctx->alloc, sizeof(*phi_web));
1376    for (unsigned i = 0; i < ctx->alloc; ++i) {
1377       phi_web[i].parent = i;
1378    }
1379 
1380    agx_foreach_block(ctx, block) {
1381       agx_foreach_phi_in_block(block, phi) {
1382          agx_foreach_ssa_src(phi, s) {
1383             phi_web_union(phi_web, phi->dest[0].value, phi->src[s].value);
1384          }
1385       }
1386    }
1387 
1388    uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
1389    enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
1390    agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *));
1391    enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size));
1392    BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD));
1393    unsigned max_ncomps = 1;
1394 
1395    agx_foreach_instr_global(ctx, I) {
1396       /* Record collects/phis so we can coalesce when assigning */
1397       if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI ||
1398           I->op == AGX_OPCODE_EXPORT || I->op == AGX_OPCODE_SPLIT) {
1399          agx_foreach_ssa_src(I, s) {
1400             src_to_collect_phi[I->src[s].value] = I;
1401          }
1402       }
1403 
1404       agx_foreach_ssa_dest(I, d) {
1405          unsigned v = I->dest[d].value;
1406          assert(ncomps[v] == 0 && "broken SSA");
1407          /* Round up vectors for easier live range splitting */
1408          ncomps[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
1409          sizes[v] = I->dest[d].size;
1410          classes[v] = ra_class_for_index(I->dest[d]);
1411 
1412          max_ncomps = MAX2(max_ncomps, ncomps[v]);
1413       }
1414    }
1415 
1416    /* For live range splitting to work properly, ensure the register file is
1417     * aligned to the larger vector size. Most of the time, this is a no-op since
1418     * the largest vector size is usually 128-bit and the register file is
1419     * naturally 128-bit aligned. However, this is required for correctness with
1420     * 3D textureGrad, which can have a source vector of length 6x32-bit,
1421     * rounding up to 256-bit and requiring special accounting here.
1422     */
1423    unsigned reg_file_alignment = MAX2(max_ncomps, 8);
1424    assert(util_is_power_of_two_nonzero(reg_file_alignment));
1425 
1426    unsigned demand = ALIGN_POT(effective_demand, reg_file_alignment);
1427    assert(demand <= max_possible_regs && "Invariant");
1428 
1429    /* Round up the demand to the maximum number of registers we can use without
1430     * affecting occupancy. This reduces live range splitting.
1431     */
1432    unsigned max_regs = agx_occupancy_for_register_count(demand).max_registers;
1433    if (ctx->key->is_helper || force_spilling)
1434       max_regs = max_possible_regs;
1435 
1436    max_regs = ROUND_DOWN_TO(max_regs, reg_file_alignment);
1437 
1438    /* Or, we can bound tightly for debugging */
1439    if (agx_compiler_debug & AGX_DBG_DEMAND)
1440       max_regs = ALIGN_POT(MAX2(demand, 12), reg_file_alignment);
1441 
1442    /* ...but not too tightly */
1443    assert((max_regs % reg_file_alignment) == 0 && "occupancy limits aligned");
1444    assert(max_regs >= (6 * 2) && "space for vertex shader preloading");
1445    assert(max_regs <= max_possible_regs);
1446 
1447    unsigned max_mem_slot = 0;
1448 
1449    /* Assign registers in dominance-order. This coincides with source-order due
1450     * to a NIR invariant, so we do not need special handling for this.
1451     */
1452    agx_foreach_block(ctx, block) {
1453       agx_ra_assign_local(&(struct ra_ctx){
1454          .shader = ctx,
1455          .block = block,
1456          .src_to_collect_phi = src_to_collect_phi,
1457          .phi_web = phi_web,
1458          .ncomps = ncomps,
1459          .sizes = sizes,
1460          .classes = classes,
1461          .visited = visited,
1462          .bound[RA_GPR] = max_regs,
1463          .bound[RA_MEM] = AGX_NUM_MODELED_REGS,
1464          .max_reg[RA_GPR] = &ctx->max_reg,
1465          .max_reg[RA_MEM] = &max_mem_slot,
1466       });
1467    }
1468 
1469    if (spilling) {
1470       ctx->spill_base = ctx->scratch_size;
1471       ctx->scratch_size += (max_mem_slot + 1) * 2;
1472    }
1473 
1474    /* Vertex shaders preload the vertex/instance IDs (r5, r6) even if the shader
1475     * don't use them. Account for that so the preload doesn't clobber GPRs.
1476     */
1477    if (ctx->nir->info.stage == MESA_SHADER_VERTEX)
1478       ctx->max_reg = MAX2(ctx->max_reg, 6 * 2);
1479 
1480    assert(ctx->max_reg <= max_regs);
1481 
1482    agx_foreach_instr_global_safe(ctx, ins) {
1483       /* Lower away RA pseudo-instructions */
1484       agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
1485 
1486       if (ins->op == AGX_OPCODE_COLLECT) {
1487          assert(ins->dest[0].type == AGX_INDEX_REGISTER);
1488          assert(!ins->dest[0].memory);
1489 
1490          unsigned base = ins->dest[0].value;
1491          unsigned width = agx_size_align_16(ins->src[0].size);
1492 
1493          struct agx_copy *copies = alloca(sizeof(copies[0]) * ins->nr_srcs);
1494          unsigned n = 0;
1495 
1496          /* Move the sources */
1497          agx_foreach_src(ins, i) {
1498             if (agx_is_null(ins->src[i]) || ins->src[i].type == AGX_INDEX_UNDEF)
1499                continue;
1500             assert(ins->src[i].size == ins->src[0].size);
1501 
1502             assert(n < ins->nr_srcs);
1503             copies[n++] = (struct agx_copy){
1504                .dest = base + (i * width),
1505                .src = ins->src[i],
1506             };
1507          }
1508 
1509          agx_emit_parallel_copies(&b, copies, n);
1510          agx_remove_instruction(ins);
1511          continue;
1512       } else if (ins->op == AGX_OPCODE_SPLIT) {
1513          assert(ins->src[0].type == AGX_INDEX_REGISTER ||
1514                 ins->src[0].type == AGX_INDEX_UNIFORM);
1515 
1516          struct agx_copy copies[4];
1517          assert(ins->nr_dests <= ARRAY_SIZE(copies));
1518 
1519          unsigned n = 0;
1520          unsigned width = agx_size_align_16(agx_split_width(ins));
1521 
1522          /* Move the sources */
1523          agx_foreach_dest(ins, i) {
1524             if (ins->dest[i].type != AGX_INDEX_REGISTER)
1525                continue;
1526 
1527             assert(!ins->dest[i].memory);
1528 
1529             agx_index src = ins->src[0];
1530             src.size = ins->dest[i].size;
1531             src.channels_m1 = 0;
1532             src.value += (i * width);
1533 
1534             assert(n < ARRAY_SIZE(copies));
1535             copies[n++] = (struct agx_copy){
1536                .dest = ins->dest[i].value,
1537                .src = src,
1538             };
1539          }
1540 
1541          /* Lower away */
1542          agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
1543          agx_emit_parallel_copies(&b, copies, n);
1544          agx_remove_instruction(ins);
1545          continue;
1546       }
1547    }
1548 
1549    /* Insert parallel copies lowering phi nodes and exports */
1550    agx_foreach_block(ctx, block) {
1551       agx_insert_parallel_copies(ctx, block);
1552    }
1553 
1554    lower_exports(ctx);
1555 
1556    agx_foreach_instr_global_safe(ctx, I) {
1557       switch (I->op) {
1558       /* Pseudoinstructions for RA must be removed now */
1559       case AGX_OPCODE_PHI:
1560       case AGX_OPCODE_PRELOAD:
1561          agx_remove_instruction(I);
1562          break;
1563 
1564       /* Coalesced moves can be removed */
1565       case AGX_OPCODE_MOV:
1566          if (I->src[0].type == AGX_INDEX_REGISTER &&
1567              I->dest[0].size == I->src[0].size &&
1568              I->src[0].value == I->dest[0].value &&
1569              I->src[0].memory == I->dest[0].memory) {
1570 
1571             assert(I->dest[0].type == AGX_INDEX_REGISTER);
1572             agx_remove_instruction(I);
1573          }
1574          break;
1575 
1576       default:
1577          break;
1578       }
1579    }
1580 
1581    if (spilling)
1582       agx_lower_spill(ctx);
1583 
1584    agx_foreach_block(ctx, block) {
1585       for (unsigned i = 0; i < ARRAY_SIZE(block->reg_to_ssa_out); ++i) {
1586          free(block->reg_to_ssa_out[i]);
1587          block->reg_to_ssa_out[i] = NULL;
1588       }
1589    }
1590 
1591    free(phi_web);
1592    free(src_to_collect_phi);
1593    free(ncomps);
1594    free(sizes);
1595    free(classes);
1596    free(visited);
1597 }
1598