xref: /aosp_15_r20/external/mesa3d/src/asahi/compiler/agx_spill.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2023-2024 Alyssa Rosenzweig
3  * Copyright 2023-2024 Valve Corporation
4  * Copyright 2022 Collabora Ltd.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "util/bitset.h"
9 #include "util/hash_table.h"
10 #include "util/ralloc.h"
11 #include "util/u_dynarray.h"
12 #include "util/u_qsort.h"
13 #include "agx_builder.h"
14 #include "agx_compiler.h"
15 #include "agx_opcodes.h"
16 
17 /*
18  * An implementation of "Register Spilling and Live-Range Splitting for SSA-Form
19  * Programs" by Braun and Hack.
20  */
21 
22 /*
23  * Next-use distances are logically in ℤ ∪ {∞}. Modeled as saturating uint32 and
24  * referred to as dist_t.
25  *
26  * next_uses represents a next-use map. This is a sparse data structure mapping
27  * variable names to next-use dist_t's. Variables with no later use (infinite
28  * next-use distance) are not stored explicitly, making the time/space
29  * requirements O(live variables). This is important for performance and memory
30  * usage on big shaders with many blocks.
31  *
32  * For now, next_uses is backed by a Mesa hash table, but it could be optimized
33  * to something more specialized in the future.
34  */
35 #define DIST_INFINITY (UINT32_MAX)
36 typedef uint32_t dist_t;
37 
38 static dist_t
dist_sum(dist_t A,dist_t B)39 dist_sum(dist_t A, dist_t B)
40 {
41    return (A + B < A) ? DIST_INFINITY : (A + B);
42 }
43 
44 struct next_uses {
45    struct hash_table_u64 *ht;
46 };
47 
48 static void
init_next_uses(struct next_uses * nu,void * memctx)49 init_next_uses(struct next_uses *nu, void *memctx)
50 {
51    nu->ht = _mesa_hash_table_u64_create(memctx);
52 }
53 
54 static void
destroy_next_uses(struct next_uses * nu)55 destroy_next_uses(struct next_uses *nu)
56 {
57    _mesa_hash_table_u64_destroy(nu->ht);
58 }
59 
60 static void
clear_next_uses(struct next_uses * nu)61 clear_next_uses(struct next_uses *nu)
62 {
63    _mesa_hash_table_u64_clear(nu->ht);
64 }
65 
66 static void
copy_next_uses(struct next_uses * nu,const struct next_uses * from)67 copy_next_uses(struct next_uses *nu, const struct next_uses *from)
68 {
69    clear_next_uses(nu);
70 
71    hash_table_u64_foreach(from->ht, use) {
72       _mesa_hash_table_u64_insert(nu->ht, use.key, use.data);
73    }
74 }
75 
76 static void
set_next_use(struct next_uses * nu,unsigned node,dist_t dist)77 set_next_use(struct next_uses *nu, unsigned node, dist_t dist)
78 {
79    if (dist == DIST_INFINITY) {
80       _mesa_hash_table_u64_remove(nu->ht, node);
81    } else {
82       uintptr_t as_ptr = (uintptr_t)(dist + 1);
83       assert(as_ptr != 0 && "non-NULL");
84 
85       _mesa_hash_table_u64_insert(nu->ht, node, (void *)as_ptr);
86    }
87 }
88 
89 static dist_t
search_next_uses(const struct next_uses * nu,unsigned node)90 search_next_uses(const struct next_uses *nu, unsigned node)
91 {
92    void *ent = _mesa_hash_table_u64_search(nu->ht, node);
93    if (!ent)
94       return DIST_INFINITY;
95 
96    uintptr_t raw = (uintptr_t)ent;
97    return raw - 1;
98 }
99 
100 #define foreach_next_use(nu, node, dist)                                       \
101    hash_table_u64_foreach((nu)->ht, use_)                                      \
102       for (uint32_t _terminator = 1, node = use_.key,                          \
103                     UNUSED dist = ((uintptr_t)use_.data) - 1;                  \
104            _terminator != 0; _terminator = 0)
105 
106 /*
107  * Calculate the minimum of two next-use sets. Values absent from one of the
108  * underlying sets are infinity so do not contribute to the minimum, instead
109  * acting like a set union.
110  */
111 static bool
minimum_next_uses(struct next_uses * nu,const struct next_uses * from)112 minimum_next_uses(struct next_uses *nu, const struct next_uses *from)
113 {
114    bool progress = false;
115 
116    foreach_next_use(from, node, from_dist) {
117       dist_t nu_dist = search_next_uses(nu, node);
118 
119       if (from_dist < nu_dist) {
120          set_next_use(nu, node, from_dist);
121          progress = true;
122       }
123    }
124 
125    return progress;
126 }
127 
128 static uint32_t
instr_cycles(const agx_instr * I)129 instr_cycles(const agx_instr *I)
130 {
131    return 1;
132 }
133 
134 struct spill_block {
135    /* Set of values available in the register file at the end */
136    unsigned W_exit[AGX_NUM_REGS];
137    unsigned nW_exit;
138 
139    unsigned W_entry[AGX_NUM_REGS];
140    unsigned nW_entry;
141 
142    /* Set of live-out spilled values at the end of the block */
143    unsigned *S_exit;
144    unsigned nS_exit;
145 
146    unsigned *S_entry;
147    unsigned nS_entry;
148 
149    /* Estimate */
150    uint32_t cycles;
151 
152    /* Next-use maps at the start/end of the block */
153    struct next_uses next_use_in;
154    struct next_uses next_use_out;
155 };
156 
157 struct spill_ctx {
158    void *memctx;
159    agx_context *shader;
160    agx_block *block;
161 
162    /* Set of values currently available in the register file */
163    BITSET_WORD *W;
164 
165    /* |W| = Current register pressure */
166    unsigned nW;
167 
168    /* Local IPs of next-use */
169    dist_t *next_uses;
170 
171    /* Current local IP relative to the start of the block */
172    uint32_t ip;
173 
174    /* Set of live values that have been spilled. Contrary to the paper, this
175     * is not a subset of W: the definition in the paper is bogus.
176     */
177    BITSET_WORD *S;
178 
179    /* Widths of vectors */
180    uint8_t *channels;
181    enum agx_size *size;
182 
183    /* Mapping of rematerializable values to their definitions, or NULL for nodes
184     * that are not materializable.
185     */
186    agx_instr **remat;
187 
188    /* Maximum register pressure allowed */
189    unsigned k;
190 
191    /* Number of variables */
192    unsigned n;
193 
194    /* Information on blocks indexed in source order */
195    struct spill_block *blocks;
196 
197    /* Base memory index reserved for spilled indices */
198    unsigned spill_base;
199 };
200 
201 static inline struct spill_block *
spill_block(struct spill_ctx * ctx,agx_block * block)202 spill_block(struct spill_ctx *ctx, agx_block *block)
203 {
204    return &ctx->blocks[block->index];
205 }
206 
207 /* Calculate the register demand of a node. This is rounded up to a power-of-two
208  * to match the equivalent calculations in RA.
209  */
210 static unsigned
node_size(struct spill_ctx * ctx,unsigned node)211 node_size(struct spill_ctx *ctx, unsigned node)
212 {
213    return util_next_power_of_two(ctx->channels[node]) *
214           agx_size_align_16(ctx->size[node]);
215 }
216 
217 /*
218  * Map a control flow edge to a block. Assumes no critical edges.
219  */
220 static agx_block *
agx_edge_to_block(agx_block * pred,agx_block * succ)221 agx_edge_to_block(agx_block *pred, agx_block *succ)
222 {
223    /* End of predecessor is unique if there's a single successor */
224    if (agx_num_successors(pred) == 1)
225       return pred;
226 
227    /* The predecessor has multiple successors, meaning this is not the only
228     * edge leaving the predecessor. Therefore, it is the only edge entering
229     * the successor (otherwise the edge would be critical), so the start of
230     * the successor is unique.
231     */
232    assert(agx_num_predecessors(succ) == 1 && "critical edge detected");
233    return succ;
234 }
235 
236 /*
237  * Get a cursor to insert along a control flow edge: either at the start of the
238  * successor or the end of the predecessor. This relies on the control flow
239  * graph having no critical edges.
240  */
241 static agx_cursor
agx_along_edge(agx_block * pred,agx_block * succ)242 agx_along_edge(agx_block *pred, agx_block *succ)
243 {
244    agx_block *to = agx_edge_to_block(pred, succ);
245 
246    if (to == pred)
247       return agx_after_block_logical(pred);
248    else
249       return agx_before_block(succ);
250 }
251 
252 static inline agx_index
agx_index_as_mem(agx_index idx,unsigned mem_base)253 agx_index_as_mem(agx_index idx, unsigned mem_base)
254 {
255    assert(idx.type == AGX_INDEX_NORMAL);
256    assert(!idx.memory);
257    idx.memory = true;
258    idx.value = mem_base + idx.value;
259    return idx;
260 }
261 
262 static unsigned
chase_mem_index(agx_index ref,unsigned mem_base)263 chase_mem_index(agx_index ref, unsigned mem_base)
264 {
265    assert(ref.type == AGX_INDEX_NORMAL);
266    return ref.memory ? (ref.value - mem_base) : ref.value;
267 }
268 
269 static agx_index
reconstruct_index(struct spill_ctx * ctx,unsigned node)270 reconstruct_index(struct spill_ctx *ctx, unsigned node)
271 {
272    return agx_get_vec_index(node, ctx->size[node], ctx->channels[node]);
273 }
274 
275 static bool
can_remat(agx_instr * I)276 can_remat(agx_instr *I)
277 {
278    switch (I->op) {
279    case AGX_OPCODE_MOV_IMM:
280    case AGX_OPCODE_GET_SR:
281       return true;
282    default:
283       return false;
284    }
285 }
286 
287 static agx_instr *
remat_to(agx_builder * b,agx_index dst,struct spill_ctx * ctx,unsigned node)288 remat_to(agx_builder *b, agx_index dst, struct spill_ctx *ctx, unsigned node)
289 {
290    agx_instr *I = ctx->remat[node];
291    assert(can_remat(I));
292 
293    switch (I->op) {
294    case AGX_OPCODE_MOV_IMM:
295       return agx_mov_imm_to(b, dst, I->imm);
296    case AGX_OPCODE_GET_SR:
297       return agx_get_sr_to(b, dst, I->sr);
298    default:
299       unreachable("invalid remat");
300    }
301 }
302 
303 static void
insert_spill(agx_builder * b,struct spill_ctx * ctx,unsigned node)304 insert_spill(agx_builder *b, struct spill_ctx *ctx, unsigned node)
305 {
306    if (!ctx->remat[node]) {
307       agx_index idx = reconstruct_index(ctx, node);
308       agx_mov_to(b, agx_index_as_mem(idx, ctx->spill_base), idx);
309    }
310 }
311 
312 static void
insert_reload(struct spill_ctx * ctx,agx_block * block,agx_cursor cursor,unsigned node)313 insert_reload(struct spill_ctx *ctx, agx_block *block, agx_cursor cursor,
314               unsigned node)
315 {
316    agx_builder b = agx_init_builder(ctx->shader, cursor);
317    agx_index idx = reconstruct_index(ctx, node);
318 
319    /* Reloading breaks SSA, but agx_repair_ssa will repair */
320    if (ctx->remat[node]) {
321       remat_to(&b, idx, ctx, node);
322    } else {
323       agx_mov_to(&b, idx, agx_index_as_mem(idx, ctx->spill_base));
324    }
325 }
326 
327 /* Insert into the register file */
328 static void
insert_W(struct spill_ctx * ctx,unsigned v)329 insert_W(struct spill_ctx *ctx, unsigned v)
330 {
331    assert(v < ctx->n);
332    assert(!BITSET_TEST(ctx->W, v));
333 
334    BITSET_SET(ctx->W, v);
335    ctx->nW += node_size(ctx, v);
336 }
337 
338 /* Remove from the register file */
339 static void
remove_W(struct spill_ctx * ctx,unsigned v)340 remove_W(struct spill_ctx *ctx, unsigned v)
341 {
342    assert(v < ctx->n);
343    assert(BITSET_TEST(ctx->W, v));
344 
345    BITSET_CLEAR(ctx->W, v);
346    ctx->nW -= node_size(ctx, v);
347 }
348 
349 static void
remove_W_if_present(struct spill_ctx * ctx,unsigned v)350 remove_W_if_present(struct spill_ctx *ctx, unsigned v)
351 {
352    assert(v < ctx->n);
353 
354    if (BITSET_TEST(ctx->W, v))
355       remove_W(ctx, v);
356 }
357 
358 struct candidate {
359    unsigned node;
360    dist_t dist;
361 };
362 
363 static int
cmp_dist(const void * left_,const void * right_,void * ctx_)364 cmp_dist(const void *left_, const void *right_, void *ctx_)
365 {
366    struct spill_ctx *ctx = ctx_;
367    const struct candidate *left = left_;
368    const struct candidate *right = right_;
369 
370    /* We assume that rematerializing - even before every instruction - is
371     * cheaper than spilling. As long as one of the nodes is rematerializable
372     * (with distance > 0), we choose it over spilling. Within a class of nodes
373     * (rematerializable or not), compare by next-use-distance.
374     */
375    bool remat_left = ctx->remat[left->node] != NULL && left->dist > 0;
376    bool remat_right = ctx->remat[right->node] != NULL && right->dist > 0;
377 
378    if (remat_left != remat_right)
379       return remat_left ? 1 : -1;
380    else
381       return (left->dist > right->dist) - (left->dist < right->dist);
382 }
383 
384 /*
385  * Limit the register file W to maximum size m by evicting registers.
386  */
387 static ATTRIBUTE_NOINLINE void
limit(struct spill_ctx * ctx,agx_instr * I,unsigned m)388 limit(struct spill_ctx *ctx, agx_instr *I, unsigned m)
389 {
390    /* Nothing to do if we're already below the limit */
391    if (ctx->nW <= m)
392       return;
393 
394    /* Gather candidates for eviction. Note that next_uses gives IPs whereas
395     * cmp_dist expects relative distances. This requires us to subtract ctx->ip
396     * to ensure that cmp_dist works properly. Even though logically it shouldn't
397     * affect the sorted order, practically this matters for correctness with
398     * rematerialization. See the dist=0 test in cmp_dist.
399     */
400    struct candidate *candidates = alloca(ctx->nW * sizeof(struct candidate));
401    unsigned j = 0;
402 
403    int i;
404    BITSET_FOREACH_SET(i, ctx->W, ctx->n) {
405       assert(j < ctx->nW);
406 
407       candidates[j++] = (struct candidate){
408          .node = i,
409          .dist = ctx->next_uses[i] - ctx->ip,
410       };
411    }
412 
413    /* Sort by next-use distance */
414    util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx);
415 
416    /* Evict what doesn't fit */
417    unsigned new_weight = 0;
418 
419    for (i = 0; i < j; ++i) {
420       unsigned v = candidates[i].node;
421       unsigned comps = node_size(ctx, v);
422 
423       if ((new_weight + comps) <= m) {
424          new_weight += comps;
425       } else {
426          /* Insert a spill if we haven't spilled before and there is
427           * another use
428           */
429          if (!BITSET_TEST(ctx->S, v) && candidates[i].dist < DIST_INFINITY) {
430             agx_builder b = agx_init_builder(ctx->shader, agx_before_instr(I));
431             insert_spill(&b, ctx, v);
432             BITSET_SET(ctx->S, v);
433          }
434 
435          remove_W(ctx, v);
436 
437          /* We keep going in case we can pack in a scalar */
438       }
439    }
440 }
441 
442 /*
443  * Insert coupling code on block boundaries. This must ensure:
444  *
445  *    - anything live-in we expect to have spilled is spilled
446  *    - anything live-in we expect to have filled is filled
447  *    - phi sources are spilled if the destination is spilled
448  *    - phi sources are filled if the destination is not spilled
449  *
450  * The latter two requirements ensure correct pressure calculations for phis.
451  */
452 static ATTRIBUTE_NOINLINE void
insert_coupling_code(struct spill_ctx * ctx,agx_block * pred,agx_block * succ)453 insert_coupling_code(struct spill_ctx *ctx, agx_block *pred, agx_block *succ)
454 {
455    struct spill_block *sp = spill_block(ctx, pred);
456    struct spill_block *ss = spill_block(ctx, succ);
457 
458    agx_foreach_phi_in_block(succ, I) {
459       if (!I->dest[0].memory)
460          continue;
461 
462       agx_builder b =
463          agx_init_builder(ctx->shader, agx_before_function(ctx->shader));
464 
465       unsigned s = agx_predecessor_index(succ, pred);
466 
467       /* Copy immediate/uniform phi sources to memory variables at the start of
468        * the program, where pressure is zero and hence the copy is legal.
469        */
470       if (I->src[s].type != AGX_INDEX_NORMAL) {
471          assert(I->src[s].type == AGX_INDEX_IMMEDIATE ||
472                 I->src[s].type == AGX_INDEX_UNIFORM);
473 
474          agx_index mem = agx_temp_like(ctx->shader, I->dest[0]);
475          assert(mem.memory);
476 
477          agx_index gpr = agx_temp_like(ctx->shader, I->dest[0]);
478          gpr.memory = false;
479 
480          if (I->src[s].type == AGX_INDEX_IMMEDIATE)
481             agx_mov_imm_to(&b, gpr, I->src[s].value);
482          else
483             agx_mov_to(&b, gpr, I->src[s]);
484 
485          agx_mov_to(&b, mem, gpr);
486          I->src[s] = mem;
487          continue;
488       }
489 
490       bool spilled = false;
491       for (unsigned i = 0; i < sp->nS_exit; ++i) {
492          if (sp->S_exit[i] == I->src[s].value) {
493             spilled = true;
494             break;
495          }
496       }
497 
498       if (!spilled) {
499          /* Spill the phi source. TODO: avoid redundant spills here */
500          agx_builder b =
501             agx_init_builder(ctx->shader, agx_after_block_logical(pred));
502 
503          insert_spill(&b, ctx, I->src[s].value);
504       }
505 
506       if (ctx->remat[I->src[s].value]) {
507          unsigned node = I->src[s].value;
508          agx_index idx = reconstruct_index(ctx, node);
509          agx_index tmp = agx_temp_like(ctx->shader, idx);
510 
511          remat_to(&b, tmp, ctx, node);
512          agx_mov_to(&b, agx_index_as_mem(idx, ctx->spill_base), tmp);
513       }
514 
515       /* Use the spilled version */
516       I->src[s] = agx_index_as_mem(I->src[s], ctx->spill_base);
517    }
518 
519    /* Anything assumed to be spilled at the start of succ must be spilled along
520     * all edges.
521     */
522    for (unsigned i = 0; i < ss->nS_entry; ++i) {
523       unsigned v = ss->S_entry[i];
524 
525       bool spilled = false;
526       for (unsigned j = 0; j < sp->nS_exit; ++j) {
527          if (sp->S_exit[j] == v) {
528             spilled = true;
529             break;
530          }
531       }
532 
533       /* We handle spilling phi destinations separately */
534       agx_foreach_phi_in_block(succ, phi) {
535          if (chase_mem_index(phi->dest[0], ctx->spill_base) == v) {
536             spilled = true;
537             break;
538          }
539       }
540 
541       if (spilled)
542          continue;
543 
544       agx_builder b = agx_init_builder(ctx->shader, agx_along_edge(pred, succ));
545       insert_spill(&b, ctx, v);
546    }
547 
548    /* Variables in W at the start of succ must be defined along the edge. */
549    for (unsigned i = 0; i < ss->nW_entry; ++i) {
550       unsigned node = ss->W_entry[i];
551       bool defined = false;
552 
553       /* Variables live at the end of the predecessor are live along the edge */
554       for (unsigned j = 0; j < sp->nW_exit; ++j) {
555          if (sp->W_exit[j] == node) {
556             defined = true;
557             break;
558          }
559       }
560 
561       /* Phis are defined along the edge */
562       agx_foreach_phi_in_block(succ, phi) {
563          if (phi->dest[0].value == node) {
564             defined = true;
565             break;
566          }
567       }
568 
569       if (defined)
570          continue;
571 
572       /* Otherwise, inserting a reload defines the variable along the edge */
573       agx_block *reload_block = agx_edge_to_block(pred, succ);
574       insert_reload(ctx, reload_block, agx_along_edge(pred, succ), node);
575    }
576 
577    agx_foreach_phi_in_block(succ, I) {
578       if (I->dest[0].memory)
579          continue;
580 
581       unsigned s = agx_predecessor_index(succ, pred);
582 
583       /* Treat immediate/uniform phi sources as registers for pressure
584        * accounting and phi lowering purposes. Parallel copy lowering can handle
585        * a copy from a immediate/uniform to a register, but not from an
586        * immediate/uniform directly to memory.
587        */
588       if (I->src[s].type != AGX_INDEX_NORMAL) {
589          assert(I->src[s].type == AGX_INDEX_IMMEDIATE ||
590                 I->src[s].type == AGX_INDEX_UNIFORM);
591 
592          continue;
593       }
594 
595       bool live = false;
596       for (unsigned i = 0; i < sp->nW_exit; ++i) {
597          if (sp->W_exit[i] == I->src[s].value) {
598             live = true;
599             break;
600          }
601       }
602 
603       /* Fill the phi source in the predecessor */
604       if (!live) {
605          agx_block *reload_block = agx_edge_to_block(pred, succ);
606          insert_reload(ctx, reload_block, agx_along_edge(pred, succ),
607                        I->src[s].value);
608       }
609 
610       /* Leave as-is for the GPR version */
611       assert(!I->src[s].memory);
612    }
613 }
614 
615 /*
616  * Produce an array of next-use IPs relative to the start of the block. This is
617  * an array of dist_t scalars, representing the next-use IP of each SSA dest
618  * (right-to-left) and SSA source (left-to-right) of each instruction in the
619  * block (bottom-to-top). Its size equals the # of SSA sources in the block.
620  */
621 static ATTRIBUTE_NOINLINE void
calculate_local_next_use(struct spill_ctx * ctx,struct util_dynarray * out)622 calculate_local_next_use(struct spill_ctx *ctx, struct util_dynarray *out)
623 {
624    struct spill_block *sb = spill_block(ctx, ctx->block);
625    unsigned ip = sb->cycles;
626 
627    util_dynarray_init(out, NULL);
628 
629    struct next_uses nu;
630    init_next_uses(&nu, NULL);
631 
632    foreach_next_use(&sb->next_use_out, i, dist) {
633       set_next_use(&nu, i, dist_sum(ip, dist));
634    }
635 
636    agx_foreach_instr_in_block_rev(ctx->block, I) {
637       ip -= instr_cycles(I);
638 
639       if (I->op != AGX_OPCODE_PHI) {
640          agx_foreach_ssa_dest_rev(I, d) {
641             unsigned v = I->dest[d].value;
642 
643             util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
644          }
645 
646          agx_foreach_ssa_src(I, s) {
647             unsigned v = I->src[s].value;
648 
649             util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
650             set_next_use(&nu, v, ip);
651          }
652       }
653    }
654 
655    assert(ip == 0 && "cycle counting is consistent");
656    destroy_next_uses(&nu);
657 }
658 
659 /*
660  * Insert spills/fills for a single basic block, following Belady's algorithm.
661  * Corresponds to minAlgorithm from the paper.
662  */
663 static ATTRIBUTE_NOINLINE void
min_algorithm(struct spill_ctx * ctx)664 min_algorithm(struct spill_ctx *ctx)
665 {
666    struct spill_block *sblock = spill_block(ctx, ctx->block);
667    struct util_dynarray local_next_ip;
668    calculate_local_next_use(ctx, &local_next_ip);
669 
670    /* next_uses gives the distance from the start of the block, so prepopulate
671     * with next_use_in.
672     */
673    foreach_next_use(&sblock->next_use_in, key, dist) {
674       assert(key < ctx->n);
675       ctx->next_uses[key] = dist;
676    }
677 
678    dist_t *next_ips = util_dynarray_element(&local_next_ip, dist_t, 0);
679    unsigned next_use_cursor =
680       util_dynarray_num_elements(&local_next_ip, dist_t);
681 
682    /* Iterate each instruction in forward order */
683    agx_foreach_instr_in_block(ctx->block, I) {
684       assert(ctx->nW <= ctx->k && "invariant");
685 
686       /* Phis are special since they happen along the edge. When we initialized
687        * W and S, we implicitly chose which phis are spilled. So, here we just
688        * need to rewrite the phis to write into memory.
689        *
690        * Phi sources are handled later.
691        */
692       if (I->op == AGX_OPCODE_PHI) {
693          if (!BITSET_TEST(ctx->W, I->dest[0].value)) {
694             I->dest[0] = agx_index_as_mem(I->dest[0], ctx->spill_base);
695          }
696 
697          ctx->ip += instr_cycles(I);
698          continue;
699       }
700 
701       /* Any source that is not in W needs to be reloaded. Gather the set R of
702        * such values.
703        */
704       unsigned R[AGX_MAX_NORMAL_SOURCES];
705       unsigned nR = 0;
706 
707       agx_foreach_ssa_src(I, s) {
708          unsigned node = I->src[s].value;
709          if (BITSET_TEST(ctx->W, node))
710             continue;
711 
712          /* Mark this variable as needing a reload. */
713          assert(node < ctx->n);
714          assert(BITSET_TEST(ctx->S, node) && "must have been spilled");
715          assert(nR < ARRAY_SIZE(R) && "maximum source count");
716          R[nR++] = node;
717 
718          /* The inserted reload will add the value to the register file. */
719          insert_W(ctx, node);
720       }
721 
722       /* Limit W to make space for the sources we just added */
723       limit(ctx, I, ctx->k);
724 
725       /* Update next-use distances for this instruction. Unlike the paper, we
726        * prune dead values from W as we go. This doesn't affect correctness, but
727        * it speeds up limit() on average.
728        */
729       agx_foreach_ssa_src_rev(I, s) {
730          assert(next_use_cursor >= 1);
731 
732          unsigned next_ip = next_ips[--next_use_cursor];
733          assert((next_ip == DIST_INFINITY) == I->src[s].kill);
734 
735          if (next_ip == DIST_INFINITY)
736             remove_W_if_present(ctx, I->src[s].value);
737          else
738             ctx->next_uses[I->src[s].value] = next_ip;
739       }
740 
741       agx_foreach_ssa_dest(I, d) {
742          assert(next_use_cursor >= 1);
743          unsigned next_ip = next_ips[--next_use_cursor];
744 
745          if (next_ip == DIST_INFINITY)
746             remove_W_if_present(ctx, I->dest[d].value);
747          else
748             ctx->next_uses[I->dest[d].value] = next_ip;
749       }
750 
751       /* Count how many registers we need for destinations. Because of
752        * SSA form, destinations are unique.
753        */
754       unsigned dest_size = 0;
755       agx_foreach_ssa_dest(I, d) {
756          dest_size += node_size(ctx, I->dest[d].value);
757       }
758 
759       /* Limit W to make space for the destinations. */
760       limit(ctx, I, ctx->k - dest_size);
761 
762       /* Destinations are now in the register file */
763       agx_foreach_ssa_dest(I, d) {
764          insert_W(ctx, I->dest[d].value);
765       }
766 
767       /* Add reloads for the sources in front of the instruction */
768       for (unsigned i = 0; i < nR; ++i) {
769          insert_reload(ctx, ctx->block, agx_before_instr(I), R[i]);
770       }
771 
772       ctx->ip += instr_cycles(I);
773    }
774 
775    assert(next_use_cursor == 0 && "exactly sized");
776 
777    int i;
778    BITSET_FOREACH_SET(i, ctx->W, ctx->n)
779       sblock->W_exit[sblock->nW_exit++] = i;
780 
781    unsigned nS = __bitset_count(ctx->S, BITSET_WORDS(ctx->n));
782    sblock->S_exit = ralloc_array(ctx->memctx, unsigned, nS);
783 
784    BITSET_FOREACH_SET(i, ctx->S, ctx->n)
785       sblock->S_exit[sblock->nS_exit++] = i;
786 
787    assert(nS == sblock->nS_exit);
788    util_dynarray_fini(&local_next_ip);
789 }
790 
791 /*
792  * TODO: Implement section 4.2 of the paper.
793  *
794  * For now, we implement the simpler heuristic in Hack's thesis: sort
795  * the live-in set (+ destinations of phis) by next-use distance.
796  */
797 static ATTRIBUTE_NOINLINE void
compute_w_entry_loop_header(struct spill_ctx * ctx)798 compute_w_entry_loop_header(struct spill_ctx *ctx)
799 {
800    agx_block *block = ctx->block;
801    struct spill_block *sb = spill_block(ctx, block);
802 
803    unsigned nP = __bitset_count(block->live_in, BITSET_WORDS(ctx->n));
804    struct candidate *candidates = calloc(nP, sizeof(struct candidate));
805    unsigned j = 0;
806 
807    foreach_next_use(&sb->next_use_in, i, dist) {
808       assert(j < nP);
809       candidates[j++] = (struct candidate){.node = i, .dist = dist};
810    }
811 
812    assert(j == nP);
813 
814    /* Sort by next-use distance */
815    util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx);
816 
817    /* Take as much as we can */
818    for (unsigned i = 0; i < j; ++i) {
819       unsigned node = candidates[i].node;
820       unsigned comps = node_size(ctx, node);
821 
822       if ((ctx->nW + comps) <= ctx->k) {
823          insert_W(ctx, node);
824          sb->W_entry[sb->nW_entry++] = node;
825       }
826    }
827 
828    assert(ctx->nW <= ctx->k);
829    free(candidates);
830 }
831 
832 /*
833  * Compute W_entry for a block. Section 4.2 in the paper.
834  */
835 static ATTRIBUTE_NOINLINE void
compute_w_entry(struct spill_ctx * ctx)836 compute_w_entry(struct spill_ctx *ctx)
837 {
838    agx_block *block = ctx->block;
839    struct spill_block *sb = spill_block(ctx, block);
840 
841    /* Nothing to do for start blocks */
842    if (agx_num_predecessors(block) == 0)
843       return;
844 
845    /* Loop headers have a different heuristic */
846    if (block->loop_header) {
847       compute_w_entry_loop_header(ctx);
848       return;
849    }
850 
851    /* Usual blocks follow */
852    unsigned *freq = calloc(ctx->n, sizeof(unsigned));
853 
854    /* Record what's written at the end of each predecessor */
855    agx_foreach_predecessor(ctx->block, P) {
856       struct spill_block *sp = spill_block(ctx, *P);
857 
858       for (unsigned i = 0; i < sp->nW_exit; ++i) {
859          unsigned v = sp->W_exit[i];
860          freq[v]++;
861       }
862    }
863 
864    struct candidate *candidates = calloc(ctx->n, sizeof(struct candidate));
865    unsigned j = 0;
866 
867    /* Variables that are in all predecessors are assumed in W_entry. Phis and
868     * variables in some predecessors are scored by next-use.
869     */
870    foreach_next_use(&sb->next_use_in, i, dist) {
871       if (freq[i] == agx_num_predecessors(ctx->block)) {
872          insert_W(ctx, i);
873       } else if (freq[i]) {
874          candidates[j++] = (struct candidate){.node = i, .dist = dist};
875       }
876    }
877 
878    agx_foreach_phi_in_block(ctx->block, I) {
879       bool all_found = true;
880 
881       agx_foreach_predecessor(ctx->block, pred) {
882          struct spill_block *sp = spill_block(ctx, *pred);
883          bool found = false;
884 
885          agx_index src = I->src[agx_predecessor_index(ctx->block, *pred)];
886          if (src.type != AGX_INDEX_NORMAL)
887             continue;
888 
889          unsigned v = src.value;
890          for (unsigned i = 0; i < sp->nW_exit; ++i) {
891             if (sp->W_exit[i] == v) {
892                found = true;
893                break;
894             }
895          }
896 
897          all_found &= found;
898       }
899 
900       /* Heuristic: if any phi source is spilled, spill the whole phi. This is
901        * suboptimal, but it massively reduces pointless fill/spill chains with
902        * massive phi webs.
903        */
904       if (!all_found)
905          continue;
906 
907       candidates[j++] = (struct candidate){
908          .node = I->dest[0].value,
909          .dist = search_next_uses(&sb->next_use_in, I->dest[0].value),
910       };
911    }
912 
913    /* Sort by next-use distance */
914    util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx);
915 
916    /* Take as much as we can */
917    for (unsigned i = 0; i < j; ++i) {
918       unsigned node = candidates[i].node;
919       unsigned comps = node_size(ctx, node);
920 
921       if ((ctx->nW + comps) <= ctx->k) {
922          insert_W(ctx, node);
923          sb->W_entry[sb->nW_entry++] = node;
924       }
925    }
926 
927    assert(ctx->nW <= ctx->k && "invariant");
928 
929    free(freq);
930    free(candidates);
931 }
932 
933 /*
934  * We initialize S with the union of S at the exit of (forward edge)
935  * predecessors and the complement of W, intersected with the live-in set. The
936  * former propagates S forward. The latter ensures we spill along the edge when
937  * a live value is not selected for the entry W.
938  */
939 static ATTRIBUTE_NOINLINE void
compute_s_entry(struct spill_ctx * ctx)940 compute_s_entry(struct spill_ctx *ctx)
941 {
942    unsigned v;
943 
944    agx_foreach_predecessor(ctx->block, pred) {
945       struct spill_block *sp = spill_block(ctx, *pred);
946 
947       for (unsigned i = 0; i < sp->nS_exit; ++i) {
948          v = sp->S_exit[i];
949 
950          if (BITSET_TEST(ctx->block->live_in, v))
951             BITSET_SET(ctx->S, v);
952       }
953    }
954 
955    BITSET_FOREACH_SET(v, ctx->block->live_in, ctx->n) {
956       if (!BITSET_TEST(ctx->W, v))
957          BITSET_SET(ctx->S, v);
958    }
959 
960    /* Copy ctx->S to S_entry for later look-ups with coupling code */
961    struct spill_block *sb = spill_block(ctx, ctx->block);
962    unsigned nS = __bitset_count(ctx->S, BITSET_WORDS(ctx->n));
963    sb->S_entry = ralloc_array(ctx->memctx, unsigned, nS);
964 
965    int i;
966    BITSET_FOREACH_SET(i, ctx->S, ctx->n)
967       sb->S_entry[sb->nS_entry++] = i;
968 
969    assert(sb->nS_entry == nS);
970 }
971 
972 static ATTRIBUTE_NOINLINE void
global_next_use_distances(agx_context * ctx,void * memctx,struct spill_block * blocks)973 global_next_use_distances(agx_context *ctx, void *memctx,
974                           struct spill_block *blocks)
975 {
976    u_worklist worklist;
977    u_worklist_init(&worklist, ctx->num_blocks, NULL);
978 
979    agx_foreach_block(ctx, block) {
980       struct spill_block *sb = &blocks[block->index];
981 
982       init_next_uses(&sb->next_use_in, memctx);
983       init_next_uses(&sb->next_use_out, memctx);
984 
985       agx_foreach_instr_in_block(block, I) {
986          sb->cycles += instr_cycles(I);
987       }
988 
989       agx_worklist_push_head(&worklist, block);
990    }
991 
992    /* Definitions that have been seen */
993    BITSET_WORD *defined =
994       malloc(BITSET_WORDS(ctx->alloc) * sizeof(BITSET_WORD));
995 
996    struct next_uses dists;
997    init_next_uses(&dists, NULL);
998 
999    /* Iterate the work list in reverse order since liveness is backwards */
1000    while (!u_worklist_is_empty(&worklist)) {
1001       agx_block *blk = agx_worklist_pop_head(&worklist);
1002       struct spill_block *sb = &blocks[blk->index];
1003 
1004       /* Definitions that have been seen */
1005       memset(defined, 0, BITSET_WORDS(ctx->alloc) * sizeof(BITSET_WORD));
1006 
1007       /* Initialize all distances to infinity */
1008       clear_next_uses(&dists);
1009 
1010       uint32_t cycle = 0;
1011 
1012       /* Calculate dists. Phis are handled separately. */
1013       agx_foreach_instr_in_block(blk, I) {
1014          if (I->op == AGX_OPCODE_PHI) {
1015             cycle++;
1016             continue;
1017          }
1018 
1019          /* Record first use before def. Phi sources are handled
1020           * above, because they logically happen in the
1021           * predecessor.
1022           */
1023          agx_foreach_ssa_src(I, s) {
1024             if (BITSET_TEST(defined, I->src[s].value))
1025                continue;
1026             if (search_next_uses(&dists, I->src[s].value) < DIST_INFINITY)
1027                continue;
1028 
1029             assert(I->src[s].value < ctx->alloc);
1030             set_next_use(&dists, I->src[s].value, cycle);
1031          }
1032 
1033          /* Record defs */
1034          agx_foreach_ssa_dest(I, d) {
1035             assert(I->dest[d].value < ctx->alloc);
1036             BITSET_SET(defined, I->dest[d].value);
1037          }
1038 
1039          cycle += instr_cycles(I);
1040       }
1041 
1042       /* Apply transfer function to get our entry state. */
1043       foreach_next_use(&sb->next_use_out, node, dist) {
1044          set_next_use(&sb->next_use_in, node, dist_sum(dist, sb->cycles));
1045       }
1046 
1047       foreach_next_use(&dists, node, dist) {
1048          set_next_use(&sb->next_use_in, node, dist);
1049       }
1050 
1051       int i;
1052       BITSET_FOREACH_SET(i, defined, ctx->alloc) {
1053          set_next_use(&sb->next_use_in, i, DIST_INFINITY);
1054       }
1055 
1056       /* Propagate the live in of the successor (blk) to the live out of
1057        * predecessors.
1058        *
1059        * Phi nodes are logically on the control flow edge and act in parallel.
1060        * To handle when propagating, we kill writes from phis and make live the
1061        * corresponding sources.
1062        */
1063       agx_foreach_predecessor(blk, pred) {
1064          struct spill_block *sp = &blocks[(*pred)->index];
1065          copy_next_uses(&dists, &sb->next_use_in);
1066 
1067          /* Kill write */
1068          agx_foreach_phi_in_block(blk, I) {
1069             assert(I->dest[0].type == AGX_INDEX_NORMAL);
1070             set_next_use(&dists, I->dest[0].value, DIST_INFINITY);
1071          }
1072 
1073          /* Make live the corresponding source */
1074          agx_foreach_phi_in_block(blk, I) {
1075             agx_index operand = I->src[agx_predecessor_index(blk, *pred)];
1076             if (operand.type == AGX_INDEX_NORMAL)
1077                set_next_use(&dists, operand.value, 0);
1078          }
1079 
1080          /* Join by taking minimum */
1081          if (minimum_next_uses(&sp->next_use_out, &dists))
1082             agx_worklist_push_tail(&worklist, *pred);
1083       }
1084    }
1085 
1086    free(defined);
1087    u_worklist_fini(&worklist);
1088    destroy_next_uses(&dists);
1089 }
1090 
1091 static ATTRIBUTE_NOINLINE void
validate_next_use_info(UNUSED agx_context * ctx,UNUSED struct spill_block * blocks)1092 validate_next_use_info(UNUSED agx_context *ctx,
1093                        UNUSED struct spill_block *blocks)
1094 {
1095 #ifndef NDEBUG
1096    int i;
1097 
1098    agx_foreach_block(ctx, blk) {
1099       struct spill_block *sb = &blocks[blk->index];
1100 
1101       /* Invariant: next-use distance is finite iff the node is live */
1102       BITSET_FOREACH_SET(i, blk->live_in, ctx->alloc)
1103          assert(search_next_uses(&sb->next_use_in, i) < DIST_INFINITY);
1104 
1105       BITSET_FOREACH_SET(i, blk->live_out, ctx->alloc)
1106          assert(search_next_uses(&sb->next_use_out, i) < DIST_INFINITY);
1107 
1108       foreach_next_use(&sb->next_use_in, i, _)
1109          assert(BITSET_TEST(blk->live_in, i));
1110 
1111       foreach_next_use(&sb->next_use_out, i, _)
1112          assert(BITSET_TEST(blk->live_out, i));
1113    }
1114 #endif
1115 }
1116 
1117 void
agx_spill(agx_context * ctx,unsigned k)1118 agx_spill(agx_context *ctx, unsigned k)
1119 {
1120    void *memctx = ralloc_context(NULL);
1121 
1122    /* Reserve the bottom registers as temporaries for memory-memory swaps */
1123    ctx->has_spill_pcopy_reserved = true;
1124    k -= 8;
1125 
1126    uint8_t *channels = rzalloc_array(memctx, uint8_t, ctx->alloc);
1127    dist_t *next_uses = rzalloc_array(memctx, dist_t, ctx->alloc);
1128    enum agx_size *sizes = rzalloc_array(memctx, enum agx_size, ctx->alloc);
1129    agx_instr **remat = rzalloc_array(memctx, agx_instr *, ctx->alloc);
1130 
1131    agx_foreach_instr_global(ctx, I) {
1132       if (can_remat(I))
1133          remat[I->dest[0].value] = I;
1134 
1135       /* Measure vectors */
1136       agx_foreach_ssa_dest(I, d) {
1137          assert(sizes[I->dest[d].value] == 0 && "broken SSA");
1138          assert(channels[I->dest[d].value] == 0 && "broken SSA");
1139 
1140          sizes[I->dest[d].value] = I->dest[d].size;
1141          channels[I->dest[d].value] = agx_channels(I->dest[d]);
1142       }
1143    }
1144 
1145    struct spill_block *blocks =
1146       rzalloc_array(memctx, struct spill_block, ctx->num_blocks);
1147 
1148    /* Step 1. Compute global next-use distances */
1149    global_next_use_distances(ctx, memctx, blocks);
1150    validate_next_use_info(ctx, blocks);
1151 
1152    /* Reserve a memory variable for every regular variable */
1153    unsigned n = ctx->alloc;
1154    ctx->alloc *= 2;
1155 
1156    BITSET_WORD *W = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n));
1157    BITSET_WORD *S = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n));
1158 
1159    agx_foreach_block(ctx, block) {
1160       memset(W, 0, BITSET_WORDS(n) * sizeof(BITSET_WORD));
1161       memset(S, 0, BITSET_WORDS(n) * sizeof(BITSET_WORD));
1162 
1163       struct spill_ctx sctx = {
1164          .memctx = memctx,
1165          .shader = ctx,
1166          .n = n,
1167          .channels = channels,
1168          .size = sizes,
1169          .remat = remat,
1170          .next_uses = next_uses,
1171          .block = block,
1172          .blocks = blocks,
1173          .k = k,
1174          .W = W,
1175          .S = S,
1176          .spill_base = n,
1177       };
1178 
1179       compute_w_entry(&sctx);
1180       compute_s_entry(&sctx);
1181       min_algorithm(&sctx);
1182    }
1183 
1184    /* Now that all blocks are processed separately, stitch it together */
1185    agx_foreach_block(ctx, block) {
1186       struct spill_ctx sctx = {
1187          .memctx = memctx,
1188          .shader = ctx,
1189          .n = n,
1190          .channels = channels,
1191          .size = sizes,
1192          .remat = remat,
1193          .block = block,
1194          .blocks = blocks,
1195          .k = k,
1196          .W = W,
1197          .S = S,
1198          .spill_base = n,
1199       };
1200 
1201       agx_foreach_predecessor(block, pred) {
1202          /* After spilling phi sources, insert coupling code */
1203          insert_coupling_code(&sctx, *pred, block);
1204       }
1205    }
1206 
1207    ralloc_free(memctx);
1208 
1209    /* Spilling breaks SSA, so we need to repair before validating */
1210    agx_repair_ssa(ctx);
1211    agx_validate(ctx, "Spilling");
1212 
1213    /* Remat can introduce dead code */
1214    agx_dce(ctx, false);
1215 }
1216