xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_vec4_reg_allocate.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "util/register_allocate.h"
25 #include "elk_vec4.h"
26 #include "elk_cfg.h"
27 
28 using namespace elk;
29 
30 #define REG_CLASS_COUNT 20
31 
32 namespace elk {
33 
34 static void
assign(unsigned int * reg_hw_locations,elk_backend_reg * reg)35 assign(unsigned int *reg_hw_locations, elk_backend_reg *reg)
36 {
37    if (reg->file == VGRF) {
38       reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
39       reg->offset %= REG_SIZE;
40    }
41 }
42 
43 bool
reg_allocate_trivial()44 vec4_visitor::reg_allocate_trivial()
45 {
46    unsigned int hw_reg_mapping[this->alloc.count];
47    bool virtual_grf_used[this->alloc.count];
48    int next;
49 
50    /* Calculate which virtual GRFs are actually in use after whatever
51     * optimization passes have occurred.
52     */
53    for (unsigned i = 0; i < this->alloc.count; i++) {
54       virtual_grf_used[i] = false;
55    }
56 
57    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
58       if (inst->dst.file == VGRF)
59          virtual_grf_used[inst->dst.nr] = true;
60 
61       for (unsigned i = 0; i < 3; i++) {
62 	 if (inst->src[i].file == VGRF)
63             virtual_grf_used[inst->src[i].nr] = true;
64       }
65    }
66 
67    hw_reg_mapping[0] = this->first_non_payload_grf;
68    next = hw_reg_mapping[0] + this->alloc.sizes[0];
69    for (unsigned i = 1; i < this->alloc.count; i++) {
70       if (virtual_grf_used[i]) {
71 	 hw_reg_mapping[i] = next;
72 	 next += this->alloc.sizes[i];
73       }
74    }
75    prog_data->total_grf = next;
76 
77    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
78       assign(hw_reg_mapping, &inst->dst);
79       assign(hw_reg_mapping, &inst->src[0]);
80       assign(hw_reg_mapping, &inst->src[1]);
81       assign(hw_reg_mapping, &inst->src[2]);
82    }
83 
84    if (prog_data->total_grf > max_grf) {
85       fail("Ran out of regs on trivial allocator (%d/%d)\n",
86 	   prog_data->total_grf, max_grf);
87       return false;
88    }
89 
90    return true;
91 }
92 
93 extern "C" void
elk_vec4_alloc_reg_set(struct elk_compiler * compiler)94 elk_vec4_alloc_reg_set(struct elk_compiler *compiler)
95 {
96    int base_reg_count =
97       compiler->devinfo->ver >= 7 ? GFX7_MRF_HACK_START : ELK_MAX_GRF;
98 
99    assert(compiler->devinfo->ver < 8);
100 
101    /* After running split_virtual_grfs(), almost all VGRFs will be of size 1.
102     * SEND-from-GRF sources cannot be split, so we also need classes for each
103     * potential message length.
104     */
105    assert(REG_CLASS_COUNT == MAX_VGRF_SIZE(compiler->devinfo));
106    int class_sizes[REG_CLASS_COUNT];
107 
108    for (int i = 0; i < REG_CLASS_COUNT; i++)
109       class_sizes[i] = i + 1;
110 
111 
112    ralloc_free(compiler->vec4_reg_set.regs);
113    compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, base_reg_count, false);
114    if (compiler->devinfo->ver >= 6)
115       ra_set_allocate_round_robin(compiler->vec4_reg_set.regs);
116    ralloc_free(compiler->vec4_reg_set.classes);
117    compiler->vec4_reg_set.classes = ralloc_array(compiler, struct ra_class *, REG_CLASS_COUNT);
118 
119    /* Now, add the registers to their classes, and add the conflicts
120     * between them and the base GRF registers (and also each other).
121     */
122    for (int i = 0; i < REG_CLASS_COUNT; i++) {
123       int class_reg_count = base_reg_count - (class_sizes[i] - 1);
124       compiler->vec4_reg_set.classes[i] =
125          ra_alloc_contig_reg_class(compiler->vec4_reg_set.regs, class_sizes[i]);
126 
127       for (int j = 0; j < class_reg_count; j++)
128          ra_class_add_reg(compiler->vec4_reg_set.classes[i], j);
129    }
130 
131    ra_set_finalize(compiler->vec4_reg_set.regs, NULL);
132 }
133 
134 void
setup_payload_interference(struct ra_graph * g,int first_payload_node,int reg_node_count)135 vec4_visitor::setup_payload_interference(struct ra_graph *g,
136                                          int first_payload_node,
137                                          int reg_node_count)
138 {
139    int payload_node_count = this->first_non_payload_grf;
140 
141    for (int i = 0; i < payload_node_count; i++) {
142       /* Mark each payload reg node as being allocated to its physical register.
143        *
144        * The alternative would be to have per-physical register classes, which
145        * would just be silly.
146        */
147       ra_set_node_reg(g, first_payload_node + i, i);
148 
149       /* For now, just mark each payload node as interfering with every other
150        * node to be allocated.
151        */
152       for (int j = 0; j < reg_node_count; j++) {
153          ra_add_node_interference(g, first_payload_node + i, j);
154       }
155    }
156 }
157 
158 bool
reg_allocate()159 vec4_visitor::reg_allocate()
160 {
161    unsigned int hw_reg_mapping[alloc.count];
162    int payload_reg_count = this->first_non_payload_grf;
163 
164    /* Using the trivial allocator can be useful in debugging undefined
165     * register access as a result of broken optimization passes.
166     */
167    if (0)
168       return reg_allocate_trivial();
169 
170    assert(devinfo->ver < 8);
171 
172    const vec4_live_variables &live = live_analysis.require();
173    int node_count = alloc.count;
174    int first_payload_node = node_count;
175    node_count += payload_reg_count;
176    struct ra_graph *g =
177       ra_alloc_interference_graph(compiler->vec4_reg_set.regs, node_count);
178 
179    for (unsigned i = 0; i < alloc.count; i++) {
180       int size = this->alloc.sizes[i];
181       assert(size >= 1 && size <= MAX_VGRF_SIZE(devinfo));
182       ra_set_node_class(g, i, compiler->vec4_reg_set.classes[size - 1]);
183 
184       for (unsigned j = 0; j < i; j++) {
185 	 if (live.vgrfs_interfere(i, j)) {
186 	    ra_add_node_interference(g, i, j);
187 	 }
188       }
189    }
190 
191    /* Certain instructions can't safely use the same register for their
192     * sources and destination.  Add interference.
193     */
194    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
195       if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
196          for (unsigned i = 0; i < 3; i++) {
197             if (inst->src[i].file == VGRF) {
198                ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
199             }
200          }
201       }
202    }
203 
204    setup_payload_interference(g, first_payload_node, node_count);
205 
206    if (!ra_allocate(g)) {
207       /* Failed to allocate registers.  Spill a reg, and the caller will
208        * loop back into here to try again.
209        */
210       int reg = choose_spill_reg(g);
211       if (this->no_spills) {
212          fail("Failure to register allocate.  Reduce number of live "
213               "values to avoid this.");
214       } else if (reg == -1) {
215          fail("no register to spill\n");
216       } else {
217          spill_reg(reg);
218       }
219       ralloc_free(g);
220       return false;
221    }
222 
223    /* Get the chosen virtual registers for each node, and map virtual
224     * regs in the register classes back down to real hardware reg
225     * numbers.
226     */
227    prog_data->total_grf = payload_reg_count;
228    for (unsigned i = 0; i < alloc.count; i++) {
229       hw_reg_mapping[i] = ra_get_node_reg(g, i);
230       prog_data->total_grf = MAX2(prog_data->total_grf,
231 				  hw_reg_mapping[i] + alloc.sizes[i]);
232    }
233 
234    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
235       assign(hw_reg_mapping, &inst->dst);
236       assign(hw_reg_mapping, &inst->src[0]);
237       assign(hw_reg_mapping, &inst->src[1]);
238       assign(hw_reg_mapping, &inst->src[2]);
239    }
240 
241    ralloc_free(g);
242 
243    return true;
244 }
245 
246 /**
247  * When we decide to spill a register, instead of blindly spilling every use,
248  * save unspills when the spill register is used (read) in consecutive
249  * instructions. This can potentially save a bunch of unspills that would
250  * have very little impact in register allocation anyway.
251  *
252  * Notice that we need to account for this behavior when spilling a register
253  * and when evaluating spilling costs. This function is designed so it can
254  * be called from both places and avoid repeating the logic.
255  *
256  *  - When we call this function from spill_reg(), we pass in scratch_reg the
257  *    actual unspill/spill register that we want to reuse in the current
258  *    instruction.
259  *
260  *  - When we call this from evaluate_spill_costs(), we pass the register for
261  *    which we are evaluating spilling costs.
262  *
263  * In either case, we check if the previous instructions read scratch_reg until
264  * we find one that writes to it with a compatible mask or does not read/write
265  * scratch_reg at all.
266  */
267 static bool
can_use_scratch_for_source(const vec4_instruction * inst,unsigned i,unsigned scratch_reg)268 can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
269                            unsigned scratch_reg)
270 {
271    assert(inst->src[i].file == VGRF);
272    bool prev_inst_read_scratch_reg = false;
273 
274    /* See if any previous source in the same instructions reads scratch_reg */
275    for (unsigned n = 0; n < i; n++) {
276       if (inst->src[n].file == VGRF && inst->src[n].nr == scratch_reg)
277          prev_inst_read_scratch_reg = true;
278    }
279 
280    /* Now check if previous instructions read/write scratch_reg */
281    for (vec4_instruction *prev_inst = (vec4_instruction *) inst->prev;
282         !prev_inst->is_head_sentinel();
283         prev_inst = (vec4_instruction *) prev_inst->prev) {
284 
285       /* If the previous instruction writes to scratch_reg then we can reuse
286        * it if the write is not conditional and the channels we write are
287        * compatible with our read mask
288        */
289       if (prev_inst->dst.file == VGRF && prev_inst->dst.nr == scratch_reg) {
290          return (!prev_inst->predicate || prev_inst->opcode == ELK_OPCODE_SEL) &&
291                 (elk_mask_for_swizzle(inst->src[i].swizzle) &
292                  ~prev_inst->dst.writemask) == 0;
293       }
294 
295       /* Skip scratch read/writes so that instructions generated by spilling
296        * other registers (that won't read/write scratch_reg) do not stop us from
297        * reusing scratch_reg for this instruction.
298        */
299       if (prev_inst->opcode == ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE ||
300           prev_inst->opcode == ELK_SHADER_OPCODE_GFX4_SCRATCH_READ)
301          continue;
302 
303       /* If the previous instruction does not write to scratch_reg, then check
304        * if it reads it
305        */
306       int n;
307       for (n = 0; n < 3; n++) {
308          if (prev_inst->src[n].file == VGRF &&
309              prev_inst->src[n].nr == scratch_reg) {
310             prev_inst_read_scratch_reg = true;
311             break;
312          }
313       }
314       if (n == 3) {
315          /* The previous instruction does not read scratch_reg. At this point,
316           * if no previous instruction has read scratch_reg it means that we
317           * will need to unspill it here and we can't reuse it (so we return
318           * false). Otherwise, if we found at least one consecutive instruction
319           * that read scratch_reg, then we know that we got here from
320           * evaluate_spill_costs (since for the spill_reg path any block of
321           * consecutive instructions using scratch_reg must start with a write
322           * to that register, so we would've exited the loop in the check for
323           * the write that we have at the start of this loop), and in that case
324           * it means that we found the point at which the scratch_reg would be
325           * unspilled. Since we always unspill a full vec4, it means that we
326           * have all the channels available and we can just return true to
327           * signal that we can reuse the register in the current instruction
328           * too.
329           */
330          return prev_inst_read_scratch_reg;
331       }
332    }
333 
334    return prev_inst_read_scratch_reg;
335 }
336 
337 static inline float
spill_cost_for_type(enum elk_reg_type type)338 spill_cost_for_type(enum elk_reg_type type)
339 {
340    /* Spilling of a 64-bit register involves emitting 2 32-bit scratch
341     * messages plus the 64b/32b shuffling code.
342     */
343    return type_sz(type) == 8 ? 2.25f : 1.0f;
344 }
345 
346 void
evaluate_spill_costs(float * spill_costs,bool * no_spill)347 vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
348 {
349    float loop_scale = 1.0;
350 
351    unsigned *reg_type_size = (unsigned *)
352       ralloc_size(NULL, this->alloc.count * sizeof(unsigned));
353 
354    for (unsigned i = 0; i < this->alloc.count; i++) {
355       spill_costs[i] = 0.0;
356       no_spill[i] = alloc.sizes[i] != 1 && alloc.sizes[i] != 2;
357       reg_type_size[i] = 0;
358    }
359 
360    /* Calculate costs for spilling nodes.  Call it a cost of 1 per
361     * spill/unspill we'll have to do, and guess that the insides of
362     * loops run 10 times.
363     */
364    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
365       for (unsigned int i = 0; i < 3; i++) {
366          if (inst->src[i].file == VGRF && !no_spill[inst->src[i].nr]) {
367             /* We will only unspill src[i] it it wasn't unspilled for the
368              * previous instruction, in which case we'll just reuse the scratch
369              * reg for this instruction.
370              */
371             if (!can_use_scratch_for_source(inst, i, inst->src[i].nr)) {
372                spill_costs[inst->src[i].nr] +=
373                   loop_scale * spill_cost_for_type(inst->src[i].type);
374                if (inst->src[i].reladdr ||
375                    inst->src[i].offset >= REG_SIZE)
376                   no_spill[inst->src[i].nr] = true;
377 
378                /* We don't support unspills of partial DF reads.
379                 *
380                 * Our 64-bit unspills are implemented with two 32-bit scratch
381                 * messages, each one reading that for both SIMD4x2 threads that
382                 * we need to shuffle into correct 64-bit data. Ensure that we
383                 * are reading data for both threads.
384                 */
385                if (type_sz(inst->src[i].type) == 8 && inst->exec_size != 8)
386                   no_spill[inst->src[i].nr] = true;
387             }
388 
389             /* We can't spill registers that mix 32-bit and 64-bit access (that
390              * contain 64-bit data that is operated on via 32-bit instructions)
391              */
392             unsigned type_size = type_sz(inst->src[i].type);
393             if (reg_type_size[inst->src[i].nr] == 0)
394                reg_type_size[inst->src[i].nr] = type_size;
395             else if (reg_type_size[inst->src[i].nr] != type_size)
396                no_spill[inst->src[i].nr] = true;
397          }
398       }
399 
400       if (inst->dst.file == VGRF && !no_spill[inst->dst.nr]) {
401          spill_costs[inst->dst.nr] +=
402             loop_scale * spill_cost_for_type(inst->dst.type);
403          if (inst->dst.reladdr || inst->dst.offset >= REG_SIZE)
404             no_spill[inst->dst.nr] = true;
405 
406          /* We don't support spills of partial DF writes.
407           *
408           * Our 64-bit spills are implemented with two 32-bit scratch messages,
409           * each one writing that for both SIMD4x2 threads. Ensure that we
410           * are writing data for both threads.
411           */
412          if (type_sz(inst->dst.type) == 8 && inst->exec_size != 8)
413             no_spill[inst->dst.nr] = true;
414 
415          /* We can't spill registers that mix 32-bit and 64-bit access (that
416           * contain 64-bit data that is operated on via 32-bit instructions)
417           */
418          unsigned type_size = type_sz(inst->dst.type);
419          if (reg_type_size[inst->dst.nr] == 0)
420             reg_type_size[inst->dst.nr] = type_size;
421          else if (reg_type_size[inst->dst.nr] != type_size)
422             no_spill[inst->dst.nr] = true;
423       }
424 
425       switch (inst->opcode) {
426 
427       case ELK_OPCODE_DO:
428          loop_scale *= 10;
429          break;
430 
431       case ELK_OPCODE_WHILE:
432          loop_scale /= 10;
433          break;
434 
435       case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
436       case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
437       case ELK_VEC4_OPCODE_MOV_FOR_SCRATCH:
438          for (int i = 0; i < 3; i++) {
439             if (inst->src[i].file == VGRF)
440                no_spill[inst->src[i].nr] = true;
441          }
442          if (inst->dst.file == VGRF)
443             no_spill[inst->dst.nr] = true;
444          break;
445 
446       default:
447          break;
448       }
449    }
450 
451    ralloc_free(reg_type_size);
452 }
453 
454 int
choose_spill_reg(struct ra_graph * g)455 vec4_visitor::choose_spill_reg(struct ra_graph *g)
456 {
457    float spill_costs[this->alloc.count];
458    bool no_spill[this->alloc.count];
459 
460    evaluate_spill_costs(spill_costs, no_spill);
461 
462    for (unsigned i = 0; i < this->alloc.count; i++) {
463       if (!no_spill[i])
464          ra_set_node_spill_cost(g, i, spill_costs[i]);
465    }
466 
467    return ra_get_best_spill_node(g);
468 }
469 
470 void
spill_reg(unsigned spill_reg_nr)471 vec4_visitor::spill_reg(unsigned spill_reg_nr)
472 {
473    assert(alloc.sizes[spill_reg_nr] == 1 || alloc.sizes[spill_reg_nr] == 2);
474    unsigned spill_offset = last_scratch;
475    last_scratch += alloc.sizes[spill_reg_nr];
476 
477    /* Generate spill/unspill instructions for the objects being spilled. */
478    unsigned scratch_reg = ~0u;
479    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
480       for (unsigned i = 0; i < 3; i++) {
481          if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) {
482             if (scratch_reg == ~0u ||
483                 !can_use_scratch_for_source(inst, i, scratch_reg)) {
484                /* We need to unspill anyway so make sure we read the full vec4
485                 * in any case. This way, the cached register can be reused
486                 * for consecutive instructions that read different channels of
487                 * the same vec4.
488                 */
489                scratch_reg = alloc.allocate(alloc.sizes[spill_reg_nr]);
490                src_reg temp = inst->src[i];
491                temp.nr = scratch_reg;
492                temp.offset = 0;
493                temp.swizzle = ELK_SWIZZLE_XYZW;
494                emit_scratch_read(block, inst,
495                                  dst_reg(temp), inst->src[i], spill_offset);
496                temp.offset = inst->src[i].offset;
497             }
498             assert(scratch_reg != ~0u);
499             inst->src[i].nr = scratch_reg;
500          }
501       }
502 
503       if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) {
504          emit_scratch_write(block, inst, spill_offset);
505          scratch_reg = inst->dst.nr;
506       }
507    }
508 
509    invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
510 }
511 
512 } /* namespace elk */
513