xref: /aosp_15_r20/external/mesa3d/src/compiler/nir/nir_opt_loop_unroll.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "nir.h"
25 #include "nir_builder.h"
26 #include "nir_control_flow.h"
27 #include "nir_loop_analyze.h"
28 
29 /* This limit is chosen fairly arbitrarily.  GLSL IR max iteration is 32
30  * instructions. (Multiply counting nodes and magic number 5.)  But there is
31  * no 1:1 mapping between GLSL IR and NIR so 25 was picked because it seemed
32  * to give about the same results. Around 5 instructions per node.  But some
33  * loops that would unroll with GLSL IR fail to unroll if we set this to 25 so
34  * we set it to 26.
35  */
36 #define LOOP_UNROLL_LIMIT 26
37 
38 /* Prepare this loop for unrolling by first converting to lcssa and then
39  * converting the phis from the top level of the loop body to regs.
40  * Partially converting out of SSA allows us to unroll the loop without having
41  * to keep track of and update phis along the way which gets tricky and
42  * doesn't add much value over converting to regs.
43  *
44  * The loop may have a jump instruction at the end of the loop which does
45  * nothing.  Once we're out of SSA, we can safely delete it so we don't have
46  * to deal with it later.
47  */
48 static void
loop_prepare_for_unroll(nir_loop * loop)49 loop_prepare_for_unroll(nir_loop *loop)
50 {
51    nir_rematerialize_derefs_in_use_blocks_impl(
52       nir_cf_node_get_function(&loop->cf_node));
53 
54    nir_convert_loop_to_lcssa(loop);
55 
56    /* Lower phis at the top level of the loop body */
57    foreach_list_typed_safe(nir_cf_node, node, node, &loop->body) {
58       if (nir_cf_node_block == node->type) {
59          nir_lower_phis_to_regs_block(nir_cf_node_as_block(node));
60       }
61    }
62 
63    /* Lower phis after the loop */
64    nir_block *block_after_loop =
65       nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node));
66 
67    nir_lower_phis_to_regs_block(block_after_loop);
68 
69    /* Remove jump if it's the last instruction in the loop */
70    nir_instr *last_instr = nir_block_last_instr(nir_loop_last_block(loop));
71    if (last_instr && last_instr->type == nir_instr_type_jump) {
72       nir_instr_remove(last_instr);
73    }
74 }
75 
76 static void
get_first_blocks_in_terminator(nir_loop_terminator * term,nir_block ** first_break_block,nir_block ** first_continue_block)77 get_first_blocks_in_terminator(nir_loop_terminator *term,
78                                nir_block **first_break_block,
79                                nir_block **first_continue_block)
80 {
81    if (term->continue_from_then) {
82       *first_continue_block = nir_if_first_then_block(term->nif);
83       *first_break_block = nir_if_first_else_block(term->nif);
84    } else {
85       *first_continue_block = nir_if_first_else_block(term->nif);
86       *first_break_block = nir_if_first_then_block(term->nif);
87    }
88 }
89 
90 /**
91  * Unroll a loop where we know exactly how many iterations there are and there
92  * is only a single exit point.  Note here we can unroll loops with multiple
93  * theoretical exits that only have a single terminating exit that we always
94  * know is the "real" exit.
95  *
96  *     loop {
97  *         ...instrs...
98  *     }
99  *
100  * And the iteration count is 3, the output will be:
101  *
102  *     ...instrs... ...instrs... ...instrs...
103  */
104 static void
simple_unroll(nir_loop * loop)105 simple_unroll(nir_loop *loop)
106 {
107    nir_loop_terminator *limiting_term = loop->info->limiting_terminator;
108    assert(nir_is_trivial_loop_if(limiting_term->nif,
109                                  limiting_term->break_block));
110 
111    loop_prepare_for_unroll(loop);
112 
113    /* Skip over loop terminator and get the loop body. */
114    list_for_each_entry(nir_loop_terminator, terminator,
115                        &loop->info->loop_terminator_list,
116                        loop_terminator_link) {
117 
118       /* Remove all but the limiting terminator as we know the other exit
119        * conditions can never be met. Note we need to extract any instructions
120        * in the continue from branch and insert then into the loop body before
121        * removing it.
122        */
123       if (terminator->nif != limiting_term->nif) {
124          nir_block *first_break_block;
125          nir_block *first_continue_block;
126          get_first_blocks_in_terminator(terminator, &first_break_block,
127                                         &first_continue_block);
128 
129          assert(nir_is_trivial_loop_if(terminator->nif,
130                                        terminator->break_block));
131 
132          nir_cf_list continue_from_lst;
133          nir_cf_extract(&continue_from_lst,
134                         nir_before_block(first_continue_block),
135                         nir_after_block(terminator->continue_from_block));
136          nir_cf_reinsert(&continue_from_lst,
137                          nir_after_cf_node(&terminator->nif->cf_node));
138 
139          nir_cf_node_remove(&terminator->nif->cf_node);
140       }
141    }
142 
143    nir_block *first_break_block;
144    nir_block *first_continue_block;
145    get_first_blocks_in_terminator(limiting_term, &first_break_block,
146                                   &first_continue_block);
147 
148    /* Pluck out the loop header */
149    nir_block *header_blk = nir_loop_first_block(loop);
150    nir_cf_list lp_header;
151    nir_cf_extract(&lp_header, nir_before_block(header_blk),
152                   nir_before_cf_node(&limiting_term->nif->cf_node));
153 
154    /* Add the continue from block of the limiting terminator to the loop body
155     */
156    nir_cf_list continue_from_lst;
157    nir_cf_extract(&continue_from_lst, nir_before_block(first_continue_block),
158                   nir_after_block(limiting_term->continue_from_block));
159    nir_cf_reinsert(&continue_from_lst,
160                    nir_after_cf_node(&limiting_term->nif->cf_node));
161 
162    /* Pluck out the loop body */
163    nir_cf_list loop_body;
164    nir_cf_extract(&loop_body, nir_after_cf_node(&limiting_term->nif->cf_node),
165                   nir_after_block(nir_loop_last_block(loop)));
166 
167    struct hash_table *remap_table = _mesa_pointer_hash_table_create(NULL);
168 
169    /* Clone the loop header and insert before the loop */
170    nir_cf_list_clone_and_reinsert(&lp_header, loop->cf_node.parent,
171                                   nir_before_cf_node(&loop->cf_node),
172                                   remap_table);
173 
174    for (unsigned i = 0; i < loop->info->max_trip_count; i++) {
175       /* Clone loop body and insert before the loop */
176       nir_cf_list_clone_and_reinsert(&loop_body, loop->cf_node.parent,
177                                      nir_before_cf_node(&loop->cf_node),
178                                      remap_table);
179 
180       /* Clone loop header and insert after loop body */
181       nir_cf_list_clone_and_reinsert(&lp_header, loop->cf_node.parent,
182                                      nir_before_cf_node(&loop->cf_node),
183                                      remap_table);
184    }
185 
186    /* Remove the break from the loop terminator and add instructions from
187     * the break block after the unrolled loop.
188     */
189    nir_instr *break_instr = nir_block_last_instr(limiting_term->break_block);
190    nir_instr_remove(break_instr);
191    nir_cf_list break_list;
192    nir_cf_extract(&break_list, nir_before_block(first_break_block),
193                   nir_after_block(limiting_term->break_block));
194 
195    /* Clone so things get properly remapped */
196    nir_cf_list_clone_and_reinsert(&break_list, loop->cf_node.parent,
197                                   nir_before_cf_node(&loop->cf_node),
198                                   remap_table);
199 
200    /* Remove the loop */
201    nir_cf_node_remove(&loop->cf_node);
202 
203    /* Delete the original loop body, break block & header */
204    nir_cf_delete(&lp_header);
205    nir_cf_delete(&loop_body);
206    nir_cf_delete(&break_list);
207 
208    _mesa_hash_table_destroy(remap_table, NULL);
209 }
210 
211 static void
move_cf_list_into_loop_term(nir_cf_list * lst,nir_loop_terminator * term)212 move_cf_list_into_loop_term(nir_cf_list *lst, nir_loop_terminator *term)
213 {
214    /* Move the rest of the loop inside the continue-from-block */
215    nir_cf_reinsert(lst, nir_after_block(term->continue_from_block));
216 
217    /* Remove the break */
218    nir_instr_remove(nir_block_last_instr(term->break_block));
219 }
220 
221 static nir_cursor
get_complex_unroll_insert_location(nir_cf_node * node,bool continue_from_then)222 get_complex_unroll_insert_location(nir_cf_node *node, bool continue_from_then)
223 {
224    if (node->type == nir_cf_node_loop) {
225       return nir_before_cf_node(node);
226    } else {
227       nir_if *if_stmt = nir_cf_node_as_if(node);
228       if (continue_from_then) {
229          return nir_after_block(nir_if_last_then_block(if_stmt));
230       } else {
231          return nir_after_block(nir_if_last_else_block(if_stmt));
232       }
233    }
234 }
235 
236 static nir_cf_node *
complex_unroll_loop_body(nir_loop * loop,nir_loop_terminator * unlimit_term,nir_cf_list * lp_header,nir_cf_list * lp_body,struct hash_table * remap_table,unsigned num_times_to_clone)237 complex_unroll_loop_body(nir_loop *loop, nir_loop_terminator *unlimit_term,
238                          nir_cf_list *lp_header, nir_cf_list *lp_body,
239                          struct hash_table *remap_table,
240                          unsigned num_times_to_clone)
241 {
242    /* In the terminator that we have no trip count for move everything after
243     * the terminator into the continue from branch.
244     */
245    nir_cf_list loop_end;
246    nir_cf_extract(&loop_end, nir_after_cf_node(&unlimit_term->nif->cf_node),
247                   nir_after_block(nir_loop_last_block(loop)));
248    move_cf_list_into_loop_term(&loop_end, unlimit_term);
249 
250    /* Pluck out the loop body. */
251    nir_cf_extract(lp_body, nir_before_block(nir_loop_first_block(loop)),
252                   nir_after_block(nir_loop_last_block(loop)));
253 
254    /* Set unroll_loc to the loop as we will insert the unrolled loop before it
255     */
256    nir_cf_node *unroll_loc = &loop->cf_node;
257 
258    /* Temp list to store the cloned loop as we unroll */
259    nir_cf_list unrolled_lp_body;
260 
261    for (unsigned i = 0; i < num_times_to_clone; i++) {
262 
263       nir_cursor cursor =
264          get_complex_unroll_insert_location(unroll_loc,
265                                             unlimit_term->continue_from_then);
266 
267       /* Clone loop header and insert in if branch */
268       nir_cf_list_clone_and_reinsert(lp_header, loop->cf_node.parent,
269                                      cursor, remap_table);
270 
271       cursor =
272          get_complex_unroll_insert_location(unroll_loc,
273                                             unlimit_term->continue_from_then);
274 
275       /* Clone loop body */
276       nir_cf_list_clone(&unrolled_lp_body, lp_body, loop->cf_node.parent,
277                         remap_table);
278 
279       unroll_loc = exec_node_data(nir_cf_node,
280                                   exec_list_get_tail(&unrolled_lp_body.list),
281                                   node);
282       assert(unroll_loc->type == nir_cf_node_block &&
283              exec_list_is_empty(&nir_cf_node_as_block(unroll_loc)->instr_list));
284 
285       /* Get the unrolled if node */
286       unroll_loc = nir_cf_node_prev(unroll_loc);
287 
288       /* Insert unrolled loop body */
289       nir_cf_reinsert(&unrolled_lp_body, cursor);
290    }
291 
292    return unroll_loc;
293 }
294 
295 /**
296  * Unroll a loop with two exists when the trip count of one of the exits is
297  * unknown.  If continue_from_then is true, the loop is repeated only when the
298  * "then" branch of the if is taken; otherwise it is repeated only
299  * when the "else" branch of the if is taken.
300  *
301  * For example, if the input is:
302  *
303  *      loop {
304  *         ...phis/condition...
305  *         if condition {
306  *            ...then instructions...
307  *         } else {
308  *            ...continue instructions...
309  *            break
310  *         }
311  *         ...body...
312  *      }
313  *
314  * And the iteration count is 3, and unlimit_term->continue_from_then is true,
315  * then the output will be:
316  *
317  *      ...condition...
318  *      if condition {
319  *         ...then instructions...
320  *         ...body...
321  *         if condition {
322  *            ...then instructions...
323  *            ...body...
324  *            if condition {
325  *               ...then instructions...
326  *               ...body...
327  *            } else {
328  *               ...continue instructions...
329  *            }
330  *         } else {
331  *            ...continue instructions...
332  *         }
333  *      } else {
334  *         ...continue instructions...
335  *      }
336  */
337 static void
complex_unroll(nir_loop * loop,nir_loop_terminator * unlimit_term,bool limiting_term_second)338 complex_unroll(nir_loop *loop, nir_loop_terminator *unlimit_term,
339                bool limiting_term_second)
340 {
341    assert(nir_is_trivial_loop_if(unlimit_term->nif,
342                                  unlimit_term->break_block));
343 
344    nir_loop_terminator *limiting_term = loop->info->limiting_terminator;
345    assert(nir_is_trivial_loop_if(limiting_term->nif,
346                                  limiting_term->break_block));
347 
348    loop_prepare_for_unroll(loop);
349 
350    nir_block *header_blk = nir_loop_first_block(loop);
351 
352    nir_cf_list lp_header;
353    nir_cf_list limit_break_list;
354    unsigned num_times_to_clone;
355    if (limiting_term_second) {
356       /* Pluck out the loop header */
357       nir_cf_extract(&lp_header, nir_before_block(header_blk),
358                      nir_before_cf_node(&unlimit_term->nif->cf_node));
359 
360       /* We need some special handling when its the second terminator causing
361        * us to exit the loop for example:
362        *
363        *   for (int i = 0; i < uniform_lp_count; i++) {
364        *      colour = vec4(0.0, 1.0, 0.0, 1.0);
365        *
366        *      if (i == 1) {
367        *         break;
368        *      }
369        *      ... any further code is unreachable after i == 1 ...
370        *   }
371        */
372       nir_cf_list after_lt;
373       nir_if *limit_if = limiting_term->nif;
374       nir_cf_extract(&after_lt, nir_after_cf_node(&limit_if->cf_node),
375                      nir_after_block(nir_loop_last_block(loop)));
376       move_cf_list_into_loop_term(&after_lt, limiting_term);
377 
378       /* Because the trip count is the number of times we pass over the entire
379        * loop before hitting a break when the second terminator is the
380        * limiting terminator we can actually execute code inside the loop when
381        * trip count == 0 e.g. the code above the break.  So we need to bump
382        * the trip_count in order for the code below to clone anything.  When
383        * trip count == 1 we execute the code above the break twice and the
384        * code below it once so we need clone things twice and so on.
385        */
386       num_times_to_clone = loop->info->max_trip_count + 1;
387    } else {
388       /* Pluck out the loop header */
389       nir_cf_extract(&lp_header, nir_before_block(header_blk),
390                      nir_before_cf_node(&limiting_term->nif->cf_node));
391 
392       nir_block *first_break_block;
393       nir_block *first_continue_block;
394       get_first_blocks_in_terminator(limiting_term, &first_break_block,
395                                      &first_continue_block);
396 
397       /* Remove the break then extract instructions from the break block so we
398        * can insert them in the innermost else of the unrolled loop.
399        */
400       nir_instr *break_instr = nir_block_last_instr(limiting_term->break_block);
401       nir_instr_remove(break_instr);
402       nir_cf_extract(&limit_break_list, nir_before_block(first_break_block),
403                      nir_after_block(limiting_term->break_block));
404 
405       nir_cf_list continue_list;
406       nir_cf_extract(&continue_list, nir_before_block(first_continue_block),
407                      nir_after_block(limiting_term->continue_from_block));
408 
409       nir_cf_reinsert(&continue_list,
410                       nir_after_cf_node(&limiting_term->nif->cf_node));
411 
412       nir_cf_node_remove(&limiting_term->nif->cf_node);
413 
414       num_times_to_clone = loop->info->max_trip_count;
415    }
416 
417    struct hash_table *remap_table = _mesa_pointer_hash_table_create(NULL);
418 
419    nir_cf_list lp_body;
420    nir_cf_node *unroll_loc =
421       complex_unroll_loop_body(loop, unlimit_term, &lp_header, &lp_body,
422                                remap_table, num_times_to_clone);
423 
424    if (!limiting_term_second) {
425       assert(unroll_loc->type == nir_cf_node_if);
426 
427       nir_cursor cursor =
428          get_complex_unroll_insert_location(unroll_loc,
429                                             unlimit_term->continue_from_then);
430 
431       /* Clone loop header and insert in if branch */
432       nir_cf_list_clone_and_reinsert(&lp_header, loop->cf_node.parent,
433                                      cursor, remap_table);
434 
435       cursor =
436          get_complex_unroll_insert_location(unroll_loc,
437                                             unlimit_term->continue_from_then);
438 
439       /* Clone so things get properly remapped, and insert break block from
440        * the limiting terminator.
441        */
442       nir_cf_list_clone_and_reinsert(&limit_break_list, loop->cf_node.parent,
443                                      cursor, remap_table);
444 
445       nir_cf_delete(&limit_break_list);
446    }
447 
448    /* The loop has been unrolled so remove it. */
449    nir_cf_node_remove(&loop->cf_node);
450 
451    /* Delete the original loop header and body */
452    nir_cf_delete(&lp_header);
453    nir_cf_delete(&lp_body);
454 
455    _mesa_hash_table_destroy(remap_table, NULL);
456 }
457 
458 /**
459  * Unroll loops where we only have a single terminator but the exact trip
460  * count is unknown. For example:
461  *
462  *    for (int i = 0; i < imin(x, 4); i++)
463  *       ...
464  */
465 static void
complex_unroll_single_terminator(nir_loop * loop)466 complex_unroll_single_terminator(nir_loop *loop)
467 {
468    assert(list_is_singular(&loop->info->loop_terminator_list));
469    assert(loop->info->limiting_terminator);
470    assert(nir_is_trivial_loop_if(loop->info->limiting_terminator->nif,
471                                  loop->info->limiting_terminator->break_block));
472 
473    nir_loop_terminator *terminator = loop->info->limiting_terminator;
474 
475    loop_prepare_for_unroll(loop);
476 
477    /* Pluck out the loop header */
478    nir_cf_list lp_header;
479    nir_cf_extract(&lp_header, nir_before_block(nir_loop_first_block(loop)),
480                   nir_before_cf_node(&terminator->nif->cf_node));
481 
482    struct hash_table *remap_table =
483       _mesa_hash_table_create(NULL, _mesa_hash_pointer,
484                               _mesa_key_pointer_equal);
485 
486    /* We need to clone the loop one extra time in order to clone the lcssa
487     * vars for the last iteration (they are inside the following ifs break
488     * branch). We leave other passes to clean up this redundant if.
489     */
490    unsigned num_times_to_clone = loop->info->max_trip_count + 1;
491 
492    nir_cf_list lp_body;
493    UNUSED nir_cf_node *unroll_loc =
494       complex_unroll_loop_body(loop, terminator, &lp_header, &lp_body,
495                                remap_table, num_times_to_clone);
496 
497    assert(unroll_loc->type == nir_cf_node_if);
498 
499    /* We need to clone the lcssa vars in order to insert them on both sides
500     * of the if in the last iteration/if-statement. Otherwise the optimisation
501     * passes will have trouble optimising the unrolled if ladder.
502     */
503    nir_cursor cursor =
504       get_complex_unroll_insert_location(unroll_loc,
505                                          terminator->continue_from_then);
506 
507    nir_if *if_stmt = nir_cf_node_as_if(unroll_loc);
508    nir_cursor start_cursor;
509    nir_cursor end_cursor;
510    if (terminator->continue_from_then) {
511       start_cursor = nir_before_block(nir_if_first_else_block(if_stmt));
512       end_cursor = nir_after_block(nir_if_last_else_block(if_stmt));
513    } else {
514       start_cursor = nir_before_block(nir_if_first_then_block(if_stmt));
515       end_cursor = nir_after_block(nir_if_last_then_block(if_stmt));
516    }
517 
518    nir_cf_list lcssa_list;
519    nir_cf_extract(&lcssa_list, start_cursor, end_cursor);
520 
521    /* Insert the cloned vars in the last continue branch */
522    nir_cf_list_clone_and_reinsert(&lcssa_list, loop->cf_node.parent,
523                                   cursor, remap_table);
524 
525    start_cursor = terminator->continue_from_then ? nir_before_block(nir_if_first_else_block(if_stmt)) : nir_before_block(nir_if_first_then_block(if_stmt));
526 
527    /* Reinsert the cloned vars back where they came from */
528    nir_cf_reinsert(&lcssa_list, start_cursor);
529 
530    /* Delete the original loop header and body */
531    nir_cf_delete(&lp_header);
532    nir_cf_delete(&lp_body);
533 
534    /* The original loop has been replaced so remove it. */
535    nir_cf_node_remove(&loop->cf_node);
536 
537    _mesa_hash_table_destroy(remap_table, NULL);
538 }
539 
540 /* Unrolls the classic wrapper loops e.g
541  *
542  *    do {
543  *        // ...
544  *    } while (false)
545  */
546 static bool
wrapper_unroll(nir_loop * loop)547 wrapper_unroll(nir_loop *loop)
548 {
549    if (!list_is_empty(&loop->info->loop_terminator_list)) {
550 
551       /* Unrolling a loop with a large number of exits can result in a
552        * large inrease in register pressure. For now we just skip
553        * unrolling if we have more than 3 exits (not including the break
554        * at the end of the loop).
555        *
556        * TODO: Most loops that fit this pattern are simply switch
557        * statements that are converted to a loop to take advantage of
558        * exiting jump instruction handling. In this case we could make
559        * use of a binary seach pattern like we do in
560        * nir_lower_indirect_derefs(), this should allow us to unroll the
561        * loops in an optimal way and should also avoid some of the
562        * register pressure that comes from simply nesting the
563        * terminators one after the other.
564        */
565       if (list_length(&loop->info->loop_terminator_list) > 3)
566          return false;
567 
568       loop_prepare_for_unroll(loop);
569 
570       nir_cursor loop_end = nir_after_block(nir_loop_last_block(loop));
571       list_for_each_entry(nir_loop_terminator, terminator,
572                           &loop->info->loop_terminator_list,
573                           loop_terminator_link) {
574 
575          /* Remove break from the terminator */
576          nir_instr *break_instr =
577             nir_block_last_instr(terminator->break_block);
578          nir_instr_remove(break_instr);
579 
580          /* Pluck out the loop body. */
581          nir_cf_list loop_body;
582          nir_cf_extract(&loop_body,
583                         nir_after_cf_node(&terminator->nif->cf_node),
584                         loop_end);
585 
586          /* Reinsert loop body into continue from block */
587          nir_cf_reinsert(&loop_body,
588                          nir_after_block(terminator->continue_from_block));
589 
590          loop_end = terminator->continue_from_then ? nir_after_block(nir_if_last_then_block(terminator->nif)) : nir_after_block(nir_if_last_else_block(terminator->nif));
591       }
592    } else {
593       loop_prepare_for_unroll(loop);
594    }
595 
596    /* Pluck out the loop body. */
597    nir_cf_list loop_body;
598    nir_cf_extract(&loop_body, nir_before_block(nir_loop_first_block(loop)),
599                   nir_after_block(nir_loop_last_block(loop)));
600 
601    /* Reinsert loop body after the loop */
602    nir_cf_reinsert(&loop_body, nir_after_cf_node(&loop->cf_node));
603 
604    /* The loop has been unrolled so remove it. */
605    nir_cf_node_remove(&loop->cf_node);
606 
607    return true;
608 }
609 
610 static bool
is_access_out_of_bounds(nir_loop_terminator * term,nir_deref_instr * deref,unsigned trip_count)611 is_access_out_of_bounds(nir_loop_terminator *term, nir_deref_instr *deref,
612                         unsigned trip_count)
613 {
614    for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) {
615       if (d->deref_type != nir_deref_type_array)
616          continue;
617 
618       nir_alu_instr *alu = nir_instr_as_alu(term->conditional_instr);
619       nir_src src = term->induction_rhs ? alu->src[1].src : alu->src[0].src;
620       if (!nir_srcs_equal(d->arr.index, src))
621          continue;
622 
623       nir_deref_instr *parent = nir_deref_instr_parent(d);
624       assert(glsl_type_is_array(parent->type) ||
625              glsl_type_is_matrix(parent->type) ||
626              glsl_type_is_vector(parent->type));
627 
628       /* We have already unrolled the loop and the new one will be imbedded in
629        * the innermost continue branch. So unless the array is greater than
630        * the trip count any iteration over the loop will be an out of bounds
631        * access of the array.
632        */
633       unsigned length = glsl_type_is_vector(parent->type) ? glsl_get_vector_elements(parent->type) : glsl_get_length(parent->type);
634       return length <= trip_count;
635    }
636 
637    return false;
638 }
639 
640 static bool
comparison_contains_instr(nir_scalar cond_scalar,nir_instr * instr)641 comparison_contains_instr(nir_scalar cond_scalar, nir_instr *instr)
642 {
643    if (nir_is_terminator_condition_with_two_inputs(cond_scalar)) {
644       nir_alu_instr *comparison =
645          nir_instr_as_alu(cond_scalar.def->parent_instr);
646       return comparison->src[0].src.ssa->parent_instr == instr ||
647              comparison->src[1].src.ssa->parent_instr == instr;
648    }
649 
650    return false;
651 }
652 
653 /* If we know an array access is going to be out of bounds remove or replace
654  * the access with an undef. This can later result in the entire loop being
655  * removed by nir_opt_dead_cf().
656  */
657 static void
remove_out_of_bounds_induction_use(nir_shader * shader,nir_loop * loop,nir_loop_terminator * term,nir_cf_list * lp_header,nir_cf_list * lp_body,unsigned trip_count)658 remove_out_of_bounds_induction_use(nir_shader *shader, nir_loop *loop,
659                                    nir_loop_terminator *term,
660                                    nir_cf_list *lp_header,
661                                    nir_cf_list *lp_body,
662                                    unsigned trip_count)
663 {
664    if (!loop->info->guessed_trip_count)
665       return;
666 
667    /* Temporarily recreate the original loop so we can alter it */
668    nir_cf_reinsert(lp_header, nir_after_block(nir_loop_last_block(loop)));
669    nir_cf_reinsert(lp_body, nir_after_block(nir_loop_last_block(loop)));
670 
671    nir_builder b = nir_builder_create(nir_cf_node_get_function(&loop->cf_node));
672 
673    nir_foreach_block_in_cf_node(block, &loop->cf_node) {
674       nir_foreach_instr_safe(instr, block) {
675          if (instr->type != nir_instr_type_intrinsic)
676             continue;
677 
678          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
679 
680          /* Check for arrays variably-indexed by a loop induction variable.
681           * If this access is out of bounds remove the instruction or replace
682           * its use with an undefined instruction.
683           * If the loop is no longer useful we leave it for the appropriate
684           * pass to clean it up for us.
685           */
686          if (intrin->intrinsic == nir_intrinsic_load_deref ||
687              intrin->intrinsic == nir_intrinsic_store_deref ||
688              intrin->intrinsic == nir_intrinsic_copy_deref) {
689 
690             if (is_access_out_of_bounds(term, nir_src_as_deref(intrin->src[0]),
691                                         trip_count)) {
692                if (intrin->intrinsic == nir_intrinsic_load_deref) {
693                   nir_alu_instr *term_alu =
694                      nir_instr_as_alu(term->nif->condition.ssa->parent_instr);
695                   b.cursor = nir_before_instr(term->nif->condition.ssa->parent_instr);
696 
697                   /* If the out of bounds load is used in the comparison of the
698                    * loop terminator replace the condition with true so that the
699                    * loop can be removed.
700                    */
701                   bool exit_on_true = !term->continue_from_then;
702                   if (term_alu->op == nir_op_ior && exit_on_true) {
703                      nir_scalar src0_cond_scalar = { term_alu->src[0].src.ssa, 0 };
704                      nir_scalar src1_cond_scalar = { term_alu->src[1].src.ssa, 0 };
705 
706                      bool replaced_comparison = false;
707                      if (comparison_contains_instr(src0_cond_scalar, instr)) {
708                         nir_def *t = nir_imm_true(&b);
709                         nir_def_rewrite_uses(term_alu->src[0].src.ssa, t);
710                         replaced_comparison = true;
711                      }
712 
713                      if (comparison_contains_instr(src1_cond_scalar, instr)) {
714                         nir_def *t = nir_imm_true(&b);
715                         nir_def_rewrite_uses(term_alu->src[1].src.ssa, t);
716                         replaced_comparison = true;
717                      }
718 
719                      if (replaced_comparison)
720                         continue;
721                   }
722 
723                   nir_def *undef =
724                      nir_undef(&b, intrin->def.num_components,
725                                intrin->def.bit_size);
726                   nir_def_rewrite_uses(&intrin->def,
727                                        undef);
728                } else {
729                   nir_instr_remove(instr);
730                   continue;
731                }
732             }
733 
734             if (intrin->intrinsic == nir_intrinsic_copy_deref &&
735                 is_access_out_of_bounds(term, nir_src_as_deref(intrin->src[1]),
736                                         trip_count)) {
737                nir_instr_remove(instr);
738             }
739          }
740       }
741    }
742 
743    /* Now that we are done extract the loop header and body again */
744    nir_cf_extract(lp_header, nir_before_block(nir_loop_first_block(loop)),
745                   nir_before_cf_node(&term->nif->cf_node));
746    nir_cf_extract(lp_body, nir_before_block(nir_loop_first_block(loop)),
747                   nir_after_block(nir_loop_last_block(loop)));
748 }
749 
750 /* Partially unrolls loops that don't have a known trip count.
751  */
752 static void
partial_unroll(nir_shader * shader,nir_loop * loop,unsigned trip_count)753 partial_unroll(nir_shader *shader, nir_loop *loop, unsigned trip_count)
754 {
755    assert(list_is_singular(&loop->info->loop_terminator_list));
756 
757    nir_loop_terminator *terminator =
758       list_first_entry(&loop->info->loop_terminator_list,
759                        nir_loop_terminator, loop_terminator_link);
760 
761    assert(nir_is_trivial_loop_if(terminator->nif, terminator->break_block));
762 
763    loop_prepare_for_unroll(loop);
764 
765    /* Pluck out the loop header */
766    nir_cf_list lp_header;
767    nir_cf_extract(&lp_header, nir_before_block(nir_loop_first_block(loop)),
768                   nir_before_cf_node(&terminator->nif->cf_node));
769 
770    struct hash_table *remap_table =
771       _mesa_hash_table_create(NULL, _mesa_hash_pointer,
772                               _mesa_key_pointer_equal);
773 
774    nir_cf_list lp_body;
775    nir_cf_node *unroll_loc =
776       complex_unroll_loop_body(loop, terminator, &lp_header, &lp_body,
777                                remap_table, trip_count);
778 
779    /* Attempt to remove out of bounds array access */
780    remove_out_of_bounds_induction_use(shader, loop, terminator, &lp_header,
781                                       &lp_body, trip_count);
782 
783    nir_cursor cursor =
784       get_complex_unroll_insert_location(unroll_loc,
785                                          terminator->continue_from_then);
786 
787    /* Reinsert the loop in the innermost nested continue branch of the unrolled
788     * loop.
789     */
790    nir_loop *new_loop = nir_loop_create(shader);
791    nir_cf_node_insert(cursor, &new_loop->cf_node);
792    new_loop->partially_unrolled = true;
793 
794    /* Clone loop header and insert into new loop */
795    nir_cf_list_clone_and_reinsert(&lp_header, loop->cf_node.parent,
796                                   nir_after_cf_list(&new_loop->body),
797                                   remap_table);
798 
799    /* Clone loop body and insert into new loop */
800    nir_cf_list_clone_and_reinsert(&lp_body, loop->cf_node.parent,
801                                   nir_after_cf_list(&new_loop->body),
802                                   remap_table);
803 
804    /* Insert break back into terminator */
805    nir_jump_instr *brk = nir_jump_instr_create(shader, nir_jump_break);
806    nir_block *break_block = _mesa_hash_table_search(remap_table, terminator->break_block)->data;
807    nir_instr_insert_after_block(break_block, &brk->instr);
808 
809    /* Delete the original loop header and body */
810    nir_cf_delete(&lp_header);
811    nir_cf_delete(&lp_body);
812 
813    /* The original loop has been replaced so remove it. */
814    nir_cf_node_remove(&loop->cf_node);
815 
816    _mesa_hash_table_destroy(remap_table, NULL);
817 }
818 
819 static bool
is_indirect_load(nir_instr * instr)820 is_indirect_load(nir_instr *instr)
821 {
822    if (instr->type == nir_instr_type_intrinsic) {
823       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
824 
825       if ((intrin->intrinsic == nir_intrinsic_load_ubo ||
826            intrin->intrinsic == nir_intrinsic_load_ssbo) &&
827           !nir_src_is_const(intrin->src[1])) {
828          return true;
829       }
830 
831       if (intrin->intrinsic == nir_intrinsic_load_global)
832          return true;
833 
834       if (intrin->intrinsic == nir_intrinsic_load_deref ||
835           intrin->intrinsic == nir_intrinsic_store_deref) {
836          nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
837          nir_variable_mode mem_modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_global;
838          if (!nir_deref_mode_may_be(deref, mem_modes))
839             return false;
840          while (deref) {
841             if ((deref->deref_type == nir_deref_type_array ||
842                  deref->deref_type == nir_deref_type_ptr_as_array) &&
843                 !nir_src_is_const(deref->arr.index)) {
844                return true;
845             }
846             deref = nir_deref_instr_parent(deref);
847          }
848       }
849    } else if (instr->type == nir_instr_type_tex) {
850       nir_tex_instr *tex = nir_instr_as_tex(instr);
851 
852       for (unsigned i = 0; i < tex->num_srcs; i++) {
853          if (!nir_src_is_const(tex->src[i].src))
854             return true;
855       }
856    }
857 
858    return false;
859 }
860 
861 static bool
can_pipeline_loads(nir_loop * loop)862 can_pipeline_loads(nir_loop *loop)
863 {
864    if (!loop->info->exact_trip_count_known)
865       return false;
866 
867    bool interesting_loads = false;
868 
869    foreach_list_typed(nir_cf_node, cf_node, node, &loop->body) {
870       if (cf_node == &loop->info->limiting_terminator->nif->cf_node)
871          continue;
872 
873       /* Control flow usually prevents useful scheduling */
874       if (cf_node->type != nir_cf_node_block)
875          return false;
876 
877       if (interesting_loads)
878          continue;
879 
880       nir_block *block = nir_cf_node_as_block(cf_node);
881       nir_foreach_instr(instr, block) {
882          if (is_indirect_load(instr)) {
883             interesting_loads = true;
884             break;
885          }
886       }
887    }
888 
889    return interesting_loads;
890 }
891 
892 /*
893  * Returns true if we should unroll the loop, otherwise false.
894  */
895 static bool
check_unrolling_restrictions(nir_shader * shader,nir_loop * loop)896 check_unrolling_restrictions(nir_shader *shader, nir_loop *loop)
897 {
898    if (loop->control == nir_loop_control_unroll)
899       return true;
900 
901    if (loop->control == nir_loop_control_dont_unroll)
902       return false;
903 
904    nir_loop_info *li = loop->info;
905    unsigned max_iter = shader->options->max_unroll_iterations;
906    /* Unroll much more aggressively if it can hide load latency. */
907    if (shader->options->max_unroll_iterations_aggressive && can_pipeline_loads(loop))
908       max_iter = shader->options->max_unroll_iterations_aggressive;
909    /* Tune differently if the loop has double ops and soft fp64 is in use */
910    else if (shader->options->max_unroll_iterations_fp64 && loop->info->has_soft_fp64)
911       max_iter = shader->options->max_unroll_iterations_fp64;
912    unsigned trip_count =
913       li->max_trip_count ? li->max_trip_count : li->guessed_trip_count;
914 
915    if (li->force_unroll && !li->guessed_trip_count && trip_count <= max_iter)
916       return true;
917 
918    unsigned cost_limit = max_iter * LOOP_UNROLL_LIMIT;
919    unsigned cost = li->instr_cost * trip_count;
920 
921    if (cost <= cost_limit && trip_count <= max_iter)
922       return true;
923 
924    return false;
925 }
926 
927 static bool
928 process_loops(nir_shader *sh, nir_cf_node *cf_node, bool *has_nested_loop_out,
929               bool *unrolled_this_block);
930 
931 static bool
process_loops_in_block(nir_shader * sh,struct exec_list * block,bool * has_nested_loop_out)932 process_loops_in_block(nir_shader *sh, struct exec_list *block,
933                        bool *has_nested_loop_out)
934 {
935    /* We try to unroll as many loops in one pass as possible.
936     * E.g. we can safely unroll both loops in this block:
937     *
938     *    if (...) {
939     *       loop {...}
940     *    }
941     *
942     *    if (...) {
943     *       loop {...}
944     *    }
945     *
946     * Unrolling one loop doesn't affect the other one.
947     *
948     * On the other hand for block with:
949     *
950     *    loop {...}
951     *    ...
952     *    loop {...}
953     *
954     * It is unsafe to unroll both loops in one pass without taking
955     * complicating precautions, since the structure of the block would
956     * change after unrolling the first loop. So in such a case we leave
957     * the second loop for the next iteration of unrolling to handle.
958     */
959 
960    bool progress = false;
961    bool unrolled_this_block = false;
962 
963    foreach_list_typed(nir_cf_node, nested_node, node, block) {
964       if (process_loops(sh, nested_node,
965                         has_nested_loop_out, &unrolled_this_block)) {
966          progress = true;
967 
968          /* If current node is unrolled we could not safely continue
969           * our iteration since we don't know the next node
970           * and it's hard to guarantee that we won't end up unrolling
971           * inner loop of the currently unrolled one, if such exists.
972           */
973          if (unrolled_this_block) {
974             break;
975          }
976       }
977    }
978 
979    return progress;
980 }
981 
982 static bool
process_loops(nir_shader * sh,nir_cf_node * cf_node,bool * has_nested_loop_out,bool * unrolled_this_block)983 process_loops(nir_shader *sh, nir_cf_node *cf_node, bool *has_nested_loop_out,
984               bool *unrolled_this_block)
985 {
986    bool progress = false;
987    bool has_nested_loop = false;
988    nir_loop *loop;
989 
990    switch (cf_node->type) {
991    case nir_cf_node_block:
992       return progress;
993    case nir_cf_node_if: {
994       nir_if *if_stmt = nir_cf_node_as_if(cf_node);
995       progress |= process_loops_in_block(sh, &if_stmt->then_list,
996                                          has_nested_loop_out);
997       progress |= process_loops_in_block(sh, &if_stmt->else_list,
998                                          has_nested_loop_out);
999       return progress;
1000    }
1001    case nir_cf_node_loop: {
1002       loop = nir_cf_node_as_loop(cf_node);
1003       assert(!nir_loop_has_continue_construct(loop));
1004       progress |= process_loops_in_block(sh, &loop->body, &has_nested_loop);
1005 
1006       break;
1007    }
1008    default:
1009       unreachable("unknown cf node type");
1010    }
1011 
1012    const bool unrolled_child_block = progress;
1013 
1014    /* Don't attempt to unroll a second inner loop in this pass, wait until the
1015     * next pass as we have altered the cf.
1016     */
1017    if (!progress && loop->control != nir_loop_control_dont_unroll) {
1018 
1019       /* Remove the conditional break statements associated with all terminators
1020        * that are associated with a fixed iteration count, except for the one
1021        * associated with the limiting terminator--that one needs to stay, since
1022        * it terminates the loop.
1023        */
1024       if (loop->info->limiting_terminator) {
1025          list_for_each_entry_safe(nir_loop_terminator, t,
1026                                   &loop->info->loop_terminator_list,
1027                                   loop_terminator_link) {
1028             if (t->exact_trip_count_unknown)
1029                continue;
1030 
1031             if (t != loop->info->limiting_terminator) {
1032 
1033                /* Only delete the if-statement if the continue block is empty.
1034                 * We trust that nir_opt_if() does its job well enough to
1035                 * remove all instructions from the continue block when possible.
1036                 */
1037                nir_block *first_continue_from_blk = t->continue_from_then ? nir_if_first_then_block(t->nif) : nir_if_first_else_block(t->nif);
1038 
1039                if (!(nir_cf_node_is_last(&first_continue_from_blk->cf_node) &&
1040                      exec_list_is_empty(&first_continue_from_blk->instr_list)))
1041                   continue;
1042 
1043                /* Now delete the if */
1044                nir_cf_node_remove(&t->nif->cf_node);
1045 
1046                /* Also remove it from the terminator list */
1047                list_del(&t->loop_terminator_link);
1048 
1049                progress = true;
1050             }
1051          }
1052       }
1053 
1054       /* Check for the classic
1055        *
1056        *    do {
1057        *        // ...
1058        *    } while (false)
1059        *
1060        * that is used to wrap multi-line macros. GLSL IR also wraps switch
1061        * statements in a loop like this.
1062        */
1063       if (loop->info->limiting_terminator == NULL &&
1064           !loop->info->complex_loop) {
1065 
1066          nir_block *last_loop_blk = nir_loop_last_block(loop);
1067          if (nir_block_ends_in_break(last_loop_blk)) {
1068             progress = wrapper_unroll(loop);
1069             goto exit;
1070          }
1071 
1072          /* If we were able to guess the loop iteration based on array access
1073           * then do a partial unroll.
1074           */
1075          bool one_lt = list_is_singular(&loop->info->loop_terminator_list);
1076          if (!has_nested_loop && one_lt && !loop->partially_unrolled &&
1077              loop->info->guessed_trip_count &&
1078              check_unrolling_restrictions(sh, loop)) {
1079             partial_unroll(sh, loop, loop->info->guessed_trip_count);
1080             progress = true;
1081          }
1082       }
1083 
1084       /* Intentionally don't consider exact_trip_count_known here.  When
1085        * max_trip_count is non-zero, it is the upper bound on the number of
1086        * times the loop will iterate, but the loop may iterate less.  For
1087        * example, the following loop will iterate 0 or 1 time:
1088        *
1089        *    for (i = 0; i < min(x, 1); i++) { ... }
1090        *
1091        * Trivial single-interation loops (e.g., do { ... } while (false)) and
1092        * trivial zero-iteration loops (e.g., while (false) { ... }) will have
1093        * already been handled.
1094        *
1095        * If the loop is known to execute at most once and meets the other
1096        * unrolling criteria, unroll it even if it has nested loops.
1097        *
1098        * It is unlikely that such loops exist in real shaders. GraphicsFuzz is
1099        * known to generate spurious loops that iterate exactly once.  It is
1100        * plausible that it could eventually start generating loops like the
1101        * example above, so it seems logical to defend against it now.
1102        */
1103       if (!loop->info->limiting_terminator ||
1104           (loop->info->max_trip_count != 1 && has_nested_loop))
1105          goto exit;
1106 
1107       if (!check_unrolling_restrictions(sh, loop))
1108          goto exit;
1109 
1110       if (loop->info->exact_trip_count_known) {
1111          simple_unroll(loop);
1112          progress = true;
1113       } else {
1114          /* Attempt to unroll loops with two terminators. */
1115          unsigned num_lt = list_length(&loop->info->loop_terminator_list);
1116          if (num_lt == 2 &&
1117              !loop->info->limiting_terminator->exact_trip_count_unknown) {
1118             bool limiting_term_second = true;
1119             nir_loop_terminator *terminator =
1120                list_first_entry(&loop->info->loop_terminator_list,
1121                                 nir_loop_terminator, loop_terminator_link);
1122 
1123             if (terminator->nif == loop->info->limiting_terminator->nif) {
1124                limiting_term_second = false;
1125                terminator =
1126                   list_last_entry(&loop->info->loop_terminator_list,
1127                                   nir_loop_terminator, loop_terminator_link);
1128             }
1129 
1130             /* If the first terminator has a trip count of zero and is the
1131              * limiting terminator just do a simple unroll as the second
1132              * terminator can never be reached.
1133              */
1134             if (loop->info->max_trip_count == 0 && !limiting_term_second) {
1135                simple_unroll(loop);
1136             } else {
1137                complex_unroll(loop, terminator, limiting_term_second);
1138             }
1139             progress = true;
1140          }
1141 
1142          if (num_lt == 1) {
1143             assert(loop->info->limiting_terminator->exact_trip_count_unknown);
1144             complex_unroll_single_terminator(loop);
1145             progress = true;
1146          }
1147       }
1148    }
1149 
1150 exit:
1151    *has_nested_loop_out = true;
1152    if (progress && !unrolled_child_block)
1153       *unrolled_this_block = true;
1154 
1155    return progress;
1156 }
1157 
1158 static bool
nir_opt_loop_unroll_impl(nir_function_impl * impl,nir_variable_mode indirect_mask,bool force_unroll_sampler_indirect)1159 nir_opt_loop_unroll_impl(nir_function_impl *impl,
1160                          nir_variable_mode indirect_mask,
1161                          bool force_unroll_sampler_indirect)
1162 {
1163    bool progress = false;
1164    nir_metadata_require(impl, nir_metadata_loop_analysis, indirect_mask,
1165                         (int)force_unroll_sampler_indirect);
1166    nir_metadata_require(impl, nir_metadata_block_index);
1167 
1168    bool has_nested_loop = false;
1169    progress |= process_loops_in_block(impl->function->shader, &impl->body,
1170                                       &has_nested_loop);
1171 
1172    if (progress) {
1173       nir_metadata_preserve(impl, nir_metadata_none);
1174       nir_lower_reg_intrinsics_to_ssa_impl(impl);
1175    } else {
1176       nir_metadata_preserve(impl, nir_metadata_all);
1177    }
1178 
1179    return progress;
1180 }
1181 
1182 /**
1183  * indirect_mask specifies which type of indirectly accessed variables
1184  * should force loop unrolling.
1185  */
1186 bool
nir_opt_loop_unroll(nir_shader * shader)1187 nir_opt_loop_unroll(nir_shader *shader)
1188 {
1189    bool progress = false;
1190 
1191    bool force_unroll_sampler_indirect = shader->options->force_indirect_unrolling_sampler;
1192    nir_variable_mode indirect_mask = shader->options->force_indirect_unrolling;
1193    nir_foreach_function_impl(impl, shader) {
1194       progress |= nir_opt_loop_unroll_impl(impl, indirect_mask,
1195                                            force_unroll_sampler_indirect);
1196    }
1197    return progress;
1198 }
1199