xref: /aosp_15_r20/external/mesa3d/src/compiler/nir/nir_opt_preamble.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "util/set.h"
25 #include "nir.h"
26 #include "nir_builder.h"
27 
28 /* This pass provides a way to move computations that are always the same for
29  * an entire draw/compute dispatch into a "preamble" that runs before the main
30  * entrypoint.
31  *
32  * We also expose a separate API to get or construct the preamble of a shader
33  * in case backends want to insert their own code.
34  */
35 
36 nir_function_impl *
nir_shader_get_preamble(nir_shader * shader)37 nir_shader_get_preamble(nir_shader *shader)
38 {
39    nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader);
40    if (entrypoint->preamble) {
41       return entrypoint->preamble->impl;
42    } else {
43       nir_function *preamble = nir_function_create(shader, "@preamble");
44       preamble->is_preamble = true;
45       nir_function_impl *impl = nir_function_impl_create(preamble);
46       entrypoint->preamble = preamble;
47       return impl;
48    }
49 }
50 
51 typedef struct {
52    bool can_move;
53    bool candidate;
54    bool must_stay;
55    bool replace;
56 
57    unsigned can_move_users;
58 
59    unsigned size, align;
60 
61    unsigned offset;
62 
63    /* Average the cost of a value among its users, to try to account for
64     * values that have multiple can_move uses.
65     */
66    float value;
67 
68    /* Overall benefit, i.e. the value minus any cost to inserting
69     * load_preamble.
70     */
71    float benefit;
72 } def_state;
73 
74 typedef struct {
75    /* Per-definition array of states */
76    def_state *states;
77 
78    /* Number of levels of non-uniform control flow we're in. We don't
79     * reconstruct loops, so loops count as non-uniform conservatively. If-else
80     * is counted if the condition is not marked can_move.
81     */
82    unsigned nonuniform_cf_nesting;
83 
84    /* Set of nir_if's that must be reconstructed in the preamble. Note an if may
85     * need reconstruction even when not entirely moved. This does not account
86     * for nesting: the parent CF nodes of ifs in this set must be reconstructed
87     * but may not be in this set, even if the parent is another if.
88     */
89    struct set *reconstructed_ifs;
90 
91    /* Set of definitions that must be reconstructed in the preamble. This is a
92     * subset of can_move instructions, determined after replacement.
93     */
94    BITSET_WORD *reconstructed_defs;
95 
96    nir_def *def;
97 
98    const nir_opt_preamble_options *options;
99 } opt_preamble_ctx;
100 
101 static bool
instr_can_speculate(nir_instr * instr)102 instr_can_speculate(nir_instr *instr)
103 {
104    /* Intrinsics with an ACCESS index can only be speculated if they are
105     * explicitly CAN_SPECULATE.
106     */
107    if (instr->type == nir_instr_type_intrinsic) {
108       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
109 
110       if (nir_intrinsic_has_access(intr))
111          return nir_intrinsic_access(intr) & ACCESS_CAN_SPECULATE;
112    }
113 
114    /* For now, everything else can be speculated. TODO: Bindless textures. */
115    return true;
116 }
117 
118 static float
get_instr_cost(nir_instr * instr,const nir_opt_preamble_options * options)119 get_instr_cost(nir_instr *instr, const nir_opt_preamble_options *options)
120 {
121    /* No backend will want to hoist load_const or undef by itself, so handle
122     * this for them.
123     */
124    if (instr->type == nir_instr_type_load_const ||
125        instr->type == nir_instr_type_undef)
126       return 0;
127 
128    return options->instr_cost_cb(instr, options->cb_data);
129 }
130 
131 static bool
can_move_src(nir_src * src,void * state)132 can_move_src(nir_src *src, void *state)
133 {
134    opt_preamble_ctx *ctx = state;
135 
136    return ctx->states[src->ssa->index].can_move;
137 }
138 
139 static bool
can_move_srcs(nir_instr * instr,opt_preamble_ctx * ctx)140 can_move_srcs(nir_instr *instr, opt_preamble_ctx *ctx)
141 {
142    return nir_foreach_src(instr, can_move_src, ctx);
143 }
144 
145 static bool
can_move_intrinsic(nir_intrinsic_instr * instr,opt_preamble_ctx * ctx)146 can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
147 {
148    switch (instr->intrinsic) {
149    /* Intrinsics which can always be moved */
150    case nir_intrinsic_load_push_constant:
151    case nir_intrinsic_load_work_dim:
152    case nir_intrinsic_load_num_workgroups:
153    case nir_intrinsic_load_ray_launch_size:
154    case nir_intrinsic_load_sbt_base_amd:
155    case nir_intrinsic_load_is_indexed_draw:
156    case nir_intrinsic_load_viewport_scale:
157    case nir_intrinsic_load_user_clip_plane:
158    case nir_intrinsic_load_viewport_x_scale:
159    case nir_intrinsic_load_viewport_y_scale:
160    case nir_intrinsic_load_viewport_z_scale:
161    case nir_intrinsic_load_viewport_offset:
162    case nir_intrinsic_load_viewport_x_offset:
163    case nir_intrinsic_load_viewport_y_offset:
164    case nir_intrinsic_load_viewport_z_offset:
165    case nir_intrinsic_load_blend_const_color_a_float:
166    case nir_intrinsic_load_blend_const_color_b_float:
167    case nir_intrinsic_load_blend_const_color_g_float:
168    case nir_intrinsic_load_blend_const_color_r_float:
169    case nir_intrinsic_load_blend_const_color_rgba:
170    case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
171    case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
172    case nir_intrinsic_load_line_width:
173    case nir_intrinsic_load_aa_line_width:
174    case nir_intrinsic_load_fb_layers_v3d:
175    case nir_intrinsic_load_fep_w_v3d:
176    case nir_intrinsic_load_tcs_num_patches_amd:
177    case nir_intrinsic_load_sample_positions_pan:
178    case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
179    case nir_intrinsic_load_prim_gen_query_enabled_amd:
180    case nir_intrinsic_load_prim_xfb_query_enabled_amd:
181    case nir_intrinsic_load_clamp_vertex_color_amd:
182    case nir_intrinsic_load_cull_front_face_enabled_amd:
183    case nir_intrinsic_load_cull_back_face_enabled_amd:
184    case nir_intrinsic_load_cull_ccw_amd:
185    case nir_intrinsic_load_cull_small_primitives_enabled_amd:
186    case nir_intrinsic_load_cull_any_enabled_amd:
187    case nir_intrinsic_load_cull_small_prim_precision_amd:
188    case nir_intrinsic_load_vbo_base_agx:
189       return true;
190 
191    /* Intrinsics which can be moved depending on hardware */
192    case nir_intrinsic_load_base_instance:
193    case nir_intrinsic_load_base_vertex:
194    case nir_intrinsic_load_first_vertex:
195    case nir_intrinsic_load_draw_id:
196       return ctx->options->drawid_uniform;
197 
198    case nir_intrinsic_load_subgroup_size:
199    case nir_intrinsic_load_num_subgroups:
200       return ctx->options->subgroup_size_uniform;
201 
202    case nir_intrinsic_load_workgroup_size:
203       return ctx->options->load_workgroup_size_allowed;
204 
205    /* Intrinsics which can be moved if the sources can */
206    case nir_intrinsic_load_ubo:
207    case nir_intrinsic_load_ubo_vec4:
208    case nir_intrinsic_get_ubo_size:
209    case nir_intrinsic_get_ssbo_size:
210    case nir_intrinsic_ballot_bitfield_extract:
211    case nir_intrinsic_ballot_find_lsb:
212    case nir_intrinsic_ballot_find_msb:
213    case nir_intrinsic_ballot_bit_count_reduce:
214    case nir_intrinsic_load_deref:
215    case nir_intrinsic_load_global_constant:
216    case nir_intrinsic_load_uniform:
217    case nir_intrinsic_load_preamble:
218    case nir_intrinsic_load_constant:
219    case nir_intrinsic_load_sample_pos_from_id:
220    case nir_intrinsic_load_kernel_input:
221    case nir_intrinsic_load_buffer_amd:
222    case nir_intrinsic_image_levels:
223    case nir_intrinsic_image_deref_levels:
224    case nir_intrinsic_bindless_image_levels:
225    case nir_intrinsic_image_samples:
226    case nir_intrinsic_image_deref_samples:
227    case nir_intrinsic_bindless_image_samples:
228    case nir_intrinsic_image_size:
229    case nir_intrinsic_image_deref_size:
230    case nir_intrinsic_bindless_image_size:
231    case nir_intrinsic_vulkan_resource_index:
232    case nir_intrinsic_vulkan_resource_reindex:
233    case nir_intrinsic_load_vulkan_descriptor:
234    case nir_intrinsic_quad_swizzle_amd:
235    case nir_intrinsic_masked_swizzle_amd:
236    case nir_intrinsic_load_ssbo_address:
237    case nir_intrinsic_bindless_resource_ir3:
238    case nir_intrinsic_load_const_ir3:
239    case nir_intrinsic_load_constant_agx:
240       return can_move_srcs(&instr->instr, ctx);
241 
242    /* Image/SSBO loads can be moved if they are CAN_REORDER and their
243     * sources can be moved.
244     */
245    case nir_intrinsic_image_load:
246    case nir_intrinsic_image_samples_identical:
247    case nir_intrinsic_bindless_image_load:
248    case nir_intrinsic_load_ssbo:
249    case nir_intrinsic_load_ssbo_ir3:
250       return (nir_intrinsic_access(instr) & ACCESS_CAN_REORDER) &&
251              can_move_srcs(&instr->instr, ctx);
252 
253    default:
254       return false;
255    }
256 }
257 
258 static bool
can_move_instr(nir_instr * instr,opt_preamble_ctx * ctx)259 can_move_instr(nir_instr *instr, opt_preamble_ctx *ctx)
260 {
261    /* If we are only contained within uniform control flow, no speculation is
262     * needed since the control flow will be reconstructed in the preamble. But
263     * if we are not, we must be able to speculate instructions to move them.
264     */
265    if (ctx->nonuniform_cf_nesting > 0 && !instr_can_speculate(instr))
266       return false;
267 
268    switch (instr->type) {
269    case nir_instr_type_tex: {
270       nir_tex_instr *tex = nir_instr_as_tex(instr);
271       /* See note below about derivatives. We have special code to convert tex
272        * to txd, though, because it's a common case.
273        */
274       if (nir_tex_instr_has_implicit_derivative(tex) &&
275           tex->op != nir_texop_tex) {
276          return false;
277       }
278       return can_move_srcs(instr, ctx);
279    }
280    case nir_instr_type_alu: {
281       /* The preamble is presumably run with only one thread, so we can't run
282        * derivatives in it.
283        * TODO: Replace derivatives with 0 instead, if real apps hit this.
284        */
285       nir_alu_instr *alu = nir_instr_as_alu(instr);
286       if (nir_op_is_derivative(alu->op))
287          return false;
288       else
289          return can_move_srcs(instr, ctx);
290    }
291    case nir_instr_type_intrinsic:
292       return can_move_intrinsic(nir_instr_as_intrinsic(instr), ctx);
293 
294    case nir_instr_type_load_const:
295    case nir_instr_type_undef:
296       return true;
297 
298    case nir_instr_type_deref: {
299       nir_deref_instr *deref = nir_instr_as_deref(instr);
300       if (deref->deref_type == nir_deref_type_var) {
301          switch (deref->modes) {
302          case nir_var_uniform:
303          case nir_var_mem_ubo:
304             return true;
305          default:
306             return false;
307          }
308       } else {
309          return can_move_srcs(instr, ctx);
310       }
311    }
312 
313    /* We can only move phis if all of their sources are movable, and it is a phi
314     * for an if-else that is itself movable.
315     */
316    case nir_instr_type_phi: {
317       nir_cf_node *prev_node = nir_cf_node_prev(&instr->block->cf_node);
318       if (!prev_node)
319          return false;
320 
321       if (prev_node->type != nir_cf_node_if) {
322          assert(prev_node->type == nir_cf_node_loop);
323          return false;
324       }
325 
326       nir_if *nif = nir_cf_node_as_if(prev_node);
327       if (!can_move_src(&nif->condition, ctx))
328          return false;
329 
330       return can_move_srcs(instr, ctx);
331    }
332 
333    default:
334       return false;
335    }
336 }
337 
338 /* True if we should avoid making this a candidate. This is only called on
339  * instructions we already determined we can move, this just makes it so that
340  * uses of this instruction cannot be rewritten. Typically this happens
341  * because of static constraints on the IR, for example some deref chains
342  * cannot be broken.
343  */
344 static bool
avoid_instr(nir_instr * instr,const nir_opt_preamble_options * options)345 avoid_instr(nir_instr *instr, const nir_opt_preamble_options *options)
346 {
347    if (instr->type == nir_instr_type_deref)
348       return true;
349 
350    return options->avoid_instr_cb(instr, options->cb_data);
351 }
352 
353 static bool
update_src_value(nir_src * src,void * data)354 update_src_value(nir_src *src, void *data)
355 {
356    opt_preamble_ctx *ctx = data;
357 
358    def_state *state = &ctx->states[ctx->def->index];
359    def_state *src_state = &ctx->states[src->ssa->index];
360 
361    assert(src_state->can_move);
362 
363    /* If an instruction has can_move and non-can_move users, it becomes a
364     * candidate and its value shouldn't propagate downwards. For example,
365     * imagine a chain like this:
366     *
367     *         -- F (cannot move)
368     *        /
369     *  A <-- B <-- C <-- D <-- E (cannot move)
370     *
371     * B and D are marked candidates. Picking B removes A and B, picking D
372     * removes C and D, and picking both removes all 4. Therefore B and D are
373     * independent and B's value shouldn't flow into D.
374     *
375     * A similar argument holds for must_stay values.
376     */
377    if (!src_state->must_stay && !src_state->candidate)
378       state->value += src_state->value;
379    return true;
380 }
381 
382 static int
candidate_sort(const void * data1,const void * data2)383 candidate_sort(const void *data1, const void *data2)
384 {
385    const def_state *state1 = *(def_state **)data1;
386    const def_state *state2 = *(def_state **)data2;
387 
388    float value1 = state1->value / state1->size;
389    float value2 = state2->value / state2->size;
390    if (value1 < value2)
391       return 1;
392    else if (value1 > value2)
393       return -1;
394    else
395       return 0;
396 }
397 
398 static bool
calculate_can_move_for_block(opt_preamble_ctx * ctx,nir_block * block)399 calculate_can_move_for_block(opt_preamble_ctx *ctx, nir_block *block)
400 {
401    bool all_can_move = true;
402 
403    nir_foreach_instr(instr, block) {
404       nir_def *def = nir_instr_def(instr);
405       if (!def)
406          continue;
407 
408       def_state *state = &ctx->states[def->index];
409       state->can_move = can_move_instr(instr, ctx);
410       all_can_move &= state->can_move;
411    }
412 
413    return all_can_move;
414 }
415 
416 static bool
calculate_can_move_for_cf_list(opt_preamble_ctx * ctx,struct exec_list * list)417 calculate_can_move_for_cf_list(opt_preamble_ctx *ctx, struct exec_list *list)
418 {
419    bool all_can_move = true;
420 
421    foreach_list_typed(nir_cf_node, node, node, list) {
422       switch (node->type) {
423       case nir_cf_node_block:
424          all_can_move &=
425             calculate_can_move_for_block(ctx, nir_cf_node_as_block(node));
426          break;
427 
428       case nir_cf_node_if: {
429          nir_if *nif = nir_cf_node_as_if(node);
430          bool uniform = can_move_src(&nif->condition, ctx);
431 
432          if (!uniform)
433             ctx->nonuniform_cf_nesting++;
434 
435          bool if_can_move = uniform;
436          if_can_move &= calculate_can_move_for_cf_list(ctx, &nif->then_list);
437          if_can_move &= calculate_can_move_for_cf_list(ctx, &nif->else_list);
438 
439          if (!uniform)
440             ctx->nonuniform_cf_nesting--;
441 
442          all_can_move &= if_can_move;
443          break;
444       }
445 
446       case nir_cf_node_loop: {
447          nir_loop *loop = nir_cf_node_as_loop(node);
448 
449          /* Conservatively treat loops like conditional control flow, since an
450           * instruction might be conditionally unreachabled due to an earlier
451           * break in a loop that executes only one iteration.
452           */
453          ctx->nonuniform_cf_nesting++;
454          calculate_can_move_for_cf_list(ctx, &loop->body);
455          ctx->nonuniform_cf_nesting--;
456          all_can_move = false;
457          break;
458       }
459 
460       default:
461          unreachable("Unexpected CF node type");
462       }
463    }
464 
465    return all_can_move;
466 }
467 
468 static void
replace_for_block(nir_builder * b,opt_preamble_ctx * ctx,struct hash_table * remap_table,nir_block * block)469 replace_for_block(nir_builder *b, opt_preamble_ctx *ctx,
470                   struct hash_table *remap_table, nir_block *block)
471 {
472    nir_foreach_instr(instr, block) {
473       nir_def *def = nir_instr_def(instr);
474       if (!def)
475          continue;
476 
477       /* Only replace what we actually need. This is a micro-optimization for
478        * compile-time performance of regular instructions, but it's required for
479        * correctness with phi nodes, since we might not reconstruct the
480        * corresponding if.
481        */
482       if (!BITSET_TEST(ctx->reconstructed_defs, def->index))
483          continue;
484 
485       def_state *state = &ctx->states[def->index];
486       assert(state->can_move && "reconstructed => can_move");
487 
488       nir_instr *clone;
489 
490       if (instr->type == nir_instr_type_phi) {
491          /* Phis are special since they can't be cloned with nir_instr_clone */
492          nir_phi_instr *phi = nir_instr_as_phi(instr);
493 
494          nir_cf_node *nif_cf = nir_cf_node_prev(&block->cf_node);
495          assert(nif_cf->type == nir_cf_node_if && "only if's are moveable");
496          nir_if *nif = nir_cf_node_as_if(nif_cf);
497 
498          nir_block *then_block = nir_if_last_then_block(nif);
499          nir_block *else_block = nir_if_last_else_block(nif);
500 
501          nir_def *then_def = NULL, *else_def = NULL;
502 
503          nir_foreach_phi_src(phi_src, phi) {
504             if (phi_src->pred == then_block) {
505                assert(then_def == NULL);
506                then_def = phi_src->src.ssa;
507             } else if (phi_src->pred == else_block) {
508                assert(else_def == NULL);
509                else_def = phi_src->src.ssa;
510             } else {
511                unreachable("Invalid predecessor for phi of if");
512             }
513          }
514 
515          assert(exec_list_length(&phi->srcs) == 2 && "only if's are movable");
516          assert(then_def && else_def && "all sources seen");
517 
518          /* Remap */
519          then_def = _mesa_hash_table_search(remap_table, then_def)->data;
520          else_def = _mesa_hash_table_search(remap_table, else_def)->data;
521 
522          b->cursor =
523             nir_before_block_after_phis(nir_cursor_current_block(b->cursor));
524 
525          nir_def *repl = nir_if_phi(b, then_def, else_def);
526          clone = repl->parent_instr;
527 
528          _mesa_hash_table_insert(remap_table, &phi->def, repl);
529       } else {
530          clone = nir_instr_clone_deep(b->shader, instr, remap_table);
531          nir_builder_instr_insert(b, clone);
532       }
533 
534       if (clone->type == nir_instr_type_tex) {
535          nir_tex_instr *tex = nir_instr_as_tex(clone);
536          if (tex->op == nir_texop_tex) {
537             /* For maximum compatibility, replace normal textures with
538              * textureGrad with a gradient of 0.
539              * TODO: Handle txb somehow.
540              */
541             b->cursor = nir_before_instr(clone);
542 
543             nir_def *zero =
544                nir_imm_zero(b, tex->coord_components - tex->is_array, 32);
545             nir_tex_instr_add_src(tex, nir_tex_src_ddx, zero);
546             nir_tex_instr_add_src(tex, nir_tex_src_ddy, zero);
547             tex->op = nir_texop_txd;
548 
549             b->cursor = nir_after_instr(clone);
550          }
551       }
552 
553       if (state->replace) {
554          nir_def *clone_def = nir_instr_def(clone);
555          nir_store_preamble(b, clone_def, .base = state->offset);
556       }
557    }
558 }
559 
560 static void
replace_for_cf_list(nir_builder * b,opt_preamble_ctx * ctx,struct hash_table * remap_table,struct exec_list * list)561 replace_for_cf_list(nir_builder *b, opt_preamble_ctx *ctx,
562                     struct hash_table *remap_table, struct exec_list *list)
563 {
564    foreach_list_typed(nir_cf_node, node, node, list) {
565       switch (node->type) {
566       case nir_cf_node_block: {
567          replace_for_block(b, ctx, remap_table, nir_cf_node_as_block(node));
568          break;
569       }
570 
571       case nir_cf_node_if: {
572          nir_if *nif = nir_cf_node_as_if(node);
573 
574          /* If we moved something that requires reconstructing the if, do so */
575          if (_mesa_set_search(ctx->reconstructed_ifs, nif)) {
576             assert(can_move_src(&nif->condition, ctx));
577 
578             struct hash_entry *entry =
579                _mesa_hash_table_search(remap_table, nif->condition.ssa);
580             assert(entry != NULL && "can_move condition, def dominates use");
581             nir_def *remap_cond = entry->data;
582 
583             nir_if *reconstructed_nif = NULL;
584             reconstructed_nif = nir_push_if(b, remap_cond);
585 
586             b->cursor = nir_before_cf_list(&reconstructed_nif->then_list);
587             replace_for_cf_list(b, ctx, remap_table, &nif->then_list);
588 
589             b->cursor = nir_before_cf_list(&reconstructed_nif->else_list);
590             replace_for_cf_list(b, ctx, remap_table, &nif->else_list);
591 
592             nir_pop_if(b, reconstructed_nif);
593             b->cursor = nir_after_cf_node(&reconstructed_nif->cf_node);
594          } else {
595             replace_for_cf_list(b, ctx, remap_table, &nif->then_list);
596             replace_for_cf_list(b, ctx, remap_table, &nif->else_list);
597          }
598 
599          break;
600       }
601 
602       case nir_cf_node_loop: {
603          /* We don't try to reconstruct loops */
604          nir_loop *loop = nir_cf_node_as_loop(node);
605          replace_for_cf_list(b, ctx, remap_table, &loop->body);
606          break;
607       }
608 
609       default:
610          unreachable("Unexpected CF node type");
611       }
612    }
613 }
614 
615 /*
616  * If an if-statement contains an instruction that cannot be speculated, the
617  * if-statement must be reconstructed so we avoid the speculation. This applies
618  * even for nested if-statements. Determine which if-statements must be
619  * reconstructed for this reason by walking the program forward and looking
620  * inside uniform if's.
621  *
622  * Returns whether the CF list contains a reconstructed instruction that would
623  * otherwise be speculated, updating the reconstructed_ifs set. This depends on
624  * reconstructed_defs being correctly set by analyze_reconstructed.
625  */
626 static bool
analyze_speculation_for_cf_list(opt_preamble_ctx * ctx,struct exec_list * list)627 analyze_speculation_for_cf_list(opt_preamble_ctx *ctx, struct exec_list *list)
628 {
629    bool reconstruct_cf_list = false;
630 
631    foreach_list_typed(nir_cf_node, node, node, list) {
632       switch (node->type) {
633       case nir_cf_node_block: {
634          nir_foreach_instr(instr, nir_cf_node_as_block(node)) {
635             nir_def *def = nir_instr_def(instr);
636             if (!def)
637                continue;
638 
639             if (!BITSET_TEST(ctx->reconstructed_defs, def->index))
640                continue;
641 
642             if (!instr_can_speculate(instr)) {
643                reconstruct_cf_list = true;
644                break;
645             }
646          }
647 
648          break;
649       }
650 
651       case nir_cf_node_if: {
652          nir_if *nif = nir_cf_node_as_if(node);
653 
654          /* If we can move the if, we might need to reconstruct */
655          if (can_move_src(&nif->condition, ctx)) {
656             bool any = false;
657             any |= analyze_speculation_for_cf_list(ctx, &nif->then_list);
658             any |= analyze_speculation_for_cf_list(ctx, &nif->else_list);
659 
660             if (any)
661                _mesa_set_add(ctx->reconstructed_ifs, nif);
662 
663             reconstruct_cf_list |= any;
664          }
665 
666          break;
667       }
668 
669       /* We don't reconstruct loops */
670       default:
671          break;
672       }
673    }
674 
675    return reconstruct_cf_list;
676 }
677 
678 static bool
mark_reconstructed(nir_src * src,void * state)679 mark_reconstructed(nir_src *src, void *state)
680 {
681    BITSET_WORD *reconstructed_defs = state;
682    BITSET_SET(reconstructed_defs, src->ssa->index);
683    return true;
684 }
685 
686 /*
687  * If a phi is moved into the preamble, then the if it depends on must also be
688  * moved. However, it is not necessary to consider any nested control flow. As
689  * an example, if we have a shader:
690  *
691  *    if (not moveable condition) {
692  *       if (moveable condition) {
693  *          x = moveable
694  *       }
695  *       y = phi x, moveable
696  *       z = floor y
697  *    }
698  *
699  * Then if 'z' is in the replace set, we need to reconstruct the inner if, but
700  * not the outer if, unless there's also speculation to worry about.
701  *
702  * We do this by marking defs that need to be reconstructed, with a backwards
703  * sweep of the program (compatible with reverse dominance), and marking the
704  * if's preceding reconstructed phis.
705  */
706 static void
analyze_reconstructed(opt_preamble_ctx * ctx,nir_function_impl * impl)707 analyze_reconstructed(opt_preamble_ctx *ctx, nir_function_impl *impl)
708 {
709    nir_foreach_block_reverse(block, impl) {
710       /* If an if-statement is reconstructed, its condition must be as well */
711       nir_if *nif = nir_block_get_following_if(block);
712       if (nif && _mesa_set_search(ctx->reconstructed_ifs, nif))
713          BITSET_SET(ctx->reconstructed_defs, nif->condition.ssa->index);
714 
715       nir_foreach_instr_reverse(instr, block) {
716          nir_def *def = nir_instr_def(instr);
717          if (!def)
718             continue;
719 
720          def_state *state = &ctx->states[def->index];
721 
722          /* Anything that's replaced must be reconstructed */
723          if (state->replace)
724             BITSET_SET(ctx->reconstructed_defs, def->index);
725          else if (!BITSET_TEST(ctx->reconstructed_defs, def->index))
726             continue;
727 
728          /* If it must be reconstructed, it better be moveable */
729          assert(state->can_move);
730 
731          /* Anything that depends on something reconstructed is reconstructed */
732          nir_foreach_src(instr, mark_reconstructed, ctx->reconstructed_defs);
733 
734          /* Reconstructed phis need their ifs reconstructed */
735          if (instr->type == nir_instr_type_phi) {
736             nir_cf_node *prev_node = nir_cf_node_prev(&instr->block->cf_node);
737 
738             /* Invariants guaranteed by can_move_instr */
739             assert(prev_node != NULL);
740             assert(prev_node->type == nir_cf_node_if);
741 
742             nir_if *nif = nir_cf_node_as_if(prev_node);
743             assert(can_move_src(&nif->condition, ctx));
744 
745             /* Mark the if for reconstruction */
746             _mesa_set_add(ctx->reconstructed_ifs, nif);
747          }
748       }
749    }
750 }
751 
752 bool
nir_opt_preamble(nir_shader * shader,const nir_opt_preamble_options * options,unsigned * size)753 nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
754                  unsigned *size)
755 {
756    opt_preamble_ctx ctx = {
757       .options = options,
758    };
759 
760    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
761    ctx.states = calloc(impl->ssa_alloc, sizeof(*ctx.states));
762 
763    /* Step 1: Calculate can_move */
764    calculate_can_move_for_cf_list(&ctx, &impl->body);
765 
766    /* Step 2: Calculate is_candidate. This is complicated by the presence of
767     * non-candidate instructions like derefs whose users cannot be rewritten.
768     * If a deref chain is used at all by a non-can_move thing, then any offset
769     * sources anywhere along the chain should be considered candidates because
770     * the entire deref chain will never be deleted, but if it's only used by
771     * can_move things then it becomes subsumed by its users and none of the
772     * offset sources should be considered candidates as they will be removed
773     * when the users of the deref chain are moved. We need to replace "are
774     * there any non-can_move users" with "are there any non-can_move users,
775     * *recursing through non-candidate users*". We do this by walking backward
776     * and marking when a non-candidate instruction must stay in the final
777     * program because it has a non-can_move user, including recursively.
778     */
779    unsigned num_candidates = 0;
780    nir_foreach_block_reverse(block, impl) {
781       nir_foreach_instr_reverse(instr, block) {
782          nir_def *def = nir_instr_def(instr);
783          if (!def)
784             continue;
785 
786          def_state *state = &ctx.states[def->index];
787          if (!state->can_move)
788             continue;
789 
790          state->value = get_instr_cost(instr, options);
791          bool is_candidate = !avoid_instr(instr, options);
792          state->candidate = false;
793          state->must_stay = false;
794          nir_foreach_use_including_if(use, def) {
795             bool is_can_move_user;
796 
797             if (nir_src_is_if(use)) {
798                is_can_move_user = false;
799             } else {
800                nir_def *use_def = nir_instr_def(nir_src_parent_instr(use));
801                is_can_move_user = use_def != NULL &&
802                                   ctx.states[use_def->index].can_move &&
803                                   !ctx.states[use_def->index].must_stay;
804             }
805 
806             if (is_can_move_user) {
807                state->can_move_users++;
808             } else {
809                if (is_candidate)
810                   state->candidate = true;
811                else
812                   state->must_stay = true;
813             }
814          }
815 
816          if (state->candidate)
817             num_candidates++;
818       }
819    }
820 
821    if (num_candidates == 0) {
822       free(ctx.states);
823       return false;
824    }
825 
826    def_state **candidates = malloc(sizeof(*candidates) * num_candidates);
827    unsigned candidate_idx = 0;
828    unsigned total_size = 0;
829 
830    /* Step 3: Calculate value of candidates by propagating downwards. We try
831     * to share the value amongst can_move uses, in case there are multiple.
832     * This won't always find the most optimal solution, but is hopefully a
833     * good heuristic.
834     *
835     * Note that we use the can_move adjusted in the last pass, because if a
836     * can_move instruction cannot be moved because it's not a candidate and it
837     * has a non-can_move source then we don't want to count it as a use.
838     *
839     * While we're here, also collect an array of candidates.
840     */
841    nir_foreach_block(block, impl) {
842       nir_foreach_instr(instr, block) {
843          nir_def *def = nir_instr_def(instr);
844          if (!def)
845             continue;
846 
847          def_state *state = &ctx.states[def->index];
848          if (!state->can_move || state->must_stay)
849             continue;
850 
851          ctx.def = def;
852          nir_foreach_src(instr, update_src_value, &ctx);
853 
854          /* If this instruction is a candidate, its value shouldn't be
855           * propagated so we skip dividing it.
856           *
857           * Note: if it's can_move but not a candidate, then all its users
858           * must be can_move, so if there are no users then it must be dead.
859           */
860          if (!state->candidate && !state->must_stay) {
861             if (state->can_move_users > 0)
862                state->value /= state->can_move_users;
863             else
864                state->value = 0;
865          }
866 
867          if (state->candidate) {
868             state->benefit = state->value -
869                              options->rewrite_cost_cb(def, options->cb_data);
870 
871             if (state->benefit > 0) {
872                options->def_size(def, &state->size, &state->align);
873                total_size = ALIGN_POT(total_size, state->align);
874                total_size += state->size;
875                candidates[candidate_idx++] = state;
876             }
877          }
878       }
879    }
880 
881    assert(candidate_idx <= num_candidates);
882    num_candidates = candidate_idx;
883 
884    if (num_candidates == 0) {
885       free(ctx.states);
886       free(candidates);
887       return false;
888    }
889 
890    /* Step 4: Figure out which candidates we're going to replace and assign an
891     * offset. Assuming there is no expression sharing, this is similar to the
892     * 0-1 knapsack problem, except when there is a gap introduced by
893     * alignment. We use a well-known greedy approximation, sorting by value
894     * divided by size.
895     */
896 
897    if (((*size) + total_size) > options->preamble_storage_size) {
898       qsort(candidates, num_candidates, sizeof(*candidates), candidate_sort);
899    }
900 
901    unsigned offset = *size;
902    for (unsigned i = 0; i < num_candidates; i++) {
903       def_state *state = candidates[i];
904       offset = ALIGN_POT(offset, state->align);
905 
906       if (offset + state->size > options->preamble_storage_size)
907          break;
908 
909       state->replace = true;
910       state->offset = offset;
911 
912       offset += state->size;
913    }
914 
915    *size = offset;
916 
917    free(candidates);
918 
919    /* Determine which if's need to be reconstructed, based on the replacements
920     * we did.
921     */
922    ctx.reconstructed_ifs = _mesa_pointer_set_create(NULL);
923    ctx.reconstructed_defs = calloc(BITSET_WORDS(impl->ssa_alloc),
924                                    sizeof(BITSET_WORD));
925    analyze_reconstructed(&ctx, impl);
926 
927    /* If we make progress analyzing speculation, we need to re-analyze
928     * reconstructed defs to get the if-conditions in there.
929     */
930    if (analyze_speculation_for_cf_list(&ctx, &impl->body))
931       analyze_reconstructed(&ctx, impl);
932 
933    /* Step 5: Actually do the replacement. */
934    struct hash_table *remap_table =
935       _mesa_pointer_hash_table_create(NULL);
936    nir_function_impl *preamble =
937       nir_shader_get_preamble(impl->function->shader);
938    nir_builder preamble_builder = nir_builder_at(nir_before_impl(preamble));
939    nir_builder *b = &preamble_builder;
940 
941    replace_for_cf_list(b, &ctx, remap_table, &impl->body);
942 
943    nir_builder builder = nir_builder_create(impl);
944    b = &builder;
945 
946    unsigned max_index = impl->ssa_alloc;
947    nir_foreach_block(block, impl) {
948       nir_foreach_instr_safe(instr, block) {
949          nir_def *def = nir_instr_def(instr);
950          if (!def)
951             continue;
952 
953          /* Ignore new load_preamble instructions */
954          if (def->index >= max_index)
955             continue;
956 
957          def_state *state = &ctx.states[def->index];
958          if (!state->replace)
959             continue;
960 
961          b->cursor = nir_after_instr_and_phis(instr);
962 
963          nir_def *new_def =
964             nir_load_preamble(b, def->num_components, def->bit_size,
965                               .base = state->offset);
966 
967          nir_def_rewrite_uses(def, new_def);
968          nir_instr_free_and_dce(instr);
969       }
970    }
971 
972    nir_metadata_preserve(impl,
973                          nir_metadata_control_flow);
974 
975    ralloc_free(remap_table);
976    free(ctx.states);
977    free(ctx.reconstructed_defs);
978    _mesa_set_destroy(ctx.reconstructed_ifs, NULL);
979    return true;
980 }
981