1 /*
2 * Copyright © 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "util/set.h"
25 #include "nir.h"
26 #include "nir_builder.h"
27
28 /* This pass provides a way to move computations that are always the same for
29 * an entire draw/compute dispatch into a "preamble" that runs before the main
30 * entrypoint.
31 *
32 * We also expose a separate API to get or construct the preamble of a shader
33 * in case backends want to insert their own code.
34 */
35
36 nir_function_impl *
nir_shader_get_preamble(nir_shader * shader)37 nir_shader_get_preamble(nir_shader *shader)
38 {
39 nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader);
40 if (entrypoint->preamble) {
41 return entrypoint->preamble->impl;
42 } else {
43 nir_function *preamble = nir_function_create(shader, "@preamble");
44 preamble->is_preamble = true;
45 nir_function_impl *impl = nir_function_impl_create(preamble);
46 entrypoint->preamble = preamble;
47 return impl;
48 }
49 }
50
51 typedef struct {
52 bool can_move;
53 bool candidate;
54 bool must_stay;
55 bool replace;
56
57 unsigned can_move_users;
58
59 unsigned size, align;
60
61 unsigned offset;
62
63 /* Average the cost of a value among its users, to try to account for
64 * values that have multiple can_move uses.
65 */
66 float value;
67
68 /* Overall benefit, i.e. the value minus any cost to inserting
69 * load_preamble.
70 */
71 float benefit;
72 } def_state;
73
74 typedef struct {
75 /* Per-definition array of states */
76 def_state *states;
77
78 /* Number of levels of non-uniform control flow we're in. We don't
79 * reconstruct loops, so loops count as non-uniform conservatively. If-else
80 * is counted if the condition is not marked can_move.
81 */
82 unsigned nonuniform_cf_nesting;
83
84 /* Set of nir_if's that must be reconstructed in the preamble. Note an if may
85 * need reconstruction even when not entirely moved. This does not account
86 * for nesting: the parent CF nodes of ifs in this set must be reconstructed
87 * but may not be in this set, even if the parent is another if.
88 */
89 struct set *reconstructed_ifs;
90
91 /* Set of definitions that must be reconstructed in the preamble. This is a
92 * subset of can_move instructions, determined after replacement.
93 */
94 BITSET_WORD *reconstructed_defs;
95
96 nir_def *def;
97
98 const nir_opt_preamble_options *options;
99 } opt_preamble_ctx;
100
101 static bool
instr_can_speculate(nir_instr * instr)102 instr_can_speculate(nir_instr *instr)
103 {
104 /* Intrinsics with an ACCESS index can only be speculated if they are
105 * explicitly CAN_SPECULATE.
106 */
107 if (instr->type == nir_instr_type_intrinsic) {
108 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
109
110 if (nir_intrinsic_has_access(intr))
111 return nir_intrinsic_access(intr) & ACCESS_CAN_SPECULATE;
112 }
113
114 /* For now, everything else can be speculated. TODO: Bindless textures. */
115 return true;
116 }
117
118 static float
get_instr_cost(nir_instr * instr,const nir_opt_preamble_options * options)119 get_instr_cost(nir_instr *instr, const nir_opt_preamble_options *options)
120 {
121 /* No backend will want to hoist load_const or undef by itself, so handle
122 * this for them.
123 */
124 if (instr->type == nir_instr_type_load_const ||
125 instr->type == nir_instr_type_undef)
126 return 0;
127
128 return options->instr_cost_cb(instr, options->cb_data);
129 }
130
131 static bool
can_move_src(nir_src * src,void * state)132 can_move_src(nir_src *src, void *state)
133 {
134 opt_preamble_ctx *ctx = state;
135
136 return ctx->states[src->ssa->index].can_move;
137 }
138
139 static bool
can_move_srcs(nir_instr * instr,opt_preamble_ctx * ctx)140 can_move_srcs(nir_instr *instr, opt_preamble_ctx *ctx)
141 {
142 return nir_foreach_src(instr, can_move_src, ctx);
143 }
144
145 static bool
can_move_intrinsic(nir_intrinsic_instr * instr,opt_preamble_ctx * ctx)146 can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
147 {
148 switch (instr->intrinsic) {
149 /* Intrinsics which can always be moved */
150 case nir_intrinsic_load_push_constant:
151 case nir_intrinsic_load_work_dim:
152 case nir_intrinsic_load_num_workgroups:
153 case nir_intrinsic_load_ray_launch_size:
154 case nir_intrinsic_load_sbt_base_amd:
155 case nir_intrinsic_load_is_indexed_draw:
156 case nir_intrinsic_load_viewport_scale:
157 case nir_intrinsic_load_user_clip_plane:
158 case nir_intrinsic_load_viewport_x_scale:
159 case nir_intrinsic_load_viewport_y_scale:
160 case nir_intrinsic_load_viewport_z_scale:
161 case nir_intrinsic_load_viewport_offset:
162 case nir_intrinsic_load_viewport_x_offset:
163 case nir_intrinsic_load_viewport_y_offset:
164 case nir_intrinsic_load_viewport_z_offset:
165 case nir_intrinsic_load_blend_const_color_a_float:
166 case nir_intrinsic_load_blend_const_color_b_float:
167 case nir_intrinsic_load_blend_const_color_g_float:
168 case nir_intrinsic_load_blend_const_color_r_float:
169 case nir_intrinsic_load_blend_const_color_rgba:
170 case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
171 case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
172 case nir_intrinsic_load_line_width:
173 case nir_intrinsic_load_aa_line_width:
174 case nir_intrinsic_load_fb_layers_v3d:
175 case nir_intrinsic_load_fep_w_v3d:
176 case nir_intrinsic_load_tcs_num_patches_amd:
177 case nir_intrinsic_load_sample_positions_pan:
178 case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
179 case nir_intrinsic_load_prim_gen_query_enabled_amd:
180 case nir_intrinsic_load_prim_xfb_query_enabled_amd:
181 case nir_intrinsic_load_clamp_vertex_color_amd:
182 case nir_intrinsic_load_cull_front_face_enabled_amd:
183 case nir_intrinsic_load_cull_back_face_enabled_amd:
184 case nir_intrinsic_load_cull_ccw_amd:
185 case nir_intrinsic_load_cull_small_primitives_enabled_amd:
186 case nir_intrinsic_load_cull_any_enabled_amd:
187 case nir_intrinsic_load_cull_small_prim_precision_amd:
188 case nir_intrinsic_load_vbo_base_agx:
189 return true;
190
191 /* Intrinsics which can be moved depending on hardware */
192 case nir_intrinsic_load_base_instance:
193 case nir_intrinsic_load_base_vertex:
194 case nir_intrinsic_load_first_vertex:
195 case nir_intrinsic_load_draw_id:
196 return ctx->options->drawid_uniform;
197
198 case nir_intrinsic_load_subgroup_size:
199 case nir_intrinsic_load_num_subgroups:
200 return ctx->options->subgroup_size_uniform;
201
202 case nir_intrinsic_load_workgroup_size:
203 return ctx->options->load_workgroup_size_allowed;
204
205 /* Intrinsics which can be moved if the sources can */
206 case nir_intrinsic_load_ubo:
207 case nir_intrinsic_load_ubo_vec4:
208 case nir_intrinsic_get_ubo_size:
209 case nir_intrinsic_get_ssbo_size:
210 case nir_intrinsic_ballot_bitfield_extract:
211 case nir_intrinsic_ballot_find_lsb:
212 case nir_intrinsic_ballot_find_msb:
213 case nir_intrinsic_ballot_bit_count_reduce:
214 case nir_intrinsic_load_deref:
215 case nir_intrinsic_load_global_constant:
216 case nir_intrinsic_load_uniform:
217 case nir_intrinsic_load_preamble:
218 case nir_intrinsic_load_constant:
219 case nir_intrinsic_load_sample_pos_from_id:
220 case nir_intrinsic_load_kernel_input:
221 case nir_intrinsic_load_buffer_amd:
222 case nir_intrinsic_image_levels:
223 case nir_intrinsic_image_deref_levels:
224 case nir_intrinsic_bindless_image_levels:
225 case nir_intrinsic_image_samples:
226 case nir_intrinsic_image_deref_samples:
227 case nir_intrinsic_bindless_image_samples:
228 case nir_intrinsic_image_size:
229 case nir_intrinsic_image_deref_size:
230 case nir_intrinsic_bindless_image_size:
231 case nir_intrinsic_vulkan_resource_index:
232 case nir_intrinsic_vulkan_resource_reindex:
233 case nir_intrinsic_load_vulkan_descriptor:
234 case nir_intrinsic_quad_swizzle_amd:
235 case nir_intrinsic_masked_swizzle_amd:
236 case nir_intrinsic_load_ssbo_address:
237 case nir_intrinsic_bindless_resource_ir3:
238 case nir_intrinsic_load_const_ir3:
239 case nir_intrinsic_load_constant_agx:
240 return can_move_srcs(&instr->instr, ctx);
241
242 /* Image/SSBO loads can be moved if they are CAN_REORDER and their
243 * sources can be moved.
244 */
245 case nir_intrinsic_image_load:
246 case nir_intrinsic_image_samples_identical:
247 case nir_intrinsic_bindless_image_load:
248 case nir_intrinsic_load_ssbo:
249 case nir_intrinsic_load_ssbo_ir3:
250 return (nir_intrinsic_access(instr) & ACCESS_CAN_REORDER) &&
251 can_move_srcs(&instr->instr, ctx);
252
253 default:
254 return false;
255 }
256 }
257
258 static bool
can_move_instr(nir_instr * instr,opt_preamble_ctx * ctx)259 can_move_instr(nir_instr *instr, opt_preamble_ctx *ctx)
260 {
261 /* If we are only contained within uniform control flow, no speculation is
262 * needed since the control flow will be reconstructed in the preamble. But
263 * if we are not, we must be able to speculate instructions to move them.
264 */
265 if (ctx->nonuniform_cf_nesting > 0 && !instr_can_speculate(instr))
266 return false;
267
268 switch (instr->type) {
269 case nir_instr_type_tex: {
270 nir_tex_instr *tex = nir_instr_as_tex(instr);
271 /* See note below about derivatives. We have special code to convert tex
272 * to txd, though, because it's a common case.
273 */
274 if (nir_tex_instr_has_implicit_derivative(tex) &&
275 tex->op != nir_texop_tex) {
276 return false;
277 }
278 return can_move_srcs(instr, ctx);
279 }
280 case nir_instr_type_alu: {
281 /* The preamble is presumably run with only one thread, so we can't run
282 * derivatives in it.
283 * TODO: Replace derivatives with 0 instead, if real apps hit this.
284 */
285 nir_alu_instr *alu = nir_instr_as_alu(instr);
286 if (nir_op_is_derivative(alu->op))
287 return false;
288 else
289 return can_move_srcs(instr, ctx);
290 }
291 case nir_instr_type_intrinsic:
292 return can_move_intrinsic(nir_instr_as_intrinsic(instr), ctx);
293
294 case nir_instr_type_load_const:
295 case nir_instr_type_undef:
296 return true;
297
298 case nir_instr_type_deref: {
299 nir_deref_instr *deref = nir_instr_as_deref(instr);
300 if (deref->deref_type == nir_deref_type_var) {
301 switch (deref->modes) {
302 case nir_var_uniform:
303 case nir_var_mem_ubo:
304 return true;
305 default:
306 return false;
307 }
308 } else {
309 return can_move_srcs(instr, ctx);
310 }
311 }
312
313 /* We can only move phis if all of their sources are movable, and it is a phi
314 * for an if-else that is itself movable.
315 */
316 case nir_instr_type_phi: {
317 nir_cf_node *prev_node = nir_cf_node_prev(&instr->block->cf_node);
318 if (!prev_node)
319 return false;
320
321 if (prev_node->type != nir_cf_node_if) {
322 assert(prev_node->type == nir_cf_node_loop);
323 return false;
324 }
325
326 nir_if *nif = nir_cf_node_as_if(prev_node);
327 if (!can_move_src(&nif->condition, ctx))
328 return false;
329
330 return can_move_srcs(instr, ctx);
331 }
332
333 default:
334 return false;
335 }
336 }
337
338 /* True if we should avoid making this a candidate. This is only called on
339 * instructions we already determined we can move, this just makes it so that
340 * uses of this instruction cannot be rewritten. Typically this happens
341 * because of static constraints on the IR, for example some deref chains
342 * cannot be broken.
343 */
344 static bool
avoid_instr(nir_instr * instr,const nir_opt_preamble_options * options)345 avoid_instr(nir_instr *instr, const nir_opt_preamble_options *options)
346 {
347 if (instr->type == nir_instr_type_deref)
348 return true;
349
350 return options->avoid_instr_cb(instr, options->cb_data);
351 }
352
353 static bool
update_src_value(nir_src * src,void * data)354 update_src_value(nir_src *src, void *data)
355 {
356 opt_preamble_ctx *ctx = data;
357
358 def_state *state = &ctx->states[ctx->def->index];
359 def_state *src_state = &ctx->states[src->ssa->index];
360
361 assert(src_state->can_move);
362
363 /* If an instruction has can_move and non-can_move users, it becomes a
364 * candidate and its value shouldn't propagate downwards. For example,
365 * imagine a chain like this:
366 *
367 * -- F (cannot move)
368 * /
369 * A <-- B <-- C <-- D <-- E (cannot move)
370 *
371 * B and D are marked candidates. Picking B removes A and B, picking D
372 * removes C and D, and picking both removes all 4. Therefore B and D are
373 * independent and B's value shouldn't flow into D.
374 *
375 * A similar argument holds for must_stay values.
376 */
377 if (!src_state->must_stay && !src_state->candidate)
378 state->value += src_state->value;
379 return true;
380 }
381
382 static int
candidate_sort(const void * data1,const void * data2)383 candidate_sort(const void *data1, const void *data2)
384 {
385 const def_state *state1 = *(def_state **)data1;
386 const def_state *state2 = *(def_state **)data2;
387
388 float value1 = state1->value / state1->size;
389 float value2 = state2->value / state2->size;
390 if (value1 < value2)
391 return 1;
392 else if (value1 > value2)
393 return -1;
394 else
395 return 0;
396 }
397
398 static bool
calculate_can_move_for_block(opt_preamble_ctx * ctx,nir_block * block)399 calculate_can_move_for_block(opt_preamble_ctx *ctx, nir_block *block)
400 {
401 bool all_can_move = true;
402
403 nir_foreach_instr(instr, block) {
404 nir_def *def = nir_instr_def(instr);
405 if (!def)
406 continue;
407
408 def_state *state = &ctx->states[def->index];
409 state->can_move = can_move_instr(instr, ctx);
410 all_can_move &= state->can_move;
411 }
412
413 return all_can_move;
414 }
415
416 static bool
calculate_can_move_for_cf_list(opt_preamble_ctx * ctx,struct exec_list * list)417 calculate_can_move_for_cf_list(opt_preamble_ctx *ctx, struct exec_list *list)
418 {
419 bool all_can_move = true;
420
421 foreach_list_typed(nir_cf_node, node, node, list) {
422 switch (node->type) {
423 case nir_cf_node_block:
424 all_can_move &=
425 calculate_can_move_for_block(ctx, nir_cf_node_as_block(node));
426 break;
427
428 case nir_cf_node_if: {
429 nir_if *nif = nir_cf_node_as_if(node);
430 bool uniform = can_move_src(&nif->condition, ctx);
431
432 if (!uniform)
433 ctx->nonuniform_cf_nesting++;
434
435 bool if_can_move = uniform;
436 if_can_move &= calculate_can_move_for_cf_list(ctx, &nif->then_list);
437 if_can_move &= calculate_can_move_for_cf_list(ctx, &nif->else_list);
438
439 if (!uniform)
440 ctx->nonuniform_cf_nesting--;
441
442 all_can_move &= if_can_move;
443 break;
444 }
445
446 case nir_cf_node_loop: {
447 nir_loop *loop = nir_cf_node_as_loop(node);
448
449 /* Conservatively treat loops like conditional control flow, since an
450 * instruction might be conditionally unreachabled due to an earlier
451 * break in a loop that executes only one iteration.
452 */
453 ctx->nonuniform_cf_nesting++;
454 calculate_can_move_for_cf_list(ctx, &loop->body);
455 ctx->nonuniform_cf_nesting--;
456 all_can_move = false;
457 break;
458 }
459
460 default:
461 unreachable("Unexpected CF node type");
462 }
463 }
464
465 return all_can_move;
466 }
467
468 static void
replace_for_block(nir_builder * b,opt_preamble_ctx * ctx,struct hash_table * remap_table,nir_block * block)469 replace_for_block(nir_builder *b, opt_preamble_ctx *ctx,
470 struct hash_table *remap_table, nir_block *block)
471 {
472 nir_foreach_instr(instr, block) {
473 nir_def *def = nir_instr_def(instr);
474 if (!def)
475 continue;
476
477 /* Only replace what we actually need. This is a micro-optimization for
478 * compile-time performance of regular instructions, but it's required for
479 * correctness with phi nodes, since we might not reconstruct the
480 * corresponding if.
481 */
482 if (!BITSET_TEST(ctx->reconstructed_defs, def->index))
483 continue;
484
485 def_state *state = &ctx->states[def->index];
486 assert(state->can_move && "reconstructed => can_move");
487
488 nir_instr *clone;
489
490 if (instr->type == nir_instr_type_phi) {
491 /* Phis are special since they can't be cloned with nir_instr_clone */
492 nir_phi_instr *phi = nir_instr_as_phi(instr);
493
494 nir_cf_node *nif_cf = nir_cf_node_prev(&block->cf_node);
495 assert(nif_cf->type == nir_cf_node_if && "only if's are moveable");
496 nir_if *nif = nir_cf_node_as_if(nif_cf);
497
498 nir_block *then_block = nir_if_last_then_block(nif);
499 nir_block *else_block = nir_if_last_else_block(nif);
500
501 nir_def *then_def = NULL, *else_def = NULL;
502
503 nir_foreach_phi_src(phi_src, phi) {
504 if (phi_src->pred == then_block) {
505 assert(then_def == NULL);
506 then_def = phi_src->src.ssa;
507 } else if (phi_src->pred == else_block) {
508 assert(else_def == NULL);
509 else_def = phi_src->src.ssa;
510 } else {
511 unreachable("Invalid predecessor for phi of if");
512 }
513 }
514
515 assert(exec_list_length(&phi->srcs) == 2 && "only if's are movable");
516 assert(then_def && else_def && "all sources seen");
517
518 /* Remap */
519 then_def = _mesa_hash_table_search(remap_table, then_def)->data;
520 else_def = _mesa_hash_table_search(remap_table, else_def)->data;
521
522 b->cursor =
523 nir_before_block_after_phis(nir_cursor_current_block(b->cursor));
524
525 nir_def *repl = nir_if_phi(b, then_def, else_def);
526 clone = repl->parent_instr;
527
528 _mesa_hash_table_insert(remap_table, &phi->def, repl);
529 } else {
530 clone = nir_instr_clone_deep(b->shader, instr, remap_table);
531 nir_builder_instr_insert(b, clone);
532 }
533
534 if (clone->type == nir_instr_type_tex) {
535 nir_tex_instr *tex = nir_instr_as_tex(clone);
536 if (tex->op == nir_texop_tex) {
537 /* For maximum compatibility, replace normal textures with
538 * textureGrad with a gradient of 0.
539 * TODO: Handle txb somehow.
540 */
541 b->cursor = nir_before_instr(clone);
542
543 nir_def *zero =
544 nir_imm_zero(b, tex->coord_components - tex->is_array, 32);
545 nir_tex_instr_add_src(tex, nir_tex_src_ddx, zero);
546 nir_tex_instr_add_src(tex, nir_tex_src_ddy, zero);
547 tex->op = nir_texop_txd;
548
549 b->cursor = nir_after_instr(clone);
550 }
551 }
552
553 if (state->replace) {
554 nir_def *clone_def = nir_instr_def(clone);
555 nir_store_preamble(b, clone_def, .base = state->offset);
556 }
557 }
558 }
559
560 static void
replace_for_cf_list(nir_builder * b,opt_preamble_ctx * ctx,struct hash_table * remap_table,struct exec_list * list)561 replace_for_cf_list(nir_builder *b, opt_preamble_ctx *ctx,
562 struct hash_table *remap_table, struct exec_list *list)
563 {
564 foreach_list_typed(nir_cf_node, node, node, list) {
565 switch (node->type) {
566 case nir_cf_node_block: {
567 replace_for_block(b, ctx, remap_table, nir_cf_node_as_block(node));
568 break;
569 }
570
571 case nir_cf_node_if: {
572 nir_if *nif = nir_cf_node_as_if(node);
573
574 /* If we moved something that requires reconstructing the if, do so */
575 if (_mesa_set_search(ctx->reconstructed_ifs, nif)) {
576 assert(can_move_src(&nif->condition, ctx));
577
578 struct hash_entry *entry =
579 _mesa_hash_table_search(remap_table, nif->condition.ssa);
580 assert(entry != NULL && "can_move condition, def dominates use");
581 nir_def *remap_cond = entry->data;
582
583 nir_if *reconstructed_nif = NULL;
584 reconstructed_nif = nir_push_if(b, remap_cond);
585
586 b->cursor = nir_before_cf_list(&reconstructed_nif->then_list);
587 replace_for_cf_list(b, ctx, remap_table, &nif->then_list);
588
589 b->cursor = nir_before_cf_list(&reconstructed_nif->else_list);
590 replace_for_cf_list(b, ctx, remap_table, &nif->else_list);
591
592 nir_pop_if(b, reconstructed_nif);
593 b->cursor = nir_after_cf_node(&reconstructed_nif->cf_node);
594 } else {
595 replace_for_cf_list(b, ctx, remap_table, &nif->then_list);
596 replace_for_cf_list(b, ctx, remap_table, &nif->else_list);
597 }
598
599 break;
600 }
601
602 case nir_cf_node_loop: {
603 /* We don't try to reconstruct loops */
604 nir_loop *loop = nir_cf_node_as_loop(node);
605 replace_for_cf_list(b, ctx, remap_table, &loop->body);
606 break;
607 }
608
609 default:
610 unreachable("Unexpected CF node type");
611 }
612 }
613 }
614
615 /*
616 * If an if-statement contains an instruction that cannot be speculated, the
617 * if-statement must be reconstructed so we avoid the speculation. This applies
618 * even for nested if-statements. Determine which if-statements must be
619 * reconstructed for this reason by walking the program forward and looking
620 * inside uniform if's.
621 *
622 * Returns whether the CF list contains a reconstructed instruction that would
623 * otherwise be speculated, updating the reconstructed_ifs set. This depends on
624 * reconstructed_defs being correctly set by analyze_reconstructed.
625 */
626 static bool
analyze_speculation_for_cf_list(opt_preamble_ctx * ctx,struct exec_list * list)627 analyze_speculation_for_cf_list(opt_preamble_ctx *ctx, struct exec_list *list)
628 {
629 bool reconstruct_cf_list = false;
630
631 foreach_list_typed(nir_cf_node, node, node, list) {
632 switch (node->type) {
633 case nir_cf_node_block: {
634 nir_foreach_instr(instr, nir_cf_node_as_block(node)) {
635 nir_def *def = nir_instr_def(instr);
636 if (!def)
637 continue;
638
639 if (!BITSET_TEST(ctx->reconstructed_defs, def->index))
640 continue;
641
642 if (!instr_can_speculate(instr)) {
643 reconstruct_cf_list = true;
644 break;
645 }
646 }
647
648 break;
649 }
650
651 case nir_cf_node_if: {
652 nir_if *nif = nir_cf_node_as_if(node);
653
654 /* If we can move the if, we might need to reconstruct */
655 if (can_move_src(&nif->condition, ctx)) {
656 bool any = false;
657 any |= analyze_speculation_for_cf_list(ctx, &nif->then_list);
658 any |= analyze_speculation_for_cf_list(ctx, &nif->else_list);
659
660 if (any)
661 _mesa_set_add(ctx->reconstructed_ifs, nif);
662
663 reconstruct_cf_list |= any;
664 }
665
666 break;
667 }
668
669 /* We don't reconstruct loops */
670 default:
671 break;
672 }
673 }
674
675 return reconstruct_cf_list;
676 }
677
678 static bool
mark_reconstructed(nir_src * src,void * state)679 mark_reconstructed(nir_src *src, void *state)
680 {
681 BITSET_WORD *reconstructed_defs = state;
682 BITSET_SET(reconstructed_defs, src->ssa->index);
683 return true;
684 }
685
686 /*
687 * If a phi is moved into the preamble, then the if it depends on must also be
688 * moved. However, it is not necessary to consider any nested control flow. As
689 * an example, if we have a shader:
690 *
691 * if (not moveable condition) {
692 * if (moveable condition) {
693 * x = moveable
694 * }
695 * y = phi x, moveable
696 * z = floor y
697 * }
698 *
699 * Then if 'z' is in the replace set, we need to reconstruct the inner if, but
700 * not the outer if, unless there's also speculation to worry about.
701 *
702 * We do this by marking defs that need to be reconstructed, with a backwards
703 * sweep of the program (compatible with reverse dominance), and marking the
704 * if's preceding reconstructed phis.
705 */
706 static void
analyze_reconstructed(opt_preamble_ctx * ctx,nir_function_impl * impl)707 analyze_reconstructed(opt_preamble_ctx *ctx, nir_function_impl *impl)
708 {
709 nir_foreach_block_reverse(block, impl) {
710 /* If an if-statement is reconstructed, its condition must be as well */
711 nir_if *nif = nir_block_get_following_if(block);
712 if (nif && _mesa_set_search(ctx->reconstructed_ifs, nif))
713 BITSET_SET(ctx->reconstructed_defs, nif->condition.ssa->index);
714
715 nir_foreach_instr_reverse(instr, block) {
716 nir_def *def = nir_instr_def(instr);
717 if (!def)
718 continue;
719
720 def_state *state = &ctx->states[def->index];
721
722 /* Anything that's replaced must be reconstructed */
723 if (state->replace)
724 BITSET_SET(ctx->reconstructed_defs, def->index);
725 else if (!BITSET_TEST(ctx->reconstructed_defs, def->index))
726 continue;
727
728 /* If it must be reconstructed, it better be moveable */
729 assert(state->can_move);
730
731 /* Anything that depends on something reconstructed is reconstructed */
732 nir_foreach_src(instr, mark_reconstructed, ctx->reconstructed_defs);
733
734 /* Reconstructed phis need their ifs reconstructed */
735 if (instr->type == nir_instr_type_phi) {
736 nir_cf_node *prev_node = nir_cf_node_prev(&instr->block->cf_node);
737
738 /* Invariants guaranteed by can_move_instr */
739 assert(prev_node != NULL);
740 assert(prev_node->type == nir_cf_node_if);
741
742 nir_if *nif = nir_cf_node_as_if(prev_node);
743 assert(can_move_src(&nif->condition, ctx));
744
745 /* Mark the if for reconstruction */
746 _mesa_set_add(ctx->reconstructed_ifs, nif);
747 }
748 }
749 }
750 }
751
752 bool
nir_opt_preamble(nir_shader * shader,const nir_opt_preamble_options * options,unsigned * size)753 nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
754 unsigned *size)
755 {
756 opt_preamble_ctx ctx = {
757 .options = options,
758 };
759
760 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
761 ctx.states = calloc(impl->ssa_alloc, sizeof(*ctx.states));
762
763 /* Step 1: Calculate can_move */
764 calculate_can_move_for_cf_list(&ctx, &impl->body);
765
766 /* Step 2: Calculate is_candidate. This is complicated by the presence of
767 * non-candidate instructions like derefs whose users cannot be rewritten.
768 * If a deref chain is used at all by a non-can_move thing, then any offset
769 * sources anywhere along the chain should be considered candidates because
770 * the entire deref chain will never be deleted, but if it's only used by
771 * can_move things then it becomes subsumed by its users and none of the
772 * offset sources should be considered candidates as they will be removed
773 * when the users of the deref chain are moved. We need to replace "are
774 * there any non-can_move users" with "are there any non-can_move users,
775 * *recursing through non-candidate users*". We do this by walking backward
776 * and marking when a non-candidate instruction must stay in the final
777 * program because it has a non-can_move user, including recursively.
778 */
779 unsigned num_candidates = 0;
780 nir_foreach_block_reverse(block, impl) {
781 nir_foreach_instr_reverse(instr, block) {
782 nir_def *def = nir_instr_def(instr);
783 if (!def)
784 continue;
785
786 def_state *state = &ctx.states[def->index];
787 if (!state->can_move)
788 continue;
789
790 state->value = get_instr_cost(instr, options);
791 bool is_candidate = !avoid_instr(instr, options);
792 state->candidate = false;
793 state->must_stay = false;
794 nir_foreach_use_including_if(use, def) {
795 bool is_can_move_user;
796
797 if (nir_src_is_if(use)) {
798 is_can_move_user = false;
799 } else {
800 nir_def *use_def = nir_instr_def(nir_src_parent_instr(use));
801 is_can_move_user = use_def != NULL &&
802 ctx.states[use_def->index].can_move &&
803 !ctx.states[use_def->index].must_stay;
804 }
805
806 if (is_can_move_user) {
807 state->can_move_users++;
808 } else {
809 if (is_candidate)
810 state->candidate = true;
811 else
812 state->must_stay = true;
813 }
814 }
815
816 if (state->candidate)
817 num_candidates++;
818 }
819 }
820
821 if (num_candidates == 0) {
822 free(ctx.states);
823 return false;
824 }
825
826 def_state **candidates = malloc(sizeof(*candidates) * num_candidates);
827 unsigned candidate_idx = 0;
828 unsigned total_size = 0;
829
830 /* Step 3: Calculate value of candidates by propagating downwards. We try
831 * to share the value amongst can_move uses, in case there are multiple.
832 * This won't always find the most optimal solution, but is hopefully a
833 * good heuristic.
834 *
835 * Note that we use the can_move adjusted in the last pass, because if a
836 * can_move instruction cannot be moved because it's not a candidate and it
837 * has a non-can_move source then we don't want to count it as a use.
838 *
839 * While we're here, also collect an array of candidates.
840 */
841 nir_foreach_block(block, impl) {
842 nir_foreach_instr(instr, block) {
843 nir_def *def = nir_instr_def(instr);
844 if (!def)
845 continue;
846
847 def_state *state = &ctx.states[def->index];
848 if (!state->can_move || state->must_stay)
849 continue;
850
851 ctx.def = def;
852 nir_foreach_src(instr, update_src_value, &ctx);
853
854 /* If this instruction is a candidate, its value shouldn't be
855 * propagated so we skip dividing it.
856 *
857 * Note: if it's can_move but not a candidate, then all its users
858 * must be can_move, so if there are no users then it must be dead.
859 */
860 if (!state->candidate && !state->must_stay) {
861 if (state->can_move_users > 0)
862 state->value /= state->can_move_users;
863 else
864 state->value = 0;
865 }
866
867 if (state->candidate) {
868 state->benefit = state->value -
869 options->rewrite_cost_cb(def, options->cb_data);
870
871 if (state->benefit > 0) {
872 options->def_size(def, &state->size, &state->align);
873 total_size = ALIGN_POT(total_size, state->align);
874 total_size += state->size;
875 candidates[candidate_idx++] = state;
876 }
877 }
878 }
879 }
880
881 assert(candidate_idx <= num_candidates);
882 num_candidates = candidate_idx;
883
884 if (num_candidates == 0) {
885 free(ctx.states);
886 free(candidates);
887 return false;
888 }
889
890 /* Step 4: Figure out which candidates we're going to replace and assign an
891 * offset. Assuming there is no expression sharing, this is similar to the
892 * 0-1 knapsack problem, except when there is a gap introduced by
893 * alignment. We use a well-known greedy approximation, sorting by value
894 * divided by size.
895 */
896
897 if (((*size) + total_size) > options->preamble_storage_size) {
898 qsort(candidates, num_candidates, sizeof(*candidates), candidate_sort);
899 }
900
901 unsigned offset = *size;
902 for (unsigned i = 0; i < num_candidates; i++) {
903 def_state *state = candidates[i];
904 offset = ALIGN_POT(offset, state->align);
905
906 if (offset + state->size > options->preamble_storage_size)
907 break;
908
909 state->replace = true;
910 state->offset = offset;
911
912 offset += state->size;
913 }
914
915 *size = offset;
916
917 free(candidates);
918
919 /* Determine which if's need to be reconstructed, based on the replacements
920 * we did.
921 */
922 ctx.reconstructed_ifs = _mesa_pointer_set_create(NULL);
923 ctx.reconstructed_defs = calloc(BITSET_WORDS(impl->ssa_alloc),
924 sizeof(BITSET_WORD));
925 analyze_reconstructed(&ctx, impl);
926
927 /* If we make progress analyzing speculation, we need to re-analyze
928 * reconstructed defs to get the if-conditions in there.
929 */
930 if (analyze_speculation_for_cf_list(&ctx, &impl->body))
931 analyze_reconstructed(&ctx, impl);
932
933 /* Step 5: Actually do the replacement. */
934 struct hash_table *remap_table =
935 _mesa_pointer_hash_table_create(NULL);
936 nir_function_impl *preamble =
937 nir_shader_get_preamble(impl->function->shader);
938 nir_builder preamble_builder = nir_builder_at(nir_before_impl(preamble));
939 nir_builder *b = &preamble_builder;
940
941 replace_for_cf_list(b, &ctx, remap_table, &impl->body);
942
943 nir_builder builder = nir_builder_create(impl);
944 b = &builder;
945
946 unsigned max_index = impl->ssa_alloc;
947 nir_foreach_block(block, impl) {
948 nir_foreach_instr_safe(instr, block) {
949 nir_def *def = nir_instr_def(instr);
950 if (!def)
951 continue;
952
953 /* Ignore new load_preamble instructions */
954 if (def->index >= max_index)
955 continue;
956
957 def_state *state = &ctx.states[def->index];
958 if (!state->replace)
959 continue;
960
961 b->cursor = nir_after_instr_and_phis(instr);
962
963 nir_def *new_def =
964 nir_load_preamble(b, def->num_components, def->bit_size,
965 .base = state->offset);
966
967 nir_def_rewrite_uses(def, new_def);
968 nir_instr_free_and_dce(instr);
969 }
970 }
971
972 nir_metadata_preserve(impl,
973 nir_metadata_control_flow);
974
975 ralloc_free(remap_table);
976 free(ctx.states);
977 free(ctx.reconstructed_defs);
978 _mesa_set_destroy(ctx.reconstructed_ifs, NULL);
979 return true;
980 }
981