1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "nir.h"
26
27 /* This pass computes for each ssa definition if it is uniform.
28 * That is, the variable has the same value for all invocations
29 * of the group.
30 *
31 * This divergence analysis pass expects the shader to be in LCSSA-form.
32 *
33 * This algorithm implements "The Simple Divergence Analysis" from
34 * Diogo Sampaio, Rafael De Souza, Sylvain Collange, Fernando Magno Quintão Pereira.
35 * Divergence Analysis. ACM Transactions on Programming Languages and Systems (TOPLAS),
36 * ACM, 2013, 35 (4), pp.13:1-13:36. <10.1145/2523815>. <hal-00909072v2>
37 */
38
39 struct divergence_state {
40 const gl_shader_stage stage;
41 nir_shader *shader;
42
43 /* Whether the caller requested vertex divergence (meaning between vertices
44 * of the same primitive) instead of subgroup invocation divergence
45 * (between invocations of the same subgroup). For example, patch input
46 * loads are always convergent, while subgroup intrinsics are divergent
47 * because vertices of the same primitive can be processed by different
48 * subgroups.
49 */
50 bool vertex_divergence;
51
52 /** current control flow state */
53 /* True if some loop-active invocations might take a different control-flow path.
54 * A divergent break does not cause subsequent control-flow to be considered
55 * divergent because those invocations are no longer active in the loop.
56 * For a divergent if, both sides are considered divergent flow because
57 * the other side is still loop-active. */
58 bool divergent_loop_cf;
59 /* True if a divergent continue happened since the loop header */
60 bool divergent_loop_continue;
61 /* True if a divergent break happened since the loop header */
62 bool divergent_loop_break;
63
64 /* True if we visit the block for the fist time */
65 bool first_visit;
66 };
67
68 static bool
69 visit_cf_list(struct exec_list *list, struct divergence_state *state);
70
71 static bool
visit_alu(nir_alu_instr * instr,struct divergence_state * state)72 visit_alu(nir_alu_instr *instr, struct divergence_state *state)
73 {
74 if (instr->def.divergent)
75 return false;
76
77 unsigned num_src = nir_op_infos[instr->op].num_inputs;
78
79 for (unsigned i = 0; i < num_src; i++) {
80 if (instr->src[i].src.ssa->divergent) {
81 instr->def.divergent = true;
82 return true;
83 }
84 }
85
86 return false;
87 }
88
89
90 /* On some HW uniform loads where there is a pending store/atomic from another
91 * wave can "tear" so that different invocations see the pre-store value and
92 * the post-store value even though they are loading from the same location.
93 * This means we have to assume it's not uniform unless it's readonly.
94 *
95 * TODO The Vulkan memory model is much more strict here and requires an
96 * atomic or volatile load for the data race to be valid, which could allow us
97 * to do better if it's in use, however we currently don't have that
98 * information plumbed through.
99 */
100 static bool
load_may_tear(nir_shader * shader,nir_intrinsic_instr * instr)101 load_may_tear(nir_shader *shader, nir_intrinsic_instr *instr)
102 {
103 return (shader->options->divergence_analysis_options &
104 nir_divergence_uniform_load_tears) &&
105 !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE);
106 }
107
108 static bool
visit_intrinsic(nir_intrinsic_instr * instr,struct divergence_state * state)109 visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
110 {
111 if (!nir_intrinsic_infos[instr->intrinsic].has_dest)
112 return false;
113
114 if (instr->def.divergent)
115 return false;
116
117 nir_divergence_options options =
118 state->shader->options->divergence_analysis_options;
119 gl_shader_stage stage = state->stage;
120 bool is_divergent = false;
121 switch (instr->intrinsic) {
122 case nir_intrinsic_shader_clock:
123 case nir_intrinsic_ballot:
124 case nir_intrinsic_ballot_relaxed:
125 case nir_intrinsic_as_uniform:
126 case nir_intrinsic_read_invocation:
127 case nir_intrinsic_read_first_invocation:
128 case nir_intrinsic_read_invocation_cond_ir3:
129 case nir_intrinsic_vote_any:
130 case nir_intrinsic_vote_all:
131 case nir_intrinsic_vote_feq:
132 case nir_intrinsic_vote_ieq:
133 case nir_intrinsic_first_invocation:
134 case nir_intrinsic_last_invocation:
135 case nir_intrinsic_load_subgroup_id:
136 case nir_intrinsic_shared_append_amd:
137 case nir_intrinsic_shared_consume_amd:
138 /* VS/TES/GS invocations of the same primitive can be in different
139 * subgroups, so subgroup ops are always divergent between vertices of
140 * the same primitive.
141 */
142 is_divergent = state->vertex_divergence;
143 break;
144
145 /* Intrinsics which are always uniform */
146 case nir_intrinsic_load_preamble:
147 case nir_intrinsic_load_push_constant:
148 case nir_intrinsic_load_push_constant_zink:
149 case nir_intrinsic_load_work_dim:
150 case nir_intrinsic_load_num_workgroups:
151 case nir_intrinsic_load_workgroup_size:
152 case nir_intrinsic_load_num_subgroups:
153 case nir_intrinsic_load_ray_launch_size:
154 case nir_intrinsic_load_sbt_base_amd:
155 case nir_intrinsic_load_subgroup_size:
156 case nir_intrinsic_load_subgroup_id_shift_ir3:
157 case nir_intrinsic_load_base_instance:
158 case nir_intrinsic_load_base_vertex:
159 case nir_intrinsic_load_first_vertex:
160 case nir_intrinsic_load_draw_id:
161 case nir_intrinsic_load_is_indexed_draw:
162 case nir_intrinsic_load_viewport_scale:
163 case nir_intrinsic_load_user_clip_plane:
164 case nir_intrinsic_load_viewport_x_scale:
165 case nir_intrinsic_load_viewport_y_scale:
166 case nir_intrinsic_load_viewport_z_scale:
167 case nir_intrinsic_load_viewport_offset:
168 case nir_intrinsic_load_viewport_x_offset:
169 case nir_intrinsic_load_viewport_y_offset:
170 case nir_intrinsic_load_viewport_z_offset:
171 case nir_intrinsic_load_viewport_xy_scale_and_offset:
172 case nir_intrinsic_load_blend_const_color_a_float:
173 case nir_intrinsic_load_blend_const_color_b_float:
174 case nir_intrinsic_load_blend_const_color_g_float:
175 case nir_intrinsic_load_blend_const_color_r_float:
176 case nir_intrinsic_load_blend_const_color_rgba:
177 case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
178 case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
179 case nir_intrinsic_load_line_width:
180 case nir_intrinsic_load_aa_line_width:
181 case nir_intrinsic_load_xfb_address:
182 case nir_intrinsic_load_num_vertices:
183 case nir_intrinsic_load_fb_layers_v3d:
184 case nir_intrinsic_load_fep_w_v3d:
185 case nir_intrinsic_load_tcs_num_patches_amd:
186 case nir_intrinsic_load_tcs_tess_levels_to_tes_amd:
187 case nir_intrinsic_load_tcs_primitive_mode_amd:
188 case nir_intrinsic_load_patch_vertices_in:
189 case nir_intrinsic_load_ring_tess_factors_amd:
190 case nir_intrinsic_load_ring_tess_offchip_amd:
191 case nir_intrinsic_load_ring_tess_factors_offset_amd:
192 case nir_intrinsic_load_ring_tess_offchip_offset_amd:
193 case nir_intrinsic_load_ring_mesh_scratch_amd:
194 case nir_intrinsic_load_ring_mesh_scratch_offset_amd:
195 case nir_intrinsic_load_ring_esgs_amd:
196 case nir_intrinsic_load_ring_es2gs_offset_amd:
197 case nir_intrinsic_load_ring_task_draw_amd:
198 case nir_intrinsic_load_ring_task_payload_amd:
199 case nir_intrinsic_load_sample_positions_amd:
200 case nir_intrinsic_load_rasterization_samples_amd:
201 case nir_intrinsic_load_ring_gsvs_amd:
202 case nir_intrinsic_load_ring_gs2vs_offset_amd:
203 case nir_intrinsic_load_streamout_config_amd:
204 case nir_intrinsic_load_streamout_write_index_amd:
205 case nir_intrinsic_load_streamout_offset_amd:
206 case nir_intrinsic_load_task_ring_entry_amd:
207 case nir_intrinsic_load_ring_attr_amd:
208 case nir_intrinsic_load_ring_attr_offset_amd:
209 case nir_intrinsic_load_provoking_vtx_amd:
210 case nir_intrinsic_load_sample_positions_pan:
211 case nir_intrinsic_load_workgroup_num_input_vertices_amd:
212 case nir_intrinsic_load_workgroup_num_input_primitives_amd:
213 case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
214 case nir_intrinsic_load_prim_gen_query_enabled_amd:
215 case nir_intrinsic_load_prim_xfb_query_enabled_amd:
216 case nir_intrinsic_load_merged_wave_info_amd:
217 case nir_intrinsic_load_clamp_vertex_color_amd:
218 case nir_intrinsic_load_cull_front_face_enabled_amd:
219 case nir_intrinsic_load_cull_back_face_enabled_amd:
220 case nir_intrinsic_load_cull_ccw_amd:
221 case nir_intrinsic_load_cull_small_primitives_enabled_amd:
222 case nir_intrinsic_load_cull_any_enabled_amd:
223 case nir_intrinsic_load_cull_small_prim_precision_amd:
224 case nir_intrinsic_load_user_data_amd:
225 case nir_intrinsic_load_force_vrs_rates_amd:
226 case nir_intrinsic_load_tess_level_inner_default:
227 case nir_intrinsic_load_tess_level_outer_default:
228 case nir_intrinsic_load_scalar_arg_amd:
229 case nir_intrinsic_load_smem_amd:
230 case nir_intrinsic_load_resume_shader_address_amd:
231 case nir_intrinsic_load_reloc_const_intel:
232 case nir_intrinsic_load_btd_global_arg_addr_intel:
233 case nir_intrinsic_load_btd_local_arg_addr_intel:
234 case nir_intrinsic_load_mesh_inline_data_intel:
235 case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
236 case nir_intrinsic_load_lshs_vertex_stride_amd:
237 case nir_intrinsic_load_esgs_vertex_stride_amd:
238 case nir_intrinsic_load_hs_out_patch_data_offset_amd:
239 case nir_intrinsic_load_clip_half_line_width_amd:
240 case nir_intrinsic_load_num_vertices_per_primitive_amd:
241 case nir_intrinsic_load_streamout_buffer_amd:
242 case nir_intrinsic_load_ordered_id_amd:
243 case nir_intrinsic_load_gs_wave_id_amd:
244 case nir_intrinsic_load_provoking_vtx_in_prim_amd:
245 case nir_intrinsic_load_lds_ngg_scratch_base_amd:
246 case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd:
247 case nir_intrinsic_load_btd_shader_type_intel:
248 case nir_intrinsic_load_base_global_invocation_id:
249 case nir_intrinsic_load_base_workgroup_id:
250 case nir_intrinsic_load_alpha_reference_amd:
251 case nir_intrinsic_load_ubo_uniform_block_intel:
252 case nir_intrinsic_load_ssbo_uniform_block_intel:
253 case nir_intrinsic_load_shared_uniform_block_intel:
254 case nir_intrinsic_load_barycentric_optimize_amd:
255 case nir_intrinsic_load_poly_line_smooth_enabled:
256 case nir_intrinsic_load_rasterization_primitive_amd:
257 case nir_intrinsic_unit_test_uniform_amd:
258 case nir_intrinsic_load_global_constant_uniform_block_intel:
259 case nir_intrinsic_load_debug_log_desc_amd:
260 case nir_intrinsic_load_xfb_state_address_gfx12_amd:
261 case nir_intrinsic_cmat_length:
262 case nir_intrinsic_load_vs_primitive_stride_ir3:
263 case nir_intrinsic_load_vs_vertex_stride_ir3:
264 case nir_intrinsic_load_hs_patch_stride_ir3:
265 case nir_intrinsic_load_tess_factor_base_ir3:
266 case nir_intrinsic_load_tess_param_base_ir3:
267 case nir_intrinsic_load_primitive_location_ir3:
268 case nir_intrinsic_preamble_start_ir3:
269 case nir_intrinsic_optimization_barrier_sgpr_amd:
270 case nir_intrinsic_load_printf_buffer_address:
271 case nir_intrinsic_load_printf_base_identifier:
272 case nir_intrinsic_load_core_id_agx:
273 case nir_intrinsic_load_samples_log2_agx:
274 case nir_intrinsic_load_active_subgroup_count_agx:
275 case nir_intrinsic_load_constant_base_ptr:
276 is_divergent = false;
277 break;
278
279 /* This is divergent because it specifically loads sequential values into
280 * successive SIMD lanes.
281 */
282 case nir_intrinsic_load_global_block_intel:
283 is_divergent = true;
284 break;
285
286 case nir_intrinsic_decl_reg:
287 is_divergent = nir_intrinsic_divergent(instr);
288 break;
289
290 /* Intrinsics with divergence depending on shader stage and hardware */
291 case nir_intrinsic_load_shader_record_ptr:
292 is_divergent = !(options & nir_divergence_shader_record_ptr_uniform);
293 break;
294 case nir_intrinsic_load_frag_shading_rate:
295 is_divergent = !(options & nir_divergence_single_frag_shading_rate_per_subgroup);
296 break;
297 case nir_intrinsic_load_input:
298 case nir_intrinsic_load_per_primitive_input:
299 is_divergent = instr->src[0].ssa->divergent;
300
301 if (stage == MESA_SHADER_FRAGMENT) {
302 is_divergent |= !(options & nir_divergence_single_prim_per_subgroup);
303 } else if (stage == MESA_SHADER_TESS_EVAL) {
304 /* Patch input loads are uniform between vertices of the same primitive. */
305 if (state->vertex_divergence)
306 is_divergent = false;
307 else
308 is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup);
309 } else {
310 is_divergent = true;
311 }
312 break;
313 case nir_intrinsic_load_per_vertex_input:
314 is_divergent = instr->src[0].ssa->divergent ||
315 instr->src[1].ssa->divergent;
316 if (stage == MESA_SHADER_TESS_CTRL)
317 is_divergent |= !(options & nir_divergence_single_patch_per_tcs_subgroup);
318 if (stage == MESA_SHADER_TESS_EVAL)
319 is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup);
320 else
321 is_divergent = true;
322 break;
323 case nir_intrinsic_load_input_vertex:
324 is_divergent = instr->src[1].ssa->divergent;
325 assert(stage == MESA_SHADER_FRAGMENT);
326 is_divergent |= !(options & nir_divergence_single_prim_per_subgroup);
327 break;
328 case nir_intrinsic_load_output:
329 is_divergent = instr->src[0].ssa->divergent;
330 switch (stage) {
331 case MESA_SHADER_TESS_CTRL:
332 is_divergent |= !(options & nir_divergence_single_patch_per_tcs_subgroup);
333 break;
334 case MESA_SHADER_FRAGMENT:
335 is_divergent = true;
336 break;
337 case MESA_SHADER_TASK:
338 case MESA_SHADER_MESH:
339 /* NV_mesh_shader only (EXT_mesh_shader does not allow loading outputs).
340 * Divergent if src[0] is, so nothing else to do.
341 */
342 break;
343 default:
344 unreachable("Invalid stage for load_output");
345 }
346 break;
347 case nir_intrinsic_load_per_vertex_output:
348 /* TCS and NV_mesh_shader only (EXT_mesh_shader does not allow loading outputs). */
349 assert(stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_MESH);
350 is_divergent = instr->src[0].ssa->divergent ||
351 instr->src[1].ssa->divergent ||
352 (stage == MESA_SHADER_TESS_CTRL &&
353 !(options & nir_divergence_single_patch_per_tcs_subgroup));
354 break;
355 case nir_intrinsic_load_per_primitive_output:
356 /* NV_mesh_shader only (EXT_mesh_shader does not allow loading outputs). */
357 assert(stage == MESA_SHADER_MESH);
358 is_divergent = instr->src[0].ssa->divergent ||
359 instr->src[1].ssa->divergent;
360 break;
361 case nir_intrinsic_load_layer_id:
362 case nir_intrinsic_load_front_face:
363 case nir_intrinsic_load_back_face_agx:
364 assert(stage == MESA_SHADER_FRAGMENT || state->shader->info.internal);
365 is_divergent = !(options & nir_divergence_single_prim_per_subgroup);
366 break;
367 case nir_intrinsic_load_view_index:
368 assert(stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_KERNEL);
369 if (options & nir_divergence_view_index_uniform)
370 is_divergent = false;
371 else if (stage == MESA_SHADER_FRAGMENT)
372 is_divergent = !(options & nir_divergence_single_prim_per_subgroup);
373 else
374 is_divergent = true;
375 break;
376 case nir_intrinsic_load_fs_input_interp_deltas:
377 assert(stage == MESA_SHADER_FRAGMENT);
378 is_divergent = instr->src[0].ssa->divergent;
379 is_divergent |= !(options & nir_divergence_single_prim_per_subgroup);
380 break;
381 case nir_intrinsic_load_instance_id:
382 is_divergent = !state->vertex_divergence;
383 break;
384 case nir_intrinsic_load_primitive_id:
385 if (stage == MESA_SHADER_FRAGMENT)
386 is_divergent = !(options & nir_divergence_single_prim_per_subgroup);
387 else if (stage == MESA_SHADER_TESS_CTRL)
388 is_divergent = !state->vertex_divergence &&
389 !(options & nir_divergence_single_patch_per_tcs_subgroup);
390 else if (stage == MESA_SHADER_TESS_EVAL)
391 is_divergent = !state->vertex_divergence &&
392 !(options & nir_divergence_single_patch_per_tes_subgroup);
393 else if (stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_VERTEX)
394 is_divergent = !state->vertex_divergence;
395 else if (stage == MESA_SHADER_ANY_HIT ||
396 stage == MESA_SHADER_CLOSEST_HIT ||
397 stage == MESA_SHADER_INTERSECTION)
398 is_divergent = true;
399 else
400 unreachable("Invalid stage for load_primitive_id");
401 break;
402 case nir_intrinsic_load_tess_level_inner:
403 case nir_intrinsic_load_tess_level_outer:
404 if (stage == MESA_SHADER_TESS_CTRL)
405 is_divergent = !(options & nir_divergence_single_patch_per_tcs_subgroup);
406 else if (stage == MESA_SHADER_TESS_EVAL)
407 is_divergent = !(options & nir_divergence_single_patch_per_tes_subgroup);
408 else
409 unreachable("Invalid stage for load_primitive_tess_level_*");
410 break;
411
412 case nir_intrinsic_load_workgroup_index:
413 case nir_intrinsic_load_workgroup_id:
414 assert(gl_shader_stage_uses_workgroup(stage) || stage == MESA_SHADER_TESS_CTRL);
415 if (stage == MESA_SHADER_COMPUTE)
416 is_divergent |= (options & nir_divergence_multiple_workgroup_per_compute_subgroup);
417 break;
418
419 /* Clustered reductions are uniform if cluster_size == subgroup_size or
420 * the source is uniform and the operation is invariant.
421 * Inclusive scans are uniform if
422 * the source is uniform and the operation is invariant
423 */
424 case nir_intrinsic_reduce:
425 if (nir_intrinsic_cluster_size(instr) == 0) {
426 /* Cluster size of 0 means the subgroup size.
427 * This is uniform within a subgroup, but divergent between
428 * vertices of the same primitive because they may be in
429 * different subgroups.
430 */
431 is_divergent = state->vertex_divergence;
432 break;
433 }
434 FALLTHROUGH;
435 case nir_intrinsic_inclusive_scan:
436 case nir_intrinsic_inclusive_scan_clusters_ir3: {
437 nir_op op = nir_intrinsic_reduction_op(instr);
438 is_divergent = instr->src[0].ssa->divergent || state->vertex_divergence;
439 if (op != nir_op_umin && op != nir_op_imin && op != nir_op_fmin &&
440 op != nir_op_umax && op != nir_op_imax && op != nir_op_fmax &&
441 op != nir_op_iand && op != nir_op_ior)
442 is_divergent = true;
443 break;
444 }
445
446 case nir_intrinsic_reduce_clusters_ir3:
447 /* This reduces the last invocations in all 8-wide clusters. It should
448 * behave the same as reduce with cluster_size == subgroup_size.
449 */
450 is_divergent = state->vertex_divergence;
451 break;
452
453 case nir_intrinsic_load_ubo:
454 case nir_intrinsic_load_ubo_vec4:
455 case nir_intrinsic_ldc_nv:
456 case nir_intrinsic_ldcx_nv:
457 is_divergent = (instr->src[0].ssa->divergent && (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
458 instr->src[1].ssa->divergent;
459 break;
460
461 case nir_intrinsic_load_ssbo:
462 case nir_intrinsic_load_ssbo_ir3:
463 is_divergent = (instr->src[0].ssa->divergent && (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
464 instr->src[1].ssa->divergent ||
465 load_may_tear(state->shader, instr);
466 break;
467
468 case nir_intrinsic_load_shared:
469 case nir_intrinsic_load_shared_ir3:
470 is_divergent = instr->src[0].ssa->divergent ||
471 (state->shader->options->divergence_analysis_options &
472 nir_divergence_uniform_load_tears);
473 break;
474
475 case nir_intrinsic_load_global:
476 case nir_intrinsic_load_global_2x32:
477 case nir_intrinsic_load_global_ir3:
478 case nir_intrinsic_load_deref: {
479 if (load_may_tear(state->shader, instr)) {
480 is_divergent = true;
481 break;
482 }
483
484 unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
485 for (unsigned i = 0; i < num_srcs; i++) {
486 if (instr->src[i].ssa->divergent) {
487 is_divergent = true;
488 break;
489 }
490 }
491 break;
492 }
493
494 case nir_intrinsic_get_ssbo_size:
495 case nir_intrinsic_deref_buffer_array_length:
496 is_divergent = instr->src[0].ssa->divergent && (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
497 break;
498
499 case nir_intrinsic_image_samples_identical:
500 case nir_intrinsic_image_deref_samples_identical:
501 case nir_intrinsic_bindless_image_samples_identical:
502 case nir_intrinsic_image_fragment_mask_load_amd:
503 case nir_intrinsic_image_deref_fragment_mask_load_amd:
504 case nir_intrinsic_bindless_image_fragment_mask_load_amd:
505 is_divergent = (instr->src[0].ssa->divergent && (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
506 instr->src[1].ssa->divergent ||
507 load_may_tear(state->shader, instr);
508 break;
509
510 case nir_intrinsic_image_texel_address:
511 case nir_intrinsic_image_deref_texel_address:
512 case nir_intrinsic_bindless_image_texel_address:
513 is_divergent = (instr->src[0].ssa->divergent && (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
514 instr->src[1].ssa->divergent || instr->src[2].ssa->divergent;
515 break;
516
517 case nir_intrinsic_image_load:
518 case nir_intrinsic_image_deref_load:
519 case nir_intrinsic_bindless_image_load:
520 case nir_intrinsic_image_sparse_load:
521 case nir_intrinsic_image_deref_sparse_load:
522 case nir_intrinsic_bindless_image_sparse_load:
523 is_divergent = (instr->src[0].ssa->divergent && (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) ||
524 instr->src[1].ssa->divergent || instr->src[2].ssa->divergent || instr->src[3].ssa->divergent ||
525 load_may_tear(state->shader, instr);
526 break;
527
528 case nir_intrinsic_optimization_barrier_vgpr_amd:
529 is_divergent = instr->src[0].ssa->divergent;
530 break;
531
532 /* Intrinsics with divergence depending on sources */
533 case nir_intrinsic_convert_alu_types:
534 case nir_intrinsic_ddx:
535 case nir_intrinsic_ddx_fine:
536 case nir_intrinsic_ddx_coarse:
537 case nir_intrinsic_ddy:
538 case nir_intrinsic_ddy_fine:
539 case nir_intrinsic_ddy_coarse:
540 case nir_intrinsic_ballot_bitfield_extract:
541 case nir_intrinsic_ballot_find_lsb:
542 case nir_intrinsic_ballot_find_msb:
543 case nir_intrinsic_ballot_bit_count_reduce:
544 case nir_intrinsic_rotate:
545 case nir_intrinsic_shuffle_xor:
546 case nir_intrinsic_shuffle_up:
547 case nir_intrinsic_shuffle_down:
548 case nir_intrinsic_quad_broadcast:
549 case nir_intrinsic_quad_swap_horizontal:
550 case nir_intrinsic_quad_swap_vertical:
551 case nir_intrinsic_quad_swap_diagonal:
552 case nir_intrinsic_quad_vote_any:
553 case nir_intrinsic_quad_vote_all:
554 case nir_intrinsic_load_shared2_amd:
555 case nir_intrinsic_load_global_constant:
556 case nir_intrinsic_load_global_amd:
557 case nir_intrinsic_load_uniform:
558 case nir_intrinsic_load_constant:
559 case nir_intrinsic_load_sample_pos_from_id:
560 case nir_intrinsic_load_kernel_input:
561 case nir_intrinsic_load_task_payload:
562 case nir_intrinsic_load_buffer_amd:
563 case nir_intrinsic_load_typed_buffer_amd:
564 case nir_intrinsic_image_levels:
565 case nir_intrinsic_image_deref_levels:
566 case nir_intrinsic_bindless_image_levels:
567 case nir_intrinsic_image_samples:
568 case nir_intrinsic_image_deref_samples:
569 case nir_intrinsic_bindless_image_samples:
570 case nir_intrinsic_image_size:
571 case nir_intrinsic_image_deref_size:
572 case nir_intrinsic_bindless_image_size:
573 case nir_intrinsic_image_descriptor_amd:
574 case nir_intrinsic_image_deref_descriptor_amd:
575 case nir_intrinsic_bindless_image_descriptor_amd:
576 case nir_intrinsic_strict_wqm_coord_amd:
577 case nir_intrinsic_copy_deref:
578 case nir_intrinsic_vulkan_resource_index:
579 case nir_intrinsic_vulkan_resource_reindex:
580 case nir_intrinsic_load_vulkan_descriptor:
581 case nir_intrinsic_atomic_counter_read:
582 case nir_intrinsic_atomic_counter_read_deref:
583 case nir_intrinsic_quad_swizzle_amd:
584 case nir_intrinsic_masked_swizzle_amd:
585 case nir_intrinsic_is_sparse_texels_resident:
586 case nir_intrinsic_is_sparse_resident_zink:
587 case nir_intrinsic_sparse_residency_code_and:
588 case nir_intrinsic_bvh64_intersect_ray_amd:
589 case nir_intrinsic_image_deref_load_param_intel:
590 case nir_intrinsic_image_load_raw_intel:
591 case nir_intrinsic_get_ubo_size:
592 case nir_intrinsic_load_ssbo_address:
593 case nir_intrinsic_load_global_constant_bounded:
594 case nir_intrinsic_load_global_constant_offset:
595 case nir_intrinsic_load_reg:
596 case nir_intrinsic_load_constant_agx:
597 case nir_intrinsic_load_reg_indirect:
598 case nir_intrinsic_load_const_ir3:
599 case nir_intrinsic_load_frag_size_ir3:
600 case nir_intrinsic_load_frag_offset_ir3:
601 case nir_intrinsic_bindless_resource_ir3: {
602 unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
603 for (unsigned i = 0; i < num_srcs; i++) {
604 if (instr->src[i].ssa->divergent) {
605 is_divergent = true;
606 break;
607 }
608 }
609 break;
610 }
611
612 case nir_intrinsic_resource_intel:
613 /* Not having the non_uniform flag with divergent sources is undefined
614 * behavior. The Intel driver defines it pick the lowest numbered live
615 * SIMD lane (via emit_uniformize).
616 */
617 if ((nir_intrinsic_resource_access_intel(instr) &
618 nir_resource_intel_non_uniform) != 0) {
619 unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
620 for (unsigned i = 0; i < num_srcs; i++) {
621 if (instr->src[i].ssa->divergent) {
622 is_divergent = true;
623 break;
624 }
625 }
626 }
627 break;
628
629 case nir_intrinsic_shuffle:
630 is_divergent = instr->src[0].ssa->divergent &&
631 instr->src[1].ssa->divergent;
632 break;
633
634 /* Intrinsics which are always divergent */
635 case nir_intrinsic_inverse_ballot:
636 case nir_intrinsic_load_color0:
637 case nir_intrinsic_load_color1:
638 case nir_intrinsic_load_param:
639 case nir_intrinsic_load_sample_id:
640 case nir_intrinsic_load_sample_id_no_per_sample:
641 case nir_intrinsic_load_sample_mask_in:
642 case nir_intrinsic_load_interpolated_input:
643 case nir_intrinsic_load_point_coord_maybe_flipped:
644 case nir_intrinsic_load_barycentric_pixel:
645 case nir_intrinsic_load_barycentric_centroid:
646 case nir_intrinsic_load_barycentric_sample:
647 case nir_intrinsic_load_barycentric_model:
648 case nir_intrinsic_load_barycentric_at_sample:
649 case nir_intrinsic_load_barycentric_at_offset:
650 case nir_intrinsic_load_barycentric_at_offset_nv:
651 case nir_intrinsic_load_barycentric_coord_pixel:
652 case nir_intrinsic_load_barycentric_coord_centroid:
653 case nir_intrinsic_load_barycentric_coord_sample:
654 case nir_intrinsic_load_barycentric_coord_at_sample:
655 case nir_intrinsic_load_barycentric_coord_at_offset:
656 case nir_intrinsic_load_persp_center_rhw_ir3:
657 case nir_intrinsic_interp_deref_at_offset:
658 case nir_intrinsic_interp_deref_at_sample:
659 case nir_intrinsic_interp_deref_at_centroid:
660 case nir_intrinsic_interp_deref_at_vertex:
661 case nir_intrinsic_load_tess_coord:
662 case nir_intrinsic_load_tess_coord_xy:
663 case nir_intrinsic_load_point_coord:
664 case nir_intrinsic_load_line_coord:
665 case nir_intrinsic_load_frag_coord:
666 case nir_intrinsic_load_frag_coord_zw:
667 case nir_intrinsic_load_frag_coord_unscaled_ir3:
668 case nir_intrinsic_load_pixel_coord:
669 case nir_intrinsic_load_fully_covered:
670 case nir_intrinsic_load_sample_pos:
671 case nir_intrinsic_load_sample_pos_or_center:
672 case nir_intrinsic_load_vertex_id_zero_base:
673 case nir_intrinsic_load_vertex_id:
674 case nir_intrinsic_load_invocation_id:
675 case nir_intrinsic_load_local_invocation_id:
676 case nir_intrinsic_load_local_invocation_index:
677 case nir_intrinsic_load_global_invocation_id:
678 case nir_intrinsic_load_global_invocation_index:
679 case nir_intrinsic_load_subgroup_invocation:
680 case nir_intrinsic_load_subgroup_eq_mask:
681 case nir_intrinsic_load_subgroup_ge_mask:
682 case nir_intrinsic_load_subgroup_gt_mask:
683 case nir_intrinsic_load_subgroup_le_mask:
684 case nir_intrinsic_load_subgroup_lt_mask:
685 case nir_intrinsic_load_helper_invocation:
686 case nir_intrinsic_is_helper_invocation:
687 case nir_intrinsic_load_scratch:
688 case nir_intrinsic_deref_atomic:
689 case nir_intrinsic_deref_atomic_swap:
690 case nir_intrinsic_ssbo_atomic:
691 case nir_intrinsic_ssbo_atomic_swap:
692 case nir_intrinsic_ssbo_atomic_ir3:
693 case nir_intrinsic_ssbo_atomic_swap_ir3:
694 case nir_intrinsic_image_deref_atomic:
695 case nir_intrinsic_image_deref_atomic_swap:
696 case nir_intrinsic_image_atomic:
697 case nir_intrinsic_image_atomic_swap:
698 case nir_intrinsic_bindless_image_atomic:
699 case nir_intrinsic_bindless_image_atomic_swap:
700 case nir_intrinsic_shared_atomic:
701 case nir_intrinsic_shared_atomic_swap:
702 case nir_intrinsic_task_payload_atomic:
703 case nir_intrinsic_task_payload_atomic_swap:
704 case nir_intrinsic_global_atomic:
705 case nir_intrinsic_global_atomic_swap:
706 case nir_intrinsic_global_atomic_amd:
707 case nir_intrinsic_global_atomic_swap_amd:
708 case nir_intrinsic_global_atomic_2x32:
709 case nir_intrinsic_global_atomic_swap_2x32:
710 case nir_intrinsic_global_atomic_ir3:
711 case nir_intrinsic_global_atomic_swap_ir3:
712 case nir_intrinsic_atomic_counter_add:
713 case nir_intrinsic_atomic_counter_min:
714 case nir_intrinsic_atomic_counter_max:
715 case nir_intrinsic_atomic_counter_and:
716 case nir_intrinsic_atomic_counter_or:
717 case nir_intrinsic_atomic_counter_xor:
718 case nir_intrinsic_atomic_counter_inc:
719 case nir_intrinsic_atomic_counter_pre_dec:
720 case nir_intrinsic_atomic_counter_post_dec:
721 case nir_intrinsic_atomic_counter_exchange:
722 case nir_intrinsic_atomic_counter_comp_swap:
723 case nir_intrinsic_atomic_counter_add_deref:
724 case nir_intrinsic_atomic_counter_min_deref:
725 case nir_intrinsic_atomic_counter_max_deref:
726 case nir_intrinsic_atomic_counter_and_deref:
727 case nir_intrinsic_atomic_counter_or_deref:
728 case nir_intrinsic_atomic_counter_xor_deref:
729 case nir_intrinsic_atomic_counter_inc_deref:
730 case nir_intrinsic_atomic_counter_pre_dec_deref:
731 case nir_intrinsic_atomic_counter_post_dec_deref:
732 case nir_intrinsic_atomic_counter_exchange_deref:
733 case nir_intrinsic_atomic_counter_comp_swap_deref:
734 case nir_intrinsic_exclusive_scan:
735 case nir_intrinsic_exclusive_scan_clusters_ir3:
736 case nir_intrinsic_ballot_bit_count_exclusive:
737 case nir_intrinsic_ballot_bit_count_inclusive:
738 case nir_intrinsic_write_invocation_amd:
739 case nir_intrinsic_mbcnt_amd:
740 case nir_intrinsic_lane_permute_16_amd:
741 case nir_intrinsic_dpp16_shift_amd:
742 case nir_intrinsic_elect:
743 case nir_intrinsic_elect_any_ir3:
744 case nir_intrinsic_load_tlb_color_brcm:
745 case nir_intrinsic_load_tess_rel_patch_id_amd:
746 case nir_intrinsic_load_gs_vertex_offset_amd:
747 case nir_intrinsic_is_subgroup_invocation_lt_amd:
748 case nir_intrinsic_load_packed_passthrough_primitive_amd:
749 case nir_intrinsic_load_initial_edgeflags_amd:
750 case nir_intrinsic_gds_atomic_add_amd:
751 case nir_intrinsic_load_rt_arg_scratch_offset_amd:
752 case nir_intrinsic_load_intersection_opaque_amd:
753 case nir_intrinsic_load_vector_arg_amd:
754 case nir_intrinsic_load_btd_stack_id_intel:
755 case nir_intrinsic_load_topology_id_intel:
756 case nir_intrinsic_load_scratch_base_ptr:
757 case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
758 case nir_intrinsic_ordered_add_loop_gfx12_amd:
759 case nir_intrinsic_xfb_counter_sub_gfx11_amd:
760 case nir_intrinsic_unit_test_divergent_amd:
761 case nir_intrinsic_load_stack:
762 case nir_intrinsic_load_ray_launch_id:
763 case nir_intrinsic_load_ray_instance_custom_index:
764 case nir_intrinsic_load_ray_geometry_index:
765 case nir_intrinsic_load_ray_world_direction:
766 case nir_intrinsic_load_ray_world_origin:
767 case nir_intrinsic_load_ray_object_origin:
768 case nir_intrinsic_load_ray_object_direction:
769 case nir_intrinsic_load_ray_t_min:
770 case nir_intrinsic_load_ray_t_max:
771 case nir_intrinsic_load_ray_object_to_world:
772 case nir_intrinsic_load_ray_world_to_object:
773 case nir_intrinsic_load_ray_hit_kind:
774 case nir_intrinsic_load_ray_flags:
775 case nir_intrinsic_load_cull_mask:
776 case nir_intrinsic_load_sysval_nv:
777 case nir_intrinsic_emit_vertex_nv:
778 case nir_intrinsic_end_primitive_nv:
779 case nir_intrinsic_report_ray_intersection:
780 case nir_intrinsic_rq_proceed:
781 case nir_intrinsic_rq_load:
782 case nir_intrinsic_load_ray_triangle_vertex_positions:
783 case nir_intrinsic_cmat_extract:
784 case nir_intrinsic_cmat_muladd_amd:
785 case nir_intrinsic_dpas_intel:
786 case nir_intrinsic_isberd_nv:
787 case nir_intrinsic_al2p_nv:
788 case nir_intrinsic_ald_nv:
789 case nir_intrinsic_ipa_nv:
790 case nir_intrinsic_ldtram_nv:
791 case nir_intrinsic_printf:
792 case nir_intrinsic_load_gs_header_ir3:
793 case nir_intrinsic_load_tcs_header_ir3:
794 case nir_intrinsic_load_rel_patch_id_ir3:
795 case nir_intrinsic_brcst_active_ir3:
796 case nir_intrinsic_load_helper_op_id_agx:
797 case nir_intrinsic_load_helper_arg_lo_agx:
798 case nir_intrinsic_load_helper_arg_hi_agx:
799 case nir_intrinsic_stack_map_agx:
800 case nir_intrinsic_stack_unmap_agx:
801 case nir_intrinsic_load_exported_agx:
802 case nir_intrinsic_load_local_pixel_agx:
803 case nir_intrinsic_load_coefficients_agx:
804 case nir_intrinsic_load_active_subgroup_invocation_agx:
805 case nir_intrinsic_load_sample_mask:
806 case nir_intrinsic_quad_ballot_agx:
807 case nir_intrinsic_load_agx:
808 is_divergent = true;
809 break;
810
811 default:
812 #ifdef NDEBUG
813 is_divergent = true;
814 break;
815 #else
816 nir_print_instr(&instr->instr, stderr);
817 unreachable("\nNIR divergence analysis: Unhandled intrinsic.");
818 #endif
819 }
820
821 instr->def.divergent = is_divergent;
822 return is_divergent;
823 }
824
825 static bool
visit_tex(nir_tex_instr * instr,struct divergence_state * state)826 visit_tex(nir_tex_instr *instr, struct divergence_state *state)
827 {
828 if (instr->def.divergent)
829 return false;
830
831 bool is_divergent = false;
832
833 for (unsigned i = 0; i < instr->num_srcs; i++) {
834 switch (instr->src[i].src_type) {
835 case nir_tex_src_sampler_deref:
836 case nir_tex_src_sampler_handle:
837 case nir_tex_src_sampler_offset:
838 is_divergent |= instr->src[i].src.ssa->divergent &&
839 instr->sampler_non_uniform;
840 break;
841 case nir_tex_src_texture_deref:
842 case nir_tex_src_texture_handle:
843 case nir_tex_src_texture_offset:
844 is_divergent |= instr->src[i].src.ssa->divergent &&
845 instr->texture_non_uniform;
846 break;
847 default:
848 is_divergent |= instr->src[i].src.ssa->divergent;
849 break;
850 }
851 }
852
853 instr->def.divergent = is_divergent;
854 return is_divergent;
855 }
856
857 static bool
visit_def(nir_def * def,struct divergence_state * state)858 visit_def(nir_def *def, struct divergence_state *state)
859 {
860 return false;
861 }
862
863 static bool
nir_variable_mode_is_uniform(nir_variable_mode mode)864 nir_variable_mode_is_uniform(nir_variable_mode mode)
865 {
866 switch (mode) {
867 case nir_var_uniform:
868 case nir_var_mem_ubo:
869 case nir_var_mem_ssbo:
870 case nir_var_mem_shared:
871 case nir_var_mem_task_payload:
872 case nir_var_mem_global:
873 case nir_var_image:
874 return true;
875 default:
876 return false;
877 }
878 }
879
880 static bool
nir_variable_is_uniform(nir_shader * shader,nir_variable * var,struct divergence_state * state)881 nir_variable_is_uniform(nir_shader *shader, nir_variable *var,
882 struct divergence_state *state)
883 {
884 if (nir_variable_mode_is_uniform(var->data.mode))
885 return true;
886
887 /* Handle system value variables. */
888 if (var->data.mode == nir_var_system_value) {
889 /* Fake the instruction to reuse visit_intrinsic for all sysvals. */
890 nir_intrinsic_instr fake_instr;
891
892 memset(&fake_instr, 0, sizeof(fake_instr));
893 fake_instr.intrinsic =
894 nir_intrinsic_from_system_value(var->data.location);
895
896 visit_intrinsic(&fake_instr, state);
897 return !fake_instr.def.divergent;
898 }
899
900 nir_divergence_options options = shader->options->divergence_analysis_options;
901 gl_shader_stage stage = shader->info.stage;
902
903 if (stage == MESA_SHADER_FRAGMENT &&
904 (options & nir_divergence_single_prim_per_subgroup) &&
905 var->data.mode == nir_var_shader_in &&
906 var->data.interpolation == INTERP_MODE_FLAT)
907 return true;
908
909 if (stage == MESA_SHADER_TESS_CTRL &&
910 (options & nir_divergence_single_patch_per_tcs_subgroup) &&
911 var->data.mode == nir_var_shader_out && var->data.patch)
912 return true;
913
914 if (stage == MESA_SHADER_TESS_EVAL &&
915 (options & nir_divergence_single_patch_per_tes_subgroup) &&
916 var->data.mode == nir_var_shader_in && var->data.patch)
917 return true;
918
919 return false;
920 }
921
922 static bool
visit_deref(nir_shader * shader,nir_deref_instr * deref,struct divergence_state * state)923 visit_deref(nir_shader *shader, nir_deref_instr *deref,
924 struct divergence_state *state)
925 {
926 if (deref->def.divergent)
927 return false;
928
929 bool is_divergent = false;
930 switch (deref->deref_type) {
931 case nir_deref_type_var:
932 is_divergent = !nir_variable_is_uniform(shader, deref->var, state);
933 break;
934 case nir_deref_type_array:
935 case nir_deref_type_ptr_as_array:
936 is_divergent = deref->arr.index.ssa->divergent;
937 FALLTHROUGH;
938 case nir_deref_type_struct:
939 case nir_deref_type_array_wildcard:
940 is_divergent |= deref->parent.ssa->divergent;
941 break;
942 case nir_deref_type_cast:
943 is_divergent = !nir_variable_mode_is_uniform(deref->var->data.mode) ||
944 deref->parent.ssa->divergent;
945 break;
946 }
947
948 deref->def.divergent = is_divergent;
949 return is_divergent;
950 }
951
952 static bool
visit_jump(nir_jump_instr * jump,struct divergence_state * state)953 visit_jump(nir_jump_instr *jump, struct divergence_state *state)
954 {
955 switch (jump->type) {
956 case nir_jump_continue:
957 if (state->divergent_loop_continue)
958 return false;
959 if (state->divergent_loop_cf)
960 state->divergent_loop_continue = true;
961 return state->divergent_loop_continue;
962 case nir_jump_break:
963 if (state->divergent_loop_break)
964 return false;
965 if (state->divergent_loop_cf)
966 state->divergent_loop_break = true;
967 return state->divergent_loop_break;
968 case nir_jump_halt:
969 /* This totally kills invocations so it doesn't add divergence */
970 break;
971 case nir_jump_return:
972 unreachable("NIR divergence analysis: Unsupported return instruction.");
973 break;
974 case nir_jump_goto:
975 case nir_jump_goto_if:
976 unreachable("NIR divergence analysis: Unsupported goto_if instruction.");
977 break;
978 }
979 return false;
980 }
981
982 static bool
set_ssa_def_not_divergent(nir_def * def,UNUSED void * _state)983 set_ssa_def_not_divergent(nir_def *def, UNUSED void *_state)
984 {
985 def->divergent = false;
986 return true;
987 }
988
989 static bool
update_instr_divergence(nir_instr * instr,struct divergence_state * state)990 update_instr_divergence(nir_instr *instr, struct divergence_state *state)
991 {
992 switch (instr->type) {
993 case nir_instr_type_alu:
994 return visit_alu(nir_instr_as_alu(instr), state);
995 case nir_instr_type_intrinsic:
996 return visit_intrinsic(nir_instr_as_intrinsic(instr), state);
997 case nir_instr_type_tex:
998 return visit_tex(nir_instr_as_tex(instr), state);
999 case nir_instr_type_load_const:
1000 return visit_def(&nir_instr_as_load_const(instr)->def, state);
1001 case nir_instr_type_undef:
1002 return visit_def(&nir_instr_as_undef(instr)->def, state);
1003 case nir_instr_type_deref:
1004 return visit_deref(state->shader, nir_instr_as_deref(instr), state);
1005 case nir_instr_type_debug_info:
1006 return false;
1007 case nir_instr_type_jump:
1008 case nir_instr_type_phi:
1009 case nir_instr_type_call:
1010 case nir_instr_type_parallel_copy:
1011 default:
1012 unreachable("NIR divergence analysis: Unsupported instruction type.");
1013 }
1014 }
1015
1016 static bool
visit_block(nir_block * block,struct divergence_state * state)1017 visit_block(nir_block *block, struct divergence_state *state)
1018 {
1019 bool has_changed = false;
1020
1021 nir_foreach_instr(instr, block) {
1022 /* phis are handled when processing the branches */
1023 if (instr->type == nir_instr_type_phi)
1024 continue;
1025
1026 if (state->first_visit)
1027 nir_foreach_def(instr, set_ssa_def_not_divergent, NULL);
1028
1029 if (instr->type == nir_instr_type_jump) {
1030 has_changed |= visit_jump(nir_instr_as_jump(instr), state);
1031 } else {
1032 has_changed |= update_instr_divergence(instr, state);
1033 }
1034 }
1035
1036 bool divergent = state->divergent_loop_cf ||
1037 state->divergent_loop_continue ||
1038 state->divergent_loop_break;
1039 if (divergent != block->divergent) {
1040 block->divergent = divergent;
1041 has_changed = true;
1042 }
1043
1044 return has_changed;
1045 }
1046
1047 /* There are 3 types of phi instructions:
1048 * (1) gamma: represent the joining point of different paths
1049 * created by an “if-then-else” branch.
1050 * The resulting value is divergent if the branch condition
1051 * or any of the source values is divergent. */
1052 static bool
visit_if_merge_phi(nir_phi_instr * phi,bool if_cond_divergent)1053 visit_if_merge_phi(nir_phi_instr *phi, bool if_cond_divergent)
1054 {
1055 if (phi->def.divergent)
1056 return false;
1057
1058 unsigned defined_srcs = 0;
1059 nir_foreach_phi_src(src, phi) {
1060 /* if any source value is divergent, the resulting value is divergent */
1061 if (src->src.ssa->divergent) {
1062 phi->def.divergent = true;
1063 return true;
1064 }
1065 if (src->src.ssa->parent_instr->type != nir_instr_type_undef) {
1066 defined_srcs++;
1067 }
1068 }
1069
1070 /* if the condition is divergent and two sources defined, the definition is divergent */
1071 if (defined_srcs > 1 && if_cond_divergent) {
1072 phi->def.divergent = true;
1073 return true;
1074 }
1075
1076 return false;
1077 }
1078
1079 /* There are 3 types of phi instructions:
1080 * (2) mu: which only exist at loop headers,
1081 * merge initial and loop-carried values.
1082 * The resulting value is divergent if any source value
1083 * is divergent or a divergent loop continue condition
1084 * is associated with a different ssa-def. */
1085 static bool
visit_loop_header_phi(nir_phi_instr * phi,nir_block * preheader,bool divergent_continue)1086 visit_loop_header_phi(nir_phi_instr *phi, nir_block *preheader, bool divergent_continue)
1087 {
1088 if (phi->def.divergent)
1089 return false;
1090
1091 nir_def *same = NULL;
1092 nir_foreach_phi_src(src, phi) {
1093 /* if any source value is divergent, the resulting value is divergent */
1094 if (src->src.ssa->divergent) {
1095 phi->def.divergent = true;
1096 return true;
1097 }
1098 /* if this loop is uniform, we're done here */
1099 if (!divergent_continue)
1100 continue;
1101 /* skip the loop preheader */
1102 if (src->pred == preheader)
1103 continue;
1104 /* skip undef values */
1105 if (nir_src_is_undef(src->src))
1106 continue;
1107
1108 /* check if all loop-carried values are from the same ssa-def */
1109 if (!same)
1110 same = src->src.ssa;
1111 else if (same != src->src.ssa) {
1112 phi->def.divergent = true;
1113 return true;
1114 }
1115 }
1116
1117 return false;
1118 }
1119
1120 /* There are 3 types of phi instructions:
1121 * (3) eta: represent values that leave a loop.
1122 * The resulting value is divergent if the source value is divergent
1123 * or any loop exit condition is divergent for a value which is
1124 * not loop-invariant.
1125 * (note: there should be no phi for loop-invariant variables.) */
1126 static bool
visit_loop_exit_phi(nir_phi_instr * phi,bool divergent_break)1127 visit_loop_exit_phi(nir_phi_instr *phi, bool divergent_break)
1128 {
1129 if (phi->def.divergent)
1130 return false;
1131
1132 if (divergent_break) {
1133 phi->def.divergent = true;
1134 return true;
1135 }
1136
1137 /* if any source value is divergent, the resulting value is divergent */
1138 nir_foreach_phi_src(src, phi) {
1139 if (src->src.ssa->divergent) {
1140 phi->def.divergent = true;
1141 return true;
1142 }
1143 }
1144
1145 return false;
1146 }
1147
1148 static bool
visit_if(nir_if * if_stmt,struct divergence_state * state)1149 visit_if(nir_if *if_stmt, struct divergence_state *state)
1150 {
1151 bool progress = false;
1152
1153 struct divergence_state then_state = *state;
1154 then_state.divergent_loop_cf |= if_stmt->condition.ssa->divergent;
1155 progress |= visit_cf_list(&if_stmt->then_list, &then_state);
1156
1157 struct divergence_state else_state = *state;
1158 else_state.divergent_loop_cf |= if_stmt->condition.ssa->divergent;
1159 progress |= visit_cf_list(&if_stmt->else_list, &else_state);
1160
1161 /* handle phis after the IF */
1162 nir_foreach_phi(phi, nir_cf_node_cf_tree_next(&if_stmt->cf_node)) {
1163 if (state->first_visit)
1164 phi->def.divergent = false;
1165 progress |= visit_if_merge_phi(phi, if_stmt->condition.ssa->divergent);
1166 }
1167
1168 /* join loop divergence information from both branch legs */
1169 state->divergent_loop_continue |= then_state.divergent_loop_continue ||
1170 else_state.divergent_loop_continue;
1171 state->divergent_loop_break |= then_state.divergent_loop_break ||
1172 else_state.divergent_loop_break;
1173
1174 /* A divergent continue makes succeeding loop CF divergent:
1175 * not all loop-active invocations participate in the remaining loop-body
1176 * which means that a following break might be taken by some invocations, only */
1177 state->divergent_loop_cf |= state->divergent_loop_continue;
1178
1179 return progress;
1180 }
1181
1182 static bool
visit_loop(nir_loop * loop,struct divergence_state * state)1183 visit_loop(nir_loop *loop, struct divergence_state *state)
1184 {
1185 assert(!nir_loop_has_continue_construct(loop));
1186 bool progress = false;
1187 nir_block *loop_header = nir_loop_first_block(loop);
1188 nir_block *loop_preheader = nir_block_cf_tree_prev(loop_header);
1189
1190 /* handle loop header phis first: we have no knowledge yet about
1191 * the loop's control flow or any loop-carried sources. */
1192 nir_foreach_phi(phi, loop_header) {
1193 if (!state->first_visit && phi->def.divergent)
1194 continue;
1195
1196 nir_foreach_phi_src(src, phi) {
1197 if (src->pred == loop_preheader) {
1198 phi->def.divergent = src->src.ssa->divergent;
1199 break;
1200 }
1201 }
1202 progress |= phi->def.divergent;
1203 }
1204
1205 /* setup loop state */
1206 struct divergence_state loop_state = *state;
1207 loop_state.divergent_loop_cf = false;
1208 loop_state.divergent_loop_continue = false;
1209 loop_state.divergent_loop_break = false;
1210
1211 /* process loop body until no further changes are made */
1212 bool repeat;
1213 do {
1214 progress |= visit_cf_list(&loop->body, &loop_state);
1215 repeat = false;
1216
1217 /* revisit loop header phis to see if something has changed */
1218 nir_foreach_phi(phi, loop_header) {
1219 repeat |= visit_loop_header_phi(phi, loop_preheader,
1220 loop_state.divergent_loop_continue);
1221 }
1222
1223 loop_state.divergent_loop_cf = false;
1224 loop_state.first_visit = false;
1225 } while (repeat);
1226
1227 /* handle phis after the loop */
1228 nir_foreach_phi(phi, nir_cf_node_cf_tree_next(&loop->cf_node)) {
1229 if (state->first_visit)
1230 phi->def.divergent = false;
1231 progress |= visit_loop_exit_phi(phi, loop_state.divergent_loop_break);
1232 }
1233
1234 loop->divergent = (loop_state.divergent_loop_break || loop_state.divergent_loop_continue);
1235
1236 return progress;
1237 }
1238
1239 static bool
visit_cf_list(struct exec_list * list,struct divergence_state * state)1240 visit_cf_list(struct exec_list *list, struct divergence_state *state)
1241 {
1242 bool has_changed = false;
1243
1244 foreach_list_typed(nir_cf_node, node, node, list) {
1245 switch (node->type) {
1246 case nir_cf_node_block:
1247 has_changed |= visit_block(nir_cf_node_as_block(node), state);
1248 break;
1249 case nir_cf_node_if:
1250 has_changed |= visit_if(nir_cf_node_as_if(node), state);
1251 break;
1252 case nir_cf_node_loop:
1253 has_changed |= visit_loop(nir_cf_node_as_loop(node), state);
1254 break;
1255 case nir_cf_node_function:
1256 unreachable("NIR divergence analysis: Unsupported cf_node type.");
1257 }
1258 }
1259
1260 return has_changed;
1261 }
1262
1263 void
nir_divergence_analysis(nir_shader * shader)1264 nir_divergence_analysis(nir_shader *shader)
1265 {
1266 shader->info.divergence_analysis_run = true;
1267
1268 struct divergence_state state = {
1269 .stage = shader->info.stage,
1270 .shader = shader,
1271 .divergent_loop_cf = false,
1272 .divergent_loop_continue = false,
1273 .divergent_loop_break = false,
1274 .first_visit = true,
1275 };
1276
1277 visit_cf_list(&nir_shader_get_entrypoint(shader)->body, &state);
1278 }
1279
1280 /* Compute divergence between vertices of the same primitive. This uses
1281 * the same divergent field in nir_def and nir_loop as the regular divergence
1282 * pass.
1283 */
1284 void
nir_vertex_divergence_analysis(nir_shader * shader)1285 nir_vertex_divergence_analysis(nir_shader *shader)
1286 {
1287 shader->info.divergence_analysis_run = false;
1288
1289 struct divergence_state state = {
1290 .stage = shader->info.stage,
1291 .shader = shader,
1292 .vertex_divergence = true,
1293 .first_visit = true,
1294 };
1295
1296 visit_cf_list(&nir_shader_get_entrypoint(shader)->body, &state);
1297 }
1298
1299 bool
nir_update_instr_divergence(nir_shader * shader,nir_instr * instr)1300 nir_update_instr_divergence(nir_shader *shader, nir_instr *instr)
1301 {
1302 nir_foreach_def(instr, set_ssa_def_not_divergent, NULL);
1303
1304 if (instr->type == nir_instr_type_phi) {
1305 nir_cf_node *prev = nir_cf_node_prev(&instr->block->cf_node);
1306 /* can only update gamma/if phis */
1307 if (!prev || prev->type != nir_cf_node_if)
1308 return false;
1309
1310 nir_if *nif = nir_cf_node_as_if(prev);
1311
1312 visit_if_merge_phi(nir_instr_as_phi(instr), nir_src_is_divergent(nif->condition));
1313 return true;
1314 }
1315
1316 struct divergence_state state = {
1317 .stage = shader->info.stage,
1318 .shader = shader,
1319 .first_visit = true,
1320 };
1321 update_instr_divergence(instr, &state);
1322 return true;
1323 }
1324
1325 bool
nir_has_divergent_loop(nir_shader * shader)1326 nir_has_divergent_loop(nir_shader *shader)
1327 {
1328 bool divergent_loop = false;
1329 nir_function_impl *func = nir_shader_get_entrypoint(shader);
1330
1331 foreach_list_typed(nir_cf_node, node, node, &func->body) {
1332 if (node->type == nir_cf_node_loop && nir_cf_node_as_loop(node)->divergent) {
1333 divergent_loop = true;
1334 break;
1335 }
1336 }
1337
1338 return divergent_loop;
1339 }
1340