1 /*
2 * Copyright © 2023 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 /* Introduction
8 * ============
9 *
10 * This pass optimizes varyings between 2 shaders, which means dead input/
11 * output removal, constant and uniform load propagation, deduplication,
12 * compaction, and inter-shader code motion. This is used during the shader
13 * linking process.
14 *
15 *
16 * Notes on behavior
17 * =================
18 *
19 * The pass operates on scalar varyings using 32-bit and 16-bit types. Vector
20 * varyings are not allowed.
21 *
22 * Indirectly-indexed varying slots (not vertices) are not optimized or
23 * compacted, but unused slots of indirectly-indexed varyings are still filled
24 * with directly-indexed varyings during compaction. Indirectly-indexed
25 * varyings are still removed if they are unused by the other shader.
26 *
27 * Indirectly-indexed vertices don't disallow optimizations, but compromises
28 * are made depending on how they are accessed. They are common in TCS, TES,
29 * and GS, so there is a desire to optimize them as much as possible. More on
30 * that in various sections below.
31 *
32 * Transform feedback doesn't prevent most optimizations such as constant
33 * propagation and compaction. Shaders can be left with output stores that set
34 * the no_varying flag, meaning the output is not consumed by the next shader,
35 * which means that optimizations did their job and now the output is only
36 * consumed by transform feedback.
37 *
38 * All legacy varying slots are optimized when it's allowed.
39 *
40 *
41 * Convergence property of shader outputs
42 * ======================================
43 *
44 * When an output stores an SSA that is convergent and all stores of that
45 * output appear in unconditional blocks or conditional blocks with
46 * a convergent entry condition and the shader is not GS, it implies that all
47 * vertices of that output have the same value, therefore the output can be
48 * promoted to flat because all interpolation modes lead to the same result
49 * as flat. Such outputs are opportunistically compacted with both flat and
50 * non-flat varyings based on whichever has unused slots in their vec4s. This
51 * pass refers to such inputs, outputs, and varyings as "convergent" (meaning
52 * all vertices are always equal).
53 *
54 * By default, flat varyings are the only ones that are not considered convergent
55 * because we want the flexibility to pack convergent varyings with both flat
56 * and non-flat varyings, and since flat varyings can contain integers and
57 * doubles, we can never interpolate them as FP32 or FP16. Optimizations start
58 * with separate interpolated, flat, and convergent groups of varyings, and
59 * they choose whether they want to promote convergent to interpolated or
60 * flat, or whether to leave that decision to the end when the compaction
61 * happens.
62 *
63 * The above default behavior doesn't apply when the hw supports convergent
64 * flat loads with interpolated vec4 slots. (there is a NIR option)
65 *
66 * TES patch inputs are always convergent because they are uniform within
67 * a primitive.
68 *
69 *
70 * Optimization steps
71 * ==================
72 *
73 * 1. Determine which varying slots can be optimized and how.
74 *
75 * * When a varying is said to be "optimized" in the following text, it
76 * means all optimizations are performed, such as removal, constant
77 * propagation, and deduplication.
78 * * All VARn, PATCHn, and FOGC varyings are always optimized and
79 * compacted.
80 * * PRIMITIVE_ID is treated as VARn in (GS, FS).
81 * * TEXn are removed if they are dead (except TEXn inputs, which can't be
82 * removed due to being affected by the coord replace state). TEXn can’t
83 * also be optimized or compacted due to being affected by the coord
84 * replace state. TEXn not consumed by FS are treated as VARn.
85 * * COLn and BFCn only propagate constants if they are between 0 and 1
86 * because of the clamp vertex color state, and they are only
87 * deduplicated and compacted among themselves because they are affected
88 * by the flat shade, provoking vertex, two-side color selection, and
89 * clamp vertex color states. COLn and BFCn not consumed by FS are
90 * treated as VARn.
91 * * All system value outputs like POS, PSIZ, CLIP_DISTn, etc. can’t be
92 * removed, but they are demoted to sysval-only outputs by setting
93 * the "no_varying" flag (i.e. they can be removed as varyings), so
94 * drivers should look at the "no_varying" flag. If an output is not
95 * a sysval output in a specific stage, it's treated as VARn. (such as
96 * POS in TCS)
97 * * TESS_LEVEL_* inputs in TES can’t be touched if TCS is missing.
98 *
99 * 2. Remove unused inputs and outputs
100 *
101 * * Outputs not used in the next shader are removed.
102 * * Inputs not initialized by the previous shader are replaced with undef
103 * except:
104 * * LAYER and VIEWPORT are replaced with 0 in FS.
105 * * TEXn.xy is untouched because the coord replace state can set it, and
106 * TEXn.zw is replaced by (0, 1), which is equal to the coord replace
107 * value.
108 * * Output loads that have no output stores anywhere in the shader are
109 * replaced with undef. (for TCS, though it works with any shader)
110 * * Output stores with transform feedback are preserved, but get
111 * the “no_varying” flag, meaning they are not consumed by the next
112 * shader stage. Later, transform-feedback-only varyings are compacted
113 * (relocated) such that they are always last.
114 * * TCS outputs that are read by TCS, but not used by TES get
115 * the "no_varying" flag to indicate that they are only read by TCS and
116 * not consumed by TES. Later, such TCS outputs are compacted (relocated)
117 * such that they are always last to keep all outputs consumed by TES
118 * consecutive without holes.
119 *
120 * 3. Constant, uniform, UBO load, and uniform expression propagation
121 *
122 * * Define “uniform expressions” as ALU expressions only sourcing
123 * constants, uniforms, and UBO loads.
124 * * Constants, uniforms, UBO loads, and uniform expressions stored
125 * in outputs are moved into the next shader, and the outputs are removed.
126 * * The same propagation is done from output stores to output loads.
127 * (for TCS, though it works with any shader)
128 * * If there are multiple stores to the same output, all such stores
129 * should store the same constant, uniform, UBO load, or uniform
130 * expression for the expression to be propagated. If an output has
131 * multiple vertices, all vertices should store the same expression.
132 * * nir->options has callbacks that are used to estimate the cost of
133 * uniform expressions that drivers can set to control the complexity of
134 * uniform expressions that are propagated. This is to ensure that
135 * we don't increase the GPU overhead measurably by moving code across
136 * pipeline stages that amplify GPU work.
137 * * Special cases:
138 * * Constant COLn and BFCn are propagated only if the constants are
139 * in the [0, 1] range because of the clamp vertex color state.
140 * If both COLn and BFCn are written, they must write the same
141 * constant. If BFCn is written but not COLn, the constant is
142 * propagated from BFCn to COLn.
143 * * TEX.xy is untouched because of the coord replace state.
144 * If TEX.zw is (0, 1), only those constants are propagated because
145 * they match the coord replace values.
146 * * CLIP_DISTn, LAYER and VIEWPORT are always propagated.
147 * Eliminated output stores get the "no_varying" flag if they are also
148 * xfb stores or write sysval outputs.
149 *
150 * 4. Remove duplicated output components
151 *
152 * * By comparing SSA defs.
153 * * If there are multiple stores to the same output, all such stores
154 * should store the same SSA as all stores of another output for
155 * the output to be considered duplicated. If an output has multiple
156 * vertices, all vertices should store the same SSA.
157 * * Deduplication can only be done between outputs of the same category.
158 * Those are: interpolated, patch, flat, interpolated color, flat color,
159 * and conditionally interpolated color based on the flat
160 * shade state
161 * * Everything is deduplicated except TEXn due to the coord replace state.
162 * * Eliminated output stores get the "no_varying" flag if they are also
163 * xfb stores or write sysval outputs.
164 *
165 * 5. Backward inter-shader code motion
166 *
167 * "Backward" refers to moving code in the opposite direction that shaders
168 * are executed, i.e. moving code from the consumer to the producer.
169 *
170 * Fragment shader example:
171 * ```
172 * result = input0 * uniform + input1 * constant + UBO.variable;
173 * ```
174 *
175 * The computation of "result" in the above example can be moved into
176 * the previous shader and both inputs can be replaced with a new input
177 * holding the value of "result", thus making the shader smaller and
178 * possibly reducing the number of inputs, uniforms, and UBOs by 1.
179 *
180 * Such code motion can be performed for any expression sourcing only
181 * inputs, constants, and uniforms except for fragment shaders, which can
182 * also do it but with the following limitations:
183 * * Only these transformations can be perfomed with interpolated inputs
184 * and any composition of these transformations (such as lerp), which can
185 * all be proven mathematically:
186 * * interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j)
187 * * interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j)
188 * * interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j)
189 * * all of these transformations are considered "inexact" in NIR
190 * * interp interpolates an input according to the barycentric
191 * coordinates (i, j), which are different for perspective,
192 * noperspective, center, centroid, sample, at_offset, and at_sample
193 * modes.
194 * * convergent_expr is any expression sourcing only constants,
195 * uniforms, and convergent inputs. The only requirement on
196 * convergent_expr is that it doesn't vary between vertices of
197 * the same primitive, but it can vary between primitives.
198 * * If inputs are flat or convergent, there are no limitations on
199 * expressions that can be moved.
200 * * Interpolated and flat inputs can't mix in the same expression, but
201 * convergent inputs can mix with both.
202 * * The interpolation qualifier of the new input is inherited from
203 * the removed non-convergent inputs that should all have the same (i, j).
204 * If there are no non-convergent inputs, then the new input is declared
205 * as flat (for simplicity; we can't choose the barycentric coordinates
206 * at random because AMD doesn't like when there are multiple sets of
207 * barycentric coordinates in the same shader unnecessarily).
208 * * Inf values break code motion across interpolation. See the section
209 * discussing how we handle it near the end.
210 *
211 * The above rules also apply to open-coded TES input interpolation, which
212 * is handled the same as FS input interpolation. The only differences are:
213 * * Open-coded TES input interpolation must match one of the allowed
214 * equations. Different interpolation equations are treated the same as
215 * different interpolation qualifiers in FS.
216 * * Patch varyings are always treated as convergent.
217 *
218 * Prerequisites:
219 * * We need a post-dominator tree that is constructed from a graph where
220 * vertices are instructions and directed edges going into them are
221 * the values of their source operands. This is different from how NIR
222 * dominance works, which represents all instructions within a basic
223 * block as a linear chain of vertices in the graph.
224 * In our graph, all loads without source operands and all constants are
225 * entry nodes in the graph, and all stores and discards are exit nodes
226 * in the graph. Each shader can have multiple disjoint graphs where
227 * the Lowest Common Ancestor of 2 instructions doesn't exist.
228 * * Given the above definition, the instruction whose result is the best
229 * candidate for a new input is the farthest instruction that
230 * post-dominates one of more inputs and is movable between shaders.
231 *
232 * Algorithm Idea Part 1: Search
233 * * Pick any input load that is hypothetically movable and call it
234 * the iterator.
235 * * Get the immediate post-dominator of the iterator, and if it's movable,
236 * replace the iterator with it.
237 * * Repeat the previous step until the obtained immediate post-dominator
238 * is not movable.
239 * * The iterator now contains the farthest post-dominator that is movable.
240 * * Gather all input loads that the post-dominator consumes.
241 * * For each of those input loads, all matching output stores must be
242 * in the same block (because they will be replaced by a single store).
243 *
244 * Algorithm Idea Part 2: Code Motion
245 * * Clone the post-dominator in the producer except input loads, which
246 * should be replaced by stored output values. Uniform and UBO loads,
247 * if any, should be cloned too.
248 * * Remove the original output stores.
249 * * Replace the post-dominator from the consumer with a new input load.
250 * * The step above makes the post-dominated input load that we picked
251 * at the beginning dead, but other input loads used by the post-
252 * dominator might still have other uses (shown in the example below).
253 *
254 * Example SSA-use graph - initial shader and the result:
255 * ```
256 * input0 input1 input0 input1
257 * \ / \ | \
258 * constant alu ... ======> | ...
259 * \ /
260 * alu
261 * (post-dominator)
262 * ```
263 *
264 * Description:
265 * On the right, the algorithm moved the constant and both ALU opcodes
266 * into the previous shader and input0 now contains the value of
267 * the post-dominator. input1 stays the same because it still has one
268 * use left. If input1 hadn't had the other use, it would have been
269 * removed.
270 *
271 * If the algorithm moves any code, the algorithm is repeated until there
272 * is no code that it can move.
273 *
274 * Which shader pairs are supported:
275 * * (VS, FS), (TES, FS): yes, fully
276 * * Limitation: If Infs must be preserved, no code is moved across
277 * interpolation, so only flat varyings are optimized.
278 * * (VS, TCS), (VS, GS), (TES, GS): no, but possible -- TODO
279 * * Current behavior:
280 * * Per-vertex inputs are rejected.
281 * * Possible solution:
282 * * All input loads used by an accepted post-dominator must use
283 * the same vertex index. The post-dominator must use all loads with
284 * that vertex index.
285 * * If a post-dominator is found for an input load from a specific
286 * slot, all other input loads from that slot must also have
287 * an accepted post-dominator, and all such post-dominators should
288 * be identical expressions.
289 * * (TCS, TES), (VS, TES): yes, with limitations
290 * * Limitations:
291 * * Only 1 store and 1 load per slot allowed.
292 * * No output loads allowed.
293 * * All stores used by an accepted post-dominator must be in
294 * the same block.
295 * * TCS barriers don't matter because there are no output loads.
296 * * Patch varyings are handled trivially with the above constraints.
297 * * Per-vertex outputs should only be indexed by gl_InvocationID.
298 * * An interpolated TES load is any ALU instruction that computes
299 * the result of linear interpolation of per-vertex inputs from
300 * the same slot using gl_TessCoord. If such an ALU instruction is
301 * found, it must be the only one, and all per-vertex input loads
302 * from that slot must feed into it. The interpolation equation must
303 * be equal to one of the allowed equations. Then the same rules as
304 * for interpolated FS inputs are used, treating different
305 * interpolation equations just like different interpolation
306 * qualifiers.
307 * * Patch inputs are treated as convergent, which means they are
308 * allowed to be in the same movable expression as interpolated TES
309 * inputs, and the same rules as for convergent FS inputs apply.
310 * * (GS, FS), (MS, FS): no
311 * * Workaround: Add a passthrough VS between GS/MS and FS, run
312 * the pass on the (VS, FS) pair to move code out of FS,
313 * and inline that VS at the end of your hw-specific
314 * GS/MS if it's possible.
315 * * (TS, MS): no
316 *
317 * The disadvantage of using the post-dominator tree is that it's a tree,
318 * which means there is only 1 post-dominator of each input. This example
319 * shows a case that could be optimized by replacing 3 inputs with 2 inputs,
320 * reducing the number of inputs by 1, but the immediate post-dominator of
321 * all input loads is NULL:
322 * ```
323 * temp0 = input0 + input1 + input2;
324 * temp1 = input0 + input1 * const1 + input2 * const2;
325 * ```
326 *
327 * If there is a graph algorithm that returns the best solution to
328 * the above case (which is temp0 and temp1 to replace all 3 inputs), let
329 * us know.
330 *
331 * 6. Forward inter-shader code motion
332 *
333 * TODO: Not implemented. The text below is a draft of the description.
334 *
335 * "Forward" refers to moving code in the direction that shaders are
336 * executed, i.e. moving code from the producer to the consumer.
337 *
338 * Vertex shader example:
339 * ```
340 * output0 = value + 1;
341 * output1 = value * 2;
342 * ```
343 *
344 * Both outputs can be replaced by 1 output storing "value", and both ALU
345 * operations can be moved into the next shader.
346 *
347 * The same dominance algorithm as in the previous optimization is used,
348 * except that:
349 * * Instead of inputs, we use outputs.
350 * * Instead of a post-dominator tree, we use a dominator tree of the exact
351 * same graph.
352 *
353 * The algorithm idea is: For each pair of 2 output stores, find their
354 * Lowest Common Ancestor in the dominator tree, and that's a candidate
355 * for a new output. All movable loads like load_const should be removed
356 * from the graph, otherwise the LCA wouldn't exist.
357 *
358 * The limitations on instructions that can be moved between shaders across
359 * interpolated loads are exactly the same as the previous optimization.
360 *
361 * nir->options has callbacks that are used to estimate the cost of
362 * expressions that drivers can set to control the complexity of
363 * expressions that can be moved to later shaders. This is to ensure that
364 * we don't increase the GPU overhead measurably by moving code across
365 * pipeline stages that amplify GPU work.
366 *
367 * 7. Compaction to vec4 slots (AKA packing)
368 *
369 * First, varyings are divided into these groups, and each group is
370 * compacted separately with some exceptions listed below:
371 *
372 * Non-FS groups (patch and non-patch are packed separately):
373 * * 32-bit flat
374 * * 16-bit flat
375 * * 32-bit no-varying (TCS outputs read by TCS but not TES)
376 * * 16-bit no-varying (TCS outputs read by TCS but not TES)
377 *
378 * FS groups:
379 * * 32-bit interpolated (always FP32)
380 * * 32-bit flat
381 * * 32-bit convergent (always FP32)
382 * * 16-bit interpolated (always FP16)
383 * * 16-bit flat
384 * * 16-bit convergent (always FP16)
385 * * 32-bit transform feedback only
386 * * 16-bit transform feedback only
387 *
388 * Then, all scalar varyings are relocated into new slots, starting from
389 * VAR0.x and increasing the scalar slot offset in 32-bit or 16-bit
390 * increments. Rules:
391 * * Both 32-bit and 16-bit flat varyings are packed in the same vec4.
392 * * Convergent varyings can be packed with interpolated varyings of
393 * the same type or flat. The group to pack with is chosen based on
394 * whichever has unused scalar slots because we want to reduce the total
395 * number of vec4s. After filling all unused scalar slots, the remaining
396 * convergent varyings are packed as flat.
397 * * Transform-feedback-only slots and no-varying slots are packed last,
398 * so that they are consecutive and not intermixed with varyings consumed
399 * by the next shader stage, and 32-bit and 16-bit slots are packed in
400 * the same vec4. This allows reducing memory for outputs by ignoring
401 * the trailing outputs that the next shader stage doesn't read.
402 *
403 * In the end, we should end up with these groups for FS:
404 * * 32-bit interpolated (always FP32) on separate vec4s
405 * * 16-bit interpolated (always FP16) on separate vec4s
406 * * 32-bit flat and 16-bit flat, mixed in the same vec4
407 * * 32-bit and 16-bit transform feedback only, sharing vec4s with flat
408 *
409 * Colors are compacted the same but separately because they can't be mixed
410 * with VARn. Colors are divided into 3 FS groups. They are:
411 * * 32-bit maybe-interpolated (affected by the flat-shade state)
412 * * 32-bit interpolated (not affected by the flat-shade state)
413 * * 32-bit flat (not affected by the flat-shade state)
414 *
415 * To facilitate driver-specific output merging, color channels are
416 * assigned in a rotated order depending on which one the first unused VARn
417 * channel is. For example, if the first unused VARn channel is VAR0.z,
418 * color channels are allocated in this order:
419 * COL0.z, COL0.w, COL0.x, COL0.y, COL1.z, COL1.w, COL1.x, COL1.y
420 * The reason is that some drivers merge outputs if each output sets
421 * different components, for example 2 outputs defining VAR0.xy and COL0.z.
422 * If drivers do interpolation in the fragment shader and color
423 * interpolation can differ for each component, VAR0.xy and COL.z can be
424 * stored in the same output storage slot, and the consumer can load VAR0
425 * and COL0 from the same slot.
426 *
427 * If COLn, BFCn, and TEXn are transform-feedback-only, they are moved to
428 * VARn. PRIMITIVE_ID in (GS, FS) and FOGC in (xx, FS) are always moved to
429 * VARn for better packing.
430 *
431 *
432 * Issue: Interpolation converts Infs to NaNs
433 * ==========================================
434 *
435 * Interpolation converts Infs to NaNs, i.e. interp(Inf, i, j) = NaN, which
436 * impacts and limits backward inter-shader code motion, uniform expression
437 * propagation, and compaction.
438 *
439 * When we decide not to interpolate a varying, we need to convert Infs to
440 * NaNs manually. Infs can be converted to NaNs like this: x*0 + x
441 * (suggested by Ian Romanick, the multiplication must be "exact")
442 *
443 * Changes to optimizations:
444 * - When we propagate a uniform expression and NaNs must be preserved,
445 * convert Infs in the result to NaNs using "x*0 + x" in the consumer.
446 * - When we change interpolation to flat for convergent varyings and NaNs
447 * must be preserved, apply "x*0 + x" to the stored output value
448 * in the producer.
449 * - There is no solution for backward inter-shader code motion with
450 * interpolation if Infs must be preserved. As an alternative, we can allow
451 * code motion across interpolation only for specific shader hashes in
452 * can_move_alu_across_interp. We can use shader-db to automatically produce
453 * a list of shader hashes that benefit from this optimization.
454 *
455 *
456 * Usage
457 * =====
458 *
459 * Requirements:
460 * - ALUs should be scalarized
461 * - Dot products and other vector opcodes should be lowered (recommended)
462 * - Input loads and output stores should be scalarized
463 * - 64-bit varyings should be lowered to 32 bits
464 * - nir_vertex_divergence_analysis must be called on the producer if
465 * the constumer is a fragment shader
466 *
467 * It's recommended to run this for all shader pairs from the first shader
468 * to the last shader first (to propagate constants etc.). If the optimization
469 * of (S1, S2) stages leads to changes in S1, remember the highest S1. Then
470 * re-run this for all shader pairs in the descending order from S1 to VS.
471 *
472 * NIR optimizations should be performed after every run that changes the IR.
473 *
474 *
475 * Analyzing the optimization potential of linking separate shaders
476 * ================================================================
477 *
478 * We can use this pass in an analysis pass that decides whether a separate
479 * shader has the potential to benefit from full draw-time linking. The way
480 * it would work is that we would create a passthrough shader adjacent to
481 * the separate shader, run this pass on both shaders, and check if the number
482 * of varyings decreased. This way we can decide to perform the draw-time
483 * linking only if we are confident that it would help performance.
484 *
485 * TODO: not implemented, mention the pass that implements it
486 */
487
488 #include "nir.h"
489 #include "nir_builder.h"
490 #include "util/u_math.h"
491 #include "util/u_memory.h"
492
493 /* nir_opt_varyings works at scalar 16-bit granularity across all varyings.
494 *
495 * Slots (i % 8 == 0,2,4,6) are 32-bit channels or low bits of 16-bit channels.
496 * Slots (i % 8 == 1,3,5,7) are high bits of 16-bit channels. 32-bit channels
497 * don't set these slots as used in bitmasks.
498 */
499 #define NUM_SCALAR_SLOTS (NUM_TOTAL_VARYING_SLOTS * 8)
500
501 /* Fragment shader input slots can be packed with indirectly-indexed vec4
502 * slots if there are unused components, but only if the vec4 slot has
503 * the same interpolation type. There are only 3 types: FLAT, FP32, FP16.
504 */
505 enum fs_vec4_type {
506 FS_VEC4_TYPE_NONE = 0,
507 FS_VEC4_TYPE_FLAT,
508 FS_VEC4_TYPE_INTERP_FP32,
509 FS_VEC4_TYPE_INTERP_FP16,
510 FS_VEC4_TYPE_INTERP_COLOR,
511 FS_VEC4_TYPE_INTERP_EXPLICIT,
512 FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
513 FS_VEC4_TYPE_PER_PRIMITIVE,
514 };
515
516 #if PRINT_RELOCATE_SLOT
517 static const char *fs_vec4_type_strings[] = {
518 "NONE",
519 "FLAT",
520 "INTERP_FP32",
521 "INTERP_FP16",
522 "INTERP_COLOR",
523 "INTERP_EXPLICIT",
524 "INTERP_EXPLICIT_STRICT",
525 "PER_PRIMITIVE",
526 };
527 #endif // PRINT_RELOCATE_SLOT
528
529 static unsigned
get_scalar_16bit_slot(nir_io_semantics sem,unsigned component)530 get_scalar_16bit_slot(nir_io_semantics sem, unsigned component)
531 {
532 return sem.location * 8 + component * 2 + sem.high_16bits;
533 }
534
535 static unsigned
intr_get_scalar_16bit_slot(nir_intrinsic_instr * intr)536 intr_get_scalar_16bit_slot(nir_intrinsic_instr *intr)
537 {
538 return get_scalar_16bit_slot(nir_intrinsic_io_semantics(intr),
539 nir_intrinsic_component(intr));
540 }
541
542 static unsigned
vec4_slot(unsigned scalar_slot)543 vec4_slot(unsigned scalar_slot)
544 {
545 return scalar_slot / 8;
546 }
547
548 struct list_node {
549 struct list_head head;
550 nir_intrinsic_instr *instr;
551 };
552
553 /* Information about 1 scalar varying slot for both shader stages. */
554 struct scalar_slot {
555 struct {
556 /* Linked list of all store instructions writing into the scalar slot
557 * in the producer.
558 */
559 struct list_head stores;
560
561 /* Only for TCS: Linked list of all load instructions read the scalar
562 * slot in the producer.
563 */
564 struct list_head loads;
565
566 /* If there is only one store instruction or if all store instructions
567 * store the same value in the producer, this is the instruction
568 * computing the stored value. Used by constant and uniform propagation
569 * to the next shader.
570 */
571 nir_instr *value;
572 } producer;
573
574 struct {
575 /* Linked list of all load instructions loading from the scalar slot
576 * in the consumer.
577 */
578 struct list_head loads;
579
580 /* The result of TES input interpolation. */
581 nir_alu_instr *tes_interp_load;
582 unsigned tes_interp_mode; /* FLAG_INTERP_TES_* */
583 nir_def *tes_load_tess_coord;
584 } consumer;
585
586 /* The number of accessed slots if this slot has indirect indexing. */
587 unsigned num_slots;
588 };
589
590 struct linkage_info {
591 struct scalar_slot slot[NUM_SCALAR_SLOTS];
592
593 bool spirv;
594 bool can_move_uniforms;
595 bool can_move_ubos;
596 bool can_mix_convergent_flat_with_interpolated;
597
598 gl_shader_stage producer_stage;
599 gl_shader_stage consumer_stage;
600 nir_builder producer_builder;
601 nir_builder consumer_builder;
602 unsigned max_varying_expression_cost;
603
604 /* Memory context for linear_alloc_child (fast allocation). */
605 void *linear_mem_ctx;
606
607 /* If any component of a vec4 slot is accessed indirectly, this is its
608 * FS vec4 qualifier type, which is either FLAT, FP32, or FP16.
609 * Components with different qualifier types can't be compacted
610 * in the same vec4.
611 */
612 uint8_t fs_vec4_type[NUM_TOTAL_VARYING_SLOTS];
613
614 /* Mask of all varyings that can be removed. Only a few non-VARn non-PATCHn
615 * varyings can't be removed.
616 */
617 BITSET_DECLARE(removable_mask, NUM_SCALAR_SLOTS);
618
619 /* Mask of all slots that have transform feedback info. */
620 BITSET_DECLARE(xfb_mask, NUM_SCALAR_SLOTS);
621
622 /* Mask of all slots that have transform feedback info, but are not used
623 * by the next shader. Separate masks for 32-bit and 16-bit outputs.
624 */
625 BITSET_DECLARE(xfb32_only_mask, NUM_SCALAR_SLOTS);
626 BITSET_DECLARE(xfb16_only_mask, NUM_SCALAR_SLOTS);
627
628 /* Mask of all TCS->TES slots that are read by TCS, but not TES. */
629 BITSET_DECLARE(no_varying32_mask, NUM_SCALAR_SLOTS);
630 BITSET_DECLARE(no_varying16_mask, NUM_SCALAR_SLOTS);
631
632 /* Mask of all slots accessed with indirect indexing. */
633 BITSET_DECLARE(indirect_mask, NUM_SCALAR_SLOTS);
634
635 /* The following masks only contain slots that can be compacted and
636 * describe the groups in which they should be compacted. Non-fragment
637 * shaders only use the flat bitmasks.
638 *
639 * Some legacy varyings are excluded when they can't be compacted due to
640 * being affected by pipeline states (like coord replace). That only
641 * applies to xx->FS shader pairs. Other shader pairs get all legacy
642 * varyings compacted and relocated to VARn.
643 *
644 * Indirectly-indexed varyings are also excluded because they are not
645 * compacted.
646 */
647 BITSET_DECLARE(interp_fp32_mask, NUM_SCALAR_SLOTS);
648 BITSET_DECLARE(interp_fp16_mask, NUM_SCALAR_SLOTS);
649 BITSET_DECLARE(flat32_mask, NUM_SCALAR_SLOTS);
650 BITSET_DECLARE(flat16_mask, NUM_SCALAR_SLOTS);
651 BITSET_DECLARE(interp_explicit32_mask, NUM_SCALAR_SLOTS);
652 BITSET_DECLARE(interp_explicit16_mask, NUM_SCALAR_SLOTS);
653 BITSET_DECLARE(interp_explicit_strict32_mask, NUM_SCALAR_SLOTS);
654 BITSET_DECLARE(interp_explicit_strict16_mask, NUM_SCALAR_SLOTS);
655 BITSET_DECLARE(per_primitive32_mask, NUM_SCALAR_SLOTS);
656 BITSET_DECLARE(per_primitive16_mask, NUM_SCALAR_SLOTS);
657
658 /* Color interpolation unqualified (follows the flat-shade state). */
659 BITSET_DECLARE(color32_mask, NUM_SCALAR_SLOTS);
660
661 /* Mask of output components that have only one store instruction, or if
662 * they have multiple store instructions, all those instructions store
663 * the same value. If the output has multiple vertices, all vertices store
664 * the same value. This is a useful property for:
665 * - constant and uniform propagation to the next shader
666 * - deduplicating outputs
667 */
668 BITSET_DECLARE(output_equal_mask, NUM_SCALAR_SLOTS);
669
670 /* Mask of output components that store values that are convergent,
671 * i.e. all values stored into the outputs are equal within a primitive.
672 *
673 * This is different from output_equal_mask, which says that all stores
674 * to the same slot in the same thread are equal, while this says that
675 * each store to the same slot can be different, but it always stores
676 * a convergent value, which means the stored value is equal among all
677 * threads within a primitive.
678 *
679 * The advantage is that these varyings can always be promoted to flat
680 * regardless of the original interpolation mode, and they can always be
681 * compacted with both interpolated and flat varyings.
682 */
683 BITSET_DECLARE(convergent32_mask, NUM_SCALAR_SLOTS);
684 BITSET_DECLARE(convergent16_mask, NUM_SCALAR_SLOTS);
685 };
686
687 /******************************************************************
688 * HELPERS
689 ******************************************************************/
690
691 /* Return whether the low or high 16-bit slot is 1. */
692 #define BITSET_TEST32(m, b) \
693 (BITSET_TEST(m, (b) & ~0x1) || BITSET_TEST(m, ((b) & ~0x1) + 1))
694
695 static void
print_linkage(struct linkage_info * linkage)696 print_linkage(struct linkage_info *linkage)
697 {
698 printf("Linkage: %s -> %s\n",
699 _mesa_shader_stage_to_abbrev(linkage->producer_stage),
700 _mesa_shader_stage_to_abbrev(linkage->consumer_stage));
701
702 for (unsigned i = 0; i < NUM_SCALAR_SLOTS; i++) {
703 struct scalar_slot *slot = &linkage->slot[i];
704
705 if (!slot->num_slots &&
706 list_is_empty(&slot->producer.stores) &&
707 list_is_empty(&slot->producer.loads) &&
708 list_is_empty(&slot->consumer.loads) &&
709 !BITSET_TEST(linkage->removable_mask, i) &&
710 !BITSET_TEST(linkage->indirect_mask, i) &&
711 !BITSET_TEST(linkage->xfb32_only_mask, i) &&
712 !BITSET_TEST(linkage->xfb16_only_mask, i) &&
713 !BITSET_TEST(linkage->no_varying32_mask, i) &&
714 !BITSET_TEST(linkage->no_varying16_mask, i) &&
715 !BITSET_TEST(linkage->interp_fp32_mask, i) &&
716 !BITSET_TEST(linkage->interp_fp16_mask, i) &&
717 !BITSET_TEST(linkage->flat32_mask, i) &&
718 !BITSET_TEST(linkage->flat16_mask, i) &&
719 !BITSET_TEST(linkage->interp_explicit32_mask, i) &&
720 !BITSET_TEST(linkage->interp_explicit16_mask, i) &&
721 !BITSET_TEST(linkage->interp_explicit_strict32_mask, i) &&
722 !BITSET_TEST(linkage->interp_explicit_strict16_mask, i) &&
723 !BITSET_TEST(linkage->per_primitive32_mask, i) &&
724 !BITSET_TEST(linkage->per_primitive16_mask, i) &&
725 !BITSET_TEST(linkage->convergent32_mask, i) &&
726 !BITSET_TEST(linkage->convergent16_mask, i) &&
727 !BITSET_TEST(linkage->output_equal_mask, i))
728 continue;
729
730 printf(" %7s.%c.%s: num_slots=%2u%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
731 gl_varying_slot_name_for_stage(vec4_slot(i),
732 linkage->producer_stage) + 13,
733 "xyzw"[(i / 2) % 4],
734 i % 2 ? "hi" : "lo",
735 slot->num_slots,
736 BITSET_TEST(linkage->removable_mask, i) ? " removable" : "",
737 BITSET_TEST(linkage->indirect_mask, i) ? " indirect" : "",
738 BITSET_TEST(linkage->xfb32_only_mask, i) ? " xfb32_only" : "",
739 BITSET_TEST(linkage->xfb16_only_mask, i) ? " xfb16_only" : "",
740 BITSET_TEST(linkage->no_varying32_mask, i) ? " no_varying32" : "",
741 BITSET_TEST(linkage->no_varying16_mask, i) ? " no_varying16" : "",
742 BITSET_TEST(linkage->interp_fp32_mask, i) ? " interp_fp32" : "",
743 BITSET_TEST(linkage->interp_fp16_mask, i) ? " interp_fp16" : "",
744 BITSET_TEST(linkage->flat32_mask, i) ? " flat32" : "",
745 BITSET_TEST(linkage->flat16_mask, i) ? " flat16" : "",
746 BITSET_TEST(linkage->interp_explicit32_mask, i) ? " interp_explicit32" : "",
747 BITSET_TEST(linkage->interp_explicit16_mask, i) ? " interp_explicit16" : "",
748 BITSET_TEST(linkage->interp_explicit_strict32_mask, i) ? " interp_explicit_strict32" : "",
749 BITSET_TEST(linkage->interp_explicit_strict16_mask, i) ? " interp_explicit_strict16" : "",
750 BITSET_TEST(linkage->per_primitive32_mask, i) ? " per_primitive32" : "",
751 BITSET_TEST(linkage->per_primitive32_mask, i) ? " per_primitive16" : "",
752 BITSET_TEST(linkage->convergent32_mask, i) ? " convergent32" : "",
753 BITSET_TEST(linkage->convergent16_mask, i) ? " convergent16" : "",
754 BITSET_TEST(linkage->output_equal_mask, i) ? " output_equal" : "",
755 !list_is_empty(&slot->producer.stores) ? " producer_stores" : "",
756 !list_is_empty(&slot->producer.loads) ? " producer_loads" : "",
757 !list_is_empty(&slot->consumer.loads) ? " consumer_loads" : "");
758 }
759 }
760
761 static void
slot_disable_optimizations_and_compaction(struct linkage_info * linkage,unsigned i)762 slot_disable_optimizations_and_compaction(struct linkage_info *linkage,
763 unsigned i)
764 {
765 BITSET_CLEAR(linkage->output_equal_mask, i);
766 BITSET_CLEAR(linkage->convergent32_mask, i);
767 BITSET_CLEAR(linkage->convergent16_mask, i);
768 BITSET_CLEAR(linkage->interp_fp32_mask, i);
769 BITSET_CLEAR(linkage->interp_fp16_mask, i);
770 BITSET_CLEAR(linkage->flat32_mask, i);
771 BITSET_CLEAR(linkage->flat16_mask, i);
772 BITSET_CLEAR(linkage->interp_explicit32_mask, i);
773 BITSET_CLEAR(linkage->interp_explicit16_mask, i);
774 BITSET_CLEAR(linkage->interp_explicit_strict32_mask, i);
775 BITSET_CLEAR(linkage->interp_explicit_strict16_mask, i);
776 BITSET_CLEAR(linkage->per_primitive32_mask, i);
777 BITSET_CLEAR(linkage->per_primitive16_mask, i);
778 BITSET_CLEAR(linkage->no_varying32_mask, i);
779 BITSET_CLEAR(linkage->no_varying16_mask, i);
780 BITSET_CLEAR(linkage->color32_mask, i);
781 }
782
783 static void
clear_slot_info_after_removal(struct linkage_info * linkage,unsigned i,bool uses_xfb)784 clear_slot_info_after_removal(struct linkage_info *linkage, unsigned i, bool uses_xfb)
785 {
786 slot_disable_optimizations_and_compaction(linkage, i);
787
788 if (uses_xfb)
789 return;
790
791 linkage->slot[i].num_slots = 0;
792
793 BITSET_CLEAR(linkage->indirect_mask, i);
794 BITSET_CLEAR(linkage->removable_mask, i);
795
796 /* Transform feedback stores can't be removed. */
797 assert(!BITSET_TEST(linkage->xfb32_only_mask, i));
798 assert(!BITSET_TEST(linkage->xfb16_only_mask, i));
799 }
800
801 static bool
has_xfb(nir_intrinsic_instr * intr)802 has_xfb(nir_intrinsic_instr *intr)
803 {
804 /* This means whether the instrinsic is ABLE to have xfb info. */
805 if (!nir_intrinsic_has_io_xfb(intr))
806 return false;
807
808 unsigned comp = nir_intrinsic_component(intr);
809
810 if (comp >= 2)
811 return nir_intrinsic_io_xfb2(intr).out[comp - 2].num_components > 0;
812 else
813 return nir_intrinsic_io_xfb(intr).out[comp].num_components > 0;
814 }
815
816 static bool
is_interpolated_color(struct linkage_info * linkage,unsigned i)817 is_interpolated_color(struct linkage_info *linkage, unsigned i)
818 {
819 if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
820 return false;
821
822 /* BFCn stores are bunched in the COLn slots with COLn, so we should never
823 * get BFCn here.
824 */
825 assert(vec4_slot(i) != VARYING_SLOT_BFC0 &&
826 vec4_slot(i) != VARYING_SLOT_BFC1);
827
828 return vec4_slot(i) == VARYING_SLOT_COL0 ||
829 vec4_slot(i) == VARYING_SLOT_COL1;
830 }
831
832 static bool
is_interpolated_texcoord(struct linkage_info * linkage,unsigned i)833 is_interpolated_texcoord(struct linkage_info *linkage, unsigned i)
834 {
835 if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
836 return false;
837
838 return vec4_slot(i) >= VARYING_SLOT_TEX0 &&
839 vec4_slot(i) <= VARYING_SLOT_TEX7;
840 }
841
842 static bool
color_uses_shade_model(struct linkage_info * linkage,unsigned i)843 color_uses_shade_model(struct linkage_info *linkage, unsigned i)
844 {
845 if (!is_interpolated_color(linkage, i))
846 return false;
847
848 list_for_each_entry(struct list_node, iter,
849 &linkage->slot[i].consumer.loads, head) {
850 assert(iter->instr->intrinsic == nir_intrinsic_load_interpolated_input);
851
852 nir_intrinsic_instr *baryc =
853 nir_instr_as_intrinsic(iter->instr->src[0].ssa->parent_instr);
854 if (nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE)
855 return true;
856 }
857
858 return false;
859 }
860
861 static bool
preserve_infs_nans(nir_shader * nir,unsigned bit_size)862 preserve_infs_nans(nir_shader *nir, unsigned bit_size)
863 {
864 unsigned mode = nir->info.float_controls_execution_mode;
865
866 return nir_is_float_control_inf_preserve(mode, bit_size) ||
867 nir_is_float_control_nan_preserve(mode, bit_size);
868 }
869
870 static bool
preserve_nans(nir_shader * nir,unsigned bit_size)871 preserve_nans(nir_shader *nir, unsigned bit_size)
872 {
873 unsigned mode = nir->info.float_controls_execution_mode;
874
875 return nir_is_float_control_nan_preserve(mode, bit_size);
876 }
877
878 static nir_def *
build_convert_inf_to_nan(nir_builder * b,nir_def * x)879 build_convert_inf_to_nan(nir_builder *b, nir_def *x)
880 {
881 /* Do x*0 + x. The multiplication by 0 can't be optimized out. */
882 nir_def *fma = nir_ffma_imm1(b, x, 0, x);
883 nir_instr_as_alu(fma->parent_instr)->exact = true;
884 return fma;
885 }
886
887 /******************************************************************
888 * GATHERING INPUTS & OUTPUTS
889 ******************************************************************/
890
891 static bool
is_active_sysval_output(struct linkage_info * linkage,unsigned slot,nir_intrinsic_instr * intr)892 is_active_sysval_output(struct linkage_info *linkage, unsigned slot,
893 nir_intrinsic_instr *intr)
894 {
895 return nir_slot_is_sysval_output(vec4_slot(slot),
896 linkage->consumer_stage) &&
897 !nir_intrinsic_io_semantics(intr).no_sysval_output;
898 }
899
900 /**
901 * This function acts like a filter. The pass won't touch varyings that
902 * return false here, and the return value is saved in the linkage bitmasks,
903 * so that all subpasses will *automatically* skip such varyings.
904 */
905 static bool
can_remove_varying(struct linkage_info * linkage,gl_varying_slot location)906 can_remove_varying(struct linkage_info *linkage, gl_varying_slot location)
907 {
908 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
909 /* User-defined varyings and fog coordinates can always be removed. */
910 if (location >= VARYING_SLOT_VAR0 ||
911 location == VARYING_SLOT_FOGC)
912 return true;
913
914 /* Workaround for mesh shader multiview in RADV.
915 * A layer output is inserted by ac_nir_lower_ngg which is called later.
916 * Prevent removing the layer input from FS when producer is MS.
917 */
918 if (linkage->producer_stage == MESA_SHADER_MESH &&
919 location == VARYING_SLOT_LAYER)
920 return false;
921
922 /* These can be removed as varyings, which means they will be demoted to
923 * sysval-only outputs keeping their culling/rasterization functions
924 * while not passing the values to FS. Drivers should handle
925 * the "no_varying" semantic to benefit from this.
926 *
927 * Note: When removing unset LAYER and VIEWPORT FS inputs, they will
928 * be replaced by 0 instead of undef.
929 */
930 if (location == VARYING_SLOT_CLIP_DIST0 ||
931 location == VARYING_SLOT_CLIP_DIST1 ||
932 location == VARYING_SLOT_CULL_DIST0 ||
933 location == VARYING_SLOT_CULL_DIST1 ||
934 location == VARYING_SLOT_LAYER ||
935 location == VARYING_SLOT_VIEWPORT)
936 return true;
937
938 /* COLn inputs can be removed only if both COLn and BFCn are not
939 * written. Both COLn and BFCn outputs can be removed if COLn inputs
940 * aren't read.
941 *
942 * TEXn inputs can never be removed in FS because of the coord replace
943 * state, but TEXn outputs can be removed if they are not read by FS.
944 */
945 if (location == VARYING_SLOT_COL0 ||
946 location == VARYING_SLOT_COL1 ||
947 location == VARYING_SLOT_BFC0 ||
948 location == VARYING_SLOT_BFC1 ||
949 (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7))
950 return true;
951
952 /* "GS -> FS" can remove the primitive ID if not written or not read. */
953 if ((linkage->producer_stage == MESA_SHADER_GEOMETRY ||
954 linkage->producer_stage == MESA_SHADER_MESH) &&
955 location == VARYING_SLOT_PRIMITIVE_ID)
956 return true;
957
958 /* No other varyings can be removed. */
959 return false;
960 } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) {
961 /* Only VS->TES shouldn't remove TESS_LEVEL_* inputs because the values
962 * come from glPatchParameterfv.
963 *
964 * For TCS->TES, TESS_LEVEL_* outputs can be removed as varyings, which
965 * means they will be demoted to sysval-only outputs, so that drivers
966 * know that TES doesn't read them.
967 */
968 if (linkage->producer_stage == MESA_SHADER_VERTEX &&
969 (location == VARYING_SLOT_TESS_LEVEL_INNER ||
970 location == VARYING_SLOT_TESS_LEVEL_OUTER))
971 return false;
972
973 return true;
974 }
975
976 /* All other varyings can be removed. */
977 return true;
978 }
979
980 struct opt_options {
981 bool propagate_uniform_expr:1;
982 bool deduplicate:1;
983 bool inter_shader_code_motion:1;
984 bool compact:1;
985 bool disable_all:1;
986 };
987
988 /**
989 * Return which optimizations are allowed.
990 */
991 static struct opt_options
can_optimize_varying(struct linkage_info * linkage,gl_varying_slot location)992 can_optimize_varying(struct linkage_info *linkage, gl_varying_slot location)
993 {
994 struct opt_options options_var = {
995 .propagate_uniform_expr = true,
996 .deduplicate = true,
997 .inter_shader_code_motion = true,
998 .compact = true,
999 };
1000 struct opt_options options_color = {
1001 .propagate_uniform_expr = true, /* only constants in [0, 1] */
1002 .deduplicate = true,
1003 .compact = true,
1004 };
1005 struct opt_options options_tex = {
1006 .propagate_uniform_expr = true, /* only TEX.zw if equal to (0, 1) */
1007 };
1008 struct opt_options options_sysval_output = {
1009 .propagate_uniform_expr = true,
1010 .deduplicate = true,
1011 };
1012 struct opt_options options_tess_levels = {
1013 .propagate_uniform_expr = true,
1014 .deduplicate = true,
1015 };
1016 struct opt_options options_disable_all = {
1017 .disable_all = true,
1018 };
1019
1020 assert(can_remove_varying(linkage, location));
1021
1022 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1023 /* xx -> FS */
1024 /* User-defined varyings and fog coordinates can always be optimized. */
1025 if (location >= VARYING_SLOT_VAR0 ||
1026 location == VARYING_SLOT_FOGC)
1027 return options_var;
1028
1029 /* The primitive ID can always be optimized in GS -> FS and MS -> FS. */
1030 if ((linkage->producer_stage == MESA_SHADER_GEOMETRY ||
1031 linkage->producer_stage == MESA_SHADER_MESH) &&
1032 location == VARYING_SLOT_PRIMITIVE_ID)
1033 return options_var;
1034
1035 /* Colors can only do constant propagation if COLn and BFCn store the
1036 * same constant and the constant is between 0 and 1 (because clamp
1037 * vertex color state is unknown). Uniform propagation isn't possible
1038 * because of the clamping.
1039 *
1040 * Color components can only be deduplicated and compacted among
1041 * themselves if they have the same interpolation qualifier, and can't
1042 * be mixed with other varyings.
1043 */
1044 if (location == VARYING_SLOT_COL0 ||
1045 location == VARYING_SLOT_COL1 ||
1046 location == VARYING_SLOT_BFC0 ||
1047 location == VARYING_SLOT_BFC1)
1048 return options_color;
1049
1050 /* TEXn.zw can only be constant-propagated if the value is (0, 1)
1051 * because it matches the coord replace values.
1052 */
1053 if (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7)
1054 return options_tex;
1055
1056 /* LAYER, VIEWPORT, CLIP_DISTn, and CULL_DISTn can only propagate
1057 * uniform expressions and be compacted (moved to VARn while keeping
1058 * the sysval outputs where they are).
1059 */
1060 if (location == VARYING_SLOT_LAYER ||
1061 location == VARYING_SLOT_VIEWPORT ||
1062 location == VARYING_SLOT_CLIP_DIST0 ||
1063 location == VARYING_SLOT_CLIP_DIST1 ||
1064 location == VARYING_SLOT_CULL_DIST0 ||
1065 location == VARYING_SLOT_CULL_DIST1)
1066 return options_sysval_output;
1067
1068 /* Everything else can't be read by the consumer, such as POS, PSIZ,
1069 * CLIP_VERTEX, EDGE, PRIMITIVE_SHADING_RATE, etc.
1070 */
1071 return options_disable_all;
1072 }
1073
1074 if (linkage->producer_stage == MESA_SHADER_TESS_CTRL) {
1075 /* TESS_LEVEL_* can only propagate uniform expressions.
1076 * Compaction is disabled because AMD doesn't want the varying to be
1077 * moved to PATCHn while keeping the sysval output where it is.
1078 */
1079 if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
1080 location == VARYING_SLOT_TESS_LEVEL_OUTER)
1081 return options_tess_levels;
1082 }
1083
1084 /* All other shader pairs, which are (VS, TCS), (TCS, TES), (VS, TES),
1085 * (TES, GS), and (VS, GS) can compact and optimize all varyings.
1086 */
1087 return options_var;
1088 }
1089
1090 static bool
gather_inputs(struct nir_builder * builder,nir_intrinsic_instr * intr,void * cb_data)1091 gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data)
1092 {
1093 struct linkage_info *linkage = (struct linkage_info *)cb_data;
1094
1095 if (intr->intrinsic != nir_intrinsic_load_input &&
1096 intr->intrinsic != nir_intrinsic_load_per_vertex_input &&
1097 intr->intrinsic != nir_intrinsic_load_per_primitive_input &&
1098 intr->intrinsic != nir_intrinsic_load_interpolated_input &&
1099 intr->intrinsic != nir_intrinsic_load_input_vertex)
1100 return false;
1101
1102 /* nir_lower_io_to_scalar is required before this */
1103 assert(intr->def.num_components == 1);
1104 /* Non-zero constant offsets should have been folded by
1105 * nir_io_add_const_offset_to_base.
1106 */
1107 nir_src offset = *nir_get_io_offset_src(intr);
1108 assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0);
1109
1110 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1111
1112 if (!can_remove_varying(linkage, sem.location))
1113 return false;
1114
1115 /* Insert the load into the list of loads for this scalar slot. */
1116 unsigned slot = intr_get_scalar_16bit_slot(intr);
1117 struct scalar_slot *in = &linkage->slot[slot];
1118 struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx,
1119 sizeof(struct list_node));
1120 node->instr = intr;
1121 list_addtail(&node->head, &in->consumer.loads);
1122 in->num_slots = MAX2(in->num_slots, sem.num_slots);
1123
1124 BITSET_SET(linkage->removable_mask, slot);
1125
1126 enum fs_vec4_type fs_vec4_type = FS_VEC4_TYPE_NONE;
1127
1128 /* Determine the type of the input for compaction. Other inputs
1129 * can be compacted with indirectly-indexed vec4 slots if they
1130 * have unused components, but only if they are of the same type.
1131 */
1132 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1133 switch (intr->intrinsic) {
1134 case nir_intrinsic_load_input:
1135 fs_vec4_type = FS_VEC4_TYPE_FLAT;
1136 break;
1137 case nir_intrinsic_load_per_primitive_input:
1138 fs_vec4_type = FS_VEC4_TYPE_PER_PRIMITIVE;
1139 break;
1140 case nir_intrinsic_load_input_vertex:
1141 if (sem.interp_explicit_strict)
1142 fs_vec4_type = FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT;
1143 else
1144 fs_vec4_type = FS_VEC4_TYPE_INTERP_EXPLICIT;
1145 break;
1146 case nir_intrinsic_load_interpolated_input:
1147 if (color_uses_shade_model(linkage, slot))
1148 fs_vec4_type = FS_VEC4_TYPE_INTERP_COLOR;
1149 else if (intr->def.bit_size == 32)
1150 fs_vec4_type = FS_VEC4_TYPE_INTERP_FP32;
1151 else if (intr->def.bit_size == 16)
1152 fs_vec4_type = FS_VEC4_TYPE_INTERP_FP16;
1153 else
1154 unreachable("invalid load_interpolated_input type");
1155 break;
1156 default:
1157 unreachable("unexpected input load intrinsic");
1158 }
1159
1160 linkage->fs_vec4_type[sem.location] = fs_vec4_type;
1161 }
1162
1163 /* Indirect indexing. */
1164 if (!nir_src_is_const(offset)) {
1165 /* Only the indirectly-indexed component is marked as indirect. */
1166 for (unsigned i = 0; i < sem.num_slots; i++)
1167 BITSET_SET(linkage->indirect_mask, slot + i * 8);
1168
1169 /* Set the same vec4 type as the first element in all slots. */
1170 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1171 for (unsigned i = 1; i < sem.num_slots; i++)
1172 linkage->fs_vec4_type[sem.location + i] = fs_vec4_type;
1173 }
1174 return false;
1175 }
1176
1177 if (!can_optimize_varying(linkage, sem.location).compact)
1178 return false;
1179
1180 /* Record inputs that can be compacted. */
1181 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1182 switch (intr->intrinsic) {
1183 case nir_intrinsic_load_input:
1184 if (intr->def.bit_size == 32)
1185 BITSET_SET(linkage->flat32_mask, slot);
1186 else if (intr->def.bit_size == 16)
1187 BITSET_SET(linkage->flat16_mask, slot);
1188 else
1189 unreachable("invalid load_input type");
1190 break;
1191 case nir_intrinsic_load_per_primitive_input:
1192 if (intr->def.bit_size == 32)
1193 BITSET_SET(linkage->per_primitive32_mask, slot);
1194 else if (intr->def.bit_size == 16)
1195 BITSET_SET(linkage->per_primitive16_mask, slot);
1196 else
1197 unreachable("invalid load_input type");
1198 break;
1199 case nir_intrinsic_load_input_vertex:
1200 if (sem.interp_explicit_strict) {
1201 if (intr->def.bit_size == 32)
1202 BITSET_SET(linkage->interp_explicit_strict32_mask, slot);
1203 else if (intr->def.bit_size == 16)
1204 BITSET_SET(linkage->interp_explicit_strict16_mask, slot);
1205 else
1206 unreachable("invalid load_input_vertex type");
1207 } else {
1208 if (intr->def.bit_size == 32)
1209 BITSET_SET(linkage->interp_explicit32_mask, slot);
1210 else if (intr->def.bit_size == 16)
1211 BITSET_SET(linkage->interp_explicit16_mask, slot);
1212 else
1213 unreachable("invalid load_input_vertex type");
1214 }
1215 break;
1216 case nir_intrinsic_load_interpolated_input:
1217 if (color_uses_shade_model(linkage, slot))
1218 BITSET_SET(linkage->color32_mask, slot);
1219 else if (intr->def.bit_size == 32)
1220 BITSET_SET(linkage->interp_fp32_mask, slot);
1221 else if (intr->def.bit_size == 16)
1222 BITSET_SET(linkage->interp_fp16_mask, slot);
1223 else
1224 unreachable("invalid load_interpolated_input type");
1225 break;
1226 default:
1227 unreachable("unexpected input load intrinsic");
1228 }
1229 } else {
1230 if (intr->def.bit_size == 32)
1231 BITSET_SET(linkage->flat32_mask, slot);
1232 else if (intr->def.bit_size == 16)
1233 BITSET_SET(linkage->flat16_mask, slot);
1234 else
1235 unreachable("invalid load_input type");
1236 }
1237 return false;
1238 }
1239
1240 static bool
gather_outputs(struct nir_builder * builder,nir_intrinsic_instr * intr,void * cb_data)1241 gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data)
1242 {
1243 struct linkage_info *linkage = (struct linkage_info *)cb_data;
1244
1245 if (intr->intrinsic != nir_intrinsic_store_output &&
1246 intr->intrinsic != nir_intrinsic_load_output &&
1247 intr->intrinsic != nir_intrinsic_store_per_vertex_output &&
1248 intr->intrinsic != nir_intrinsic_store_per_primitive_output &&
1249 intr->intrinsic != nir_intrinsic_load_per_vertex_output &&
1250 intr->intrinsic != nir_intrinsic_load_per_primitive_output)
1251 return false;
1252
1253 bool is_store =
1254 intr->intrinsic == nir_intrinsic_store_output ||
1255 intr->intrinsic == nir_intrinsic_store_per_vertex_output ||
1256 intr->intrinsic == nir_intrinsic_store_per_primitive_output;
1257
1258 if (is_store) {
1259 /* nir_lower_io_to_scalar is required before this */
1260 assert(intr->src[0].ssa->num_components == 1);
1261 /* nit_opt_undef is required before this. */
1262 assert(intr->src[0].ssa->parent_instr->type !=
1263 nir_instr_type_undef);
1264 } else {
1265 /* nir_lower_io_to_scalar is required before this */
1266 assert(intr->def.num_components == 1);
1267 /* Outputs loads are only allowed in TCS. */
1268 assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
1269 }
1270
1271 /* Non-zero constant offsets should have been folded by
1272 * nir_io_add_const_offset_to_base.
1273 */
1274 nir_src offset = *nir_get_io_offset_src(intr);
1275 assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0);
1276
1277 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1278
1279 if (!can_remove_varying(linkage, sem.location))
1280 return false;
1281
1282 /* For "xx -> FS", treat BFCn stores as COLn to make dead varying
1283 * elimination do the right thing automatically. The rules are:
1284 * - COLn inputs can be removed only if both COLn and BFCn are not
1285 * written.
1286 * - Both COLn and BFCn outputs can be removed if COLn inputs
1287 * aren't read.
1288 */
1289 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1290 if (sem.location == VARYING_SLOT_BFC0)
1291 sem.location = VARYING_SLOT_COL0;
1292 else if (sem.location == VARYING_SLOT_BFC1)
1293 sem.location = VARYING_SLOT_COL1;
1294 }
1295
1296 /* Insert the instruction into the list of stores or loads for this
1297 * scalar slot.
1298 */
1299 unsigned slot =
1300 get_scalar_16bit_slot(sem, nir_intrinsic_component(intr));
1301
1302 struct scalar_slot *out = &linkage->slot[slot];
1303 struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx,
1304 sizeof(struct list_node));
1305 node->instr = intr;
1306 out->num_slots = MAX2(out->num_slots, sem.num_slots);
1307
1308 if (is_store) {
1309 list_addtail(&node->head, &out->producer.stores);
1310
1311 if (has_xfb(intr)) {
1312 BITSET_SET(linkage->xfb_mask, slot);
1313
1314 if (sem.no_varying &&
1315 !is_active_sysval_output(linkage, slot, intr)) {
1316 if (intr->src[0].ssa->bit_size == 32)
1317 BITSET_SET(linkage->xfb32_only_mask, slot);
1318 else if (intr->src[0].ssa->bit_size == 16)
1319 BITSET_SET(linkage->xfb16_only_mask, slot);
1320 else
1321 unreachable("invalid load_input type");
1322 }
1323 }
1324 } else {
1325 list_addtail(&node->head, &out->producer.loads);
1326 }
1327
1328 BITSET_SET(linkage->removable_mask, slot);
1329
1330 /* Indirect indexing. */
1331 if (!nir_src_is_const(offset)) {
1332 /* Only the indirectly-indexed component is marked as indirect. */
1333 for (unsigned i = 0; i < sem.num_slots; i++)
1334 BITSET_SET(linkage->indirect_mask, slot + i * 8);
1335
1336 /* Set the same vec4 type as the first element in all slots. */
1337 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1338 enum fs_vec4_type fs_vec4_type =
1339 linkage->fs_vec4_type[sem.location];
1340
1341 for (unsigned i = 1; i < sem.num_slots; i++)
1342 linkage->fs_vec4_type[sem.location + i] = fs_vec4_type;
1343 }
1344 return false;
1345 }
1346
1347 if (can_optimize_varying(linkage, sem.location).disable_all)
1348 return false;
1349
1350 if (is_store) {
1351 nir_def *value = intr->src[0].ssa;
1352
1353 const bool constant = value->parent_instr->type == nir_instr_type_load_const;
1354
1355 /* If the store instruction is executed in a divergent block, the value
1356 * that's stored in the output becomes divergent.
1357 *
1358 * Mesh shaders get special treatment because we can't follow their topology,
1359 * so we only propagate constants.
1360 * TODO: revisit this when workgroup divergence analysis is merged.
1361 */
1362 const bool divergent = value->divergent ||
1363 intr->instr.block->divergent ||
1364 (!constant && linkage->producer_stage == MESA_SHADER_MESH);
1365
1366 if (!out->producer.value) {
1367 /* This is the first store to this output. */
1368 BITSET_SET(linkage->output_equal_mask, slot);
1369 out->producer.value = value->parent_instr;
1370
1371 /* Set whether the value is convergent. Such varyings can be
1372 * promoted to flat regardless of their original interpolation
1373 * mode.
1374 */
1375 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && !divergent) {
1376 if (value->bit_size == 32)
1377 BITSET_SET(linkage->convergent32_mask, slot);
1378 else if (value->bit_size == 16)
1379 BITSET_SET(linkage->convergent16_mask, slot);
1380 else
1381 unreachable("invalid store_output type");
1382 }
1383 } else {
1384 /* There are multiple stores to the same output. If they store
1385 * different values, clear the mask.
1386 */
1387 if (out->producer.value != value->parent_instr)
1388 BITSET_CLEAR(linkage->output_equal_mask, slot);
1389
1390 /* Update divergence information. */
1391 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && divergent) {
1392 if (value->bit_size == 32)
1393 BITSET_CLEAR(linkage->convergent32_mask, slot);
1394 else if (value->bit_size == 16)
1395 BITSET_CLEAR(linkage->convergent16_mask, slot);
1396 else
1397 unreachable("invalid store_output type");
1398 }
1399 }
1400 } else {
1401 /* Only TCS output loads can get here.
1402 *
1403 * We need to record output loads as flat32 or flat16, otherwise
1404 * compaction will think that the slot is free and will put some
1405 * other output in its place.
1406 */
1407 assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
1408
1409 if (!can_optimize_varying(linkage, sem.location).compact)
1410 return false;
1411
1412 if (intr->def.bit_size == 32)
1413 BITSET_SET(linkage->flat32_mask, slot);
1414 else if (intr->def.bit_size == 16)
1415 BITSET_SET(linkage->flat16_mask, slot);
1416 else
1417 unreachable("invalid load_input type");
1418 }
1419 return false;
1420 }
1421
1422 /******************************************************************
1423 * TIDYING UP INDIRECT VARYINGS (BEFORE DEAD VARYINGS REMOVAL)
1424 ******************************************************************/
1425
1426 static void
tidy_up_indirect_varyings(struct linkage_info * linkage)1427 tidy_up_indirect_varyings(struct linkage_info *linkage)
1428 {
1429 unsigned i;
1430
1431 /* Indirectly-indexed slots can have direct access too and thus set
1432 * various bitmasks, so clear those bitmasks to make sure they are not
1433 * touched.
1434 */
1435 BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) {
1436 slot_disable_optimizations_and_compaction(linkage, i);
1437 }
1438
1439 /* If some slots have both direct and indirect accesses, move instructions
1440 * of such slots to the slot representing the first array element, so that
1441 * we can remove all loads/stores of dead indirectly-indexed varyings
1442 * by only looking at the first element.
1443 */
1444 BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) {
1445 struct scalar_slot *first = &linkage->slot[i];
1446
1447 /* Skip if this is not the first array element. The first element
1448 * always sets num_slots to at least 2.
1449 */
1450 if (first->num_slots <= 1)
1451 continue;
1452
1453 /* Move instructions from other elements of the indirectly-accessed
1454 * array to the first element (by merging the linked lists).
1455 */
1456 for (unsigned elem = 1; elem < first->num_slots; elem++) {
1457 /* The component slots are at 16-bit granularity, so we need to
1458 * increment by 8 to get the same component in the next vec4 slot.
1459 */
1460 struct scalar_slot *other = &linkage->slot[i + elem * 8];
1461
1462 list_splicetail(&other->producer.stores, &first->producer.stores);
1463 list_splicetail(&other->producer.loads, &first->producer.loads);
1464 list_splicetail(&other->consumer.loads, &first->consumer.loads);
1465 list_inithead(&other->producer.stores);
1466 list_inithead(&other->producer.loads);
1467 list_inithead(&other->consumer.loads);
1468 }
1469 }
1470 }
1471
1472 /******************************************************************
1473 * TIDYING UP CONVERGENT VARYINGS
1474 ******************************************************************/
1475
1476 /**
1477 * Reorganize bitmasks for FS because they are initialized such that they can
1478 * intersect with the convergent bitmasks. We want them to be disjoint, so
1479 * that masks of interpolated, flat, and convergent varyings don't intersect.
1480 */
1481 static void
tidy_up_convergent_varyings(struct linkage_info * linkage)1482 tidy_up_convergent_varyings(struct linkage_info *linkage)
1483 {
1484 if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
1485 return;
1486
1487 unsigned i;
1488 /* Whether to promote convergent interpolated slots to flat if it
1489 * doesn't lead to worse compaction.
1490 */
1491 bool optimize_convergent_slots = true; /* only turn off for debugging */
1492
1493 if (optimize_convergent_slots) {
1494 /* If a slot is flat and convergent and the driver can't load as flat
1495 * from interpolated vec4 slots, keep the flat bit and remove
1496 * the convergent bit. If the driver can load as flat from interpolated
1497 * vec4 slots, keep the convergent bit.
1498 *
1499 * If a slot is interpolated and convergent, remove the interpolated
1500 * bit and keep the convergent bit, which means that it's interpolated,
1501 * but can be promoted to flat.
1502 *
1503 * Since the geometry shader is the only shader that can store values
1504 * in multiple vertices before FS, it's required that all stores are
1505 * equal to be considered convergent (output_equal_mask), otherwise
1506 * the promotion to flat would be incorrect.
1507 */
1508 BITSET_FOREACH_SET(i, linkage->convergent32_mask, NUM_SCALAR_SLOTS) {
1509 if (!BITSET_TEST(linkage->interp_fp32_mask, i) &&
1510 !BITSET_TEST(linkage->flat32_mask, i) &&
1511 !BITSET_TEST(linkage->color32_mask, i)) {
1512 /* Clear the flag - not used by FS. */
1513 BITSET_CLEAR(linkage->convergent32_mask, i);
1514 } else if ((!linkage->can_mix_convergent_flat_with_interpolated &&
1515 BITSET_TEST(linkage->flat32_mask, i)) ||
1516 (linkage->producer_stage == MESA_SHADER_GEOMETRY &&
1517 !BITSET_TEST(linkage->output_equal_mask, i))) {
1518 /* Keep the original qualifier. */
1519 BITSET_CLEAR(linkage->convergent32_mask, i);
1520 } else {
1521 /* Keep it convergent. */
1522 BITSET_CLEAR(linkage->interp_fp32_mask, i);
1523 BITSET_CLEAR(linkage->color32_mask, i);
1524 BITSET_CLEAR(linkage->flat32_mask, i);
1525 }
1526 }
1527 BITSET_FOREACH_SET(i, linkage->convergent16_mask, NUM_SCALAR_SLOTS) {
1528 if (!BITSET_TEST(linkage->interp_fp16_mask, i) &&
1529 !BITSET_TEST(linkage->flat16_mask, i)) {
1530 /* Clear the flag - not used by FS. */
1531 BITSET_CLEAR(linkage->convergent16_mask, i);
1532 } else if ((!linkage->can_mix_convergent_flat_with_interpolated &&
1533 BITSET_TEST(linkage->flat16_mask, i)) ||
1534 (linkage->producer_stage == MESA_SHADER_GEOMETRY &&
1535 !BITSET_TEST(linkage->output_equal_mask, i))) {
1536 /* Keep the original qualifier. */
1537 BITSET_CLEAR(linkage->convergent16_mask, i);
1538 } else {
1539 /* Keep it convergent. */
1540 BITSET_CLEAR(linkage->interp_fp16_mask, i);
1541 BITSET_CLEAR(linkage->flat16_mask, i);
1542 }
1543 }
1544 } else {
1545 /* Don't do anything with convergent slots. */
1546 BITSET_ZERO(linkage->convergent32_mask);
1547 BITSET_ZERO(linkage->convergent16_mask);
1548 }
1549 }
1550
1551 /******************************************************************
1552 * DETERMINING UNIFORM AND UBO MOVABILITY BASED ON DRIVER LIMITS
1553 ******************************************************************/
1554
1555 static bool
is_variable_present(nir_shader * nir,nir_variable * var,nir_variable_mode mode,bool spirv)1556 is_variable_present(nir_shader *nir, nir_variable *var,
1557 nir_variable_mode mode, bool spirv)
1558 {
1559 nir_foreach_variable_with_modes(it, nir, mode) {
1560 if ((spirv && it->data.binding == var->data.binding) ||
1561 (!spirv && !strcmp(it->name, var->name)))
1562 return true;
1563 }
1564 return false;
1565 }
1566
1567 /* TODO: this should be a helper in common code */
1568 static unsigned
get_uniform_components(const struct glsl_type * type)1569 get_uniform_components(const struct glsl_type *type)
1570 {
1571 unsigned size = glsl_get_aoa_size(type);
1572 size = MAX2(size, 1);
1573 size *= glsl_get_matrix_columns(glsl_without_array(type));
1574
1575 if (glsl_type_is_dual_slot(glsl_without_array(type)))
1576 size *= 2;
1577
1578 /* Convert from vec4 to scalar. */
1579 return size * 4;
1580 }
1581
1582 static unsigned
get_ubo_slots(const nir_variable * var)1583 get_ubo_slots(const nir_variable *var)
1584 {
1585 if (glsl_type_is_interface(glsl_without_array(var->type))) {
1586 unsigned slots = glsl_get_aoa_size(var->type);
1587 return MAX2(slots, 1);
1588 }
1589
1590 return 1;
1591 }
1592
1593 /**
1594 * Count uniforms and see if the combined uniform component count is over
1595 * the limit. If it is, don't move any uniforms. It's sufficient if drivers
1596 * declare a very high limit.
1597 */
1598 static void
determine_uniform_movability(struct linkage_info * linkage,unsigned max_uniform_components)1599 determine_uniform_movability(struct linkage_info *linkage,
1600 unsigned max_uniform_components)
1601 {
1602 nir_shader *producer = linkage->producer_builder.shader;
1603 nir_shader *consumer = linkage->consumer_builder.shader;
1604 unsigned num_producer_uniforms = 0;
1605 unsigned num_consumer_uniforms = 0;
1606 unsigned num_shared_uniforms = 0;
1607
1608 nir_foreach_variable_with_modes(var, producer, nir_var_uniform) {
1609 if (is_variable_present(consumer, var, nir_var_uniform, linkage->spirv))
1610 num_shared_uniforms += get_uniform_components(var->type);
1611 else
1612 num_producer_uniforms += get_uniform_components(var->type);
1613 }
1614
1615 nir_foreach_variable_with_modes(var, consumer, nir_var_uniform) {
1616 if (!is_variable_present(producer, var, nir_var_uniform, linkage->spirv))
1617 num_consumer_uniforms += get_uniform_components(var->type);
1618 }
1619
1620 linkage->can_move_uniforms =
1621 num_producer_uniforms + num_consumer_uniforms + num_shared_uniforms <=
1622 max_uniform_components;
1623 }
1624
1625 /**
1626 * Count UBOs and see if the combined UBO count is over the limit. If it is,
1627 * don't move any UBOs. It's sufficient if drivers declare a very high limit.
1628 */
1629 static void
determine_ubo_movability(struct linkage_info * linkage,unsigned max_ubos_per_stage)1630 determine_ubo_movability(struct linkage_info *linkage,
1631 unsigned max_ubos_per_stage)
1632 {
1633 nir_shader *producer = linkage->producer_builder.shader;
1634 nir_shader *consumer = linkage->consumer_builder.shader;
1635 unsigned num_producer_ubos = 0;
1636 unsigned num_consumer_ubos = 0;
1637 unsigned num_shared_ubos = 0;
1638
1639 nir_foreach_variable_with_modes(var, producer, nir_var_mem_ubo) {
1640 if (is_variable_present(consumer, var, nir_var_mem_ubo, linkage->spirv))
1641 num_shared_ubos += get_ubo_slots(var);
1642 else
1643 num_producer_ubos += get_ubo_slots(var);
1644 }
1645
1646 nir_foreach_variable_with_modes(var, consumer, nir_var_mem_ubo) {
1647 if (!is_variable_present(producer, var, nir_var_mem_ubo,
1648 linkage->spirv))
1649 num_consumer_ubos += get_ubo_slots(var);
1650 }
1651
1652 linkage->can_move_ubos =
1653 num_producer_ubos + num_consumer_ubos + num_shared_ubos <=
1654 max_ubos_per_stage;
1655 }
1656
1657 /******************************************************************
1658 * DEAD VARYINGS REMOVAL
1659 ******************************************************************/
1660
1661 static void
remove_all_stores(struct linkage_info * linkage,unsigned i,bool * uses_xfb,nir_opt_varyings_progress * progress)1662 remove_all_stores(struct linkage_info *linkage, unsigned i,
1663 bool *uses_xfb, nir_opt_varyings_progress *progress)
1664 {
1665 struct scalar_slot *slot = &linkage->slot[i];
1666
1667 assert(!list_is_empty(&slot->producer.stores) &&
1668 list_is_empty(&slot->producer.loads) &&
1669 list_is_empty(&slot->consumer.loads));
1670
1671 /* Remove all stores. */
1672 list_for_each_entry_safe(struct list_node, iter, &slot->producer.stores, head) {
1673 if (nir_remove_varying(iter->instr, linkage->consumer_stage)) {
1674 list_del(&iter->head);
1675 *progress |= nir_progress_producer;
1676 } else {
1677 if (has_xfb(iter->instr)) {
1678 *uses_xfb = true;
1679
1680 if (!is_active_sysval_output(linkage, i, iter->instr)) {
1681 if (iter->instr->src[0].ssa->bit_size == 32)
1682 BITSET_SET(linkage->xfb32_only_mask, i);
1683 else if (iter->instr->src[0].ssa->bit_size == 16)
1684 BITSET_SET(linkage->xfb16_only_mask, i);
1685 else
1686 unreachable("invalid load_input type");
1687 }
1688 }
1689 }
1690 }
1691 }
1692
1693 static void
remove_dead_varyings(struct linkage_info * linkage,nir_opt_varyings_progress * progress)1694 remove_dead_varyings(struct linkage_info *linkage,
1695 nir_opt_varyings_progress *progress)
1696 {
1697 unsigned i;
1698
1699 /* Remove dead inputs and outputs. */
1700 BITSET_FOREACH_SET(i, linkage->removable_mask, NUM_SCALAR_SLOTS) {
1701 struct scalar_slot *slot = &linkage->slot[i];
1702
1703 /* Only indirect access can have no loads and stores because we moved
1704 * them to the first element in tidy_up_indirect_varyings().
1705 */
1706 assert(!list_is_empty(&slot->producer.stores) ||
1707 !list_is_empty(&slot->producer.loads) ||
1708 !list_is_empty(&slot->consumer.loads) ||
1709 BITSET_TEST(linkage->indirect_mask, i));
1710
1711 /* Nothing to do if there are no loads and stores. */
1712 if (list_is_empty(&slot->producer.stores) &&
1713 list_is_empty(&slot->producer.loads) &&
1714 list_is_empty(&slot->consumer.loads))
1715 continue;
1716
1717 /* If there are producer loads (e.g. TCS) but no consumer loads
1718 * (e.g. TES), set the "no_varying" flag to indicate that the outputs
1719 * are not consumed by the next shader stage (e.g. TES).
1720 */
1721 if (!list_is_empty(&slot->producer.stores) &&
1722 !list_is_empty(&slot->producer.loads) &&
1723 list_is_empty(&slot->consumer.loads)) {
1724 for (unsigned list_index = 0; list_index < 2; list_index++) {
1725 struct list_head *list = list_index ? &slot->producer.stores :
1726 &slot->producer.loads;
1727
1728 list_for_each_entry(struct list_node, iter, list, head) {
1729 nir_io_semantics sem = nir_intrinsic_io_semantics(iter->instr);
1730 sem.no_varying = 1;
1731 nir_intrinsic_set_io_semantics(iter->instr, sem);
1732 }
1733 }
1734
1735 /* This tells the compaction to move these varyings to the end. */
1736 if (BITSET_TEST(linkage->flat32_mask, i)) {
1737 assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT);
1738 BITSET_CLEAR(linkage->flat32_mask, i);
1739 BITSET_SET(linkage->no_varying32_mask, i);
1740 }
1741 if (BITSET_TEST(linkage->flat16_mask, i)) {
1742 assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT);
1743 BITSET_CLEAR(linkage->flat16_mask, i);
1744 BITSET_SET(linkage->no_varying16_mask, i);
1745 }
1746 continue;
1747 }
1748
1749 /* The varyings aren't dead if both loads and stores are present. */
1750 if (!list_is_empty(&slot->producer.stores) &&
1751 (!list_is_empty(&slot->producer.loads) ||
1752 !list_is_empty(&slot->consumer.loads)))
1753 continue;
1754
1755 bool uses_xfb = false;
1756
1757 if (list_is_empty(&slot->producer.stores)) {
1758 /* There are no stores. */
1759 assert(!list_is_empty(&slot->producer.loads) ||
1760 !list_is_empty(&slot->consumer.loads));
1761
1762 /* TEXn.xy loads can't be removed in FS because of the coord
1763 * replace state, but TEXn outputs can be removed if they are
1764 * not read by FS.
1765 *
1766 * TEXn.zw loads can be eliminated and replaced by (0, 1), which
1767 * is equal to the coord replace value.
1768 */
1769 if (is_interpolated_texcoord(linkage, i)) {
1770 assert(i % 2 == 0); /* high 16-bit slots disallowed */
1771 /* Keep TEXn.xy. */
1772 if (i % 8 < 4)
1773 continue;
1774 }
1775
1776 /* Replace all loads with undef. Do that for both input loads
1777 * in the consumer stage and output loads in the producer stage
1778 * because we also want to eliminate TCS loads that have no
1779 * corresponding TCS stores.
1780 */
1781 for (unsigned list_index = 0; list_index < 2; list_index++) {
1782 struct list_head *list = list_index ? &slot->producer.loads :
1783 &slot->consumer.loads;
1784 nir_builder *b = list_index ? &linkage->producer_builder :
1785 &linkage->consumer_builder;
1786
1787 list_for_each_entry(struct list_node, iter, list, head) {
1788 nir_intrinsic_instr *loadi = iter->instr;
1789 nir_def *replacement = NULL;
1790
1791 b->cursor = nir_before_instr(&loadi->instr);
1792
1793 /* LAYER and VIEWPORT FS inputs should be replaced by 0
1794 * instead of undef.
1795 */
1796 gl_varying_slot location = (gl_varying_slot)(vec4_slot(i));
1797
1798 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
1799 (location == VARYING_SLOT_LAYER ||
1800 location == VARYING_SLOT_VIEWPORT ||
1801 /* TEXn.z is replaced by 0 (matching coord replace) */
1802 (is_interpolated_texcoord(linkage, i) && i % 8 == 4)))
1803 replacement = nir_imm_intN_t(b, 0, loadi->def.bit_size);
1804 else if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
1805 /* TEXn.w is replaced by 1 (matching coord replace) */
1806 is_interpolated_texcoord(linkage, i) && i % 8 == 6)
1807 replacement = nir_imm_floatN_t(b, 1, loadi->def.bit_size);
1808 else
1809 replacement = nir_undef(b, 1, loadi->def.bit_size);
1810
1811 nir_def_replace(&loadi->def, replacement);
1812
1813 *progress |= list_index ? nir_progress_producer :
1814 nir_progress_consumer;
1815 }
1816 }
1817
1818 /* Clear the lists. */
1819 list_inithead(&slot->producer.loads);
1820 list_inithead(&slot->consumer.loads);
1821 } else {
1822 /* There are no loads. */
1823 remove_all_stores(linkage, i, &uses_xfb, progress);
1824 }
1825
1826 /* Clear bitmasks associated with this varying slot or array. */
1827 for (unsigned elem = 0; elem < slot->num_slots; elem++)
1828 clear_slot_info_after_removal(linkage, i + elem, uses_xfb);
1829 }
1830 }
1831
1832 /******************************************************************
1833 * SSA CLONING HELPERS
1834 ******************************************************************/
1835
1836 /* Pass flags for inter-shader code motion. Also used by helpers. */
1837 #define FLAG_ALU_IS_TES_INTERP_LOAD BITFIELD_BIT(0)
1838 #define FLAG_MOVABLE BITFIELD_BIT(1)
1839 #define FLAG_UNMOVABLE BITFIELD_BIT(2)
1840 #define FLAG_POST_DOMINATOR_PROCESSED BITFIELD_BIT(3)
1841 #define FLAG_GATHER_LOADS_VISITED BITFIELD_BIT(4)
1842
1843 #define FLAG_INTERP_MASK BITFIELD_RANGE(5, 3)
1844 #define FLAG_INTERP_CONVERGENT (0 << 5)
1845 #define FLAG_INTERP_FLAT (1 << 5)
1846 /* FS-only interpolation modes. */
1847 #define FLAG_INTERP_PERSP_PIXEL (2 << 5)
1848 #define FLAG_INTERP_PERSP_CENTROID (3 << 5)
1849 #define FLAG_INTERP_PERSP_SAMPLE (4 << 5)
1850 #define FLAG_INTERP_LINEAR_PIXEL (5 << 5)
1851 #define FLAG_INTERP_LINEAR_CENTROID (6 << 5)
1852 #define FLAG_INTERP_LINEAR_SAMPLE (7 << 5)
1853 /* TES-only interpolation modes. (these were found in shaders) */
1854 #define FLAG_INTERP_TES_TRIANGLE_UVW (2 << 5) /* v0*u + v1*v + v2*w */
1855 #define FLAG_INTERP_TES_TRIANGLE_WUV (3 << 5) /* v0*w + v1*u + v2*v */
1856 /* TODO: Feel free to insert more TES interpolation equations here. */
1857
1858 static bool
can_move_deref_between_shaders(struct linkage_info * linkage,nir_instr * instr)1859 can_move_deref_between_shaders(struct linkage_info *linkage, nir_instr *instr)
1860 {
1861 nir_deref_instr *deref = nir_instr_as_deref(instr);
1862 unsigned allowed_modes =
1863 (linkage->can_move_uniforms ? nir_var_uniform : 0) |
1864 (linkage->can_move_ubos ? nir_var_mem_ubo : 0);
1865
1866 if (!nir_deref_mode_is_one_of(deref, allowed_modes))
1867 return false;
1868
1869 /* Indirectly-indexed uniforms and UBOs are not moved into later shaders
1870 * due to performance concerns, and they are not moved into previous shaders
1871 * because it's unimplemented (TODO).
1872 */
1873 if (nir_deref_instr_has_indirect(deref))
1874 return false;
1875
1876 nir_variable *var = nir_deref_instr_get_variable(deref);
1877
1878 /* Subroutine uniforms are not moved. Even though it works and subroutine
1879 * uniforms are moved correctly and subroutines have been inlined at this
1880 * point, subroutine functions aren't moved and the linker doesn't like
1881 * when a shader only contains a subroutine uniform but no subroutine
1882 * functions. This could be fixed in the linker, but for now, don't
1883 * move subroutine uniforms.
1884 */
1885 if (var->name && strstr(var->name, "__subu_") == var->name)
1886 return false;
1887
1888 return true;
1889 }
1890
1891 static nir_intrinsic_instr *
find_per_vertex_load_for_tes_interp(nir_instr * instr)1892 find_per_vertex_load_for_tes_interp(nir_instr *instr)
1893 {
1894 switch (instr->type) {
1895 case nir_instr_type_alu: {
1896 nir_alu_instr *alu = nir_instr_as_alu(instr);
1897 unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
1898
1899 for (unsigned i = 0; i < num_srcs; i++) {
1900 nir_instr *src = alu->src[i].src.ssa->parent_instr;
1901 nir_intrinsic_instr *intr = find_per_vertex_load_for_tes_interp(src);
1902
1903 if (intr)
1904 return intr;
1905 }
1906 return NULL;
1907 }
1908
1909 case nir_instr_type_intrinsic: {
1910 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1911
1912 return intr->intrinsic == nir_intrinsic_load_per_vertex_input ?
1913 intr : NULL;
1914 }
1915
1916 default:
1917 unreachable("unexpected instruction type");
1918 }
1919 }
1920
1921 static nir_def *
get_stored_value_for_load(struct linkage_info * linkage,nir_instr * instr)1922 get_stored_value_for_load(struct linkage_info *linkage, nir_instr *instr)
1923 {
1924 nir_intrinsic_instr *intr;
1925
1926 if (instr->type == nir_instr_type_intrinsic) {
1927 intr = nir_instr_as_intrinsic(instr);
1928 } else {
1929 assert(instr->type == nir_instr_type_alu &&
1930 instr->pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD);
1931 intr = find_per_vertex_load_for_tes_interp(instr);
1932 }
1933
1934 unsigned slot_index = intr_get_scalar_16bit_slot(intr);
1935 assert(list_is_singular(&linkage->slot[slot_index].producer.stores));
1936
1937 nir_def *stored_value =
1938 list_first_entry(&linkage->slot[slot_index].producer.stores,
1939 struct list_node, head)->instr->src[0].ssa;
1940 assert(stored_value->num_components == 1);
1941 return stored_value;
1942 }
1943
1944 /* Clone the SSA, which can be in a different shader. */
1945 static nir_def *
clone_ssa(struct linkage_info * linkage,nir_builder * b,nir_def * ssa)1946 clone_ssa(struct linkage_info *linkage, nir_builder *b, nir_def *ssa)
1947 {
1948 switch (ssa->parent_instr->type) {
1949 case nir_instr_type_load_const:
1950 return nir_build_imm(b, ssa->num_components, ssa->bit_size,
1951 nir_instr_as_load_const(ssa->parent_instr)->value);
1952
1953 case nir_instr_type_undef:
1954 return nir_undef(b, ssa->num_components, ssa->bit_size);
1955
1956 case nir_instr_type_alu: {
1957 nir_alu_instr *alu = nir_instr_as_alu(ssa->parent_instr);
1958
1959 if (alu->instr.pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD) {
1960 /* We are cloning an interpolated TES load in the producer for
1961 * backward inter-shader code motion.
1962 */
1963 assert(&linkage->producer_builder == b);
1964 return get_stored_value_for_load(linkage, &alu->instr);
1965 }
1966
1967 nir_def *src[4] = {0};
1968 unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
1969 assert(num_srcs <= ARRAY_SIZE(src));
1970
1971 for (unsigned i = 0; i < num_srcs; i++)
1972 src[i] = clone_ssa(linkage, b, alu->src[i].src.ssa);
1973
1974 nir_def *clone = nir_build_alu(b, alu->op, src[0], src[1], src[2], src[3]);
1975 nir_alu_instr *alu_clone = nir_instr_as_alu(clone->parent_instr);
1976
1977 alu_clone->exact = alu->exact;
1978 alu_clone->no_signed_wrap = alu->no_signed_wrap;
1979 alu_clone->no_unsigned_wrap = alu->no_unsigned_wrap;
1980 alu_clone->def.num_components = alu->def.num_components;
1981 alu_clone->def.bit_size = alu->def.bit_size;
1982
1983 for (unsigned i = 0; i < num_srcs; i++) {
1984 memcpy(alu_clone->src[i].swizzle, alu->src[i].swizzle,
1985 NIR_MAX_VEC_COMPONENTS);
1986 }
1987
1988 return clone;
1989 }
1990
1991 case nir_instr_type_intrinsic: {
1992 /* Clone load_deref of uniform or ubo. It's the only thing that can
1993 * occur here.
1994 */
1995 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(ssa->parent_instr);
1996
1997 switch (intr->intrinsic) {
1998 case nir_intrinsic_load_deref: {
1999 nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
2000
2001 assert(deref);
2002 assert(nir_deref_mode_is_one_of(deref, nir_var_uniform | nir_var_mem_ubo));
2003 /* Indirect uniform indexing is disallowed here. */
2004 assert(!nir_deref_instr_has_indirect(deref));
2005
2006 /* Get the uniform from the original shader. */
2007 nir_variable *var = nir_deref_instr_get_variable(deref);
2008 assert(!(var->data.mode & nir_var_mem_ubo) || linkage->can_move_ubos);
2009
2010 /* Declare the uniform in the target shader. If it's the same shader
2011 * (in the case of replacing output loads with a uniform), this has
2012 * no effect.
2013 */
2014 var = nir_clone_uniform_variable(b->shader, var, linkage->spirv);
2015
2016 /* Re-build the uniform deref load before the load. */
2017 nir_deref_instr *load_uniform_deref =
2018 nir_clone_deref_instr(b, var, deref);
2019
2020 return nir_load_deref(b, load_uniform_deref);
2021 }
2022
2023 case nir_intrinsic_load_input:
2024 case nir_intrinsic_load_per_primitive_input:
2025 case nir_intrinsic_load_interpolated_input: {
2026 /* We are cloning load_input in the producer for backward
2027 * inter-shader code motion. Replace the input load with the stored
2028 * output value. That way we can clone any expression using inputs
2029 * from the consumer in the producer.
2030 */
2031 assert(&linkage->producer_builder == b);
2032 return get_stored_value_for_load(linkage, &intr->instr);
2033 }
2034
2035 default:
2036 unreachable("unexpected intrinsic");
2037 }
2038 }
2039
2040 default:
2041 unreachable("unexpected instruction type");
2042 }
2043 }
2044
2045 /******************************************************************
2046 * UNIFORM EXPRESSION PROPAGATION (CONSTANTS, UNIFORMS, UBO LOADS)
2047 ******************************************************************/
2048
2049 static void
remove_all_stores_and_clear_slot(struct linkage_info * linkage,unsigned slot,nir_opt_varyings_progress * progress)2050 remove_all_stores_and_clear_slot(struct linkage_info *linkage, unsigned slot,
2051 nir_opt_varyings_progress *progress)
2052 {
2053 bool uses_xfb = false;
2054 remove_all_stores(linkage, slot, &uses_xfb, progress);
2055 clear_slot_info_after_removal(linkage, slot, uses_xfb);
2056 }
2057
2058 struct is_uniform_expr_state {
2059 struct linkage_info *linkage;
2060 unsigned cost;
2061 };
2062
2063 static bool
2064 is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state);
2065
2066 static bool
src_is_uniform_expression(nir_src * src,void * data)2067 src_is_uniform_expression(nir_src *src, void *data)
2068 {
2069 return is_uniform_expression(src->ssa->parent_instr,
2070 (struct is_uniform_expr_state*)data);
2071 }
2072
2073 /**
2074 * Return whether instr is a uniform expression that can be moved into
2075 * the next shader.
2076 */
2077 static bool
is_uniform_expression(nir_instr * instr,struct is_uniform_expr_state * state)2078 is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state)
2079 {
2080 const nir_shader_compiler_options *options =
2081 state->linkage->producer_builder.shader->options;
2082
2083 switch (instr->type) {
2084 case nir_instr_type_load_const:
2085 case nir_instr_type_undef:
2086 return true;
2087
2088 case nir_instr_type_alu:
2089 state->cost += options->varying_estimate_instr_cost ?
2090 options->varying_estimate_instr_cost(instr) : 1;
2091 return nir_foreach_src(instr, src_is_uniform_expression, state);
2092
2093 case nir_instr_type_intrinsic:
2094 if (nir_instr_as_intrinsic(instr)->intrinsic ==
2095 nir_intrinsic_load_deref) {
2096 state->cost += options->varying_estimate_instr_cost ?
2097 options->varying_estimate_instr_cost(instr) : 1;
2098 return nir_foreach_src(instr, src_is_uniform_expression, state);
2099 }
2100 return false;
2101
2102 case nir_instr_type_deref:
2103 return can_move_deref_between_shaders(state->linkage, instr);
2104
2105 default:
2106 return false;
2107 }
2108 }
2109
2110 /**
2111 * Propagate constants, uniforms, UBO loads, and uniform expressions
2112 * in output components to inputs loads in the next shader and output
2113 * loads in the current stage, and remove the output components.
2114 *
2115 * Uniform expressions are ALU expressions only sourcing constants, uniforms,
2116 * and UBO loads.
2117 */
2118 static void
propagate_uniform_expressions(struct linkage_info * linkage,nir_opt_varyings_progress * progress)2119 propagate_uniform_expressions(struct linkage_info *linkage,
2120 nir_opt_varyings_progress *progress)
2121 {
2122 unsigned i;
2123
2124 /* Clear pass_flags, which is used by clone_ssa. */
2125 nir_shader_clear_pass_flags(linkage->consumer_builder.shader);
2126
2127 /* Find uniform expressions. If there are multiple stores, they should all
2128 * store the same value. That's guaranteed by output_equal_mask.
2129 */
2130 BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
2131 if (!can_optimize_varying(linkage, vec4_slot(i)).propagate_uniform_expr)
2132 continue;
2133
2134 struct scalar_slot *slot = &linkage->slot[i];
2135 assert(!list_is_empty(&slot->producer.loads) ||
2136 !list_is_empty(&slot->consumer.loads));
2137
2138 struct is_uniform_expr_state state = {
2139 .linkage = linkage,
2140 .cost = 0,
2141 };
2142
2143 if (!is_uniform_expression(slot->producer.value, &state))
2144 continue;
2145
2146 if (state.cost > linkage->max_varying_expression_cost)
2147 continue;
2148
2149 /* Colors can be propagated only if they are constant between [0, 1]
2150 * because that's the only case when the clamp vertex color state has
2151 * no effect.
2152 */
2153 if (is_interpolated_color(linkage, i) &&
2154 (slot->producer.value->type != nir_instr_type_load_const ||
2155 nir_instr_as_load_const(slot->producer.value)->value[0].f32 < 0 ||
2156 nir_instr_as_load_const(slot->producer.value)->value[0].f32 > 1))
2157 continue;
2158
2159 /* TEXn.zw can be propagated only if it's equal to (0, 1) because it's
2160 * the coord replace value.
2161 */
2162 if (is_interpolated_texcoord(linkage, i)) {
2163 assert(i % 2 == 0); /* high 16-bit slots disallowed */
2164
2165 if (i % 8 == 0 || /* TEXn.x */
2166 i % 8 == 2 || /* TEXn.y */
2167 slot->producer.value->type != nir_instr_type_load_const)
2168 continue;
2169
2170 float value =
2171 nir_instr_as_load_const(slot->producer.value)->value[0].f32;
2172
2173 /* This ignores signed zeros, but those are destroyed by
2174 * interpolation, so it doesn't matter.
2175 */
2176 if ((i % 8 == 4 && value != 0) ||
2177 (i % 8 == 6 && value != 1))
2178 continue;
2179 }
2180
2181 /* Replace all loads. Do that for both input and output loads. */
2182 for (unsigned list_index = 0; list_index < 2; list_index++) {
2183 struct list_head *load = list_index ? &slot->producer.loads :
2184 &slot->consumer.loads;
2185 nir_builder *b = list_index ? &linkage->producer_builder :
2186 &linkage->consumer_builder;
2187
2188 list_for_each_entry(struct list_node, node, load, head) {
2189 nir_intrinsic_instr *loadi = node->instr;
2190 b->cursor = nir_before_instr(&loadi->instr);
2191
2192 /* Copy the uniform expression before the load. */
2193 nir_def *clone = clone_ssa(linkage, b,
2194 nir_instr_def(slot->producer.value));
2195
2196 /* Interpolation converts Infs to NaNs. If we skip it, we need to
2197 * convert Infs to NaNs manually.
2198 */
2199 if (loadi->intrinsic == nir_intrinsic_load_interpolated_input &&
2200 preserve_nans(b->shader, clone->bit_size))
2201 clone = build_convert_inf_to_nan(b, clone);
2202
2203 /* Replace the original load. */
2204 nir_def_replace(&loadi->def, clone);
2205 *progress |= list_index ? nir_progress_producer :
2206 nir_progress_consumer;
2207 }
2208 }
2209
2210 /* Clear the lists. */
2211 list_inithead(&slot->producer.loads);
2212 list_inithead(&slot->consumer.loads);
2213
2214 /* Remove all stores now that loads have been replaced. */
2215 remove_all_stores_and_clear_slot(linkage, i, progress);
2216 }
2217 }
2218
2219 /******************************************************************
2220 * OUTPUT DEDUPLICATION
2221 ******************************************************************/
2222
2223 /* We can only deduplicate outputs that have the same qualifier, and color
2224 * components must be deduplicated separately because they are affected by GL
2225 * states.
2226 *
2227 * QUAL_*_INTERP_ANY means that the interpolation qualifier doesn't matter for
2228 * deduplication as long as it's not flat.
2229 *
2230 * QUAL_COLOR_SHADEMODEL_ANY is the same, but can be switched to flat
2231 * by the flatshade state, so it can't be deduplicated with
2232 * QUAL_COLOR_INTERP_ANY, which is never flat.
2233 */
2234 enum var_qualifier {
2235 QUAL_PATCH,
2236 QUAL_VAR_FLAT,
2237 QUAL_COLOR_FLAT,
2238 QUAL_EXPLICIT,
2239 QUAL_EXPLICIT_STRICT,
2240 QUAL_PER_PRIMITIVE,
2241 /* When nir_io_has_flexible_input_interpolation_except_flat is set: */
2242 QUAL_VAR_INTERP_ANY,
2243 QUAL_COLOR_INTERP_ANY,
2244 QUAL_COLOR_SHADEMODEL_ANY,
2245 /* When nir_io_has_flexible_input_interpolation_except_flat is unset: */
2246 QUAL_VAR_PERSP_PIXEL,
2247 QUAL_VAR_PERSP_CENTROID,
2248 QUAL_VAR_PERSP_SAMPLE,
2249 QUAL_VAR_LINEAR_PIXEL,
2250 QUAL_VAR_LINEAR_CENTROID,
2251 QUAL_VAR_LINEAR_SAMPLE,
2252 QUAL_COLOR_PERSP_PIXEL,
2253 QUAL_COLOR_PERSP_CENTROID,
2254 QUAL_COLOR_PERSP_SAMPLE,
2255 QUAL_COLOR_LINEAR_PIXEL,
2256 QUAL_COLOR_LINEAR_CENTROID,
2257 QUAL_COLOR_LINEAR_SAMPLE,
2258 QUAL_COLOR_SHADEMODEL_PIXEL,
2259 QUAL_COLOR_SHADEMODEL_CENTROID,
2260 QUAL_COLOR_SHADEMODEL_SAMPLE,
2261 NUM_DEDUP_QUALIFIERS,
2262
2263 QUAL_SKIP,
2264 QUAL_UNKNOWN,
2265 };
2266
2267 /* Return the input qualifier if all loads use the same one, else skip.
2268 * This is only used by output deduplication to determine input compatibility.
2269 */
2270 static enum var_qualifier
get_input_qualifier(struct linkage_info * linkage,unsigned i)2271 get_input_qualifier(struct linkage_info *linkage, unsigned i)
2272 {
2273 assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT);
2274 struct scalar_slot *slot = &linkage->slot[i];
2275 bool is_color = is_interpolated_color(linkage, i);
2276 nir_intrinsic_instr *load =
2277 list_first_entry(&slot->consumer.loads, struct list_node, head)->instr;
2278
2279 if (load->intrinsic == nir_intrinsic_load_input)
2280 return is_color ? QUAL_COLOR_FLAT : QUAL_VAR_FLAT;
2281
2282 if (load->intrinsic == nir_intrinsic_load_per_primitive_input)
2283 return QUAL_PER_PRIMITIVE;
2284
2285 if (load->intrinsic == nir_intrinsic_load_input_vertex) {
2286 return nir_intrinsic_io_semantics(load).interp_explicit_strict ?
2287 QUAL_EXPLICIT_STRICT : QUAL_EXPLICIT;
2288 }
2289
2290 assert(load->intrinsic == nir_intrinsic_load_interpolated_input);
2291 nir_intrinsic_instr *baryc =
2292 nir_instr_as_intrinsic(load->src[0].ssa->parent_instr);
2293
2294 if (linkage->consumer_builder.shader->options->io_options &
2295 nir_io_has_flexible_input_interpolation_except_flat) {
2296 if (is_color) {
2297 return nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE ?
2298 QUAL_COLOR_SHADEMODEL_ANY : QUAL_COLOR_INTERP_ANY;
2299 } else {
2300 return QUAL_VAR_INTERP_ANY;
2301 }
2302 }
2303
2304 /* Get the exact interpolation qualifier. */
2305 unsigned pixel_location;
2306 enum var_qualifier qual;
2307
2308 switch (baryc->intrinsic) {
2309 case nir_intrinsic_load_barycentric_pixel:
2310 pixel_location = 0;
2311 break;
2312 case nir_intrinsic_load_barycentric_centroid:
2313 pixel_location = 1;
2314 break;
2315 case nir_intrinsic_load_barycentric_sample:
2316 pixel_location = 2;
2317 break;
2318 case nir_intrinsic_load_barycentric_at_offset:
2319 case nir_intrinsic_load_barycentric_at_sample:
2320 /* Don't deduplicate outputs that are interpolated at offset/sample. */
2321 return QUAL_SKIP;
2322 default:
2323 unreachable("unexpected barycentric src");
2324 }
2325
2326 switch (nir_intrinsic_interp_mode(baryc)) {
2327 case INTERP_MODE_NONE:
2328 qual = is_color ? QUAL_COLOR_SHADEMODEL_PIXEL :
2329 QUAL_VAR_PERSP_PIXEL;
2330 break;
2331 case INTERP_MODE_SMOOTH:
2332 qual = is_color ? QUAL_COLOR_PERSP_PIXEL : QUAL_VAR_PERSP_PIXEL;
2333 break;
2334 case INTERP_MODE_NOPERSPECTIVE:
2335 qual = is_color ? QUAL_COLOR_LINEAR_PIXEL : QUAL_VAR_LINEAR_PIXEL;
2336 break;
2337 default:
2338 unreachable("unexpected interp mode");
2339 }
2340
2341 /* The ordering of the "qual" enum was carefully chosen to make this
2342 * addition correct.
2343 */
2344 STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 1 == QUAL_VAR_PERSP_CENTROID);
2345 STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 2 == QUAL_VAR_PERSP_SAMPLE);
2346 STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 1 == QUAL_VAR_LINEAR_CENTROID);
2347 STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 2 == QUAL_VAR_LINEAR_SAMPLE);
2348 STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 1 == QUAL_COLOR_PERSP_CENTROID);
2349 STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 2 == QUAL_COLOR_PERSP_SAMPLE);
2350 STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 1 == QUAL_COLOR_LINEAR_CENTROID);
2351 STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 2 == QUAL_COLOR_LINEAR_SAMPLE);
2352 STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 1 ==
2353 QUAL_COLOR_SHADEMODEL_CENTROID);
2354 STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 2 ==
2355 QUAL_COLOR_SHADEMODEL_SAMPLE);
2356 return qual + pixel_location;
2357 }
2358
2359 static void
deduplicate_outputs(struct linkage_info * linkage,nir_opt_varyings_progress * progress)2360 deduplicate_outputs(struct linkage_info *linkage,
2361 nir_opt_varyings_progress *progress)
2362 {
2363 struct hash_table *tables[NUM_DEDUP_QUALIFIERS] = {NULL};
2364 unsigned i;
2365
2366 /* Find duplicated outputs. If there are multiple stores, they should all
2367 * store the same value as all stores of some other output. That's
2368 * guaranteed by output_equal_mask.
2369 */
2370 BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
2371 if (!can_optimize_varying(linkage, vec4_slot(i)).deduplicate)
2372 continue;
2373
2374 struct scalar_slot *slot = &linkage->slot[i];
2375 enum var_qualifier qualifier;
2376 gl_varying_slot var_slot = vec4_slot(i);
2377
2378 /* Determine which qualifier this slot has. */
2379 if ((var_slot >= VARYING_SLOT_PATCH0 &&
2380 var_slot <= VARYING_SLOT_PATCH31) ||
2381 var_slot == VARYING_SLOT_TESS_LEVEL_INNER ||
2382 var_slot == VARYING_SLOT_TESS_LEVEL_OUTER)
2383 qualifier = QUAL_PATCH;
2384 else if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
2385 qualifier = QUAL_VAR_FLAT;
2386 else
2387 qualifier = get_input_qualifier(linkage, i);
2388
2389 if (qualifier == QUAL_SKIP)
2390 continue;
2391
2392 struct hash_table **table = &tables[qualifier];
2393 if (!*table)
2394 *table = _mesa_pointer_hash_table_create(NULL);
2395
2396 nir_instr *value = slot->producer.value;
2397
2398 struct hash_entry *entry = _mesa_hash_table_search(*table, value);
2399 if (!entry) {
2400 _mesa_hash_table_insert(*table, value, (void*)(uintptr_t)i);
2401 continue;
2402 }
2403
2404 /* We've found a duplicate. Redirect loads and remove stores. */
2405 struct scalar_slot *found_slot = &linkage->slot[(uintptr_t)entry->data];
2406 nir_intrinsic_instr *store =
2407 list_first_entry(&found_slot->producer.stores,
2408 struct list_node, head)->instr;
2409 nir_io_semantics sem = nir_intrinsic_io_semantics(store);
2410 unsigned component = nir_intrinsic_component(store);
2411
2412 /* Redirect loads. */
2413 for (unsigned list_index = 0; list_index < 2; list_index++) {
2414 struct list_head *src_loads = list_index ? &slot->producer.loads :
2415 &slot->consumer.loads;
2416 struct list_head *dst_loads = list_index ? &found_slot->producer.loads :
2417 &found_slot->consumer.loads;
2418 bool has_progress = !list_is_empty(src_loads);
2419
2420 list_for_each_entry(struct list_node, iter, src_loads, head) {
2421 nir_intrinsic_instr *loadi = iter->instr;
2422
2423 nir_intrinsic_set_io_semantics(loadi, sem);
2424 nir_intrinsic_set_component(loadi, component);
2425
2426 /* We also need to set the base to match the duplicate load, so
2427 * that CSE can eliminate it.
2428 */
2429 if (!list_is_empty(dst_loads)) {
2430 struct list_node *first =
2431 list_first_entry(dst_loads, struct list_node, head);
2432 nir_intrinsic_set_base(loadi, nir_intrinsic_base(first->instr));
2433 } else {
2434 /* Use the base of the found store if there are no loads (it can
2435 * only happen with TCS).
2436 */
2437 assert(list_index == 0);
2438 nir_intrinsic_set_base(loadi, nir_intrinsic_base(store));
2439 }
2440 }
2441
2442 if (has_progress) {
2443 /* Move the redirected loads to the found slot, so that compaction
2444 * can find them.
2445 */
2446 list_splicetail(src_loads, dst_loads);
2447 list_inithead(src_loads);
2448
2449 *progress |= list_index ? nir_progress_producer :
2450 nir_progress_consumer;
2451 }
2452 }
2453
2454 /* Remove all duplicated stores now that loads have been redirected. */
2455 remove_all_stores_and_clear_slot(linkage, i, progress);
2456 }
2457
2458 for (unsigned i = 0; i < ARRAY_SIZE(tables); i++)
2459 _mesa_hash_table_destroy(tables[i], NULL);
2460 }
2461
2462 /******************************************************************
2463 * FIND OPEN-CODED TES INPUT INTERPOLATION
2464 ******************************************************************/
2465
2466 static bool
is_sysval(nir_instr * instr,gl_system_value sysval)2467 is_sysval(nir_instr *instr, gl_system_value sysval)
2468 {
2469 if (instr->type == nir_instr_type_intrinsic) {
2470 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2471
2472 if (intr->intrinsic == nir_intrinsic_from_system_value(sysval))
2473 return true;
2474
2475 if (intr->intrinsic == nir_intrinsic_load_deref) {
2476 nir_deref_instr *deref =
2477 nir_instr_as_deref(intr->src[0].ssa->parent_instr);
2478
2479 return nir_deref_mode_is_one_of(deref, nir_var_system_value) &&
2480 deref->var->data.location == sysval;
2481 }
2482 }
2483
2484 return false;
2485 }
2486
2487 static nir_alu_instr *
get_single_use_as_alu(nir_def * def)2488 get_single_use_as_alu(nir_def *def)
2489 {
2490 /* Only 1 use allowed. */
2491 if (!list_is_singular(&def->uses))
2492 return NULL;
2493
2494 nir_instr *instr =
2495 nir_src_parent_instr(list_first_entry(&def->uses, nir_src, use_link));
2496 if (instr->type != nir_instr_type_alu)
2497 return NULL;
2498
2499 return nir_instr_as_alu(instr);
2500 }
2501
2502 static nir_alu_instr *
check_tes_input_load_get_single_use_alu(nir_intrinsic_instr * load,unsigned * vertex_index,unsigned * vertices_used,unsigned max_vertices)2503 check_tes_input_load_get_single_use_alu(nir_intrinsic_instr *load,
2504 unsigned *vertex_index,
2505 unsigned *vertices_used,
2506 unsigned max_vertices)
2507 {
2508 if (load->intrinsic != nir_intrinsic_load_per_vertex_input)
2509 return NULL;
2510
2511 /* Check the vertex index. Each vertex can be loaded only once. */
2512 if (!nir_src_is_const(load->src[0]))
2513 return false;
2514
2515 *vertex_index = nir_src_as_uint(load->src[0]);
2516 if (*vertex_index >= max_vertices ||
2517 *vertices_used & BITFIELD_BIT(*vertex_index))
2518 return false;
2519
2520 *vertices_used |= BITFIELD_BIT(*vertex_index);
2521
2522 return get_single_use_as_alu(&load->def);
2523 }
2524
2525 static bool
gather_fmul_tess_coord(nir_intrinsic_instr * load,nir_alu_instr * fmul,unsigned vertex_index,unsigned * tess_coord_swizzle,unsigned * tess_coord_used,nir_def ** load_tess_coord)2526 gather_fmul_tess_coord(nir_intrinsic_instr *load, nir_alu_instr *fmul,
2527 unsigned vertex_index, unsigned *tess_coord_swizzle,
2528 unsigned *tess_coord_used, nir_def **load_tess_coord)
2529 {
2530 unsigned other_src = fmul->src[0].src.ssa == &load->def;
2531 nir_instr *other_instr = fmul->src[other_src].src.ssa->parent_instr;
2532
2533 assert(fmul->src[!other_src].swizzle[0] == 0);
2534
2535 if (!is_sysval(other_instr, SYSTEM_VALUE_TESS_COORD))
2536 return false;
2537
2538 unsigned tess_coord_component = fmul->src[other_src].swizzle[0];
2539 /* Each tesscoord component can be used only once. */
2540 if (*tess_coord_used & BITFIELD_BIT(tess_coord_component))
2541 return false;
2542
2543 *tess_coord_swizzle |= tess_coord_component << (4 * vertex_index);
2544 *tess_coord_used |= BITFIELD_BIT(tess_coord_component);
2545 *load_tess_coord = &nir_instr_as_intrinsic(other_instr)->def;
2546 return true;
2547 }
2548
2549 /**
2550 * Find interpolation of the form:
2551 * input[0].slot * TessCoord.a +
2552 * input[1].slot * TessCoord.b +
2553 * input[2].slot * TessCoord.c;
2554 *
2555 * a,b,c can be any of x,y,z, but each can occur only once.
2556 */
2557 static bool
find_tes_triangle_interp_3fmul_2fadd(struct linkage_info * linkage,unsigned i)2558 find_tes_triangle_interp_3fmul_2fadd(struct linkage_info *linkage, unsigned i)
2559 {
2560 struct scalar_slot *slot = &linkage->slot[i];
2561 unsigned vertices_used = 0;
2562 unsigned tess_coord_used = 0;
2563 unsigned tess_coord_swizzle = 0;
2564 unsigned num_fmuls = 0, num_fadds = 0;
2565 nir_alu_instr *fadds[2];
2566 nir_def *load_tess_coord = NULL;
2567
2568 /* Find 3 multiplications by TessCoord and their uses, which must be
2569 * fadds.
2570 */
2571 list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) {
2572 unsigned vertex_index;
2573 nir_alu_instr *fmul =
2574 check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index,
2575 &vertices_used, 3);
2576 /* Only maximum of 3 loads expected. Also reject exact ops because we
2577 * are going to do an inexact transformation with it.
2578 */
2579 if (!fmul || fmul->op != nir_op_fmul || fmul->exact || num_fmuls == 3 ||
2580 !gather_fmul_tess_coord(iter->instr, fmul, vertex_index,
2581 &tess_coord_swizzle, &tess_coord_used,
2582 &load_tess_coord))
2583 return false;
2584
2585 num_fmuls++;
2586
2587 /* The multiplication must only be used by fadd. Also reject exact ops.
2588 */
2589 nir_alu_instr *fadd = get_single_use_as_alu(&fmul->def);
2590 if (!fadd || fadd->op != nir_op_fadd || fadd->exact)
2591 return false;
2592
2593 /* The 3 fmuls must only be used by 2 fadds. */
2594 unsigned i;
2595 for (i = 0; i < num_fadds; i++) {
2596 if (fadds[i] == fadd)
2597 break;
2598 }
2599 if (i == num_fadds) {
2600 if (num_fadds == 2)
2601 return false;
2602
2603 fadds[num_fadds++] = fadd;
2604 }
2605 }
2606
2607 if (num_fmuls != 3 || num_fadds != 2)
2608 return false;
2609
2610 assert(tess_coord_used == 0x7);
2611
2612 /* We have found that the only uses of the 3 fmuls are 2 fadds, which
2613 * implies that at least 2 fmuls are used by the same fadd.
2614 *
2615 * Check that 1 fadd is used by the other fadd, which can only be
2616 * the result of the TessCoord interpolation.
2617 */
2618 for (unsigned i = 0; i < 2; i++) {
2619 if (get_single_use_as_alu(&fadds[i]->def) == fadds[!i]) {
2620 switch (tess_coord_swizzle) {
2621 case 0x210:
2622 slot->consumer.tes_interp_load = fadds[!i];
2623 slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW;
2624 slot->consumer.tes_load_tess_coord = load_tess_coord;
2625 return true;
2626
2627 case 0x102:
2628 slot->consumer.tes_interp_load = fadds[!i];
2629 slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV;
2630 slot->consumer.tes_load_tess_coord = load_tess_coord;
2631 return true;
2632
2633 default:
2634 return false;
2635 }
2636 }
2637 }
2638
2639 return false;
2640 }
2641
2642 /**
2643 * Find interpolation of the form:
2644 * fma(input[0].slot, TessCoord.a,
2645 * fma(input[1].slot, TessCoord.b,
2646 * input[2].slot * TessCoord.c))
2647 *
2648 * a,b,c can be any of x,y,z, but each can occur only once.
2649 */
2650 static bool
find_tes_triangle_interp_1fmul_2ffma(struct linkage_info * linkage,unsigned i)2651 find_tes_triangle_interp_1fmul_2ffma(struct linkage_info *linkage, unsigned i)
2652 {
2653 struct scalar_slot *slot = &linkage->slot[i];
2654 unsigned vertices_used = 0;
2655 unsigned tess_coord_used = 0;
2656 unsigned tess_coord_swizzle = 0;
2657 unsigned num_fmuls = 0, num_ffmas = 0;
2658 nir_alu_instr *ffmas[2], *fmul = NULL;
2659 nir_def *load_tess_coord = NULL;
2660
2661 list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) {
2662 unsigned vertex_index;
2663 nir_alu_instr *alu =
2664 check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index,
2665 &vertices_used, 3);
2666
2667 /* Reject exact ops because we are going to do an inexact transformation
2668 * with it.
2669 */
2670 if (!alu || (alu->op != nir_op_fmul && alu->op != nir_op_ffma) ||
2671 alu->exact ||
2672 !gather_fmul_tess_coord(iter->instr, alu, vertex_index,
2673 &tess_coord_swizzle, &tess_coord_used,
2674 &load_tess_coord))
2675 return false;
2676
2677 /* The multiplication must only be used by ffma. */
2678 if (alu->op == nir_op_fmul) {
2679 nir_alu_instr *ffma = get_single_use_as_alu(&alu->def);
2680 if (!ffma || ffma->op != nir_op_ffma)
2681 return false;
2682
2683 if (num_fmuls == 1)
2684 return false;
2685
2686 fmul = alu;
2687 num_fmuls++;
2688 } else {
2689 if (num_ffmas == 2)
2690 return false;
2691
2692 ffmas[num_ffmas++] = alu;
2693 }
2694 }
2695
2696 if (num_fmuls != 1 || num_ffmas != 2)
2697 return false;
2698
2699 assert(tess_coord_used == 0x7);
2700
2701 /* We have found that fmul has only 1 use and it's ffma, and there are 2
2702 * ffmas. Fail if neither ffma is using fmul.
2703 */
2704 if (ffmas[0]->src[2].src.ssa != &fmul->def &&
2705 ffmas[1]->src[2].src.ssa != &fmul->def)
2706 return false;
2707
2708 /* If one ffma is using the other ffma, it's guaranteed to be src[2]. */
2709 for (unsigned i = 0; i < 2; i++) {
2710 if (get_single_use_as_alu(&ffmas[i]->def) == ffmas[!i]) {
2711 switch (tess_coord_swizzle) {
2712 case 0x210:
2713 slot->consumer.tes_interp_load = ffmas[!i];
2714 slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW;
2715 slot->consumer.tes_load_tess_coord = load_tess_coord;
2716 return true;
2717
2718 case 0x102:
2719 slot->consumer.tes_interp_load = ffmas[!i];
2720 slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV;
2721 slot->consumer.tes_load_tess_coord = load_tess_coord;
2722 return true;
2723
2724 default:
2725 return false;
2726 }
2727 }
2728 }
2729
2730 return false;
2731 }
2732
2733 static void
find_open_coded_tes_input_interpolation(struct linkage_info * linkage)2734 find_open_coded_tes_input_interpolation(struct linkage_info *linkage)
2735 {
2736 if (linkage->consumer_stage != MESA_SHADER_TESS_EVAL)
2737 return;
2738
2739 unsigned i;
2740 BITSET_FOREACH_SET(i, linkage->flat32_mask, NUM_SCALAR_SLOTS) {
2741 if (vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
2742 vec4_slot(i) <= VARYING_SLOT_PATCH31)
2743 continue;
2744 if (find_tes_triangle_interp_3fmul_2fadd(linkage, i))
2745 continue;
2746 if (find_tes_triangle_interp_1fmul_2ffma(linkage, i))
2747 continue;
2748 }
2749
2750 BITSET_FOREACH_SET(i, linkage->flat16_mask, NUM_SCALAR_SLOTS) {
2751 if (vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
2752 vec4_slot(i) <= VARYING_SLOT_PATCH31)
2753 continue;
2754 if (find_tes_triangle_interp_3fmul_2fadd(linkage, i))
2755 continue;
2756 if (find_tes_triangle_interp_1fmul_2ffma(linkage, i))
2757 continue;
2758 }
2759 }
2760
2761 /******************************************************************
2762 * BACKWARD INTER-SHADER CODE MOTION
2763 ******************************************************************/
2764
2765 #define NEED_UPDATE_MOVABLE_FLAGS(instr) \
2766 (!((instr)->pass_flags & (FLAG_MOVABLE | FLAG_UNMOVABLE)))
2767
2768 #define GET_SRC_INTERP(alu, i) \
2769 ((alu)->src[i].src.ssa->parent_instr->pass_flags & FLAG_INTERP_MASK)
2770
2771 static bool
can_move_alu_across_interp(struct linkage_info * linkage,nir_alu_instr * alu)2772 can_move_alu_across_interp(struct linkage_info *linkage, nir_alu_instr *alu)
2773 {
2774 /* Exact ALUs can't be moved across interpolation. */
2775 if (alu->exact)
2776 return false;
2777
2778 /* Interpolation converts Infs to NaNs. If we turn a result of an ALU
2779 * instruction into a new interpolated input, it converts Infs to NaNs for
2780 * that instruction, while removing the Infs to NaNs conversion for sourced
2781 * interpolated values. We can't do that if Infs and NaNs must be preserved.
2782 */
2783 if (preserve_infs_nans(linkage->consumer_builder.shader, alu->def.bit_size))
2784 return false;
2785
2786 switch (alu->op) {
2787 /* Always legal if the sources are interpolated identically because:
2788 * interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j)
2789 * interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j)
2790 */
2791 case nir_op_fadd:
2792 case nir_op_fsub:
2793 /* This is the same as multiplying by -1, which is always legal, see fmul.
2794 */
2795 case nir_op_fneg:
2796 case nir_op_mov:
2797 return true;
2798
2799 /* At least one side of the multiplication must be convergent because this
2800 * is the only equation with multiplication that is true:
2801 * interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j)
2802 */
2803 case nir_op_fmul:
2804 case nir_op_fmulz:
2805 case nir_op_ffma:
2806 case nir_op_ffmaz:
2807 return GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT ||
2808 GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT;
2809
2810 case nir_op_fdiv:
2811 /* The right side must be convergent, which then follows the fmul rule.
2812 */
2813 return GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT;
2814
2815 case nir_op_flrp:
2816 /* Using the same rule as fmul. */
2817 return (GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT &&
2818 GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT) ||
2819 GET_SRC_INTERP(alu, 2) == FLAG_INTERP_CONVERGENT;
2820
2821 default:
2822 /* Moving other ALU instructions across interpolation is illegal. */
2823 return false;
2824 }
2825 }
2826
2827 /* Determine whether an instruction is movable from the consumer to
2828 * the producer. Also determine which interpolation modes each ALU instruction
2829 * should use if its value was promoted to a new input.
2830 */
2831 static void
update_movable_flags(struct linkage_info * linkage,nir_instr * instr)2832 update_movable_flags(struct linkage_info *linkage, nir_instr *instr)
2833 {
2834 /* This function shouldn't be called more than once for each instruction
2835 * to minimize recursive calling.
2836 */
2837 assert(NEED_UPDATE_MOVABLE_FLAGS(instr));
2838
2839 switch (instr->type) {
2840 case nir_instr_type_undef:
2841 case nir_instr_type_load_const:
2842 /* Treat constants as convergent, which means compatible with both flat
2843 * and non-flat inputs.
2844 */
2845 instr->pass_flags |= FLAG_MOVABLE | FLAG_INTERP_CONVERGENT;
2846 return;
2847
2848 case nir_instr_type_alu: {
2849 nir_alu_instr *alu = nir_instr_as_alu(instr);
2850 unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
2851 unsigned alu_interp;
2852
2853 /* These are shader-dependent and thus unmovable. */
2854 if (nir_op_is_derivative(alu->op)) {
2855 instr->pass_flags |= FLAG_UNMOVABLE;
2856 return;
2857 }
2858
2859 /* Make vector ops unmovable. They are technically movable but more
2860 * complicated, and NIR should be scalarized for this pass anyway.
2861 * The only remaining vector ops should be vecN for intrinsic sources.
2862 */
2863 if (alu->def.num_components > 1) {
2864 instr->pass_flags |= FLAG_UNMOVABLE;
2865 return;
2866 }
2867
2868 alu_interp = FLAG_INTERP_CONVERGENT;
2869
2870 for (unsigned i = 0; i < num_srcs; i++) {
2871 nir_instr *src_instr = alu->src[i].src.ssa->parent_instr;
2872
2873 if (NEED_UPDATE_MOVABLE_FLAGS(src_instr))
2874 update_movable_flags(linkage, src_instr);
2875
2876 if (src_instr->pass_flags & FLAG_UNMOVABLE) {
2877 instr->pass_flags |= FLAG_UNMOVABLE;
2878 return;
2879 }
2880
2881 /* Determine which interpolation mode this ALU instruction should
2882 * use if it was promoted to a new input.
2883 */
2884 unsigned src_interp = src_instr->pass_flags & FLAG_INTERP_MASK;
2885
2886 if (alu_interp == src_interp ||
2887 src_interp == FLAG_INTERP_CONVERGENT) {
2888 /* Nothing to do. */
2889 } else if (alu_interp == FLAG_INTERP_CONVERGENT) {
2890 alu_interp = src_interp;
2891 } else {
2892 assert(alu_interp != FLAG_INTERP_CONVERGENT &&
2893 src_interp != FLAG_INTERP_CONVERGENT &&
2894 alu_interp != src_interp);
2895 /* The ALU instruction sources conflicting interpolation flags.
2896 * It can never become a new input.
2897 */
2898 instr->pass_flags |= FLAG_UNMOVABLE;
2899 return;
2900 }
2901 }
2902
2903 /* Check if we can move the ALU instruction across an interpolated
2904 * load into the previous shader.
2905 */
2906 if (alu_interp > FLAG_INTERP_FLAT &&
2907 !can_move_alu_across_interp(linkage, alu)) {
2908 instr->pass_flags |= FLAG_UNMOVABLE;
2909 return;
2910 }
2911
2912 instr->pass_flags |= FLAG_MOVABLE | alu_interp;
2913 return;
2914 }
2915
2916 case nir_instr_type_intrinsic: {
2917 /* Movable input loads already have FLAG_MOVABLE on them.
2918 * Unmovable input loads skipped by initialization get UNMOVABLE here.
2919 * (e.g. colors, texcoords)
2920 *
2921 * The only other movable intrinsic is load_deref for uniforms and UBOs.
2922 * Other intrinsics are not movable.
2923 */
2924 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2925
2926 if (intr->intrinsic == nir_intrinsic_load_deref) {
2927 nir_instr *deref = intr->src[0].ssa->parent_instr;
2928
2929 if (NEED_UPDATE_MOVABLE_FLAGS(deref))
2930 update_movable_flags(linkage, deref);
2931
2932 if (deref->pass_flags & FLAG_MOVABLE) {
2933 /* Treat uniforms as convergent, which means compatible with both
2934 * flat and non-flat inputs.
2935 */
2936 instr->pass_flags |= FLAG_MOVABLE | FLAG_INTERP_CONVERGENT;
2937 return;
2938 }
2939 }
2940
2941 instr->pass_flags |= FLAG_UNMOVABLE;
2942 return;
2943 }
2944
2945 case nir_instr_type_deref:
2946 if (can_move_deref_between_shaders(linkage, instr))
2947 instr->pass_flags |= FLAG_MOVABLE;
2948 else
2949 instr->pass_flags |= FLAG_UNMOVABLE;
2950 return;
2951
2952 default:
2953 instr->pass_flags |= FLAG_UNMOVABLE;
2954 return;
2955 }
2956 }
2957
2958 /* Gather the input loads used by the post-dominator using DFS. */
2959 static void
gather_used_input_loads(nir_instr * instr,nir_intrinsic_instr * loads[NUM_SCALAR_SLOTS],unsigned * num_loads)2960 gather_used_input_loads(nir_instr *instr,
2961 nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS],
2962 unsigned *num_loads)
2963 {
2964 switch (instr->type) {
2965 case nir_instr_type_undef:
2966 case nir_instr_type_load_const:
2967 return;
2968
2969 case nir_instr_type_alu: {
2970 nir_alu_instr *alu = nir_instr_as_alu(instr);
2971 unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
2972
2973 for (unsigned i = 0; i < num_srcs; i++) {
2974 gather_used_input_loads(alu->src[i].src.ssa->parent_instr,
2975 loads, num_loads);
2976 }
2977 return;
2978 }
2979
2980 case nir_instr_type_intrinsic: {
2981 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2982
2983 switch (intr->intrinsic) {
2984 case nir_intrinsic_load_deref:
2985 case nir_intrinsic_load_tess_coord:
2986 return;
2987
2988 case nir_intrinsic_load_input:
2989 case nir_intrinsic_load_per_vertex_input:
2990 case nir_intrinsic_load_interpolated_input:
2991 if (!(intr->instr.pass_flags & FLAG_GATHER_LOADS_VISITED)) {
2992 assert(*num_loads < NUM_SCALAR_SLOTS*8);
2993 loads[(*num_loads)++] = intr;
2994 intr->instr.pass_flags |= FLAG_GATHER_LOADS_VISITED;
2995 }
2996 return;
2997
2998 default:
2999 printf("%u\n", intr->intrinsic);
3000 unreachable("unexpected intrinsic");
3001 }
3002 }
3003
3004 default:
3005 unreachable("unexpected instr type");
3006 }
3007 }
3008
3009 /* Move a post-dominator, which is an ALU opcode, into the previous shader,
3010 * and replace the post-dominator with a new input load.
3011 */
3012 static bool
try_move_postdominator(struct linkage_info * linkage,struct nir_use_dominance_state * postdom_state,nir_alu_instr * postdom,nir_def * load_def,nir_intrinsic_instr * first_load,nir_opt_varyings_progress * progress)3013 try_move_postdominator(struct linkage_info *linkage,
3014 struct nir_use_dominance_state *postdom_state,
3015 nir_alu_instr *postdom,
3016 nir_def *load_def,
3017 nir_intrinsic_instr *first_load,
3018 nir_opt_varyings_progress *progress)
3019 {
3020 #define PRINT 0
3021 #if PRINT
3022 printf("Trying to move post-dom: ");
3023 nir_print_instr(&postdom->instr, stdout);
3024 puts("");
3025 #endif
3026
3027 /* Gather the input loads used by the post-dominator using DFS. */
3028 nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS*8];
3029 unsigned num_loads = 0;
3030 gather_used_input_loads(&postdom->instr, loads, &num_loads);
3031
3032 /* Clear the flag set by gather_used_input_loads. */
3033 for (unsigned i = 0; i < num_loads; i++)
3034 loads[i]->instr.pass_flags &= ~FLAG_GATHER_LOADS_VISITED;
3035
3036 /* For all the loads, the previous shader must have the corresponding
3037 * output stores in the same basic block because we are going to replace
3038 * them with 1 store. Only TCS and GS can have stores of different outputs
3039 * in different blocks.
3040 */
3041 nir_block *block = NULL;
3042
3043 for (unsigned i = 0; i < num_loads; i++) {
3044 unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]);
3045 struct scalar_slot *slot = &linkage->slot[slot_index];
3046
3047 assert(list_is_singular(&slot->producer.stores));
3048 nir_intrinsic_instr *store =
3049 list_first_entry(&slot->producer.stores, struct list_node,
3050 head)->instr;
3051
3052 if (!block) {
3053 block = store->instr.block;
3054 continue;
3055 }
3056 if (block != store->instr.block)
3057 return false;
3058 }
3059
3060 assert(block);
3061
3062 #if PRINT
3063 printf("Post-dom accepted: ");
3064 nir_print_instr(&postdom->instr, stdout);
3065 puts("\n");
3066 #endif
3067
3068 /* Determine the scalar slot index of the new varying. It will reuse
3069 * the slot of the load we started from because the load will be
3070 * removed.
3071 */
3072 unsigned final_slot = intr_get_scalar_16bit_slot(first_load);
3073
3074 /* Replace the post-dominator in the consumer with a new input load.
3075 * Since we are reusing the same slot as the first load and it has
3076 * the right interpolation qualifiers, use it as the new load by using
3077 * it in place of the post-dominator.
3078 *
3079 * Boolean post-dominators are upcast in the producer and then downcast
3080 * in the consumer.
3081 */
3082 unsigned slot_index = final_slot;
3083 struct scalar_slot *slot = &linkage->slot[slot_index];
3084 nir_builder *b = &linkage->consumer_builder;
3085 b->cursor = nir_after_instr(load_def->parent_instr);
3086 unsigned alu_interp = postdom->instr.pass_flags & FLAG_INTERP_MASK;
3087 nir_def *new_input, *new_tes_loads[3];
3088 BITSET_WORD *mask;
3089
3090 /* Convergent instruction results that are not interpolatable (integer or
3091 * FP64) should not be moved because compaction can relocate convergent
3092 * varyings to interpolated vec4 slots because the definition of convergent
3093 * varyings implies that they can be interpolated (which doesn't work with
3094 * integer and FP64 values).
3095 *
3096 * Check the result type and if it's not float and the driver doesn't
3097 * support convergent flat loads from interpolated vec4 slots, don't move
3098 * it.
3099 */
3100 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
3101 alu_interp == FLAG_INTERP_CONVERGENT &&
3102 !linkage->can_mix_convergent_flat_with_interpolated &&
3103 ((postdom->def.bit_size != 16 && postdom->def.bit_size != 32) ||
3104 !(nir_op_infos[postdom->op].output_type & nir_type_float)))
3105 return false;
3106
3107 /* NIR can't do 1-bit inputs. Convert them to a bigger size. */
3108 assert(postdom->def.bit_size & (1 | 16 | 32));
3109 unsigned new_bit_size = postdom->def.bit_size;
3110
3111 if (new_bit_size == 1) {
3112 assert(alu_interp == FLAG_INTERP_CONVERGENT ||
3113 alu_interp == FLAG_INTERP_FLAT);
3114 /* TODO: We could use 16 bits instead, but that currently fails on AMD.
3115 */
3116 new_bit_size = 32;
3117 }
3118
3119 bool rewrite_convergent_to_flat =
3120 alu_interp == FLAG_INTERP_CONVERGENT &&
3121 linkage->can_mix_convergent_flat_with_interpolated;
3122
3123 /* Create the new input load. This creates a new load (or a series of
3124 * loads in case of open-coded TES interpolation) that's identical to
3125 * the original load(s).
3126 */
3127 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
3128 alu_interp != FLAG_INTERP_FLAT && !rewrite_convergent_to_flat) {
3129 nir_def *baryc = NULL;
3130
3131 /* Determine the barycentric coordinates. */
3132 switch (alu_interp) {
3133 case FLAG_INTERP_PERSP_PIXEL:
3134 case FLAG_INTERP_LINEAR_PIXEL:
3135 baryc = nir_load_barycentric_pixel(b, 32);
3136 break;
3137 case FLAG_INTERP_PERSP_CENTROID:
3138 case FLAG_INTERP_LINEAR_CENTROID:
3139 baryc = nir_load_barycentric_centroid(b, 32);
3140 break;
3141 case FLAG_INTERP_PERSP_SAMPLE:
3142 case FLAG_INTERP_LINEAR_SAMPLE:
3143 baryc = nir_load_barycentric_sample(b, 32);
3144 break;
3145 default:
3146 baryc = first_load->src[0].ssa;
3147 break;
3148 }
3149
3150 if (baryc != first_load->src[0].ssa) {
3151 nir_intrinsic_instr *baryc_i =
3152 nir_instr_as_intrinsic(baryc->parent_instr);
3153
3154 if (alu_interp == FLAG_INTERP_LINEAR_PIXEL ||
3155 alu_interp == FLAG_INTERP_LINEAR_CENTROID ||
3156 alu_interp == FLAG_INTERP_LINEAR_SAMPLE)
3157 nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_NOPERSPECTIVE);
3158 else
3159 nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_SMOOTH);
3160 }
3161
3162 new_input = nir_load_interpolated_input(
3163 b, 1, new_bit_size, baryc, nir_imm_int(b, 0),
3164 .base = nir_intrinsic_base(first_load),
3165 .component = nir_intrinsic_component(first_load),
3166 .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3167 new_bit_size,
3168 .io_semantics = nir_intrinsic_io_semantics(first_load));
3169
3170 if (alu_interp == FLAG_INTERP_CONVERGENT) {
3171 mask = new_bit_size == 16 ? linkage->convergent16_mask
3172 : linkage->convergent32_mask;
3173 } else {
3174 mask = new_bit_size == 16 ? linkage->interp_fp16_mask
3175 : linkage->interp_fp32_mask;
3176 }
3177 } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3178 alu_interp > FLAG_INTERP_FLAT) {
3179 nir_def *zero = nir_imm_int(b, 0);
3180
3181 for (unsigned i = 0; i < 3; i++) {
3182 new_tes_loads[i] =
3183 nir_load_per_vertex_input(b, 1, new_bit_size,
3184 i ? nir_imm_int(b, i) : zero, zero,
3185 .base = nir_intrinsic_base(first_load),
3186 .component = nir_intrinsic_component(first_load),
3187 .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3188 new_bit_size,
3189 .io_semantics = nir_intrinsic_io_semantics(first_load));
3190 }
3191
3192 int remap_uvw[3] = {0, 1, 2};
3193 int remap_wuv[3] = {2, 0, 1};
3194 int *remap;
3195
3196 switch (alu_interp) {
3197 case FLAG_INTERP_TES_TRIANGLE_UVW:
3198 remap = remap_uvw;
3199 break;
3200 case FLAG_INTERP_TES_TRIANGLE_WUV:
3201 remap = remap_wuv;
3202 break;
3203 default:
3204 unreachable("invalid TES interpolation mode");
3205 }
3206
3207 nir_def *tesscoord = slot->consumer.tes_load_tess_coord;
3208 nir_def *defs[3];
3209
3210 for (unsigned i = 0; i < 3; i++) {
3211 if (i == 0) {
3212 defs[i] = nir_fmul(b, new_tes_loads[i],
3213 nir_channel(b, tesscoord, remap[i]));
3214 } else {
3215 defs[i] = nir_ffma(b, new_tes_loads[i],
3216 nir_channel(b, tesscoord, remap[i]),
3217 defs[i - 1]);
3218 }
3219 }
3220 new_input = defs[2];
3221
3222 mask = new_bit_size == 16 ? linkage->flat16_mask
3223 : linkage->flat32_mask;
3224 } else {
3225 /* We have to rewrite convergent to flat here and not during compaction
3226 * because compaction adds code to convert Infs to NaNs for
3227 * "load_interpolated_input -> load_input" replacements, which corrupts
3228 * integer data.
3229 */
3230 assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT ||
3231 alu_interp == FLAG_INTERP_FLAT || rewrite_convergent_to_flat);
3232
3233 new_input =
3234 nir_load_input(b, 1, new_bit_size, nir_imm_int(b, 0),
3235 .base = nir_intrinsic_base(first_load),
3236 .component = nir_intrinsic_component(first_load),
3237 .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3238 new_bit_size,
3239 .io_semantics = nir_intrinsic_io_semantics(first_load));
3240
3241 mask = new_bit_size == 16 ? linkage->flat16_mask
3242 : linkage->flat32_mask;
3243
3244 if (rewrite_convergent_to_flat) {
3245 mask = new_bit_size == 16 ? linkage->convergent16_mask
3246 : linkage->convergent32_mask;
3247 }
3248 }
3249
3250 assert(!BITSET_TEST(linkage->no_varying32_mask, slot_index));
3251 assert(!BITSET_TEST(linkage->no_varying16_mask, slot_index));
3252
3253 /* Re-set the category of the new scalar input. This will cause
3254 * the compaction to treat it as a different type, so that it will move it
3255 * into the vec4 that has compatible interpolation qualifiers.
3256 *
3257 * This shouldn't be done if any of the interp masks are not set, which
3258 * indicates that compaction is disallowed.
3259 */
3260 if (BITSET_TEST(linkage->interp_fp32_mask, slot_index) ||
3261 BITSET_TEST(linkage->interp_fp16_mask, slot_index) ||
3262 BITSET_TEST(linkage->flat32_mask, slot_index) ||
3263 BITSET_TEST(linkage->flat16_mask, slot_index) ||
3264 BITSET_TEST(linkage->convergent32_mask, slot_index) ||
3265 BITSET_TEST(linkage->convergent16_mask, slot_index)) {
3266 BITSET_CLEAR(linkage->interp_fp32_mask, slot_index);
3267 BITSET_CLEAR(linkage->interp_fp16_mask, slot_index);
3268 BITSET_CLEAR(linkage->flat16_mask, slot_index);
3269 BITSET_CLEAR(linkage->flat32_mask, slot_index);
3270 BITSET_CLEAR(linkage->convergent16_mask, slot_index);
3271 BITSET_CLEAR(linkage->convergent32_mask, slot_index);
3272 BITSET_SET(mask, slot_index);
3273 }
3274
3275 /* Replace the existing load with the new load in the slot. */
3276 if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3277 alu_interp >= FLAG_INTERP_TES_TRIANGLE_UVW) {
3278 /* For TES, replace all 3 loads. */
3279 unsigned i = 0;
3280 list_for_each_entry(struct list_node, iter, &slot->consumer.loads,
3281 head) {
3282 assert(i < 3);
3283 iter->instr = nir_instr_as_intrinsic(new_tes_loads[i]->parent_instr);
3284 i++;
3285 }
3286
3287 assert(i == 3);
3288 assert(postdom->def.bit_size != 1);
3289
3290 slot->consumer.tes_interp_load =
3291 nir_instr_as_alu(new_input->parent_instr);
3292 } else {
3293 assert(list_is_singular(&slot->consumer.loads));
3294 list_first_entry(&slot->consumer.loads, struct list_node, head)->instr =
3295 nir_instr_as_intrinsic(new_input->parent_instr);
3296
3297 /* The input is a bigger type even if the post-dominator is boolean. */
3298 if (postdom->def.bit_size == 1)
3299 new_input = nir_ine_imm(b, new_input, 0);
3300 }
3301
3302 nir_def_rewrite_uses(&postdom->def, new_input);
3303
3304 /* Clone the post-dominator at the end of the block in the producer
3305 * where the output stores are.
3306 */
3307 b = &linkage->producer_builder;
3308 b->cursor = nir_after_block_before_jump(block);
3309 nir_def *producer_clone = clone_ssa(linkage, b, &postdom->def);
3310
3311 /* Boolean post-dominators are upcast in the producer because we can't
3312 * use 1-bit outputs.
3313 */
3314 if (producer_clone->bit_size == 1)
3315 producer_clone = nir_b2bN(b, producer_clone, new_bit_size);
3316
3317 /* Move the existing store to the end of the block and rewrite it to use
3318 * the post-dominator result.
3319 */
3320 nir_intrinsic_instr *store =
3321 list_first_entry(&linkage->slot[final_slot].producer.stores,
3322 struct list_node, head)->instr;
3323 nir_instr_move(b->cursor, &store->instr);
3324 if (nir_src_bit_size(store->src[0]) != producer_clone->bit_size)
3325 nir_intrinsic_set_src_type(store, nir_alu_type_get_base_type(nir_intrinsic_src_type(store)) |
3326 producer_clone->bit_size);
3327 nir_src_rewrite(&store->src[0], producer_clone);
3328
3329 /* Remove all loads and stores that we are replacing from the producer
3330 * and consumer.
3331 */
3332 for (unsigned i = 0; i < num_loads; i++) {
3333 unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]);
3334
3335 if (slot_index == final_slot) {
3336 /* Keep the load and store that we reused. */
3337 continue;
3338 }
3339
3340 /* Remove loads and stores that are dead after the code motion. Only
3341 * those loads that are post-dominated by the post-dominator are dead.
3342 */
3343 struct scalar_slot *slot = &linkage->slot[slot_index];
3344 nir_instr *load;
3345
3346 if (slot->consumer.tes_interp_load) {
3347 load = &slot->consumer.tes_interp_load->instr;
3348
3349 /* With interpolated TES loads, we get here 3 times, once for each
3350 * per-vertex load. Skip this if we've been here before.
3351 */
3352 if (list_is_empty(&slot->producer.stores)) {
3353 assert(list_is_empty(&slot->consumer.loads));
3354 continue;
3355 }
3356 } else {
3357 assert(list_is_singular(&slot->consumer.loads));
3358 load = &list_first_entry(&slot->consumer.loads,
3359 struct list_node, head)->instr->instr;
3360 }
3361
3362 if (nir_instr_dominates_use(postdom_state, &postdom->instr, load)) {
3363 list_inithead(&slot->consumer.loads);
3364
3365 /* Remove stores. (transform feedback is allowed here, just not
3366 * in final_slot)
3367 */
3368 remove_all_stores_and_clear_slot(linkage, slot_index, progress);
3369 }
3370 }
3371
3372 *progress |= nir_progress_producer | nir_progress_consumer;
3373 return true;
3374 }
3375
3376 static bool
backward_inter_shader_code_motion(struct linkage_info * linkage,nir_opt_varyings_progress * progress)3377 backward_inter_shader_code_motion(struct linkage_info *linkage,
3378 nir_opt_varyings_progress *progress)
3379 {
3380 /* These producers are not supported. The description at the beginning
3381 * suggests a possible workaround.
3382 */
3383 if (linkage->producer_stage == MESA_SHADER_GEOMETRY ||
3384 linkage->producer_stage == MESA_SHADER_MESH ||
3385 linkage->producer_stage == MESA_SHADER_TASK)
3386 return false;
3387
3388 /* Clear pass_flags. */
3389 nir_shader_clear_pass_flags(linkage->consumer_builder.shader);
3390
3391 /* Gather inputs that can be moved into the previous shader. These are only
3392 * checked for the basic constraints for movability.
3393 */
3394 struct {
3395 nir_def *def;
3396 nir_intrinsic_instr *first_load;
3397 } movable_loads[NUM_SCALAR_SLOTS];
3398 unsigned num_movable_loads = 0;
3399 unsigned i;
3400
3401 BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
3402 if (!can_optimize_varying(linkage,
3403 vec4_slot(i)).inter_shader_code_motion)
3404 continue;
3405
3406 struct scalar_slot *slot = &linkage->slot[i];
3407
3408 assert(!list_is_empty(&slot->producer.stores));
3409 assert(!is_interpolated_texcoord(linkage, i));
3410 assert(!is_interpolated_color(linkage, i));
3411
3412 /* Disallow producer loads. */
3413 if (!list_is_empty(&slot->producer.loads))
3414 continue;
3415
3416 /* There should be only 1 store per output. */
3417 if (!list_is_singular(&slot->producer.stores))
3418 continue;
3419
3420 nir_def *load_def = NULL;
3421 nir_intrinsic_instr *load =
3422 list_first_entry(&slot->consumer.loads, struct list_node,
3423 head)->instr;
3424
3425 nir_intrinsic_instr *store =
3426 list_first_entry(&slot->producer.stores, struct list_node,
3427 head)->instr;
3428
3429 /* Set interpolation flags.
3430 * Handle interpolated TES loads first because they are special.
3431 */
3432 if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3433 slot->consumer.tes_interp_load) {
3434 if (linkage->producer_stage == MESA_SHADER_VERTEX) {
3435 /* VS -> TES has no constraints on VS stores. */
3436 load_def = &slot->consumer.tes_interp_load->def;
3437 load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD |
3438 slot->consumer.tes_interp_mode;
3439 } else {
3440 assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
3441 assert(store->intrinsic == nir_intrinsic_store_per_vertex_output);
3442
3443 /* The vertex index of the store must InvocationID. */
3444 if (is_sysval(store->src[1].ssa->parent_instr,
3445 SYSTEM_VALUE_INVOCATION_ID)) {
3446 load_def = &slot->consumer.tes_interp_load->def;
3447 load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD |
3448 slot->consumer.tes_interp_mode;
3449 } else {
3450 continue;
3451 }
3452 }
3453 } else {
3454 /* Allow only 1 load per input. CSE should be run before this. */
3455 if (!list_is_singular(&slot->consumer.loads))
3456 continue;
3457
3458 /* This can only be TCS -> TES, which is handled above and rejected
3459 * otherwise.
3460 */
3461 if (store->intrinsic == nir_intrinsic_store_per_vertex_output) {
3462 assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
3463 continue;
3464 }
3465
3466 /* TODO: handle load_per_vertex_input for TCS and GS.
3467 * TES can also occur here if tes_interp_load is NULL.
3468 */
3469 if (load->intrinsic == nir_intrinsic_load_per_vertex_input)
3470 continue;
3471
3472 load_def = &load->def;
3473
3474 switch (load->intrinsic) {
3475 case nir_intrinsic_load_interpolated_input: {
3476 assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT);
3477 nir_intrinsic_instr *baryc =
3478 nir_instr_as_intrinsic(load->src[0].ssa->parent_instr);
3479 nir_intrinsic_op op = baryc->intrinsic;
3480 enum glsl_interp_mode interp = nir_intrinsic_interp_mode(baryc);
3481 bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
3482 bool convergent = BITSET_TEST(linkage->convergent32_mask, i) ||
3483 BITSET_TEST(linkage->convergent16_mask, i);
3484
3485 assert(interp == INTERP_MODE_NONE ||
3486 interp == INTERP_MODE_SMOOTH ||
3487 interp == INTERP_MODE_NOPERSPECTIVE);
3488
3489 if (convergent) {
3490 load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3491 } else if (op == nir_intrinsic_load_barycentric_pixel) {
3492 load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_PIXEL
3493 : FLAG_INTERP_PERSP_PIXEL;
3494 } else if (op == nir_intrinsic_load_barycentric_centroid) {
3495 load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_CENTROID
3496 : FLAG_INTERP_PERSP_CENTROID;
3497 } else if (op == nir_intrinsic_load_barycentric_sample) {
3498 load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_SAMPLE
3499 : FLAG_INTERP_PERSP_SAMPLE;
3500 } else {
3501 /* Optimizing at_offset and at_sample would be possible but
3502 * maybe not worth it if they are not convergent. Convergent
3503 * inputs can trivially switch the barycentric coordinates
3504 * to different ones or flat.
3505 */
3506 continue;
3507 }
3508 break;
3509 }
3510 case nir_intrinsic_load_input:
3511 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
3512 if (BITSET_TEST(linkage->convergent32_mask, i) ||
3513 BITSET_TEST(linkage->convergent16_mask, i))
3514 load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3515 else
3516 load->instr.pass_flags |= FLAG_INTERP_FLAT;
3517 } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) {
3518 assert(vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
3519 vec4_slot(i) <= VARYING_SLOT_PATCH31);
3520 /* Patch inputs are always convergent. */
3521 load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3522 } else {
3523 /* It's not a fragment shader. We still need to set this. */
3524 load->instr.pass_flags |= FLAG_INTERP_FLAT;
3525 }
3526 break;
3527 case nir_intrinsic_load_per_primitive_input:
3528 case nir_intrinsic_load_input_vertex:
3529 /* Inter-shader code motion is unimplemented these. */
3530 continue;
3531 default:
3532 unreachable("unexpected load intrinsic");
3533 }
3534 }
3535
3536 load_def->parent_instr->pass_flags |= FLAG_MOVABLE;
3537
3538 /* Disallow transform feedback. The load is "movable" for the purpose of
3539 * finding a movable post-dominator, we just can't rewrite the store
3540 * because we need to keep it for xfb, so the post-dominator search
3541 * will have to start from a different load (only that varying will have
3542 * its value rewritten).
3543 */
3544 if (BITSET_TEST(linkage->xfb_mask, i))
3545 continue;
3546
3547 assert(num_movable_loads < ARRAY_SIZE(movable_loads));
3548 movable_loads[num_movable_loads].def = load_def;
3549 movable_loads[num_movable_loads].first_load = load;
3550 num_movable_loads++;
3551 }
3552
3553 if (!num_movable_loads)
3554 return false;
3555
3556 /* Inter-shader code motion turns ALU results into outputs, but not all
3557 * bit sizes are supported by outputs.
3558 *
3559 * The 1-bit type is allowed because the pass always promotes 1-bit
3560 * outputs to 16 or 32 bits, whichever is supported.
3561 *
3562 * TODO: We could support replacing 2 32-bit inputs with one 64-bit
3563 * post-dominator by supporting 64 bits here, but the likelihood of that
3564 * occuring seems low.
3565 */
3566 unsigned supported_io_types = 32 | 1;
3567
3568 if (linkage->producer_builder.shader->options->io_options &
3569 linkage->consumer_builder.shader->options->io_options &
3570 nir_io_16bit_input_output_support)
3571 supported_io_types |= 16;
3572
3573 struct nir_use_dominance_state *postdom_state =
3574 nir_calc_use_dominance_impl(linkage->consumer_builder.impl, true);
3575
3576 for (unsigned i = 0; i < num_movable_loads; i++) {
3577 nir_def *load_def = movable_loads[i].def;
3578 nir_instr *iter = load_def->parent_instr;
3579 nir_instr *movable_postdom = NULL;
3580
3581 /* Find the farthest post-dominator that is movable. */
3582 while (iter) {
3583 iter = nir_get_immediate_use_dominator(postdom_state, iter);
3584 if (iter) {
3585 if (NEED_UPDATE_MOVABLE_FLAGS(iter))
3586 update_movable_flags(linkage, iter);
3587
3588 if (iter->pass_flags & FLAG_UNMOVABLE)
3589 break;
3590
3591 /* This can only be an ALU instruction. */
3592 nir_alu_instr *alu = nir_instr_as_alu(iter);
3593
3594 /* Skip unsupported bit sizes and keep searching. */
3595 if (!(alu->def.bit_size & supported_io_types))
3596 continue;
3597
3598 /* Skip comparison opcodes that directly source the first load
3599 * and a constant because any 1-bit values would have to be
3600 * converted to 32 bits in the producer and then converted back
3601 * to 1 bit using nir_op_ine in the consumer, achieving nothing.
3602 */
3603 if (alu->def.bit_size == 1 &&
3604 ((nir_op_infos[alu->op].num_inputs == 1 &&
3605 alu->src[0].src.ssa == load_def) ||
3606 (nir_op_infos[alu->op].num_inputs == 2 &&
3607 ((alu->src[0].src.ssa == load_def &&
3608 alu->src[1].src.ssa->parent_instr->type ==
3609 nir_instr_type_load_const) ||
3610 (alu->src[0].src.ssa->parent_instr->type ==
3611 nir_instr_type_load_const &&
3612 alu->src[1].src.ssa == load_def)))))
3613 continue;
3614
3615 movable_postdom = iter;
3616 }
3617 }
3618
3619 /* Add the post-dominator to the list unless it's been added already. */
3620 if (movable_postdom &&
3621 !(movable_postdom->pass_flags & FLAG_POST_DOMINATOR_PROCESSED)) {
3622 if (try_move_postdominator(linkage, postdom_state,
3623 nir_instr_as_alu(movable_postdom),
3624 load_def, movable_loads[i].first_load,
3625 progress)) {
3626 /* Moving only one postdominator can change the IR enough that
3627 * we should start from scratch.
3628 */
3629 ralloc_free(postdom_state);
3630 return true;
3631 }
3632
3633 movable_postdom->pass_flags |= FLAG_POST_DOMINATOR_PROCESSED;
3634 }
3635 }
3636
3637 ralloc_free(postdom_state);
3638 return false;
3639 }
3640
3641 /******************************************************************
3642 * COMPACTION
3643 ******************************************************************/
3644
3645 /* Relocate a slot to a new index. Used by compaction. new_index is
3646 * the component index at 16-bit granularity, so the size of vec4 is 8
3647 * in that representation.
3648 */
3649 static void
relocate_slot(struct linkage_info * linkage,struct scalar_slot * slot,unsigned i,unsigned new_index,enum fs_vec4_type fs_vec4_type,bool convergent,nir_opt_varyings_progress * progress)3650 relocate_slot(struct linkage_info *linkage, struct scalar_slot *slot,
3651 unsigned i, unsigned new_index, enum fs_vec4_type fs_vec4_type,
3652 bool convergent, nir_opt_varyings_progress *progress)
3653 {
3654 assert(!list_is_empty(&slot->producer.stores));
3655
3656 list_for_each_entry(struct list_node, iter, &slot->producer.stores, head) {
3657 assert(!nir_intrinsic_io_semantics(iter->instr).no_varying ||
3658 has_xfb(iter->instr) ||
3659 linkage->producer_stage == MESA_SHADER_TESS_CTRL);
3660 assert(!is_active_sysval_output(linkage, i, iter->instr));
3661 }
3662
3663 /* Relocate the slot in all loads and stores. */
3664 struct list_head *instruction_lists[3] = {
3665 &slot->producer.stores,
3666 &slot->producer.loads,
3667 &slot->consumer.loads,
3668 };
3669
3670 for (unsigned i = 0; i < ARRAY_SIZE(instruction_lists); i++) {
3671 list_for_each_entry(struct list_node, iter, instruction_lists[i], head) {
3672 nir_intrinsic_instr *intr = iter->instr;
3673
3674 gl_varying_slot new_semantic = vec4_slot(new_index);
3675 unsigned new_component = (new_index % 8) / 2;
3676 bool new_high_16bits = new_index % 2;
3677
3678 /* We also need to relocate xfb info because it's always relative
3679 * to component 0. This just moves it into the correct xfb slot.
3680 */
3681 if (has_xfb(intr)) {
3682 unsigned old_component = nir_intrinsic_component(intr);
3683 static const nir_io_xfb clear_xfb;
3684 nir_io_xfb xfb;
3685 bool new_is_odd = new_component % 2 == 1;
3686
3687 memset(&xfb, 0, sizeof(xfb));
3688
3689 if (old_component >= 2) {
3690 xfb.out[new_is_odd] = nir_intrinsic_io_xfb2(intr).out[old_component - 2];
3691 nir_intrinsic_set_io_xfb2(intr, clear_xfb);
3692 } else {
3693 xfb.out[new_is_odd] = nir_intrinsic_io_xfb(intr).out[old_component];
3694 nir_intrinsic_set_io_xfb(intr, clear_xfb);
3695 }
3696
3697 if (new_component >= 2)
3698 nir_intrinsic_set_io_xfb2(intr, xfb);
3699 else
3700 nir_intrinsic_set_io_xfb(intr, xfb);
3701 }
3702
3703 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
3704
3705 /* When relocating a back color store, don't change it to a front
3706 * color as that would be incorrect. Keep it as back color and only
3707 * relocate it between BFC0 and BFC1.
3708 */
3709 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
3710 (sem.location == VARYING_SLOT_BFC0 ||
3711 sem.location == VARYING_SLOT_BFC1)) {
3712 assert(new_semantic == VARYING_SLOT_COL0 ||
3713 new_semantic == VARYING_SLOT_COL1);
3714 new_semantic = VARYING_SLOT_BFC0 +
3715 (new_semantic - VARYING_SLOT_COL0);
3716 }
3717
3718 #if PRINT_RELOCATE_SLOT
3719 unsigned bit_size =
3720 (intr->intrinsic == nir_intrinsic_load_input ||
3721 intr->intrinsic == nir_intrinsic_load_input_vertex ||
3722 intr->intrinsic == nir_intrinsic_load_interpolated_input)
3723 ? intr->def.bit_size : intr->src[0].ssa->bit_size;
3724
3725 assert(bit_size == 16 || bit_size == 32);
3726
3727 fprintf(stderr, "--- relocating: %s.%c%s%s -> %s.%c%s%s FS_VEC4_TYPE_%s\n",
3728 gl_varying_slot_name_for_stage(sem.location, linkage->producer_stage) + 13,
3729 "xyzw"[nir_intrinsic_component(intr) % 4],
3730 (bit_size == 16 && !sem.high_16bits) ? ".lo" : "",
3731 (bit_size == 16 && sem.high_16bits) ? ".hi" : "",
3732 gl_varying_slot_name_for_stage(new_semantic, linkage->producer_stage) + 13,
3733 "xyzw"[new_component % 4],
3734 (bit_size == 16 && !new_high_16bits) ? ".lo" : "",
3735 (bit_size == 16 && new_high_16bits) ? ".hi" : "",
3736 fs_vec4_type_strings[fs_vec4_type]);
3737 #endif /* PRINT_RELOCATE_SLOT */
3738
3739 sem.location = new_semantic;
3740 sem.high_16bits = new_high_16bits;
3741
3742 /* This is never indirectly indexed. Simplify num_slots. */
3743 sem.num_slots = 1;
3744
3745 nir_intrinsic_set_io_semantics(intr, sem);
3746 nir_intrinsic_set_component(intr, new_component);
3747
3748 if (fs_vec4_type == FS_VEC4_TYPE_PER_PRIMITIVE) {
3749 assert(intr->intrinsic == nir_intrinsic_store_per_primitive_output ||
3750 intr->intrinsic == nir_intrinsic_load_per_primitive_output ||
3751 intr->intrinsic == nir_intrinsic_load_per_primitive_input);
3752 } else {
3753 assert(intr->intrinsic != nir_intrinsic_store_per_primitive_output &&
3754 intr->intrinsic != nir_intrinsic_load_per_primitive_output &&
3755 intr->intrinsic != nir_intrinsic_load_per_primitive_input);
3756 }
3757
3758 /* This path is used when promoting convergent interpolated
3759 * inputs to flat. Replace load_interpolated_input with load_input.
3760 */
3761 if (intr->intrinsic == nir_intrinsic_load_interpolated_input &&
3762 (fs_vec4_type == FS_VEC4_TYPE_FLAT ||
3763 /* Promote all convergent loads to flat if the driver supports it. */
3764 (convergent &&
3765 linkage->can_mix_convergent_flat_with_interpolated))) {
3766 assert(instruction_lists[i] == &slot->consumer.loads);
3767 nir_builder *b = &linkage->consumer_builder;
3768
3769 b->cursor = nir_before_instr(&intr->instr);
3770 nir_def *load =
3771 nir_load_input(b, 1, intr->def.bit_size,
3772 nir_get_io_offset_src(intr)->ssa,
3773 .io_semantics = sem,
3774 .component = new_component,
3775 .dest_type = nir_intrinsic_dest_type(intr));
3776
3777 nir_def_rewrite_uses(&intr->def, load);
3778 iter->instr = nir_instr_as_intrinsic(load->parent_instr);
3779 nir_instr_remove(&intr->instr);
3780 *progress |= nir_progress_consumer;
3781
3782 /* Interpolation converts Infs to NaNs. If we change it to flat,
3783 * we need to convert Infs to NaNs manually in the producer to
3784 * preserve that.
3785 */
3786 if (preserve_nans(linkage->consumer_builder.shader,
3787 load->bit_size)) {
3788 list_for_each_entry(struct list_node, iter,
3789 &slot->producer.stores, head) {
3790 nir_intrinsic_instr *store = iter->instr;
3791
3792 nir_builder *b = &linkage->producer_builder;
3793 b->cursor = nir_before_instr(&store->instr);
3794 nir_def *repl =
3795 build_convert_inf_to_nan(b, store->src[0].ssa);
3796 nir_src_rewrite(&store->src[0], repl);
3797 }
3798 }
3799 }
3800 }
3801 }
3802 }
3803
3804 /**
3805 * A helper function for compact_varyings(). Assign new slot indices for
3806 * existing slots of a certain vec4 type (FLAT, FP16, or FP32). Skip already-
3807 * assigned scalar slots (determined by assigned_mask) and don't assign to
3808 * vec4 slots that have an incompatible vec4 type (determined by
3809 * assigned_fs_vec4_type). This works with both 32-bit and 16-bit types.
3810 * slot_size is the component size in the units of 16 bits (2 means 32 bits).
3811 *
3812 * The number of slots to assign can optionally be limited by
3813 * max_assigned_slots.
3814 *
3815 * Return how many 16-bit slots are left unused in the last vec4 (up to 8
3816 * slots).
3817 */
3818 static unsigned
fs_assign_slots(struct linkage_info * linkage,BITSET_WORD * assigned_mask,uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],BITSET_WORD * input_mask,enum fs_vec4_type fs_vec4_type,unsigned slot_size,unsigned max_assigned_slots,bool convergent,bool assign_colors,unsigned color_channel_rotate,nir_opt_varyings_progress * progress)3819 fs_assign_slots(struct linkage_info *linkage,
3820 BITSET_WORD *assigned_mask,
3821 uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],
3822 BITSET_WORD *input_mask,
3823 enum fs_vec4_type fs_vec4_type,
3824 unsigned slot_size,
3825 unsigned max_assigned_slots,
3826 bool convergent,
3827 bool assign_colors,
3828 unsigned color_channel_rotate,
3829 nir_opt_varyings_progress *progress)
3830 {
3831 unsigned i, slot_index, max_slot;
3832 unsigned num_assigned_slots = 0;
3833
3834 if (assign_colors) {
3835 slot_index = VARYING_SLOT_COL0 * 8; /* starting slot */
3836 max_slot = VARYING_SLOT_COL1 * 8 + 8;
3837 } else {
3838 slot_index = VARYING_SLOT_VAR0 * 8; /* starting slot */
3839 max_slot = VARYING_SLOT_MAX;
3840 }
3841
3842 /* Assign new slot indices for scalar slots. */
3843 BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) {
3844 if (is_interpolated_color(linkage, i) != assign_colors)
3845 continue;
3846
3847 /* Skip indirectly-indexed scalar slots and slots incompatible
3848 * with the FS vec4 type.
3849 */
3850 while ((fs_vec4_type != FS_VEC4_TYPE_NONE &&
3851 assigned_fs_vec4_type[vec4_slot(slot_index)] !=
3852 FS_VEC4_TYPE_NONE &&
3853 assigned_fs_vec4_type[vec4_slot(slot_index)] !=
3854 fs_vec4_type) ||
3855 BITSET_TEST32(linkage->indirect_mask, slot_index) ||
3856 BITSET_TEST(assigned_mask, slot_index)) {
3857 /* If the FS vec4 type is incompatible. Move to the next vec4. */
3858 if (fs_vec4_type != FS_VEC4_TYPE_NONE &&
3859 assigned_fs_vec4_type[vec4_slot(slot_index)] !=
3860 FS_VEC4_TYPE_NONE &&
3861 assigned_fs_vec4_type[vec4_slot(slot_index)] != fs_vec4_type) {
3862 slot_index = align(slot_index + slot_size, 8); /* move to next vec4 */
3863 continue;
3864 }
3865
3866 /* Copy the FS vec4 type if indexed indirectly, and move to
3867 * the next slot.
3868 */
3869 if (BITSET_TEST32(linkage->indirect_mask, slot_index)) {
3870 if (assigned_fs_vec4_type) {
3871 assigned_fs_vec4_type[vec4_slot(slot_index)] =
3872 linkage->fs_vec4_type[vec4_slot(slot_index)];
3873 }
3874 assert(slot_index % 2 == 0);
3875 slot_index += 2; /* increment by 32 bits */
3876 continue;
3877 }
3878
3879 /* This slot is already assigned (assigned_mask is set). Move to
3880 * the next one.
3881 */
3882 slot_index += slot_size;
3883 }
3884
3885 /* Assign color channels in this order, starting
3886 * at the color_channel_rotate component first. Cases:
3887 * color_channel_rotate = 0: xyzw
3888 * color_channel_rotate = 1: yzwx
3889 * color_channel_rotate = 2: zwxy
3890 * color_channel_rotate = 3: wxyz
3891 *
3892 * This has no effect on behavior per se, but some drivers merge VARn
3893 * and COLn into one output if each defines different components.
3894 * For example, if we store VAR0.xy and COL0.z, a driver can merge them
3895 * by mapping the same output to 2 different inputs (VAR0 and COL0) if
3896 * color-specific behavior is per component, but it can't merge VAR0.xy
3897 * and COL0.x because they both define x.
3898 */
3899 unsigned new_slot_index = slot_index;
3900 if (assign_colors && color_channel_rotate) {
3901 new_slot_index = (vec4_slot(new_slot_index)) * 8 +
3902 (new_slot_index + color_channel_rotate * 2) % 8;
3903 }
3904
3905 /* Relocate the slot. */
3906 assert(slot_index < max_slot * 8);
3907 relocate_slot(linkage, &linkage->slot[i], i, new_slot_index,
3908 fs_vec4_type, convergent, progress);
3909
3910 for (unsigned i = 0; i < slot_size; ++i)
3911 BITSET_SET(assigned_mask, slot_index + i);
3912
3913 if (assigned_fs_vec4_type)
3914 assigned_fs_vec4_type[vec4_slot(slot_index)] = fs_vec4_type;
3915 slot_index += slot_size; /* move to the next slot */
3916 num_assigned_slots += slot_size;
3917
3918 /* Remove the slot from the input (unassigned) mask. */
3919 BITSET_CLEAR(input_mask, i);
3920
3921 /* The number of slots to assign can optionally be limited. */
3922 assert(num_assigned_slots <= max_assigned_slots);
3923 if (num_assigned_slots == max_assigned_slots)
3924 break;
3925 }
3926
3927 assert(slot_index <= max_slot * 8);
3928 /* Return how many 16-bit slots are left unused in the last vec4. */
3929 return (NUM_SCALAR_SLOTS - slot_index) % 8;
3930 }
3931
3932 /**
3933 * This is called once for 32-bit inputs and once for 16-bit inputs.
3934 * It assigns new slot indices to all scalar slots specified in the masks.
3935 *
3936 * \param linkage Linkage info
3937 * \param assigned_mask Which scalar (16-bit) slots are already taken.
3938 * \param assigned_fs_vec4_type Which vec4 slots have an assigned qualifier
3939 * and can only be filled with compatible slots.
3940 * \param interp_mask The list of interp slots to assign locations for.
3941 * \param flat_mask The list of flat slots to assign locations for.
3942 * \param convergent_mask The list of slots that have convergent output
3943 * stores.
3944 * \param sized_interp_type One of FS_VEC4_TYPE_INTERP_{FP32, FP16, COLOR}.
3945 * \param slot_size 1 for 16 bits, 2 for 32 bits
3946 * \param color_channel_rotate Assign color channels starting with this index,
3947 * e.g. 2 assigns channels in the zwxy order.
3948 * \param assign_colors Whether to assign only color varyings or only
3949 * non-color varyings.
3950 */
3951 static void
fs_assign_slot_groups(struct linkage_info * linkage,BITSET_WORD * assigned_mask,uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],BITSET_WORD * interp_mask,BITSET_WORD * flat_mask,BITSET_WORD * convergent_mask,BITSET_WORD * color_interp_mask,enum fs_vec4_type sized_interp_type,unsigned slot_size,bool assign_colors,unsigned color_channel_rotate,nir_opt_varyings_progress * progress)3952 fs_assign_slot_groups(struct linkage_info *linkage,
3953 BITSET_WORD *assigned_mask,
3954 uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],
3955 BITSET_WORD *interp_mask,
3956 BITSET_WORD *flat_mask,
3957 BITSET_WORD *convergent_mask,
3958 BITSET_WORD *color_interp_mask,
3959 enum fs_vec4_type sized_interp_type,
3960 unsigned slot_size,
3961 bool assign_colors,
3962 unsigned color_channel_rotate,
3963 nir_opt_varyings_progress *progress)
3964 {
3965 /* Put interpolated slots first. */
3966 unsigned unused_interp_slots =
3967 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
3968 interp_mask, sized_interp_type,
3969 slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
3970 color_channel_rotate, progress);
3971
3972 unsigned unused_color_interp_slots = 0;
3973 if (color_interp_mask) {
3974 unused_color_interp_slots =
3975 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
3976 color_interp_mask, FS_VEC4_TYPE_INTERP_COLOR,
3977 slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
3978 color_channel_rotate, progress);
3979 }
3980
3981 /* Put flat slots next.
3982 * Note that only flat vec4 slots can have both 32-bit and 16-bit types
3983 * packed in the same vec4. 32-bit flat inputs are packed first, followed
3984 * by 16-bit flat inputs.
3985 */
3986 unsigned unused_flat_slots =
3987 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
3988 flat_mask, FS_VEC4_TYPE_FLAT,
3989 slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
3990 color_channel_rotate, progress);
3991
3992 /* Take the inputs with convergent values and assign them as follows.
3993 * Since they can be assigned as both interpolated and flat, we can
3994 * choose. We prefer them to be flat, but if interpolated vec4s have
3995 * unused components, try to fill those before starting a new flat vec4.
3996 *
3997 * First, fill the unused components of flat (if any), then fill
3998 * the unused components of interpolated (if any), and then make
3999 * the remaining convergent inputs flat.
4000 */
4001 if (unused_flat_slots) {
4002 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4003 convergent_mask, FS_VEC4_TYPE_FLAT,
4004 slot_size, unused_flat_slots, true, assign_colors,
4005 color_channel_rotate, progress);
4006 }
4007 if (unused_interp_slots) {
4008 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4009 convergent_mask, sized_interp_type,
4010 slot_size, unused_interp_slots, true, assign_colors,
4011 color_channel_rotate, progress);
4012 }
4013 if (unused_color_interp_slots) {
4014 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4015 convergent_mask, FS_VEC4_TYPE_INTERP_COLOR,
4016 slot_size, unused_color_interp_slots, true, assign_colors,
4017 color_channel_rotate, progress);
4018 }
4019 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4020 convergent_mask, FS_VEC4_TYPE_FLAT,
4021 slot_size, NUM_SCALAR_SLOTS, true, assign_colors,
4022 color_channel_rotate, progress);
4023 }
4024
4025 static void
vs_tcs_tes_gs_assign_slots(struct linkage_info * linkage,BITSET_WORD * input_mask,unsigned * slot_index,unsigned * patch_slot_index,unsigned slot_size,nir_opt_varyings_progress * progress)4026 vs_tcs_tes_gs_assign_slots(struct linkage_info *linkage,
4027 BITSET_WORD *input_mask,
4028 unsigned *slot_index,
4029 unsigned *patch_slot_index,
4030 unsigned slot_size,
4031 nir_opt_varyings_progress *progress)
4032 {
4033 unsigned i;
4034
4035 BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) {
4036 if (i >= VARYING_SLOT_PATCH0 * 8 && i < VARYING_SLOT_TESS_MAX * 8) {
4037 /* Skip indirectly-indexed scalar slots at 32-bit granularity.
4038 * We have to do it at this granularity because the low 16-bit
4039 * slot is set to 1 for 32-bit inputs but not the high 16-bit slot.
4040 */
4041 while (BITSET_TEST32(linkage->indirect_mask, *patch_slot_index))
4042 *patch_slot_index = align(*patch_slot_index + 1, 2);
4043
4044 assert(*patch_slot_index < VARYING_SLOT_TESS_MAX * 8);
4045 relocate_slot(linkage, &linkage->slot[i], i, *patch_slot_index,
4046 FS_VEC4_TYPE_NONE, false, progress);
4047 *patch_slot_index += slot_size; /* increment by 16 or 32 bits */
4048 } else {
4049 /* If the driver wants to use POS and we've already used it, move
4050 * to VARn.
4051 */
4052 if (*slot_index < VARYING_SLOT_VAR0 &&
4053 *slot_index >= VARYING_SLOT_POS + 8)
4054 *slot_index = VARYING_SLOT_VAR0 * 8;
4055
4056 /* Skip indirectly-indexed scalar slots at 32-bit granularity. */
4057 while (BITSET_TEST32(linkage->indirect_mask, *slot_index))
4058 *slot_index = align(*slot_index + 1, 2);
4059
4060 assert(*slot_index < VARYING_SLOT_MAX * 8);
4061 relocate_slot(linkage, &linkage->slot[i], i, *slot_index,
4062 FS_VEC4_TYPE_NONE, false, progress);
4063 *slot_index += slot_size; /* increment by 16 or 32 bits */
4064 }
4065 }
4066 }
4067
4068 /**
4069 * Compaction means scalarizing and then packing scalar components into full
4070 * vec4s, so that we minimize the number of unused components in vec4 slots.
4071 *
4072 * Compaction is as simple as moving a scalar input from one scalar slot
4073 * to another. Indirectly-indexed slots are not touched, so the compaction
4074 * has to compact around them. Unused 32-bit components of indirectly-indexed
4075 * slots are still filled, so no space is wasted there, but if indirectly-
4076 * indexed 16-bit components have the other 16-bit half unused, that half is
4077 * wasted.
4078 */
4079 static void
compact_varyings(struct linkage_info * linkage,nir_opt_varyings_progress * progress)4080 compact_varyings(struct linkage_info *linkage,
4081 nir_opt_varyings_progress *progress)
4082 {
4083 if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
4084 /* These arrays are used to track which scalar slots we've already
4085 * assigned. We can fill unused components of indirectly-indexed slots,
4086 * but only if the vec4 slot type (FLAT, FP16, or FP32) is the same.
4087 * Assign vec4 slot type separately, skipping over already assigned
4088 * scalar slots.
4089 */
4090 uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS] = {0};
4091 BITSET_DECLARE(assigned_mask, NUM_SCALAR_SLOTS);
4092 BITSET_ZERO(assigned_mask);
4093
4094 fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4095 linkage->interp_fp32_mask, linkage->flat32_mask,
4096 linkage->convergent32_mask, NULL,
4097 FS_VEC4_TYPE_INTERP_FP32, 2, false, 0, progress);
4098
4099 /* Now do the same thing, but for 16-bit inputs. */
4100 fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4101 linkage->interp_fp16_mask, linkage->flat16_mask,
4102 linkage->convergent16_mask, NULL,
4103 FS_VEC4_TYPE_INTERP_FP16, 1, false, 0, progress);
4104
4105 /* Assign INTERP_MODE_EXPLICIT. Both FP32 and FP16 can occupy the same
4106 * slot because the vertex data is passed to FS as-is.
4107 */
4108 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4109 linkage->interp_explicit32_mask, FS_VEC4_TYPE_INTERP_EXPLICIT,
4110 2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4111
4112 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4113 linkage->interp_explicit16_mask, FS_VEC4_TYPE_INTERP_EXPLICIT,
4114 1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4115
4116 /* Same for strict vertex ordering. */
4117 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4118 linkage->interp_explicit_strict32_mask, FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
4119 2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4120
4121 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4122 linkage->interp_explicit_strict16_mask, FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
4123 1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4124
4125 /* Same for per-primitive. */
4126 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4127 linkage->per_primitive32_mask, FS_VEC4_TYPE_PER_PRIMITIVE,
4128 2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4129
4130 fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4131 linkage->per_primitive16_mask, FS_VEC4_TYPE_PER_PRIMITIVE,
4132 1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4133
4134 /* Put transform-feedback-only outputs last. */
4135 fs_assign_slots(linkage, assigned_mask, NULL,
4136 linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2,
4137 NUM_SCALAR_SLOTS, false, false, 0, progress);
4138
4139 fs_assign_slots(linkage, assigned_mask, NULL,
4140 linkage->xfb16_only_mask, FS_VEC4_TYPE_NONE, 1,
4141 NUM_SCALAR_SLOTS, false, false, 0, progress);
4142
4143 /* Color varyings are only compacted among themselves. */
4144 /* Set whether the shader contains any color varyings. */
4145 unsigned col0 = VARYING_SLOT_COL0 * 8;
4146 bool has_colors =
4147 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->interp_fp32_mask, col0, 16,
4148 0) ||
4149 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->convergent32_mask, col0, 16,
4150 0) ||
4151 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->color32_mask, col0, 16, 0) ||
4152 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->flat32_mask, col0, 16, 0) ||
4153 !BITSET_TEST_RANGE_INSIDE_WORD(linkage->xfb32_only_mask, col0, 16, 0);
4154
4155 if (has_colors) {
4156 unsigned color_channel_rotate =
4157 DIV_ROUND_UP(BITSET_LAST_BIT(assigned_mask), 2) % 4;
4158
4159 fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4160 linkage->interp_fp32_mask, linkage->flat32_mask,
4161 linkage->convergent32_mask, linkage->color32_mask,
4162 FS_VEC4_TYPE_INTERP_FP32, 2, true,
4163 color_channel_rotate, progress);
4164
4165 /* Put transform-feedback-only outputs last. */
4166 fs_assign_slots(linkage, assigned_mask, NULL,
4167 linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2,
4168 NUM_SCALAR_SLOTS, false, true, color_channel_rotate,
4169 progress);
4170 }
4171 } else {
4172 /* The consumer is a TCS, TES, or GS.
4173 *
4174 * "use_pos" says whether the driver prefers that compaction with non-FS
4175 * consumers puts varyings into POS first before using any VARn.
4176 */
4177 bool use_pos = !(linkage->producer_builder.shader->options->io_options &
4178 nir_io_dont_use_pos_for_non_fs_varyings);
4179 unsigned slot_index = (use_pos ? VARYING_SLOT_POS
4180 : VARYING_SLOT_VAR0) * 8;
4181 unsigned patch_slot_index = VARYING_SLOT_PATCH0 * 8;
4182
4183 /* Compact 32-bit inputs. */
4184 vs_tcs_tes_gs_assign_slots(linkage, linkage->flat32_mask, &slot_index,
4185 &patch_slot_index, 2, progress);
4186
4187 /* Compact 16-bit inputs, allowing them to share vec4 slots with 32-bit
4188 * inputs.
4189 */
4190 vs_tcs_tes_gs_assign_slots(linkage, linkage->flat16_mask, &slot_index,
4191 &patch_slot_index, 1, progress);
4192
4193 /* Put no-varying slots last. These are TCS outputs read by TCS but not
4194 * TES.
4195 */
4196 vs_tcs_tes_gs_assign_slots(linkage, linkage->no_varying32_mask, &slot_index,
4197 &patch_slot_index, 2, progress);
4198 vs_tcs_tes_gs_assign_slots(linkage, linkage->no_varying16_mask, &slot_index,
4199 &patch_slot_index, 1, progress);
4200
4201 assert(slot_index <= VARYING_SLOT_MAX * 8);
4202 assert(patch_slot_index <= VARYING_SLOT_TESS_MAX * 8);
4203 }
4204 }
4205
4206 /******************************************************************
4207 * PUTTING IT ALL TOGETHER
4208 ******************************************************************/
4209
4210 static void
init_linkage(nir_shader * producer,nir_shader * consumer,bool spirv,unsigned max_uniform_components,unsigned max_ubos_per_stage,struct linkage_info * linkage)4211 init_linkage(nir_shader *producer, nir_shader *consumer, bool spirv,
4212 unsigned max_uniform_components, unsigned max_ubos_per_stage,
4213 struct linkage_info *linkage)
4214 {
4215 *linkage = (struct linkage_info){
4216 .spirv = spirv,
4217 .can_mix_convergent_flat_with_interpolated =
4218 consumer->info.stage == MESA_SHADER_FRAGMENT &&
4219 consumer->options->io_options &
4220 nir_io_mix_convergent_flat_with_interpolated,
4221 .producer_stage = producer->info.stage,
4222 .consumer_stage = consumer->info.stage,
4223 .producer_builder =
4224 nir_builder_create(nir_shader_get_entrypoint(producer)),
4225 .consumer_builder =
4226 nir_builder_create(nir_shader_get_entrypoint(consumer)),
4227
4228 .max_varying_expression_cost =
4229 producer->options->varying_expression_max_cost ?
4230 producer->options->varying_expression_max_cost(producer, consumer) : 0,
4231
4232 .linear_mem_ctx = linear_context(ralloc_context(NULL)),
4233 };
4234
4235 for (unsigned i = 0; i < ARRAY_SIZE(linkage->slot); i++) {
4236 list_inithead(&linkage->slot[i].producer.loads);
4237 list_inithead(&linkage->slot[i].producer.stores);
4238 list_inithead(&linkage->slot[i].consumer.loads);
4239 }
4240
4241 /* Preparation. */
4242 nir_shader_intrinsics_pass(consumer, gather_inputs, 0, linkage);
4243 nir_shader_intrinsics_pass(producer, gather_outputs, 0, linkage);
4244 tidy_up_indirect_varyings(linkage);
4245 determine_uniform_movability(linkage, max_uniform_components);
4246 determine_ubo_movability(linkage, max_ubos_per_stage);
4247 }
4248
4249 static void
free_linkage(struct linkage_info * linkage)4250 free_linkage(struct linkage_info *linkage)
4251 {
4252 ralloc_free(ralloc_parent_of_linear_context(linkage->linear_mem_ctx));
4253 }
4254
4255 static void
print_shader_linkage(nir_shader * producer,nir_shader * consumer)4256 print_shader_linkage(nir_shader *producer, nir_shader *consumer)
4257 {
4258 struct linkage_info *linkage = MALLOC_STRUCT(linkage_info);
4259
4260 init_linkage(producer, consumer, false, 0, 0, linkage);
4261 print_linkage(linkage);
4262 free_linkage(linkage);
4263 FREE(linkage);
4264 }
4265
4266 /**
4267 * Run lots of optimizations on varyings. See the description at the beginning
4268 * of this file.
4269 */
4270 nir_opt_varyings_progress
nir_opt_varyings(nir_shader * producer,nir_shader * consumer,bool spirv,unsigned max_uniform_components,unsigned max_ubos_per_stage)4271 nir_opt_varyings(nir_shader *producer, nir_shader *consumer, bool spirv,
4272 unsigned max_uniform_components, unsigned max_ubos_per_stage)
4273 {
4274 /* Task -> Mesh I/O uses payload variables and not varying slots,
4275 * so this pass can't do anything about it.
4276 */
4277 if (producer->info.stage == MESA_SHADER_TASK)
4278 return 0;
4279
4280 nir_opt_varyings_progress progress = 0;
4281 struct linkage_info *linkage = MALLOC_STRUCT(linkage_info);
4282 if (linkage == NULL)
4283 return 0;
4284
4285 /* Producers before a fragment shader must have up-to-date vertex
4286 * divergence information.
4287 */
4288 if (consumer->info.stage == MESA_SHADER_FRAGMENT) {
4289 /* Required by the divergence analysis. */
4290 NIR_PASS(_, producer, nir_convert_to_lcssa, true, true);
4291 nir_vertex_divergence_analysis(producer);
4292 }
4293
4294 init_linkage(producer, consumer, spirv, max_uniform_components,
4295 max_ubos_per_stage, linkage);
4296
4297 /* Part 1: Run optimizations that only remove varyings. (they can move
4298 * instructions between shaders)
4299 */
4300 remove_dead_varyings(linkage, &progress);
4301 propagate_uniform_expressions(linkage, &progress);
4302
4303 /* Part 2: Deduplicate outputs. */
4304 deduplicate_outputs(linkage, &progress);
4305
4306 /* Run CSE on the consumer after output deduplication because duplicated
4307 * loads can prevent finding the post-dominator for inter-shader code
4308 * motion.
4309 */
4310 NIR_PASS(_, consumer, nir_opt_cse);
4311
4312 /* Re-gather linkage info after CSE. */
4313 free_linkage(linkage);
4314 init_linkage(producer, consumer, spirv, max_uniform_components,
4315 max_ubos_per_stage, linkage);
4316 /* This must be done again to clean up bitmasks in linkage. */
4317 remove_dead_varyings(linkage, &progress);
4318
4319 /* This must be done after deduplication and before inter-shader code
4320 * motion.
4321 */
4322 tidy_up_convergent_varyings(linkage);
4323 find_open_coded_tes_input_interpolation(linkage);
4324
4325 /* Part 3: Run optimizations that completely change varyings. */
4326 #if PRINT
4327 int i = 0;
4328 puts("Before:");
4329 nir_print_shader(linkage.producer_builder.shader, stdout);
4330 nir_print_shader(linkage.consumer_builder.shader, stdout);
4331 print_linkage(&linkage);
4332 puts("");
4333 #endif
4334
4335 while (backward_inter_shader_code_motion(linkage, &progress)) {
4336 #if PRINT
4337 i++;
4338 printf("Finished: %i\n", i);
4339 nir_print_shader(linkage->producer_builder.shader, stdout);
4340 nir_print_shader(linkage->consumer_builder.shader, stdout);
4341 print_linkage(linkage);
4342 puts("");
4343 #endif
4344 }
4345
4346 /* Part 4: Do compaction. */
4347 compact_varyings(linkage, &progress);
4348
4349 nir_metadata_preserve(linkage->producer_builder.impl,
4350 progress & nir_progress_producer ?
4351 (nir_metadata_control_flow) :
4352 nir_metadata_all);
4353 nir_metadata_preserve(linkage->consumer_builder.impl,
4354 progress & nir_progress_consumer ?
4355 (nir_metadata_control_flow) :
4356 nir_metadata_all);
4357 free_linkage(linkage);
4358 FREE(linkage);
4359
4360 if (progress & nir_progress_producer)
4361 nir_validate_shader(producer, "nir_opt_varyings");
4362 if (progress & nir_progress_consumer)
4363 nir_validate_shader(consumer, "nir_opt_varyings");
4364
4365 return progress;
4366 }
4367