xref: /aosp_15_r20/external/mesa3d/src/compiler/nir/nir_opt_varyings.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2023 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* Introduction
8  * ============
9  *
10  * This pass optimizes varyings between 2 shaders, which means dead input/
11  * output removal, constant and uniform load propagation, deduplication,
12  * compaction, and inter-shader code motion. This is used during the shader
13  * linking process.
14  *
15  *
16  * Notes on behavior
17  * =================
18  *
19  * The pass operates on scalar varyings using 32-bit and 16-bit types. Vector
20  * varyings are not allowed.
21  *
22  * Indirectly-indexed varying slots (not vertices) are not optimized or
23  * compacted, but unused slots of indirectly-indexed varyings are still filled
24  * with directly-indexed varyings during compaction. Indirectly-indexed
25  * varyings are still removed if they are unused by the other shader.
26  *
27  * Indirectly-indexed vertices don't disallow optimizations, but compromises
28  * are made depending on how they are accessed. They are common in TCS, TES,
29  * and GS, so there is a desire to optimize them as much as possible. More on
30  * that in various sections below.
31  *
32  * Transform feedback doesn't prevent most optimizations such as constant
33  * propagation and compaction. Shaders can be left with output stores that set
34  * the no_varying flag, meaning the output is not consumed by the next shader,
35  * which means that optimizations did their job and now the output is only
36  * consumed by transform feedback.
37  *
38  * All legacy varying slots are optimized when it's allowed.
39  *
40  *
41  * Convergence property of shader outputs
42  * ======================================
43  *
44  * When an output stores an SSA that is convergent and all stores of that
45  * output appear in unconditional blocks or conditional blocks with
46  * a convergent entry condition and the shader is not GS, it implies that all
47  * vertices of that output have the same value, therefore the output can be
48  * promoted to flat because all interpolation modes lead to the same result
49  * as flat. Such outputs are opportunistically compacted with both flat and
50  * non-flat varyings based on whichever has unused slots in their vec4s. This
51  * pass refers to such inputs, outputs, and varyings as "convergent" (meaning
52  * all vertices are always equal).
53  *
54  * By default, flat varyings are the only ones that are not considered convergent
55  * because we want the flexibility to pack convergent varyings with both flat
56  * and non-flat varyings, and since flat varyings can contain integers and
57  * doubles, we can never interpolate them as FP32 or FP16. Optimizations start
58  * with separate interpolated, flat, and convergent groups of varyings, and
59  * they choose whether they want to promote convergent to interpolated or
60  * flat, or whether to leave that decision to the end when the compaction
61  * happens.
62  *
63  * The above default behavior doesn't apply when the hw supports convergent
64  * flat loads with interpolated vec4 slots. (there is a NIR option)
65  *
66  * TES patch inputs are always convergent because they are uniform within
67  * a primitive.
68  *
69  *
70  * Optimization steps
71  * ==================
72  *
73  * 1. Determine which varying slots can be optimized and how.
74  *
75  *    * When a varying is said to be "optimized" in the following text, it
76  *      means all optimizations are performed, such as removal, constant
77  *      propagation, and deduplication.
78  *    * All VARn, PATCHn, and FOGC varyings are always optimized and
79  *      compacted.
80  *    * PRIMITIVE_ID is treated as VARn in (GS, FS).
81  *    * TEXn are removed if they are dead (except TEXn inputs, which can't be
82  *      removed due to being affected by the coord replace state). TEXn can’t
83  *      also be optimized or compacted due to being affected by the coord
84  *      replace state. TEXn not consumed by FS are treated as VARn.
85  *    * COLn and BFCn only propagate constants if they are between 0 and 1
86  *      because of the clamp vertex color state, and they are only
87  *      deduplicated and compacted among themselves because they are affected
88  *      by the flat shade, provoking vertex, two-side color selection, and
89  *      clamp vertex color states. COLn and BFCn not consumed by FS are
90  *      treated as VARn.
91  *    * All system value outputs like POS, PSIZ, CLIP_DISTn, etc. can’t be
92  *      removed, but they are demoted to sysval-only outputs by setting
93  *      the "no_varying" flag (i.e. they can be removed as varyings), so
94  *      drivers should look at the "no_varying" flag. If an output is not
95  *      a sysval output in a specific stage, it's treated as VARn. (such as
96  *      POS in TCS)
97  *    * TESS_LEVEL_* inputs in TES can’t be touched if TCS is missing.
98  *
99  * 2. Remove unused inputs and outputs
100  *
101  *    * Outputs not used in the next shader are removed.
102  *    * Inputs not initialized by the previous shader are replaced with undef
103  *      except:
104  *      * LAYER and VIEWPORT are replaced with 0 in FS.
105  *      * TEXn.xy is untouched because the coord replace state can set it, and
106  *        TEXn.zw is replaced by (0, 1), which is equal to the coord replace
107  *        value.
108  *    * Output loads that have no output stores anywhere in the shader are
109  *      replaced with undef. (for TCS, though it works with any shader)
110  *    * Output stores with transform feedback are preserved, but get
111  *      the “no_varying” flag, meaning they are not consumed by the next
112  *      shader stage. Later, transform-feedback-only varyings are compacted
113  *      (relocated) such that they are always last.
114  *    * TCS outputs that are read by TCS, but not used by TES get
115  *      the "no_varying" flag to indicate that they are only read by TCS and
116  *      not consumed by TES. Later, such TCS outputs are compacted (relocated)
117  *      such that they are always last to keep all outputs consumed by TES
118  *      consecutive without holes.
119  *
120  * 3. Constant, uniform, UBO load, and uniform expression propagation
121  *
122  *    * Define “uniform expressions” as ALU expressions only sourcing
123  *      constants, uniforms, and UBO loads.
124  *    * Constants, uniforms, UBO loads, and uniform expressions stored
125  *      in outputs are moved into the next shader, and the outputs are removed.
126  *    * The same propagation is done from output stores to output loads.
127  *      (for TCS, though it works with any shader)
128  *    * If there are multiple stores to the same output, all such stores
129  *      should store the same constant, uniform, UBO load, or uniform
130  *      expression for the expression to be propagated. If an output has
131  *      multiple vertices, all vertices should store the same expression.
132  *    * nir->options has callbacks that are used to estimate the cost of
133  *      uniform expressions that drivers can set to control the complexity of
134  *      uniform expressions that are propagated. This is to ensure that
135  *      we don't increase the GPU overhead measurably by moving code across
136  *      pipeline stages that amplify GPU work.
137  *    * Special cases:
138  *      * Constant COLn and BFCn are propagated only if the constants are
139  *        in the [0, 1] range because of the clamp vertex color state.
140  *        If both COLn and BFCn are written, they must write the same
141  *        constant. If BFCn is written but not COLn, the constant is
142  *        propagated from BFCn to COLn.
143  *      * TEX.xy is untouched because of the coord replace state.
144  *        If TEX.zw is (0, 1), only those constants are propagated because
145  *        they match the coord replace values.
146  *      * CLIP_DISTn, LAYER and VIEWPORT are always propagated.
147  *        Eliminated output stores get the "no_varying" flag if they are also
148  *        xfb stores or write sysval outputs.
149  *
150  * 4. Remove duplicated output components
151  *
152  *    * By comparing SSA defs.
153  *    * If there are multiple stores to the same output, all such stores
154  *      should store the same SSA as all stores of another output for
155  *      the output to be considered duplicated. If an output has multiple
156  *      vertices, all vertices should store the same SSA.
157  *    * Deduplication can only be done between outputs of the same category.
158  *      Those are: interpolated, patch, flat, interpolated color, flat color,
159  *                 and conditionally interpolated color based on the flat
160  *                 shade state
161  *    * Everything is deduplicated except TEXn due to the coord replace state.
162  *    * Eliminated output stores get the "no_varying" flag if they are also
163  *      xfb stores or write sysval outputs.
164  *
165  * 5. Backward inter-shader code motion
166  *
167  *    "Backward" refers to moving code in the opposite direction that shaders
168  *    are executed, i.e. moving code from the consumer to the producer.
169  *
170  *    Fragment shader example:
171  *    ```
172  *       result = input0 * uniform + input1 * constant + UBO.variable;
173  *    ```
174  *
175  *    The computation of "result" in the above example can be moved into
176  *    the previous shader and both inputs can be replaced with a new input
177  *    holding the value of "result", thus making the shader smaller and
178  *    possibly reducing the number of inputs, uniforms, and UBOs by 1.
179  *
180  *    Such code motion can be performed for any expression sourcing only
181  *    inputs, constants, and uniforms except for fragment shaders, which can
182  *    also do it but with the following limitations:
183  *    * Only these transformations can be perfomed with interpolated inputs
184  *      and any composition of these transformations (such as lerp), which can
185  *      all be proven mathematically:
186  *      * interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j)
187  *      * interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j)
188  *      * interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j)
189  *        * all of these transformations are considered "inexact" in NIR
190  *        * interp interpolates an input according to the barycentric
191  *          coordinates (i, j), which are different for perspective,
192  *          noperspective, center, centroid, sample, at_offset, and at_sample
193  *          modes.
194  *        * convergent_expr is any expression sourcing only constants,
195  *          uniforms, and convergent inputs. The only requirement on
196  *          convergent_expr is that it doesn't vary between vertices of
197  *          the same primitive, but it can vary between primitives.
198  *    * If inputs are flat or convergent, there are no limitations on
199  *      expressions that can be moved.
200  *    * Interpolated and flat inputs can't mix in the same expression, but
201  *      convergent inputs can mix with both.
202  *    * The interpolation qualifier of the new input is inherited from
203  *      the removed non-convergent inputs that should all have the same (i, j).
204  *      If there are no non-convergent inputs, then the new input is declared
205  *      as flat (for simplicity; we can't choose the barycentric coordinates
206  *      at random because AMD doesn't like when there are multiple sets of
207  *      barycentric coordinates in the same shader unnecessarily).
208  *    * Inf values break code motion across interpolation. See the section
209  *      discussing how we handle it near the end.
210  *
211  *    The above rules also apply to open-coded TES input interpolation, which
212  *    is handled the same as FS input interpolation. The only differences are:
213  *    * Open-coded TES input interpolation must match one of the allowed
214  *      equations. Different interpolation equations are treated the same as
215  *      different interpolation qualifiers in FS.
216  *    * Patch varyings are always treated as convergent.
217  *
218  *    Prerequisites:
219  *    * We need a post-dominator tree that is constructed from a graph where
220  *      vertices are instructions and directed edges going into them are
221  *      the values of their source operands. This is different from how NIR
222  *      dominance works, which represents all instructions within a basic
223  *      block as a linear chain of vertices in the graph.
224  *      In our graph, all loads without source operands and all constants are
225  *      entry nodes in the graph, and all stores and discards are exit nodes
226  *      in the graph. Each shader can have multiple disjoint graphs where
227  *      the Lowest Common Ancestor of 2 instructions doesn't exist.
228  *    * Given the above definition, the instruction whose result is the best
229  *      candidate for a new input is the farthest instruction that
230  *      post-dominates one of more inputs and is movable between shaders.
231  *
232  *    Algorithm Idea Part 1: Search
233  *    * Pick any input load that is hypothetically movable and call it
234  *      the iterator.
235  *    * Get the immediate post-dominator of the iterator, and if it's movable,
236  *      replace the iterator with it.
237  *    * Repeat the previous step until the obtained immediate post-dominator
238  *      is not movable.
239  *    * The iterator now contains the farthest post-dominator that is movable.
240  *    * Gather all input loads that the post-dominator consumes.
241  *    * For each of those input loads, all matching output stores must be
242  *      in the same block (because they will be replaced by a single store).
243  *
244  *    Algorithm Idea Part 2: Code Motion
245  *    * Clone the post-dominator in the producer except input loads, which
246  *      should be replaced by stored output values. Uniform and UBO loads,
247  *      if any, should be cloned too.
248  *    * Remove the original output stores.
249  *    * Replace the post-dominator from the consumer with a new input load.
250  *    * The step above makes the post-dominated input load that we picked
251  *      at the beginning dead, but other input loads used by the post-
252  *      dominator might still have other uses (shown in the example below).
253  *
254  *    Example SSA-use graph - initial shader and the result:
255  *    ```
256  *          input0 input1             input0 input1
257  *              \   / \                  |      \
258  *    constant   alu  ...    ======>     |     ...
259  *           \   /
260  *            alu
261  *      (post-dominator)
262  *    ```
263  *
264  *    Description:
265  *       On the right, the algorithm moved the constant and both ALU opcodes
266  *       into the previous shader and input0 now contains the value of
267  *       the post-dominator. input1 stays the same because it still has one
268  *       use left. If input1 hadn't had the other use, it would have been
269  *       removed.
270  *
271  *    If the algorithm moves any code, the algorithm is repeated until there
272  *    is no code that it can move.
273  *
274  *    Which shader pairs are supported:
275  *    * (VS, FS), (TES, FS): yes, fully
276  *      * Limitation: If Infs must be preserved, no code is moved across
277  *                    interpolation, so only flat varyings are optimized.
278  *    * (VS, TCS), (VS, GS), (TES, GS): no, but possible -- TODO
279  *      * Current behavior:
280  *        * Per-vertex inputs are rejected.
281  *      * Possible solution:
282  *        * All input loads used by an accepted post-dominator must use
283  *          the same vertex index. The post-dominator must use all loads with
284  *          that vertex index.
285  *        * If a post-dominator is found for an input load from a specific
286  *          slot, all other input loads from that slot must also have
287  *          an accepted post-dominator, and all such post-dominators should
288  *          be identical expressions.
289  *    * (TCS, TES), (VS, TES): yes, with limitations
290  *      * Limitations:
291  *        * Only 1 store and 1 load per slot allowed.
292  *        * No output loads allowed.
293  *        * All stores used by an accepted post-dominator must be in
294  *          the same block.
295  *        * TCS barriers don't matter because there are no output loads.
296  *        * Patch varyings are handled trivially with the above constraints.
297  *        * Per-vertex outputs should only be indexed by gl_InvocationID.
298  *        * An interpolated TES load is any ALU instruction that computes
299  *          the result of linear interpolation of per-vertex inputs from
300  *          the same slot using gl_TessCoord. If such an ALU instruction is
301  *          found, it must be the only one, and all per-vertex input loads
302  *          from that slot must feed into it. The interpolation equation must
303  *          be equal to one of the allowed equations. Then the same rules as
304  *          for interpolated FS inputs are used, treating different
305  *          interpolation equations just like different interpolation
306  *          qualifiers.
307  *        * Patch inputs are treated as convergent, which means they are
308  *          allowed to be in the same movable expression as interpolated TES
309  *          inputs, and the same rules as for convergent FS inputs apply.
310  *    * (GS, FS), (MS, FS): no
311  *      * Workaround: Add a passthrough VS between GS/MS and FS, run
312  *                    the pass on the (VS, FS) pair to move code out of FS,
313  *                    and inline that VS at the end of your hw-specific
314  *                    GS/MS if it's possible.
315  *    * (TS, MS): no
316  *
317  *    The disadvantage of using the post-dominator tree is that it's a tree,
318  *    which means there is only 1 post-dominator of each input. This example
319  *    shows a case that could be optimized by replacing 3 inputs with 2 inputs,
320  *    reducing the number of inputs by 1, but the immediate post-dominator of
321  *    all input loads is NULL:
322  *    ```
323  *        temp0 = input0 + input1 + input2;
324  *        temp1 = input0 + input1 * const1 + input2 * const2;
325  *    ```
326  *
327  *    If there is a graph algorithm that returns the best solution to
328  *    the above case (which is temp0 and temp1 to replace all 3 inputs), let
329  *    us know.
330  *
331  * 6. Forward inter-shader code motion
332  *
333  *    TODO: Not implemented. The text below is a draft of the description.
334  *
335  *    "Forward" refers to moving code in the direction that shaders are
336  *    executed, i.e. moving code from the producer to the consumer.
337  *
338  *    Vertex shader example:
339  *    ```
340  *       output0 = value + 1;
341  *       output1 = value * 2;
342  *    ```
343  *
344  *    Both outputs can be replaced by 1 output storing "value", and both ALU
345  *    operations can be moved into the next shader.
346  *
347  *    The same dominance algorithm as in the previous optimization is used,
348  *    except that:
349  *    * Instead of inputs, we use outputs.
350  *    * Instead of a post-dominator tree, we use a dominator tree of the exact
351  *      same graph.
352  *
353  *    The algorithm idea is: For each pair of 2 output stores, find their
354  *    Lowest Common Ancestor in the dominator tree, and that's a candidate
355  *    for a new output. All movable loads like load_const should be removed
356  *    from the graph, otherwise the LCA wouldn't exist.
357  *
358  *    The limitations on instructions that can be moved between shaders across
359  *    interpolated loads are exactly the same as the previous optimization.
360  *
361  *    nir->options has callbacks that are used to estimate the cost of
362  *    expressions that drivers can set to control the complexity of
363  *    expressions that can be moved to later shaders. This is to ensure that
364  *    we don't increase the GPU overhead measurably by moving code across
365  *    pipeline stages that amplify GPU work.
366  *
367  * 7. Compaction to vec4 slots (AKA packing)
368  *
369  *    First, varyings are divided into these groups, and each group is
370  *    compacted separately with some exceptions listed below:
371  *
372  *    Non-FS groups (patch and non-patch are packed separately):
373  *    * 32-bit flat
374  *    * 16-bit flat
375  *    * 32-bit no-varying (TCS outputs read by TCS but not TES)
376  *    * 16-bit no-varying (TCS outputs read by TCS but not TES)
377  *
378  *    FS groups:
379  *    * 32-bit interpolated (always FP32)
380  *    * 32-bit flat
381  *    * 32-bit convergent (always FP32)
382  *    * 16-bit interpolated (always FP16)
383  *    * 16-bit flat
384  *    * 16-bit convergent (always FP16)
385  *    * 32-bit transform feedback only
386  *    * 16-bit transform feedback only
387  *
388  *    Then, all scalar varyings are relocated into new slots, starting from
389  *    VAR0.x and increasing the scalar slot offset in 32-bit or 16-bit
390  *    increments. Rules:
391  *    * Both 32-bit and 16-bit flat varyings are packed in the same vec4.
392  *    * Convergent varyings can be packed with interpolated varyings of
393  *      the same type or flat. The group to pack with is chosen based on
394  *      whichever has unused scalar slots because we want to reduce the total
395  *      number of vec4s. After filling all unused scalar slots, the remaining
396  *      convergent varyings are packed as flat.
397  *    * Transform-feedback-only slots and no-varying slots are packed last,
398  *      so that they are consecutive and not intermixed with varyings consumed
399  *      by the next shader stage, and 32-bit and 16-bit slots are packed in
400  *      the same vec4. This allows reducing memory for outputs by ignoring
401  *      the trailing outputs that the next shader stage doesn't read.
402  *
403  *    In the end, we should end up with these groups for FS:
404  *    * 32-bit interpolated (always FP32) on separate vec4s
405  *    * 16-bit interpolated (always FP16) on separate vec4s
406  *    * 32-bit flat and 16-bit flat, mixed in the same vec4
407  *    * 32-bit and 16-bit transform feedback only, sharing vec4s with flat
408  *
409  *    Colors are compacted the same but separately because they can't be mixed
410  *    with VARn. Colors are divided into 3 FS groups. They are:
411  *    * 32-bit maybe-interpolated (affected by the flat-shade state)
412  *    * 32-bit interpolated (not affected by the flat-shade state)
413  *    * 32-bit flat (not affected by the flat-shade state)
414  *
415  *    To facilitate driver-specific output merging, color channels are
416  *    assigned in a rotated order depending on which one the first unused VARn
417  *    channel is. For example, if the first unused VARn channel is VAR0.z,
418  *    color channels are allocated in this order:
419  *       COL0.z, COL0.w, COL0.x, COL0.y, COL1.z, COL1.w, COL1.x, COL1.y
420  *    The reason is that some drivers merge outputs if each output sets
421  *    different components, for example 2 outputs defining VAR0.xy and COL0.z.
422  *    If drivers do interpolation in the fragment shader and color
423  *    interpolation can differ for each component, VAR0.xy and COL.z can be
424  *    stored in the same output storage slot, and the consumer can load VAR0
425  *    and COL0 from the same slot.
426  *
427  *    If COLn, BFCn, and TEXn are transform-feedback-only, they are moved to
428  *    VARn. PRIMITIVE_ID in (GS, FS) and FOGC in (xx, FS) are always moved to
429  *    VARn for better packing.
430  *
431  *
432  * Issue: Interpolation converts Infs to NaNs
433  * ==========================================
434  *
435  * Interpolation converts Infs to NaNs, i.e. interp(Inf, i, j) = NaN, which
436  * impacts and limits backward inter-shader code motion, uniform expression
437  * propagation, and compaction.
438  *
439  * When we decide not to interpolate a varying, we need to convert Infs to
440  * NaNs manually. Infs can be converted to NaNs like this: x*0 + x
441  * (suggested by Ian Romanick, the multiplication must be "exact")
442  *
443  * Changes to optimizations:
444  * - When we propagate a uniform expression and NaNs must be preserved,
445  *   convert Infs in the result to NaNs using "x*0 + x" in the consumer.
446  * - When we change interpolation to flat for convergent varyings and NaNs
447  *   must be preserved, apply "x*0 + x" to the stored output value
448  *   in the producer.
449  * - There is no solution for backward inter-shader code motion with
450  *   interpolation if Infs must be preserved. As an alternative, we can allow
451  *   code motion across interpolation only for specific shader hashes in
452  *   can_move_alu_across_interp. We can use shader-db to automatically produce
453  *   a list of shader hashes that benefit from this optimization.
454  *
455  *
456  * Usage
457  * =====
458  *
459  * Requirements:
460  * - ALUs should be scalarized
461  * - Dot products and other vector opcodes should be lowered (recommended)
462  * - Input loads and output stores should be scalarized
463  * - 64-bit varyings should be lowered to 32 bits
464  * - nir_vertex_divergence_analysis must be called on the producer if
465  *   the constumer is a fragment shader
466  *
467  * It's recommended to run this for all shader pairs from the first shader
468  * to the last shader first (to propagate constants etc.). If the optimization
469  * of (S1, S2) stages leads to changes in S1, remember the highest S1. Then
470  * re-run this for all shader pairs in the descending order from S1 to VS.
471  *
472  * NIR optimizations should be performed after every run that changes the IR.
473  *
474  *
475  * Analyzing the optimization potential of linking separate shaders
476  * ================================================================
477  *
478  * We can use this pass in an analysis pass that decides whether a separate
479  * shader has the potential to benefit from full draw-time linking. The way
480  * it would work is that we would create a passthrough shader adjacent to
481  * the separate shader, run this pass on both shaders, and check if the number
482  * of varyings decreased. This way we can decide to perform the draw-time
483  * linking only if we are confident that it would help performance.
484  *
485  * TODO: not implemented, mention the pass that implements it
486  */
487 
488 #include "nir.h"
489 #include "nir_builder.h"
490 #include "util/u_math.h"
491 #include "util/u_memory.h"
492 
493 /* nir_opt_varyings works at scalar 16-bit granularity across all varyings.
494  *
495  * Slots (i % 8 == 0,2,4,6) are 32-bit channels or low bits of 16-bit channels.
496  * Slots (i % 8 == 1,3,5,7) are high bits of 16-bit channels. 32-bit channels
497  * don't set these slots as used in bitmasks.
498  */
499 #define NUM_SCALAR_SLOTS  (NUM_TOTAL_VARYING_SLOTS * 8)
500 
501 /* Fragment shader input slots can be packed with indirectly-indexed vec4
502  * slots if there are unused components, but only if the vec4 slot has
503  * the same interpolation type. There are only 3 types: FLAT, FP32, FP16.
504  */
505 enum fs_vec4_type {
506    FS_VEC4_TYPE_NONE = 0,
507    FS_VEC4_TYPE_FLAT,
508    FS_VEC4_TYPE_INTERP_FP32,
509    FS_VEC4_TYPE_INTERP_FP16,
510    FS_VEC4_TYPE_INTERP_COLOR,
511    FS_VEC4_TYPE_INTERP_EXPLICIT,
512    FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
513    FS_VEC4_TYPE_PER_PRIMITIVE,
514 };
515 
516 #if PRINT_RELOCATE_SLOT
517 static const char *fs_vec4_type_strings[] = {
518    "NONE",
519    "FLAT",
520    "INTERP_FP32",
521    "INTERP_FP16",
522    "INTERP_COLOR",
523    "INTERP_EXPLICIT",
524    "INTERP_EXPLICIT_STRICT",
525    "PER_PRIMITIVE",
526 };
527 #endif // PRINT_RELOCATE_SLOT
528 
529 static unsigned
get_scalar_16bit_slot(nir_io_semantics sem,unsigned component)530 get_scalar_16bit_slot(nir_io_semantics sem, unsigned component)
531 {
532    return sem.location * 8 + component * 2 + sem.high_16bits;
533 }
534 
535 static unsigned
intr_get_scalar_16bit_slot(nir_intrinsic_instr * intr)536 intr_get_scalar_16bit_slot(nir_intrinsic_instr *intr)
537 {
538     return get_scalar_16bit_slot(nir_intrinsic_io_semantics(intr),
539                                  nir_intrinsic_component(intr));
540 }
541 
542 static unsigned
vec4_slot(unsigned scalar_slot)543 vec4_slot(unsigned scalar_slot)
544 {
545    return scalar_slot / 8;
546 }
547 
548 struct list_node {
549    struct list_head head;
550    nir_intrinsic_instr *instr;
551 };
552 
553 /* Information about 1 scalar varying slot for both shader stages. */
554 struct scalar_slot {
555    struct {
556       /* Linked list of all store instructions writing into the scalar slot
557        * in the producer.
558        */
559       struct list_head stores;
560 
561       /* Only for TCS: Linked list of all load instructions read the scalar
562        * slot in the producer.
563        */
564       struct list_head loads;
565 
566       /* If there is only one store instruction or if all store instructions
567        * store the same value in the producer, this is the instruction
568        * computing the stored value. Used by constant and uniform propagation
569        * to the next shader.
570        */
571       nir_instr *value;
572    } producer;
573 
574    struct {
575       /* Linked list of all load instructions loading from the scalar slot
576        * in the consumer.
577        */
578       struct list_head loads;
579 
580       /* The result of TES input interpolation. */
581       nir_alu_instr *tes_interp_load;
582       unsigned tes_interp_mode;  /* FLAG_INTERP_TES_* */
583       nir_def *tes_load_tess_coord;
584    } consumer;
585 
586    /* The number of accessed slots if this slot has indirect indexing. */
587    unsigned num_slots;
588 };
589 
590 struct linkage_info {
591    struct scalar_slot slot[NUM_SCALAR_SLOTS];
592 
593    bool spirv;
594    bool can_move_uniforms;
595    bool can_move_ubos;
596    bool can_mix_convergent_flat_with_interpolated;
597 
598    gl_shader_stage producer_stage;
599    gl_shader_stage consumer_stage;
600    nir_builder producer_builder;
601    nir_builder consumer_builder;
602    unsigned max_varying_expression_cost;
603 
604    /* Memory context for linear_alloc_child (fast allocation). */
605    void *linear_mem_ctx;
606 
607    /* If any component of a vec4 slot is accessed indirectly, this is its
608     * FS vec4 qualifier type, which is either FLAT, FP32, or FP16.
609     * Components with different qualifier types can't be compacted
610     * in the same vec4.
611     */
612    uint8_t fs_vec4_type[NUM_TOTAL_VARYING_SLOTS];
613 
614    /* Mask of all varyings that can be removed. Only a few non-VARn non-PATCHn
615     * varyings can't be removed.
616     */
617    BITSET_DECLARE(removable_mask, NUM_SCALAR_SLOTS);
618 
619    /* Mask of all slots that have transform feedback info. */
620    BITSET_DECLARE(xfb_mask, NUM_SCALAR_SLOTS);
621 
622    /* Mask of all slots that have transform feedback info, but are not used
623     * by the next shader. Separate masks for 32-bit and 16-bit outputs.
624     */
625    BITSET_DECLARE(xfb32_only_mask, NUM_SCALAR_SLOTS);
626    BITSET_DECLARE(xfb16_only_mask, NUM_SCALAR_SLOTS);
627 
628    /* Mask of all TCS->TES slots that are read by TCS, but not TES. */
629    BITSET_DECLARE(no_varying32_mask, NUM_SCALAR_SLOTS);
630    BITSET_DECLARE(no_varying16_mask, NUM_SCALAR_SLOTS);
631 
632    /* Mask of all slots accessed with indirect indexing. */
633    BITSET_DECLARE(indirect_mask, NUM_SCALAR_SLOTS);
634 
635    /* The following masks only contain slots that can be compacted and
636     * describe the groups in which they should be compacted. Non-fragment
637     * shaders only use the flat bitmasks.
638     *
639     * Some legacy varyings are excluded when they can't be compacted due to
640     * being affected by pipeline states (like coord replace). That only
641     * applies to xx->FS shader pairs. Other shader pairs get all legacy
642     * varyings compacted and relocated to VARn.
643     *
644     * Indirectly-indexed varyings are also excluded because they are not
645     * compacted.
646     */
647    BITSET_DECLARE(interp_fp32_mask, NUM_SCALAR_SLOTS);
648    BITSET_DECLARE(interp_fp16_mask, NUM_SCALAR_SLOTS);
649    BITSET_DECLARE(flat32_mask, NUM_SCALAR_SLOTS);
650    BITSET_DECLARE(flat16_mask, NUM_SCALAR_SLOTS);
651    BITSET_DECLARE(interp_explicit32_mask, NUM_SCALAR_SLOTS);
652    BITSET_DECLARE(interp_explicit16_mask, NUM_SCALAR_SLOTS);
653    BITSET_DECLARE(interp_explicit_strict32_mask, NUM_SCALAR_SLOTS);
654    BITSET_DECLARE(interp_explicit_strict16_mask, NUM_SCALAR_SLOTS);
655    BITSET_DECLARE(per_primitive32_mask, NUM_SCALAR_SLOTS);
656    BITSET_DECLARE(per_primitive16_mask, NUM_SCALAR_SLOTS);
657 
658    /* Color interpolation unqualified (follows the flat-shade state). */
659    BITSET_DECLARE(color32_mask, NUM_SCALAR_SLOTS);
660 
661    /* Mask of output components that have only one store instruction, or if
662     * they have multiple store instructions, all those instructions store
663     * the same value. If the output has multiple vertices, all vertices store
664     * the same value. This is a useful property for:
665     * - constant and uniform propagation to the next shader
666     * - deduplicating outputs
667     */
668    BITSET_DECLARE(output_equal_mask, NUM_SCALAR_SLOTS);
669 
670    /* Mask of output components that store values that are convergent,
671     * i.e. all values stored into the outputs are equal within a primitive.
672     *
673     * This is different from output_equal_mask, which says that all stores
674     * to the same slot in the same thread are equal, while this says that
675     * each store to the same slot can be different, but it always stores
676     * a convergent value, which means the stored value is equal among all
677     * threads within a primitive.
678     *
679     * The advantage is that these varyings can always be promoted to flat
680     * regardless of the original interpolation mode, and they can always be
681     * compacted with both interpolated and flat varyings.
682     */
683    BITSET_DECLARE(convergent32_mask, NUM_SCALAR_SLOTS);
684    BITSET_DECLARE(convergent16_mask, NUM_SCALAR_SLOTS);
685 };
686 
687 /******************************************************************
688  * HELPERS
689  ******************************************************************/
690 
691 /* Return whether the low or high 16-bit slot is 1. */
692 #define BITSET_TEST32(m, b) \
693    (BITSET_TEST(m, (b) & ~0x1) || BITSET_TEST(m, ((b) & ~0x1) + 1))
694 
695 static void
print_linkage(struct linkage_info * linkage)696 print_linkage(struct linkage_info *linkage)
697 {
698    printf("Linkage: %s -> %s\n",
699           _mesa_shader_stage_to_abbrev(linkage->producer_stage),
700           _mesa_shader_stage_to_abbrev(linkage->consumer_stage));
701 
702    for (unsigned i = 0; i < NUM_SCALAR_SLOTS; i++) {
703       struct scalar_slot *slot = &linkage->slot[i];
704 
705       if (!slot->num_slots &&
706           list_is_empty(&slot->producer.stores) &&
707           list_is_empty(&slot->producer.loads) &&
708           list_is_empty(&slot->consumer.loads) &&
709           !BITSET_TEST(linkage->removable_mask, i) &&
710           !BITSET_TEST(linkage->indirect_mask, i) &&
711           !BITSET_TEST(linkage->xfb32_only_mask, i) &&
712           !BITSET_TEST(linkage->xfb16_only_mask, i) &&
713           !BITSET_TEST(linkage->no_varying32_mask, i) &&
714           !BITSET_TEST(linkage->no_varying16_mask, i) &&
715           !BITSET_TEST(linkage->interp_fp32_mask, i) &&
716           !BITSET_TEST(linkage->interp_fp16_mask, i) &&
717           !BITSET_TEST(linkage->flat32_mask, i) &&
718           !BITSET_TEST(linkage->flat16_mask, i) &&
719           !BITSET_TEST(linkage->interp_explicit32_mask, i) &&
720           !BITSET_TEST(linkage->interp_explicit16_mask, i) &&
721           !BITSET_TEST(linkage->interp_explicit_strict32_mask, i) &&
722           !BITSET_TEST(linkage->interp_explicit_strict16_mask, i) &&
723           !BITSET_TEST(linkage->per_primitive32_mask, i) &&
724           !BITSET_TEST(linkage->per_primitive16_mask, i) &&
725           !BITSET_TEST(linkage->convergent32_mask, i) &&
726           !BITSET_TEST(linkage->convergent16_mask, i) &&
727           !BITSET_TEST(linkage->output_equal_mask, i))
728          continue;
729 
730       printf("  %7s.%c.%s: num_slots=%2u%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
731              gl_varying_slot_name_for_stage(vec4_slot(i),
732                                             linkage->producer_stage) + 13,
733              "xyzw"[(i / 2) % 4],
734              i % 2 ? "hi" : "lo",
735              slot->num_slots,
736              BITSET_TEST(linkage->removable_mask, i) ? " removable" : "",
737              BITSET_TEST(linkage->indirect_mask, i) ? " indirect" : "",
738              BITSET_TEST(linkage->xfb32_only_mask, i) ? " xfb32_only" : "",
739              BITSET_TEST(linkage->xfb16_only_mask, i) ? " xfb16_only" : "",
740              BITSET_TEST(linkage->no_varying32_mask, i) ? " no_varying32" : "",
741              BITSET_TEST(linkage->no_varying16_mask, i) ? " no_varying16" : "",
742              BITSET_TEST(linkage->interp_fp32_mask, i) ? " interp_fp32" : "",
743              BITSET_TEST(linkage->interp_fp16_mask, i) ? " interp_fp16" : "",
744              BITSET_TEST(linkage->flat32_mask, i) ? " flat32" : "",
745              BITSET_TEST(linkage->flat16_mask, i) ? " flat16" : "",
746              BITSET_TEST(linkage->interp_explicit32_mask, i) ? " interp_explicit32" : "",
747              BITSET_TEST(linkage->interp_explicit16_mask, i) ? " interp_explicit16" : "",
748              BITSET_TEST(linkage->interp_explicit_strict32_mask, i) ? " interp_explicit_strict32" : "",
749              BITSET_TEST(linkage->interp_explicit_strict16_mask, i) ? " interp_explicit_strict16" : "",
750              BITSET_TEST(linkage->per_primitive32_mask, i) ? " per_primitive32" : "",
751              BITSET_TEST(linkage->per_primitive32_mask, i) ? " per_primitive16" : "",
752              BITSET_TEST(linkage->convergent32_mask, i) ? " convergent32" : "",
753              BITSET_TEST(linkage->convergent16_mask, i) ? " convergent16" : "",
754              BITSET_TEST(linkage->output_equal_mask, i) ? " output_equal" : "",
755              !list_is_empty(&slot->producer.stores) ? " producer_stores" : "",
756              !list_is_empty(&slot->producer.loads) ? " producer_loads" : "",
757              !list_is_empty(&slot->consumer.loads) ? " consumer_loads" : "");
758    }
759 }
760 
761 static void
slot_disable_optimizations_and_compaction(struct linkage_info * linkage,unsigned i)762 slot_disable_optimizations_and_compaction(struct linkage_info *linkage,
763                                           unsigned i)
764 {
765    BITSET_CLEAR(linkage->output_equal_mask, i);
766    BITSET_CLEAR(linkage->convergent32_mask, i);
767    BITSET_CLEAR(linkage->convergent16_mask, i);
768    BITSET_CLEAR(linkage->interp_fp32_mask, i);
769    BITSET_CLEAR(linkage->interp_fp16_mask, i);
770    BITSET_CLEAR(linkage->flat32_mask, i);
771    BITSET_CLEAR(linkage->flat16_mask, i);
772    BITSET_CLEAR(linkage->interp_explicit32_mask, i);
773    BITSET_CLEAR(linkage->interp_explicit16_mask, i);
774    BITSET_CLEAR(linkage->interp_explicit_strict32_mask, i);
775    BITSET_CLEAR(linkage->interp_explicit_strict16_mask, i);
776    BITSET_CLEAR(linkage->per_primitive32_mask, i);
777    BITSET_CLEAR(linkage->per_primitive16_mask, i);
778    BITSET_CLEAR(linkage->no_varying32_mask, i);
779    BITSET_CLEAR(linkage->no_varying16_mask, i);
780    BITSET_CLEAR(linkage->color32_mask, i);
781 }
782 
783 static void
clear_slot_info_after_removal(struct linkage_info * linkage,unsigned i,bool uses_xfb)784 clear_slot_info_after_removal(struct linkage_info *linkage, unsigned i, bool uses_xfb)
785 {
786    slot_disable_optimizations_and_compaction(linkage, i);
787 
788    if (uses_xfb)
789       return;
790 
791    linkage->slot[i].num_slots = 0;
792 
793    BITSET_CLEAR(linkage->indirect_mask, i);
794    BITSET_CLEAR(linkage->removable_mask, i);
795 
796    /* Transform feedback stores can't be removed. */
797    assert(!BITSET_TEST(linkage->xfb32_only_mask, i));
798    assert(!BITSET_TEST(linkage->xfb16_only_mask, i));
799 }
800 
801 static bool
has_xfb(nir_intrinsic_instr * intr)802 has_xfb(nir_intrinsic_instr *intr)
803 {
804    /* This means whether the instrinsic is ABLE to have xfb info. */
805    if (!nir_intrinsic_has_io_xfb(intr))
806       return false;
807 
808    unsigned comp = nir_intrinsic_component(intr);
809 
810    if (comp >= 2)
811       return nir_intrinsic_io_xfb2(intr).out[comp - 2].num_components > 0;
812    else
813       return nir_intrinsic_io_xfb(intr).out[comp].num_components > 0;
814 }
815 
816 static bool
is_interpolated_color(struct linkage_info * linkage,unsigned i)817 is_interpolated_color(struct linkage_info *linkage, unsigned i)
818 {
819    if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
820       return false;
821 
822    /* BFCn stores are bunched in the COLn slots with COLn, so we should never
823     * get BFCn here.
824     */
825    assert(vec4_slot(i) != VARYING_SLOT_BFC0 &&
826           vec4_slot(i) != VARYING_SLOT_BFC1);
827 
828    return vec4_slot(i) == VARYING_SLOT_COL0 ||
829           vec4_slot(i) == VARYING_SLOT_COL1;
830 }
831 
832 static bool
is_interpolated_texcoord(struct linkage_info * linkage,unsigned i)833 is_interpolated_texcoord(struct linkage_info *linkage, unsigned i)
834 {
835    if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
836       return false;
837 
838    return vec4_slot(i) >= VARYING_SLOT_TEX0 &&
839           vec4_slot(i) <= VARYING_SLOT_TEX7;
840 }
841 
842 static bool
color_uses_shade_model(struct linkage_info * linkage,unsigned i)843 color_uses_shade_model(struct linkage_info *linkage, unsigned i)
844 {
845    if (!is_interpolated_color(linkage, i))
846       return false;
847 
848    list_for_each_entry(struct list_node, iter,
849                        &linkage->slot[i].consumer.loads, head) {
850       assert(iter->instr->intrinsic == nir_intrinsic_load_interpolated_input);
851 
852       nir_intrinsic_instr *baryc =
853          nir_instr_as_intrinsic(iter->instr->src[0].ssa->parent_instr);
854       if (nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE)
855          return true;
856    }
857 
858    return false;
859 }
860 
861 static bool
preserve_infs_nans(nir_shader * nir,unsigned bit_size)862 preserve_infs_nans(nir_shader *nir, unsigned bit_size)
863 {
864    unsigned mode = nir->info.float_controls_execution_mode;
865 
866    return nir_is_float_control_inf_preserve(mode, bit_size) ||
867           nir_is_float_control_nan_preserve(mode, bit_size);
868 }
869 
870 static bool
preserve_nans(nir_shader * nir,unsigned bit_size)871 preserve_nans(nir_shader *nir, unsigned bit_size)
872 {
873    unsigned mode = nir->info.float_controls_execution_mode;
874 
875    return nir_is_float_control_nan_preserve(mode, bit_size);
876 }
877 
878 static nir_def *
build_convert_inf_to_nan(nir_builder * b,nir_def * x)879 build_convert_inf_to_nan(nir_builder *b, nir_def *x)
880 {
881    /* Do x*0 + x. The multiplication by 0 can't be optimized out. */
882    nir_def *fma = nir_ffma_imm1(b, x, 0, x);
883    nir_instr_as_alu(fma->parent_instr)->exact = true;
884    return fma;
885 }
886 
887 /******************************************************************
888  * GATHERING INPUTS & OUTPUTS
889  ******************************************************************/
890 
891 static bool
is_active_sysval_output(struct linkage_info * linkage,unsigned slot,nir_intrinsic_instr * intr)892 is_active_sysval_output(struct linkage_info *linkage, unsigned slot,
893                         nir_intrinsic_instr *intr)
894 {
895    return nir_slot_is_sysval_output(vec4_slot(slot),
896                                     linkage->consumer_stage) &&
897           !nir_intrinsic_io_semantics(intr).no_sysval_output;
898 }
899 
900 /**
901  * This function acts like a filter. The pass won't touch varyings that
902  * return false here, and the return value is saved in the linkage bitmasks,
903  * so that all subpasses will *automatically* skip such varyings.
904  */
905 static bool
can_remove_varying(struct linkage_info * linkage,gl_varying_slot location)906 can_remove_varying(struct linkage_info *linkage, gl_varying_slot location)
907 {
908    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
909       /* User-defined varyings and fog coordinates can always be removed. */
910       if (location >= VARYING_SLOT_VAR0 ||
911           location == VARYING_SLOT_FOGC)
912          return true;
913 
914       /* Workaround for mesh shader multiview in RADV.
915        * A layer output is inserted by ac_nir_lower_ngg which is called later.
916        * Prevent removing the layer input from FS when producer is MS.
917        */
918       if (linkage->producer_stage == MESA_SHADER_MESH &&
919           location == VARYING_SLOT_LAYER)
920          return false;
921 
922       /* These can be removed as varyings, which means they will be demoted to
923        * sysval-only outputs keeping their culling/rasterization functions
924        * while not passing the values to FS. Drivers should handle
925        * the "no_varying" semantic to benefit from this.
926        *
927        * Note: When removing unset LAYER and VIEWPORT FS inputs, they will
928        *       be replaced by 0 instead of undef.
929        */
930       if (location == VARYING_SLOT_CLIP_DIST0 ||
931           location == VARYING_SLOT_CLIP_DIST1 ||
932           location == VARYING_SLOT_CULL_DIST0 ||
933           location == VARYING_SLOT_CULL_DIST1 ||
934           location == VARYING_SLOT_LAYER ||
935           location == VARYING_SLOT_VIEWPORT)
936          return true;
937 
938       /* COLn inputs can be removed only if both COLn and BFCn are not
939        * written. Both COLn and BFCn outputs can be removed if COLn inputs
940        * aren't read.
941        *
942        * TEXn inputs can never be removed in FS because of the coord replace
943        * state, but TEXn outputs can be removed if they are not read by FS.
944        */
945       if (location == VARYING_SLOT_COL0 ||
946           location == VARYING_SLOT_COL1 ||
947           location == VARYING_SLOT_BFC0 ||
948           location == VARYING_SLOT_BFC1 ||
949           (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7))
950          return true;
951 
952       /* "GS -> FS" can remove the primitive ID if not written or not read. */
953       if ((linkage->producer_stage == MESA_SHADER_GEOMETRY ||
954            linkage->producer_stage == MESA_SHADER_MESH) &&
955           location == VARYING_SLOT_PRIMITIVE_ID)
956          return true;
957 
958       /* No other varyings can be removed. */
959       return false;
960    } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) {
961       /* Only VS->TES shouldn't remove TESS_LEVEL_* inputs because the values
962        * come from glPatchParameterfv.
963        *
964        * For TCS->TES, TESS_LEVEL_* outputs can be removed as varyings, which
965        * means they will be demoted to sysval-only outputs, so that drivers
966        * know that TES doesn't read them.
967        */
968       if (linkage->producer_stage == MESA_SHADER_VERTEX &&
969           (location == VARYING_SLOT_TESS_LEVEL_INNER ||
970            location == VARYING_SLOT_TESS_LEVEL_OUTER))
971          return false;
972 
973       return true;
974    }
975 
976    /* All other varyings can be removed. */
977    return true;
978 }
979 
980 struct opt_options {
981    bool propagate_uniform_expr:1;
982    bool deduplicate:1;
983    bool inter_shader_code_motion:1;
984    bool compact:1;
985    bool disable_all:1;
986 };
987 
988 /**
989  * Return which optimizations are allowed.
990  */
991 static struct opt_options
can_optimize_varying(struct linkage_info * linkage,gl_varying_slot location)992 can_optimize_varying(struct linkage_info *linkage, gl_varying_slot location)
993 {
994    struct opt_options options_var = {
995       .propagate_uniform_expr = true,
996       .deduplicate = true,
997       .inter_shader_code_motion = true,
998       .compact = true,
999    };
1000    struct opt_options options_color = {
1001       .propagate_uniform_expr = true, /* only constants in [0, 1] */
1002       .deduplicate = true,
1003       .compact = true,
1004    };
1005    struct opt_options options_tex = {
1006       .propagate_uniform_expr = true, /* only TEX.zw if equal to (0, 1) */
1007    };
1008    struct opt_options options_sysval_output = {
1009       .propagate_uniform_expr = true,
1010       .deduplicate = true,
1011    };
1012    struct opt_options options_tess_levels = {
1013       .propagate_uniform_expr = true,
1014       .deduplicate = true,
1015    };
1016    struct opt_options options_disable_all = {
1017       .disable_all = true,
1018    };
1019 
1020    assert(can_remove_varying(linkage, location));
1021 
1022    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1023       /* xx -> FS */
1024       /* User-defined varyings and fog coordinates can always be optimized. */
1025       if (location >= VARYING_SLOT_VAR0 ||
1026           location == VARYING_SLOT_FOGC)
1027          return options_var;
1028 
1029       /* The primitive ID can always be optimized in GS -> FS and MS -> FS. */
1030       if ((linkage->producer_stage == MESA_SHADER_GEOMETRY ||
1031            linkage->producer_stage == MESA_SHADER_MESH) &&
1032           location == VARYING_SLOT_PRIMITIVE_ID)
1033          return options_var;
1034 
1035       /* Colors can only do constant propagation if COLn and BFCn store the
1036        * same constant and the constant is between 0 and 1 (because clamp
1037        * vertex color state is unknown). Uniform propagation isn't possible
1038        * because of the clamping.
1039        *
1040        * Color components can only be deduplicated and compacted among
1041        * themselves if they have the same interpolation qualifier, and can't
1042        * be mixed with other varyings.
1043        */
1044       if (location == VARYING_SLOT_COL0 ||
1045           location == VARYING_SLOT_COL1 ||
1046           location == VARYING_SLOT_BFC0 ||
1047           location == VARYING_SLOT_BFC1)
1048          return options_color;
1049 
1050       /* TEXn.zw can only be constant-propagated if the value is (0, 1)
1051        * because it matches the coord replace values.
1052        */
1053       if (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7)
1054          return options_tex;
1055 
1056       /* LAYER, VIEWPORT, CLIP_DISTn, and CULL_DISTn can only propagate
1057        * uniform expressions and be compacted (moved to VARn while keeping
1058        * the sysval outputs where they are).
1059        */
1060       if (location == VARYING_SLOT_LAYER ||
1061           location == VARYING_SLOT_VIEWPORT ||
1062           location == VARYING_SLOT_CLIP_DIST0 ||
1063           location == VARYING_SLOT_CLIP_DIST1 ||
1064           location == VARYING_SLOT_CULL_DIST0 ||
1065           location == VARYING_SLOT_CULL_DIST1)
1066          return options_sysval_output;
1067 
1068       /* Everything else can't be read by the consumer, such as POS, PSIZ,
1069        * CLIP_VERTEX, EDGE, PRIMITIVE_SHADING_RATE, etc.
1070        */
1071       return options_disable_all;
1072    }
1073 
1074    if (linkage->producer_stage == MESA_SHADER_TESS_CTRL) {
1075       /* TESS_LEVEL_* can only propagate uniform expressions.
1076        * Compaction is disabled because AMD doesn't want the varying to be
1077        * moved to PATCHn while keeping the sysval output where it is.
1078        */
1079       if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
1080           location == VARYING_SLOT_TESS_LEVEL_OUTER)
1081          return options_tess_levels;
1082    }
1083 
1084    /* All other shader pairs, which are (VS, TCS), (TCS, TES), (VS, TES),
1085     * (TES, GS), and (VS, GS) can compact and optimize all varyings.
1086     */
1087    return options_var;
1088 }
1089 
1090 static bool
gather_inputs(struct nir_builder * builder,nir_intrinsic_instr * intr,void * cb_data)1091 gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data)
1092 {
1093    struct linkage_info *linkage = (struct linkage_info *)cb_data;
1094 
1095    if (intr->intrinsic != nir_intrinsic_load_input &&
1096        intr->intrinsic != nir_intrinsic_load_per_vertex_input &&
1097        intr->intrinsic != nir_intrinsic_load_per_primitive_input &&
1098        intr->intrinsic != nir_intrinsic_load_interpolated_input &&
1099        intr->intrinsic != nir_intrinsic_load_input_vertex)
1100       return false;
1101 
1102    /* nir_lower_io_to_scalar is required before this */
1103    assert(intr->def.num_components == 1);
1104    /* Non-zero constant offsets should have been folded by
1105     * nir_io_add_const_offset_to_base.
1106     */
1107    nir_src offset = *nir_get_io_offset_src(intr);
1108    assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0);
1109 
1110    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1111 
1112    if (!can_remove_varying(linkage, sem.location))
1113       return false;
1114 
1115    /* Insert the load into the list of loads for this scalar slot. */
1116    unsigned slot = intr_get_scalar_16bit_slot(intr);
1117    struct scalar_slot *in = &linkage->slot[slot];
1118    struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx,
1119                                                sizeof(struct list_node));
1120    node->instr = intr;
1121    list_addtail(&node->head, &in->consumer.loads);
1122    in->num_slots = MAX2(in->num_slots, sem.num_slots);
1123 
1124    BITSET_SET(linkage->removable_mask, slot);
1125 
1126    enum fs_vec4_type fs_vec4_type = FS_VEC4_TYPE_NONE;
1127 
1128    /* Determine the type of the input for compaction. Other inputs
1129     * can be compacted with indirectly-indexed vec4 slots if they
1130     * have unused components, but only if they are of the same type.
1131     */
1132    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1133       switch (intr->intrinsic) {
1134       case nir_intrinsic_load_input:
1135          fs_vec4_type = FS_VEC4_TYPE_FLAT;
1136          break;
1137       case nir_intrinsic_load_per_primitive_input:
1138          fs_vec4_type = FS_VEC4_TYPE_PER_PRIMITIVE;
1139          break;
1140       case nir_intrinsic_load_input_vertex:
1141          if (sem.interp_explicit_strict)
1142             fs_vec4_type = FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT;
1143          else
1144             fs_vec4_type = FS_VEC4_TYPE_INTERP_EXPLICIT;
1145          break;
1146       case nir_intrinsic_load_interpolated_input:
1147          if (color_uses_shade_model(linkage, slot))
1148             fs_vec4_type = FS_VEC4_TYPE_INTERP_COLOR;
1149          else if (intr->def.bit_size == 32)
1150             fs_vec4_type = FS_VEC4_TYPE_INTERP_FP32;
1151          else if (intr->def.bit_size == 16)
1152             fs_vec4_type = FS_VEC4_TYPE_INTERP_FP16;
1153          else
1154             unreachable("invalid load_interpolated_input type");
1155          break;
1156       default:
1157          unreachable("unexpected input load intrinsic");
1158       }
1159 
1160       linkage->fs_vec4_type[sem.location] = fs_vec4_type;
1161    }
1162 
1163    /* Indirect indexing. */
1164    if (!nir_src_is_const(offset)) {
1165       /* Only the indirectly-indexed component is marked as indirect. */
1166       for (unsigned i = 0; i < sem.num_slots; i++)
1167          BITSET_SET(linkage->indirect_mask, slot + i * 8);
1168 
1169       /* Set the same vec4 type as the first element in all slots. */
1170       if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1171          for (unsigned i = 1; i < sem.num_slots; i++)
1172             linkage->fs_vec4_type[sem.location + i] = fs_vec4_type;
1173       }
1174       return false;
1175    }
1176 
1177    if (!can_optimize_varying(linkage, sem.location).compact)
1178       return false;
1179 
1180    /* Record inputs that can be compacted. */
1181    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1182       switch (intr->intrinsic) {
1183       case nir_intrinsic_load_input:
1184          if (intr->def.bit_size == 32)
1185             BITSET_SET(linkage->flat32_mask, slot);
1186          else if (intr->def.bit_size == 16)
1187             BITSET_SET(linkage->flat16_mask, slot);
1188          else
1189             unreachable("invalid load_input type");
1190          break;
1191       case nir_intrinsic_load_per_primitive_input:
1192          if (intr->def.bit_size == 32)
1193             BITSET_SET(linkage->per_primitive32_mask, slot);
1194          else if (intr->def.bit_size == 16)
1195             BITSET_SET(linkage->per_primitive16_mask, slot);
1196          else
1197             unreachable("invalid load_input type");
1198          break;
1199       case nir_intrinsic_load_input_vertex:
1200          if (sem.interp_explicit_strict) {
1201             if (intr->def.bit_size == 32)
1202                BITSET_SET(linkage->interp_explicit_strict32_mask, slot);
1203             else if (intr->def.bit_size == 16)
1204                BITSET_SET(linkage->interp_explicit_strict16_mask, slot);
1205             else
1206                unreachable("invalid load_input_vertex type");
1207          } else {
1208             if (intr->def.bit_size == 32)
1209                BITSET_SET(linkage->interp_explicit32_mask, slot);
1210             else if (intr->def.bit_size == 16)
1211                BITSET_SET(linkage->interp_explicit16_mask, slot);
1212             else
1213                unreachable("invalid load_input_vertex type");
1214          }
1215          break;
1216       case nir_intrinsic_load_interpolated_input:
1217          if (color_uses_shade_model(linkage, slot))
1218             BITSET_SET(linkage->color32_mask, slot);
1219          else if (intr->def.bit_size == 32)
1220             BITSET_SET(linkage->interp_fp32_mask, slot);
1221          else if (intr->def.bit_size == 16)
1222             BITSET_SET(linkage->interp_fp16_mask, slot);
1223          else
1224             unreachable("invalid load_interpolated_input type");
1225          break;
1226       default:
1227          unreachable("unexpected input load intrinsic");
1228       }
1229    } else {
1230       if (intr->def.bit_size == 32)
1231          BITSET_SET(linkage->flat32_mask, slot);
1232       else if (intr->def.bit_size == 16)
1233          BITSET_SET(linkage->flat16_mask, slot);
1234       else
1235          unreachable("invalid load_input type");
1236    }
1237    return false;
1238 }
1239 
1240 static bool
gather_outputs(struct nir_builder * builder,nir_intrinsic_instr * intr,void * cb_data)1241 gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data)
1242 {
1243    struct linkage_info *linkage = (struct linkage_info *)cb_data;
1244 
1245    if (intr->intrinsic != nir_intrinsic_store_output &&
1246        intr->intrinsic != nir_intrinsic_load_output &&
1247        intr->intrinsic != nir_intrinsic_store_per_vertex_output &&
1248        intr->intrinsic != nir_intrinsic_store_per_primitive_output &&
1249        intr->intrinsic != nir_intrinsic_load_per_vertex_output &&
1250        intr->intrinsic != nir_intrinsic_load_per_primitive_output)
1251       return false;
1252 
1253    bool is_store =
1254       intr->intrinsic == nir_intrinsic_store_output ||
1255       intr->intrinsic == nir_intrinsic_store_per_vertex_output ||
1256       intr->intrinsic == nir_intrinsic_store_per_primitive_output;
1257 
1258    if (is_store) {
1259       /* nir_lower_io_to_scalar is required before this */
1260       assert(intr->src[0].ssa->num_components == 1);
1261       /* nit_opt_undef is required before this. */
1262       assert(intr->src[0].ssa->parent_instr->type !=
1263             nir_instr_type_undef);
1264    } else {
1265       /* nir_lower_io_to_scalar is required before this */
1266       assert(intr->def.num_components == 1);
1267       /* Outputs loads are only allowed in TCS. */
1268       assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
1269    }
1270 
1271    /* Non-zero constant offsets should have been folded by
1272     * nir_io_add_const_offset_to_base.
1273     */
1274    nir_src offset = *nir_get_io_offset_src(intr);
1275    assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0);
1276 
1277    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1278 
1279    if (!can_remove_varying(linkage, sem.location))
1280       return false;
1281 
1282    /* For "xx -> FS", treat BFCn stores as COLn to make dead varying
1283     * elimination do the right thing automatically. The rules are:
1284     * - COLn inputs can be removed only if both COLn and BFCn are not
1285     *   written.
1286     * - Both COLn and BFCn outputs can be removed if COLn inputs
1287     *   aren't read.
1288     */
1289    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1290       if (sem.location == VARYING_SLOT_BFC0)
1291          sem.location = VARYING_SLOT_COL0;
1292       else if (sem.location == VARYING_SLOT_BFC1)
1293          sem.location = VARYING_SLOT_COL1;
1294    }
1295 
1296    /* Insert the instruction into the list of stores or loads for this
1297     * scalar slot.
1298     */
1299    unsigned slot =
1300       get_scalar_16bit_slot(sem, nir_intrinsic_component(intr));
1301 
1302    struct scalar_slot *out = &linkage->slot[slot];
1303    struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx,
1304                                                sizeof(struct list_node));
1305    node->instr = intr;
1306    out->num_slots = MAX2(out->num_slots, sem.num_slots);
1307 
1308    if (is_store) {
1309       list_addtail(&node->head, &out->producer.stores);
1310 
1311       if (has_xfb(intr)) {
1312          BITSET_SET(linkage->xfb_mask, slot);
1313 
1314          if (sem.no_varying &&
1315              !is_active_sysval_output(linkage, slot, intr)) {
1316             if (intr->src[0].ssa->bit_size == 32)
1317                BITSET_SET(linkage->xfb32_only_mask, slot);
1318             else if (intr->src[0].ssa->bit_size == 16)
1319                BITSET_SET(linkage->xfb16_only_mask, slot);
1320             else
1321                unreachable("invalid load_input type");
1322          }
1323       }
1324    } else {
1325       list_addtail(&node->head, &out->producer.loads);
1326    }
1327 
1328    BITSET_SET(linkage->removable_mask, slot);
1329 
1330    /* Indirect indexing. */
1331    if (!nir_src_is_const(offset)) {
1332       /* Only the indirectly-indexed component is marked as indirect. */
1333       for (unsigned i = 0; i < sem.num_slots; i++)
1334          BITSET_SET(linkage->indirect_mask, slot + i * 8);
1335 
1336       /* Set the same vec4 type as the first element in all slots. */
1337       if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1338          enum fs_vec4_type fs_vec4_type =
1339             linkage->fs_vec4_type[sem.location];
1340 
1341          for (unsigned i = 1; i < sem.num_slots; i++)
1342             linkage->fs_vec4_type[sem.location + i] = fs_vec4_type;
1343       }
1344       return false;
1345    }
1346 
1347    if (can_optimize_varying(linkage, sem.location).disable_all)
1348       return false;
1349 
1350    if (is_store) {
1351       nir_def *value = intr->src[0].ssa;
1352 
1353       const bool constant = value->parent_instr->type == nir_instr_type_load_const;
1354 
1355       /* If the store instruction is executed in a divergent block, the value
1356        * that's stored in the output becomes divergent.
1357        *
1358        * Mesh shaders get special treatment because we can't follow their topology,
1359        * so we only propagate constants.
1360        * TODO: revisit this when workgroup divergence analysis is merged.
1361        */
1362       const bool divergent = value->divergent ||
1363                              intr->instr.block->divergent ||
1364                              (!constant && linkage->producer_stage == MESA_SHADER_MESH);
1365 
1366       if (!out->producer.value) {
1367          /* This is the first store to this output. */
1368          BITSET_SET(linkage->output_equal_mask, slot);
1369          out->producer.value = value->parent_instr;
1370 
1371          /* Set whether the value is convergent. Such varyings can be
1372           * promoted to flat regardless of their original interpolation
1373           * mode.
1374           */
1375          if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && !divergent) {
1376             if (value->bit_size == 32)
1377                BITSET_SET(linkage->convergent32_mask, slot);
1378             else if (value->bit_size == 16)
1379                BITSET_SET(linkage->convergent16_mask, slot);
1380             else
1381                unreachable("invalid store_output type");
1382          }
1383       } else {
1384          /* There are multiple stores to the same output. If they store
1385           * different values, clear the mask.
1386           */
1387          if (out->producer.value != value->parent_instr)
1388             BITSET_CLEAR(linkage->output_equal_mask, slot);
1389 
1390          /* Update divergence information. */
1391          if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && divergent) {
1392             if (value->bit_size == 32)
1393                BITSET_CLEAR(linkage->convergent32_mask, slot);
1394             else if (value->bit_size == 16)
1395                BITSET_CLEAR(linkage->convergent16_mask, slot);
1396             else
1397                unreachable("invalid store_output type");
1398          }
1399       }
1400    } else {
1401       /* Only TCS output loads can get here.
1402        *
1403        * We need to record output loads as flat32 or flat16, otherwise
1404        * compaction will think that the slot is free and will put some
1405        * other output in its place.
1406        */
1407       assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
1408 
1409       if (!can_optimize_varying(linkage, sem.location).compact)
1410          return false;
1411 
1412       if (intr->def.bit_size == 32)
1413          BITSET_SET(linkage->flat32_mask, slot);
1414       else if (intr->def.bit_size == 16)
1415          BITSET_SET(linkage->flat16_mask, slot);
1416       else
1417          unreachable("invalid load_input type");
1418    }
1419    return false;
1420 }
1421 
1422 /******************************************************************
1423  * TIDYING UP INDIRECT VARYINGS (BEFORE DEAD VARYINGS REMOVAL)
1424  ******************************************************************/
1425 
1426 static void
tidy_up_indirect_varyings(struct linkage_info * linkage)1427 tidy_up_indirect_varyings(struct linkage_info *linkage)
1428 {
1429    unsigned i;
1430 
1431    /* Indirectly-indexed slots can have direct access too and thus set
1432     * various bitmasks, so clear those bitmasks to make sure they are not
1433     * touched.
1434     */
1435    BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) {
1436       slot_disable_optimizations_and_compaction(linkage, i);
1437    }
1438 
1439    /* If some slots have both direct and indirect accesses, move instructions
1440     * of such slots to the slot representing the first array element, so that
1441     * we can remove all loads/stores of dead indirectly-indexed varyings
1442     * by only looking at the first element.
1443     */
1444    BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) {
1445       struct scalar_slot *first = &linkage->slot[i];
1446 
1447       /* Skip if this is not the first array element. The first element
1448        * always sets num_slots to at least 2.
1449        */
1450       if (first->num_slots <= 1)
1451          continue;
1452 
1453       /* Move instructions from other elements of the indirectly-accessed
1454        * array to the first element (by merging the linked lists).
1455        */
1456       for (unsigned elem = 1; elem < first->num_slots; elem++) {
1457          /* The component slots are at 16-bit granularity, so we need to
1458           * increment by 8 to get the same component in the next vec4 slot.
1459           */
1460          struct scalar_slot *other = &linkage->slot[i + elem * 8];
1461 
1462          list_splicetail(&other->producer.stores, &first->producer.stores);
1463          list_splicetail(&other->producer.loads, &first->producer.loads);
1464          list_splicetail(&other->consumer.loads, &first->consumer.loads);
1465          list_inithead(&other->producer.stores);
1466          list_inithead(&other->producer.loads);
1467          list_inithead(&other->consumer.loads);
1468       }
1469    }
1470 }
1471 
1472 /******************************************************************
1473  * TIDYING UP CONVERGENT VARYINGS
1474  ******************************************************************/
1475 
1476 /**
1477  * Reorganize bitmasks for FS because they are initialized such that they can
1478  * intersect with the convergent bitmasks. We want them to be disjoint, so
1479  * that masks of interpolated, flat, and convergent varyings don't intersect.
1480  */
1481 static void
tidy_up_convergent_varyings(struct linkage_info * linkage)1482 tidy_up_convergent_varyings(struct linkage_info *linkage)
1483 {
1484    if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
1485       return;
1486 
1487    unsigned i;
1488    /* Whether to promote convergent interpolated slots to flat if it
1489     * doesn't lead to worse compaction.
1490     */
1491    bool optimize_convergent_slots = true; /* only turn off for debugging */
1492 
1493    if (optimize_convergent_slots) {
1494       /* If a slot is flat and convergent and the driver can't load as flat
1495        * from interpolated vec4 slots, keep the flat bit and remove
1496        * the convergent bit. If the driver can load as flat from interpolated
1497        * vec4 slots, keep the convergent bit.
1498        *
1499        * If a slot is interpolated and convergent, remove the interpolated
1500        * bit and keep the convergent bit, which means that it's interpolated,
1501        * but can be promoted to flat.
1502        *
1503        * Since the geometry shader is the only shader that can store values
1504        * in multiple vertices before FS, it's required that all stores are
1505        * equal to be considered convergent (output_equal_mask), otherwise
1506        * the promotion to flat would be incorrect.
1507        */
1508       BITSET_FOREACH_SET(i, linkage->convergent32_mask, NUM_SCALAR_SLOTS) {
1509          if (!BITSET_TEST(linkage->interp_fp32_mask, i) &&
1510              !BITSET_TEST(linkage->flat32_mask, i) &&
1511              !BITSET_TEST(linkage->color32_mask, i)) {
1512             /* Clear the flag - not used by FS. */
1513             BITSET_CLEAR(linkage->convergent32_mask, i);
1514          } else if ((!linkage->can_mix_convergent_flat_with_interpolated &&
1515                      BITSET_TEST(linkage->flat32_mask, i)) ||
1516                     (linkage->producer_stage == MESA_SHADER_GEOMETRY &&
1517                      !BITSET_TEST(linkage->output_equal_mask, i))) {
1518             /* Keep the original qualifier. */
1519             BITSET_CLEAR(linkage->convergent32_mask, i);
1520          } else {
1521             /* Keep it convergent. */
1522             BITSET_CLEAR(linkage->interp_fp32_mask, i);
1523             BITSET_CLEAR(linkage->color32_mask, i);
1524             BITSET_CLEAR(linkage->flat32_mask, i);
1525          }
1526       }
1527       BITSET_FOREACH_SET(i, linkage->convergent16_mask, NUM_SCALAR_SLOTS) {
1528          if (!BITSET_TEST(linkage->interp_fp16_mask, i) &&
1529              !BITSET_TEST(linkage->flat16_mask, i)) {
1530             /* Clear the flag - not used by FS. */
1531             BITSET_CLEAR(linkage->convergent16_mask, i);
1532          } else if ((!linkage->can_mix_convergent_flat_with_interpolated &&
1533                      BITSET_TEST(linkage->flat16_mask, i)) ||
1534                     (linkage->producer_stage == MESA_SHADER_GEOMETRY &&
1535                      !BITSET_TEST(linkage->output_equal_mask, i))) {
1536             /* Keep the original qualifier. */
1537             BITSET_CLEAR(linkage->convergent16_mask, i);
1538          } else {
1539             /* Keep it convergent. */
1540             BITSET_CLEAR(linkage->interp_fp16_mask, i);
1541             BITSET_CLEAR(linkage->flat16_mask, i);
1542          }
1543       }
1544    } else {
1545       /* Don't do anything with convergent slots. */
1546       BITSET_ZERO(linkage->convergent32_mask);
1547       BITSET_ZERO(linkage->convergent16_mask);
1548    }
1549 }
1550 
1551 /******************************************************************
1552  * DETERMINING UNIFORM AND UBO MOVABILITY BASED ON DRIVER LIMITS
1553  ******************************************************************/
1554 
1555 static bool
is_variable_present(nir_shader * nir,nir_variable * var,nir_variable_mode mode,bool spirv)1556 is_variable_present(nir_shader *nir, nir_variable *var,
1557                     nir_variable_mode mode, bool spirv)
1558 {
1559    nir_foreach_variable_with_modes(it, nir, mode) {
1560       if ((spirv && it->data.binding == var->data.binding) ||
1561           (!spirv && !strcmp(it->name, var->name)))
1562          return true;
1563    }
1564    return false;
1565 }
1566 
1567 /* TODO: this should be a helper in common code */
1568 static unsigned
get_uniform_components(const struct glsl_type * type)1569 get_uniform_components(const struct glsl_type *type)
1570 {
1571    unsigned size = glsl_get_aoa_size(type);
1572    size = MAX2(size, 1);
1573    size *= glsl_get_matrix_columns(glsl_without_array(type));
1574 
1575    if (glsl_type_is_dual_slot(glsl_without_array(type)))
1576       size *= 2;
1577 
1578    /* Convert from vec4 to scalar. */
1579    return size * 4;
1580 }
1581 
1582 static unsigned
get_ubo_slots(const nir_variable * var)1583 get_ubo_slots(const nir_variable *var)
1584 {
1585    if (glsl_type_is_interface(glsl_without_array(var->type))) {
1586       unsigned slots = glsl_get_aoa_size(var->type);
1587       return MAX2(slots, 1);
1588    }
1589 
1590    return 1;
1591 }
1592 
1593 /**
1594  * Count uniforms and see if the combined uniform component count is over
1595  * the limit. If it is, don't move any uniforms. It's sufficient if drivers
1596  * declare a very high limit.
1597  */
1598 static void
determine_uniform_movability(struct linkage_info * linkage,unsigned max_uniform_components)1599 determine_uniform_movability(struct linkage_info *linkage,
1600                              unsigned max_uniform_components)
1601 {
1602    nir_shader *producer = linkage->producer_builder.shader;
1603    nir_shader *consumer = linkage->consumer_builder.shader;
1604    unsigned num_producer_uniforms = 0;
1605    unsigned num_consumer_uniforms = 0;
1606    unsigned num_shared_uniforms = 0;
1607 
1608    nir_foreach_variable_with_modes(var, producer, nir_var_uniform) {
1609       if (is_variable_present(consumer, var, nir_var_uniform, linkage->spirv))
1610          num_shared_uniforms += get_uniform_components(var->type);
1611       else
1612          num_producer_uniforms += get_uniform_components(var->type);
1613    }
1614 
1615    nir_foreach_variable_with_modes(var, consumer, nir_var_uniform) {
1616       if (!is_variable_present(producer, var, nir_var_uniform, linkage->spirv))
1617          num_consumer_uniforms += get_uniform_components(var->type);
1618    }
1619 
1620    linkage->can_move_uniforms =
1621       num_producer_uniforms + num_consumer_uniforms + num_shared_uniforms <=
1622       max_uniform_components;
1623 }
1624 
1625 /**
1626  * Count UBOs and see if the combined UBO count is over the limit. If it is,
1627  * don't move any UBOs. It's sufficient if drivers declare a very high limit.
1628  */
1629 static void
determine_ubo_movability(struct linkage_info * linkage,unsigned max_ubos_per_stage)1630 determine_ubo_movability(struct linkage_info *linkage,
1631                          unsigned max_ubos_per_stage)
1632 {
1633    nir_shader *producer = linkage->producer_builder.shader;
1634    nir_shader *consumer = linkage->consumer_builder.shader;
1635    unsigned num_producer_ubos = 0;
1636    unsigned num_consumer_ubos = 0;
1637    unsigned num_shared_ubos = 0;
1638 
1639    nir_foreach_variable_with_modes(var, producer, nir_var_mem_ubo) {
1640       if (is_variable_present(consumer, var, nir_var_mem_ubo, linkage->spirv))
1641          num_shared_ubos += get_ubo_slots(var);
1642       else
1643          num_producer_ubos += get_ubo_slots(var);
1644    }
1645 
1646    nir_foreach_variable_with_modes(var, consumer, nir_var_mem_ubo) {
1647       if (!is_variable_present(producer, var, nir_var_mem_ubo,
1648                                linkage->spirv))
1649          num_consumer_ubos += get_ubo_slots(var);
1650    }
1651 
1652    linkage->can_move_ubos =
1653       num_producer_ubos + num_consumer_ubos + num_shared_ubos <=
1654       max_ubos_per_stage;
1655 }
1656 
1657 /******************************************************************
1658  * DEAD VARYINGS REMOVAL
1659  ******************************************************************/
1660 
1661 static void
remove_all_stores(struct linkage_info * linkage,unsigned i,bool * uses_xfb,nir_opt_varyings_progress * progress)1662 remove_all_stores(struct linkage_info *linkage, unsigned i,
1663                   bool *uses_xfb, nir_opt_varyings_progress *progress)
1664 {
1665    struct scalar_slot *slot = &linkage->slot[i];
1666 
1667    assert(!list_is_empty(&slot->producer.stores) &&
1668           list_is_empty(&slot->producer.loads) &&
1669           list_is_empty(&slot->consumer.loads));
1670 
1671    /* Remove all stores. */
1672    list_for_each_entry_safe(struct list_node, iter, &slot->producer.stores, head) {
1673       if (nir_remove_varying(iter->instr, linkage->consumer_stage)) {
1674          list_del(&iter->head);
1675          *progress |= nir_progress_producer;
1676       } else {
1677          if (has_xfb(iter->instr)) {
1678             *uses_xfb = true;
1679 
1680             if (!is_active_sysval_output(linkage, i, iter->instr)) {
1681                if (iter->instr->src[0].ssa->bit_size == 32)
1682                   BITSET_SET(linkage->xfb32_only_mask, i);
1683                else if (iter->instr->src[0].ssa->bit_size == 16)
1684                   BITSET_SET(linkage->xfb16_only_mask, i);
1685                else
1686                   unreachable("invalid load_input type");
1687             }
1688          }
1689       }
1690    }
1691 }
1692 
1693 static void
remove_dead_varyings(struct linkage_info * linkage,nir_opt_varyings_progress * progress)1694 remove_dead_varyings(struct linkage_info *linkage,
1695                      nir_opt_varyings_progress *progress)
1696 {
1697    unsigned i;
1698 
1699    /* Remove dead inputs and outputs. */
1700    BITSET_FOREACH_SET(i, linkage->removable_mask, NUM_SCALAR_SLOTS) {
1701       struct scalar_slot *slot = &linkage->slot[i];
1702 
1703       /* Only indirect access can have no loads and stores because we moved
1704        * them to the first element in tidy_up_indirect_varyings().
1705        */
1706       assert(!list_is_empty(&slot->producer.stores) ||
1707              !list_is_empty(&slot->producer.loads) ||
1708              !list_is_empty(&slot->consumer.loads) ||
1709              BITSET_TEST(linkage->indirect_mask, i));
1710 
1711       /* Nothing to do if there are no loads and stores. */
1712       if (list_is_empty(&slot->producer.stores) &&
1713           list_is_empty(&slot->producer.loads) &&
1714           list_is_empty(&slot->consumer.loads))
1715          continue;
1716 
1717       /* If there are producer loads (e.g. TCS) but no consumer loads
1718        * (e.g. TES), set the "no_varying" flag to indicate that the outputs
1719        * are not consumed by the next shader stage (e.g. TES).
1720        */
1721       if (!list_is_empty(&slot->producer.stores) &&
1722           !list_is_empty(&slot->producer.loads) &&
1723           list_is_empty(&slot->consumer.loads)) {
1724          for (unsigned list_index = 0; list_index < 2; list_index++) {
1725             struct list_head *list = list_index ? &slot->producer.stores :
1726                                                   &slot->producer.loads;
1727 
1728             list_for_each_entry(struct list_node, iter, list, head) {
1729                nir_io_semantics sem = nir_intrinsic_io_semantics(iter->instr);
1730                sem.no_varying = 1;
1731                nir_intrinsic_set_io_semantics(iter->instr, sem);
1732             }
1733          }
1734 
1735          /* This tells the compaction to move these varyings to the end. */
1736          if (BITSET_TEST(linkage->flat32_mask, i)) {
1737             assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT);
1738             BITSET_CLEAR(linkage->flat32_mask, i);
1739             BITSET_SET(linkage->no_varying32_mask, i);
1740          }
1741          if (BITSET_TEST(linkage->flat16_mask, i)) {
1742             assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT);
1743             BITSET_CLEAR(linkage->flat16_mask, i);
1744             BITSET_SET(linkage->no_varying16_mask, i);
1745          }
1746          continue;
1747       }
1748 
1749       /* The varyings aren't dead if both loads and stores are present. */
1750       if (!list_is_empty(&slot->producer.stores) &&
1751           (!list_is_empty(&slot->producer.loads) ||
1752            !list_is_empty(&slot->consumer.loads)))
1753          continue;
1754 
1755       bool uses_xfb = false;
1756 
1757       if (list_is_empty(&slot->producer.stores)) {
1758          /* There are no stores. */
1759          assert(!list_is_empty(&slot->producer.loads) ||
1760                 !list_is_empty(&slot->consumer.loads));
1761 
1762          /* TEXn.xy loads can't be removed in FS because of the coord
1763           * replace state, but TEXn outputs can be removed if they are
1764           * not read by FS.
1765           *
1766           * TEXn.zw loads can be eliminated and replaced by (0, 1), which
1767           * is equal to the coord replace value.
1768           */
1769          if (is_interpolated_texcoord(linkage, i)) {
1770             assert(i % 2 == 0); /* high 16-bit slots disallowed */
1771             /* Keep TEXn.xy. */
1772             if (i % 8 < 4)
1773                continue;
1774          }
1775 
1776          /* Replace all loads with undef. Do that for both input loads
1777           * in the consumer stage and output loads in the producer stage
1778           * because we also want to eliminate TCS loads that have no
1779           * corresponding TCS stores.
1780           */
1781          for (unsigned list_index = 0; list_index < 2; list_index++) {
1782             struct list_head *list = list_index ? &slot->producer.loads :
1783                                                   &slot->consumer.loads;
1784             nir_builder *b = list_index ? &linkage->producer_builder :
1785                                           &linkage->consumer_builder;
1786 
1787             list_for_each_entry(struct list_node, iter, list, head) {
1788                nir_intrinsic_instr *loadi = iter->instr;
1789                nir_def *replacement = NULL;
1790 
1791                b->cursor = nir_before_instr(&loadi->instr);
1792 
1793                /* LAYER and VIEWPORT FS inputs should be replaced by 0
1794                 * instead of undef.
1795                 */
1796                gl_varying_slot location = (gl_varying_slot)(vec4_slot(i));
1797 
1798                if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
1799                    (location == VARYING_SLOT_LAYER ||
1800                     location == VARYING_SLOT_VIEWPORT ||
1801                     /* TEXn.z is replaced by 0 (matching coord replace) */
1802                     (is_interpolated_texcoord(linkage, i) && i % 8 == 4)))
1803                   replacement = nir_imm_intN_t(b, 0, loadi->def.bit_size);
1804                else if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
1805                         /* TEXn.w is replaced by 1 (matching coord replace) */
1806                         is_interpolated_texcoord(linkage, i) && i % 8 == 6)
1807                   replacement = nir_imm_floatN_t(b, 1, loadi->def.bit_size);
1808                else
1809                   replacement = nir_undef(b, 1, loadi->def.bit_size);
1810 
1811                nir_def_replace(&loadi->def, replacement);
1812 
1813                *progress |= list_index ? nir_progress_producer :
1814                                          nir_progress_consumer;
1815             }
1816          }
1817 
1818          /* Clear the lists. */
1819          list_inithead(&slot->producer.loads);
1820          list_inithead(&slot->consumer.loads);
1821       } else {
1822          /* There are no loads. */
1823          remove_all_stores(linkage, i, &uses_xfb, progress);
1824       }
1825 
1826       /* Clear bitmasks associated with this varying slot or array. */
1827       for (unsigned elem = 0; elem < slot->num_slots; elem++)
1828          clear_slot_info_after_removal(linkage, i + elem, uses_xfb);
1829    }
1830 }
1831 
1832 /******************************************************************
1833  * SSA CLONING HELPERS
1834  ******************************************************************/
1835 
1836 /* Pass flags for inter-shader code motion. Also used by helpers. */
1837 #define FLAG_ALU_IS_TES_INTERP_LOAD    BITFIELD_BIT(0)
1838 #define FLAG_MOVABLE                   BITFIELD_BIT(1)
1839 #define FLAG_UNMOVABLE                 BITFIELD_BIT(2)
1840 #define FLAG_POST_DOMINATOR_PROCESSED  BITFIELD_BIT(3)
1841 #define FLAG_GATHER_LOADS_VISITED      BITFIELD_BIT(4)
1842 
1843 #define FLAG_INTERP_MASK               BITFIELD_RANGE(5, 3)
1844 #define FLAG_INTERP_CONVERGENT         (0 << 5)
1845 #define FLAG_INTERP_FLAT               (1 << 5)
1846 /* FS-only interpolation modes. */
1847 #define FLAG_INTERP_PERSP_PIXEL        (2 << 5)
1848 #define FLAG_INTERP_PERSP_CENTROID     (3 << 5)
1849 #define FLAG_INTERP_PERSP_SAMPLE       (4 << 5)
1850 #define FLAG_INTERP_LINEAR_PIXEL       (5 << 5)
1851 #define FLAG_INTERP_LINEAR_CENTROID    (6 << 5)
1852 #define FLAG_INTERP_LINEAR_SAMPLE      (7 << 5)
1853 /* TES-only interpolation modes. (these were found in shaders) */
1854 #define FLAG_INTERP_TES_TRIANGLE_UVW   (2 << 5) /* v0*u + v1*v + v2*w */
1855 #define FLAG_INTERP_TES_TRIANGLE_WUV   (3 << 5) /* v0*w + v1*u + v2*v */
1856 /* TODO: Feel free to insert more TES interpolation equations here. */
1857 
1858 static bool
can_move_deref_between_shaders(struct linkage_info * linkage,nir_instr * instr)1859 can_move_deref_between_shaders(struct linkage_info *linkage, nir_instr *instr)
1860 {
1861    nir_deref_instr *deref = nir_instr_as_deref(instr);
1862    unsigned allowed_modes =
1863       (linkage->can_move_uniforms ? nir_var_uniform : 0) |
1864       (linkage->can_move_ubos ? nir_var_mem_ubo : 0);
1865 
1866    if (!nir_deref_mode_is_one_of(deref, allowed_modes))
1867       return false;
1868 
1869    /* Indirectly-indexed uniforms and UBOs are not moved into later shaders
1870     * due to performance concerns, and they are not moved into previous shaders
1871     * because it's unimplemented (TODO).
1872     */
1873    if (nir_deref_instr_has_indirect(deref))
1874       return false;
1875 
1876    nir_variable *var = nir_deref_instr_get_variable(deref);
1877 
1878    /* Subroutine uniforms are not moved. Even though it works and subroutine
1879     * uniforms are moved correctly and subroutines have been inlined at this
1880     * point, subroutine functions aren't moved and the linker doesn't like
1881     * when a shader only contains a subroutine uniform but no subroutine
1882     * functions. This could be fixed in the linker, but for now, don't
1883     * move subroutine uniforms.
1884     */
1885    if (var->name && strstr(var->name, "__subu_") == var->name)
1886       return false;
1887 
1888    return true;
1889 }
1890 
1891 static nir_intrinsic_instr *
find_per_vertex_load_for_tes_interp(nir_instr * instr)1892 find_per_vertex_load_for_tes_interp(nir_instr *instr)
1893 {
1894    switch (instr->type) {
1895    case nir_instr_type_alu: {
1896       nir_alu_instr *alu = nir_instr_as_alu(instr);
1897       unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
1898 
1899       for (unsigned i = 0; i < num_srcs; i++) {
1900          nir_instr *src = alu->src[i].src.ssa->parent_instr;
1901          nir_intrinsic_instr *intr = find_per_vertex_load_for_tes_interp(src);
1902 
1903          if (intr)
1904             return intr;
1905       }
1906       return NULL;
1907    }
1908 
1909    case nir_instr_type_intrinsic: {
1910       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1911 
1912       return intr->intrinsic == nir_intrinsic_load_per_vertex_input ?
1913                intr : NULL;
1914    }
1915 
1916    default:
1917       unreachable("unexpected instruction type");
1918    }
1919 }
1920 
1921 static nir_def *
get_stored_value_for_load(struct linkage_info * linkage,nir_instr * instr)1922 get_stored_value_for_load(struct linkage_info *linkage, nir_instr *instr)
1923 {
1924    nir_intrinsic_instr *intr;
1925 
1926    if (instr->type == nir_instr_type_intrinsic) {
1927       intr = nir_instr_as_intrinsic(instr);
1928    } else {
1929       assert(instr->type == nir_instr_type_alu &&
1930              instr->pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD);
1931       intr = find_per_vertex_load_for_tes_interp(instr);
1932    }
1933 
1934    unsigned slot_index = intr_get_scalar_16bit_slot(intr);
1935    assert(list_is_singular(&linkage->slot[slot_index].producer.stores));
1936 
1937    nir_def *stored_value =
1938       list_first_entry(&linkage->slot[slot_index].producer.stores,
1939                        struct list_node, head)->instr->src[0].ssa;
1940    assert(stored_value->num_components == 1);
1941    return stored_value;
1942 }
1943 
1944 /* Clone the SSA, which can be in a different shader. */
1945 static nir_def *
clone_ssa(struct linkage_info * linkage,nir_builder * b,nir_def * ssa)1946 clone_ssa(struct linkage_info *linkage, nir_builder *b, nir_def *ssa)
1947 {
1948    switch (ssa->parent_instr->type) {
1949    case nir_instr_type_load_const:
1950       return nir_build_imm(b, ssa->num_components, ssa->bit_size,
1951                            nir_instr_as_load_const(ssa->parent_instr)->value);
1952 
1953    case nir_instr_type_undef:
1954       return nir_undef(b, ssa->num_components, ssa->bit_size);
1955 
1956    case nir_instr_type_alu: {
1957       nir_alu_instr *alu = nir_instr_as_alu(ssa->parent_instr);
1958 
1959       if (alu->instr.pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD) {
1960          /* We are cloning an interpolated TES load in the producer for
1961           * backward inter-shader code motion.
1962           */
1963          assert(&linkage->producer_builder == b);
1964          return get_stored_value_for_load(linkage, &alu->instr);
1965       }
1966 
1967       nir_def *src[4] = {0};
1968       unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
1969       assert(num_srcs <= ARRAY_SIZE(src));
1970 
1971       for (unsigned i = 0; i < num_srcs; i++)
1972          src[i] = clone_ssa(linkage, b, alu->src[i].src.ssa);
1973 
1974       nir_def *clone = nir_build_alu(b, alu->op, src[0], src[1], src[2], src[3]);
1975       nir_alu_instr *alu_clone = nir_instr_as_alu(clone->parent_instr);
1976 
1977       alu_clone->exact = alu->exact;
1978       alu_clone->no_signed_wrap = alu->no_signed_wrap;
1979       alu_clone->no_unsigned_wrap = alu->no_unsigned_wrap;
1980       alu_clone->def.num_components = alu->def.num_components;
1981       alu_clone->def.bit_size = alu->def.bit_size;
1982 
1983       for (unsigned i = 0; i < num_srcs; i++) {
1984          memcpy(alu_clone->src[i].swizzle, alu->src[i].swizzle,
1985                 NIR_MAX_VEC_COMPONENTS);
1986       }
1987 
1988       return clone;
1989    }
1990 
1991    case nir_instr_type_intrinsic: {
1992       /* Clone load_deref of uniform or ubo. It's the only thing that can
1993        * occur here.
1994        */
1995       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(ssa->parent_instr);
1996 
1997       switch (intr->intrinsic) {
1998       case nir_intrinsic_load_deref: {
1999          nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
2000 
2001          assert(deref);
2002          assert(nir_deref_mode_is_one_of(deref, nir_var_uniform | nir_var_mem_ubo));
2003          /* Indirect uniform indexing is disallowed here. */
2004          assert(!nir_deref_instr_has_indirect(deref));
2005 
2006          /* Get the uniform from the original shader. */
2007          nir_variable *var = nir_deref_instr_get_variable(deref);
2008          assert(!(var->data.mode & nir_var_mem_ubo) || linkage->can_move_ubos);
2009 
2010          /* Declare the uniform in the target shader. If it's the same shader
2011           * (in the case of replacing output loads with a uniform), this has
2012           * no effect.
2013           */
2014          var = nir_clone_uniform_variable(b->shader, var, linkage->spirv);
2015 
2016          /* Re-build the uniform deref load before the load. */
2017          nir_deref_instr *load_uniform_deref =
2018             nir_clone_deref_instr(b, var, deref);
2019 
2020          return nir_load_deref(b, load_uniform_deref);
2021       }
2022 
2023       case nir_intrinsic_load_input:
2024       case nir_intrinsic_load_per_primitive_input:
2025       case nir_intrinsic_load_interpolated_input: {
2026          /* We are cloning load_input in the producer for backward
2027           * inter-shader code motion. Replace the input load with the stored
2028           * output value. That way we can clone any expression using inputs
2029           * from the consumer in the producer.
2030           */
2031          assert(&linkage->producer_builder == b);
2032          return get_stored_value_for_load(linkage, &intr->instr);
2033       }
2034 
2035       default:
2036          unreachable("unexpected intrinsic");
2037       }
2038    }
2039 
2040    default:
2041       unreachable("unexpected instruction type");
2042    }
2043 }
2044 
2045 /******************************************************************
2046  * UNIFORM EXPRESSION PROPAGATION (CONSTANTS, UNIFORMS, UBO LOADS)
2047  ******************************************************************/
2048 
2049 static void
remove_all_stores_and_clear_slot(struct linkage_info * linkage,unsigned slot,nir_opt_varyings_progress * progress)2050 remove_all_stores_and_clear_slot(struct linkage_info *linkage, unsigned slot,
2051                                  nir_opt_varyings_progress *progress)
2052 {
2053    bool uses_xfb = false;
2054    remove_all_stores(linkage, slot, &uses_xfb, progress);
2055    clear_slot_info_after_removal(linkage, slot, uses_xfb);
2056 }
2057 
2058 struct is_uniform_expr_state {
2059    struct linkage_info *linkage;
2060    unsigned cost;
2061 };
2062 
2063 static bool
2064 is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state);
2065 
2066 static bool
src_is_uniform_expression(nir_src * src,void * data)2067 src_is_uniform_expression(nir_src *src, void *data)
2068 {
2069    return is_uniform_expression(src->ssa->parent_instr,
2070                                 (struct is_uniform_expr_state*)data);
2071 }
2072 
2073 /**
2074  * Return whether instr is a uniform expression that can be moved into
2075  * the next shader.
2076  */
2077 static bool
is_uniform_expression(nir_instr * instr,struct is_uniform_expr_state * state)2078 is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state)
2079 {
2080    const nir_shader_compiler_options *options =
2081       state->linkage->producer_builder.shader->options;
2082 
2083    switch (instr->type) {
2084    case nir_instr_type_load_const:
2085    case nir_instr_type_undef:
2086       return true;
2087 
2088    case nir_instr_type_alu:
2089       state->cost += options->varying_estimate_instr_cost ?
2090                         options->varying_estimate_instr_cost(instr) : 1;
2091       return nir_foreach_src(instr, src_is_uniform_expression, state);
2092 
2093    case nir_instr_type_intrinsic:
2094       if (nir_instr_as_intrinsic(instr)->intrinsic ==
2095           nir_intrinsic_load_deref) {
2096          state->cost += options->varying_estimate_instr_cost ?
2097                            options->varying_estimate_instr_cost(instr) : 1;
2098          return nir_foreach_src(instr, src_is_uniform_expression, state);
2099       }
2100       return false;
2101 
2102    case nir_instr_type_deref:
2103       return can_move_deref_between_shaders(state->linkage, instr);
2104 
2105    default:
2106       return false;
2107    }
2108 }
2109 
2110 /**
2111  * Propagate constants, uniforms, UBO loads, and uniform expressions
2112  * in output components to inputs loads in the next shader and output
2113  * loads in the current stage, and remove the output components.
2114  *
2115  * Uniform expressions are ALU expressions only sourcing constants, uniforms,
2116  * and UBO loads.
2117  */
2118 static void
propagate_uniform_expressions(struct linkage_info * linkage,nir_opt_varyings_progress * progress)2119 propagate_uniform_expressions(struct linkage_info *linkage,
2120                               nir_opt_varyings_progress *progress)
2121 {
2122    unsigned i;
2123 
2124    /* Clear pass_flags, which is used by clone_ssa. */
2125    nir_shader_clear_pass_flags(linkage->consumer_builder.shader);
2126 
2127    /* Find uniform expressions. If there are multiple stores, they should all
2128     * store the same value. That's guaranteed by output_equal_mask.
2129     */
2130    BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
2131       if (!can_optimize_varying(linkage, vec4_slot(i)).propagate_uniform_expr)
2132          continue;
2133 
2134       struct scalar_slot *slot = &linkage->slot[i];
2135       assert(!list_is_empty(&slot->producer.loads) ||
2136              !list_is_empty(&slot->consumer.loads));
2137 
2138       struct is_uniform_expr_state state = {
2139          .linkage = linkage,
2140          .cost = 0,
2141       };
2142 
2143       if (!is_uniform_expression(slot->producer.value, &state))
2144          continue;
2145 
2146       if (state.cost > linkage->max_varying_expression_cost)
2147          continue;
2148 
2149       /* Colors can be propagated only if they are constant between [0, 1]
2150        * because that's the only case when the clamp vertex color state has
2151        * no effect.
2152        */
2153       if (is_interpolated_color(linkage, i) &&
2154           (slot->producer.value->type != nir_instr_type_load_const ||
2155            nir_instr_as_load_const(slot->producer.value)->value[0].f32 < 0 ||
2156            nir_instr_as_load_const(slot->producer.value)->value[0].f32 > 1))
2157          continue;
2158 
2159       /* TEXn.zw can be propagated only if it's equal to (0, 1) because it's
2160        * the coord replace value.
2161        */
2162       if (is_interpolated_texcoord(linkage, i)) {
2163          assert(i % 2 == 0); /* high 16-bit slots disallowed */
2164 
2165          if (i % 8 == 0 || /* TEXn.x */
2166              i % 8 == 2 || /* TEXn.y */
2167              slot->producer.value->type != nir_instr_type_load_const)
2168             continue;
2169 
2170          float value =
2171             nir_instr_as_load_const(slot->producer.value)->value[0].f32;
2172 
2173          /* This ignores signed zeros, but those are destroyed by
2174           * interpolation, so it doesn't matter.
2175           */
2176          if ((i % 8 == 4 && value != 0) ||
2177              (i % 8 == 6 && value != 1))
2178             continue;
2179       }
2180 
2181       /* Replace all loads. Do that for both input and output loads. */
2182       for (unsigned list_index = 0; list_index < 2; list_index++) {
2183          struct list_head *load = list_index ? &slot->producer.loads :
2184                                                &slot->consumer.loads;
2185          nir_builder *b = list_index ? &linkage->producer_builder :
2186                                        &linkage->consumer_builder;
2187 
2188          list_for_each_entry(struct list_node, node, load, head) {
2189             nir_intrinsic_instr *loadi = node->instr;
2190             b->cursor = nir_before_instr(&loadi->instr);
2191 
2192             /* Copy the uniform expression before the load. */
2193             nir_def *clone = clone_ssa(linkage, b,
2194                                        nir_instr_def(slot->producer.value));
2195 
2196             /* Interpolation converts Infs to NaNs. If we skip it, we need to
2197              * convert Infs to NaNs manually.
2198              */
2199             if (loadi->intrinsic == nir_intrinsic_load_interpolated_input &&
2200                 preserve_nans(b->shader, clone->bit_size))
2201                clone = build_convert_inf_to_nan(b, clone);
2202 
2203             /* Replace the original load. */
2204             nir_def_replace(&loadi->def, clone);
2205             *progress |= list_index ? nir_progress_producer :
2206                                       nir_progress_consumer;
2207          }
2208       }
2209 
2210       /* Clear the lists. */
2211       list_inithead(&slot->producer.loads);
2212       list_inithead(&slot->consumer.loads);
2213 
2214       /* Remove all stores now that loads have been replaced. */
2215       remove_all_stores_and_clear_slot(linkage, i, progress);
2216    }
2217 }
2218 
2219 /******************************************************************
2220  * OUTPUT DEDUPLICATION
2221  ******************************************************************/
2222 
2223 /* We can only deduplicate outputs that have the same qualifier, and color
2224  * components must be deduplicated separately because they are affected by GL
2225  * states.
2226  *
2227  * QUAL_*_INTERP_ANY means that the interpolation qualifier doesn't matter for
2228  * deduplication as long as it's not flat.
2229  *
2230  * QUAL_COLOR_SHADEMODEL_ANY is the same, but can be switched to flat
2231  * by the flatshade state, so it can't be deduplicated with
2232  * QUAL_COLOR_INTERP_ANY, which is never flat.
2233  */
2234 enum var_qualifier {
2235    QUAL_PATCH,
2236    QUAL_VAR_FLAT,
2237    QUAL_COLOR_FLAT,
2238    QUAL_EXPLICIT,
2239    QUAL_EXPLICIT_STRICT,
2240    QUAL_PER_PRIMITIVE,
2241    /* When nir_io_has_flexible_input_interpolation_except_flat is set: */
2242    QUAL_VAR_INTERP_ANY,
2243    QUAL_COLOR_INTERP_ANY,
2244    QUAL_COLOR_SHADEMODEL_ANY,
2245    /* When nir_io_has_flexible_input_interpolation_except_flat is unset: */
2246    QUAL_VAR_PERSP_PIXEL,
2247    QUAL_VAR_PERSP_CENTROID,
2248    QUAL_VAR_PERSP_SAMPLE,
2249    QUAL_VAR_LINEAR_PIXEL,
2250    QUAL_VAR_LINEAR_CENTROID,
2251    QUAL_VAR_LINEAR_SAMPLE,
2252    QUAL_COLOR_PERSP_PIXEL,
2253    QUAL_COLOR_PERSP_CENTROID,
2254    QUAL_COLOR_PERSP_SAMPLE,
2255    QUAL_COLOR_LINEAR_PIXEL,
2256    QUAL_COLOR_LINEAR_CENTROID,
2257    QUAL_COLOR_LINEAR_SAMPLE,
2258    QUAL_COLOR_SHADEMODEL_PIXEL,
2259    QUAL_COLOR_SHADEMODEL_CENTROID,
2260    QUAL_COLOR_SHADEMODEL_SAMPLE,
2261    NUM_DEDUP_QUALIFIERS,
2262 
2263    QUAL_SKIP,
2264    QUAL_UNKNOWN,
2265 };
2266 
2267 /* Return the input qualifier if all loads use the same one, else skip.
2268  * This is only used by output deduplication to determine input compatibility.
2269  */
2270 static enum var_qualifier
get_input_qualifier(struct linkage_info * linkage,unsigned i)2271 get_input_qualifier(struct linkage_info *linkage, unsigned i)
2272 {
2273    assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT);
2274    struct scalar_slot *slot = &linkage->slot[i];
2275    bool is_color = is_interpolated_color(linkage, i);
2276    nir_intrinsic_instr *load =
2277       list_first_entry(&slot->consumer.loads, struct list_node, head)->instr;
2278 
2279    if (load->intrinsic == nir_intrinsic_load_input)
2280       return is_color ? QUAL_COLOR_FLAT : QUAL_VAR_FLAT;
2281 
2282    if (load->intrinsic == nir_intrinsic_load_per_primitive_input)
2283       return QUAL_PER_PRIMITIVE;
2284 
2285    if (load->intrinsic == nir_intrinsic_load_input_vertex) {
2286       return nir_intrinsic_io_semantics(load).interp_explicit_strict ?
2287                QUAL_EXPLICIT_STRICT : QUAL_EXPLICIT;
2288    }
2289 
2290    assert(load->intrinsic == nir_intrinsic_load_interpolated_input);
2291    nir_intrinsic_instr *baryc =
2292       nir_instr_as_intrinsic(load->src[0].ssa->parent_instr);
2293 
2294    if (linkage->consumer_builder.shader->options->io_options &
2295        nir_io_has_flexible_input_interpolation_except_flat) {
2296       if (is_color) {
2297          return nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE ?
2298                    QUAL_COLOR_SHADEMODEL_ANY : QUAL_COLOR_INTERP_ANY;
2299       } else {
2300          return QUAL_VAR_INTERP_ANY;
2301       }
2302    }
2303 
2304    /* Get the exact interpolation qualifier. */
2305    unsigned pixel_location;
2306    enum var_qualifier qual;
2307 
2308    switch (baryc->intrinsic) {
2309    case nir_intrinsic_load_barycentric_pixel:
2310       pixel_location = 0;
2311       break;
2312    case nir_intrinsic_load_barycentric_centroid:
2313       pixel_location = 1;
2314       break;
2315    case nir_intrinsic_load_barycentric_sample:
2316       pixel_location = 2;
2317       break;
2318    case nir_intrinsic_load_barycentric_at_offset:
2319    case nir_intrinsic_load_barycentric_at_sample:
2320       /* Don't deduplicate outputs that are interpolated at offset/sample. */
2321       return QUAL_SKIP;
2322    default:
2323       unreachable("unexpected barycentric src");
2324    }
2325 
2326    switch (nir_intrinsic_interp_mode(baryc)) {
2327    case INTERP_MODE_NONE:
2328       qual = is_color ? QUAL_COLOR_SHADEMODEL_PIXEL :
2329                         QUAL_VAR_PERSP_PIXEL;
2330       break;
2331    case INTERP_MODE_SMOOTH:
2332       qual = is_color ? QUAL_COLOR_PERSP_PIXEL : QUAL_VAR_PERSP_PIXEL;
2333       break;
2334    case INTERP_MODE_NOPERSPECTIVE:
2335       qual = is_color ? QUAL_COLOR_LINEAR_PIXEL : QUAL_VAR_LINEAR_PIXEL;
2336       break;
2337    default:
2338       unreachable("unexpected interp mode");
2339    }
2340 
2341    /* The ordering of the "qual" enum was carefully chosen to make this
2342     * addition correct.
2343     */
2344    STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 1 == QUAL_VAR_PERSP_CENTROID);
2345    STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 2 == QUAL_VAR_PERSP_SAMPLE);
2346    STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 1 == QUAL_VAR_LINEAR_CENTROID);
2347    STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 2 == QUAL_VAR_LINEAR_SAMPLE);
2348    STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 1 == QUAL_COLOR_PERSP_CENTROID);
2349    STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 2 == QUAL_COLOR_PERSP_SAMPLE);
2350    STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 1 == QUAL_COLOR_LINEAR_CENTROID);
2351    STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 2 == QUAL_COLOR_LINEAR_SAMPLE);
2352    STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 1 ==
2353                  QUAL_COLOR_SHADEMODEL_CENTROID);
2354    STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 2 ==
2355                  QUAL_COLOR_SHADEMODEL_SAMPLE);
2356    return qual + pixel_location;
2357 }
2358 
2359 static void
deduplicate_outputs(struct linkage_info * linkage,nir_opt_varyings_progress * progress)2360 deduplicate_outputs(struct linkage_info *linkage,
2361                     nir_opt_varyings_progress *progress)
2362 {
2363    struct hash_table *tables[NUM_DEDUP_QUALIFIERS] = {NULL};
2364    unsigned i;
2365 
2366    /* Find duplicated outputs. If there are multiple stores, they should all
2367     * store the same value as all stores of some other output. That's
2368     * guaranteed by output_equal_mask.
2369     */
2370    BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
2371       if (!can_optimize_varying(linkage, vec4_slot(i)).deduplicate)
2372          continue;
2373 
2374       struct scalar_slot *slot = &linkage->slot[i];
2375       enum var_qualifier qualifier;
2376       gl_varying_slot var_slot = vec4_slot(i);
2377 
2378       /* Determine which qualifier this slot has. */
2379       if ((var_slot >= VARYING_SLOT_PATCH0 &&
2380            var_slot <= VARYING_SLOT_PATCH31) ||
2381           var_slot == VARYING_SLOT_TESS_LEVEL_INNER ||
2382           var_slot == VARYING_SLOT_TESS_LEVEL_OUTER)
2383          qualifier = QUAL_PATCH;
2384       else if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
2385          qualifier = QUAL_VAR_FLAT;
2386       else
2387          qualifier = get_input_qualifier(linkage, i);
2388 
2389       if (qualifier == QUAL_SKIP)
2390          continue;
2391 
2392       struct hash_table **table = &tables[qualifier];
2393       if (!*table)
2394          *table = _mesa_pointer_hash_table_create(NULL);
2395 
2396       nir_instr *value = slot->producer.value;
2397 
2398       struct hash_entry *entry = _mesa_hash_table_search(*table, value);
2399       if (!entry) {
2400          _mesa_hash_table_insert(*table, value, (void*)(uintptr_t)i);
2401          continue;
2402       }
2403 
2404       /* We've found a duplicate. Redirect loads and remove stores. */
2405       struct scalar_slot *found_slot = &linkage->slot[(uintptr_t)entry->data];
2406       nir_intrinsic_instr *store =
2407          list_first_entry(&found_slot->producer.stores,
2408                           struct list_node, head)->instr;
2409       nir_io_semantics sem = nir_intrinsic_io_semantics(store);
2410       unsigned component = nir_intrinsic_component(store);
2411 
2412       /* Redirect loads. */
2413       for (unsigned list_index = 0; list_index < 2; list_index++) {
2414          struct list_head *src_loads = list_index ? &slot->producer.loads :
2415                                                     &slot->consumer.loads;
2416          struct list_head *dst_loads = list_index ? &found_slot->producer.loads :
2417                                                     &found_slot->consumer.loads;
2418          bool has_progress = !list_is_empty(src_loads);
2419 
2420          list_for_each_entry(struct list_node, iter, src_loads, head) {
2421             nir_intrinsic_instr *loadi = iter->instr;
2422 
2423             nir_intrinsic_set_io_semantics(loadi, sem);
2424             nir_intrinsic_set_component(loadi, component);
2425 
2426             /* We also need to set the base to match the duplicate load, so
2427              * that CSE can eliminate it.
2428              */
2429             if (!list_is_empty(dst_loads)) {
2430                struct list_node *first =
2431                   list_first_entry(dst_loads, struct list_node, head);
2432                nir_intrinsic_set_base(loadi, nir_intrinsic_base(first->instr));
2433             } else {
2434                /* Use the base of the found store if there are no loads (it can
2435                 * only happen with TCS).
2436                 */
2437                assert(list_index == 0);
2438                nir_intrinsic_set_base(loadi, nir_intrinsic_base(store));
2439             }
2440          }
2441 
2442          if (has_progress) {
2443             /* Move the redirected loads to the found slot, so that compaction
2444              * can find them.
2445              */
2446             list_splicetail(src_loads, dst_loads);
2447             list_inithead(src_loads);
2448 
2449             *progress |= list_index ? nir_progress_producer :
2450                                       nir_progress_consumer;
2451          }
2452       }
2453 
2454       /* Remove all duplicated stores now that loads have been redirected. */
2455       remove_all_stores_and_clear_slot(linkage, i, progress);
2456    }
2457 
2458    for (unsigned i = 0; i < ARRAY_SIZE(tables); i++)
2459       _mesa_hash_table_destroy(tables[i], NULL);
2460 }
2461 
2462 /******************************************************************
2463  * FIND OPEN-CODED TES INPUT INTERPOLATION
2464  ******************************************************************/
2465 
2466 static bool
is_sysval(nir_instr * instr,gl_system_value sysval)2467 is_sysval(nir_instr *instr, gl_system_value sysval)
2468 {
2469    if (instr->type == nir_instr_type_intrinsic) {
2470       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2471 
2472       if (intr->intrinsic == nir_intrinsic_from_system_value(sysval))
2473          return true;
2474 
2475       if (intr->intrinsic == nir_intrinsic_load_deref) {
2476           nir_deref_instr *deref =
2477             nir_instr_as_deref(intr->src[0].ssa->parent_instr);
2478 
2479           return nir_deref_mode_is_one_of(deref, nir_var_system_value) &&
2480                  deref->var->data.location == sysval;
2481       }
2482    }
2483 
2484    return false;
2485 }
2486 
2487 static nir_alu_instr *
get_single_use_as_alu(nir_def * def)2488 get_single_use_as_alu(nir_def *def)
2489 {
2490    /* Only 1 use allowed. */
2491    if (!list_is_singular(&def->uses))
2492       return NULL;
2493 
2494    nir_instr *instr =
2495       nir_src_parent_instr(list_first_entry(&def->uses, nir_src, use_link));
2496    if (instr->type != nir_instr_type_alu)
2497       return NULL;
2498 
2499    return nir_instr_as_alu(instr);
2500 }
2501 
2502 static nir_alu_instr *
check_tes_input_load_get_single_use_alu(nir_intrinsic_instr * load,unsigned * vertex_index,unsigned * vertices_used,unsigned max_vertices)2503 check_tes_input_load_get_single_use_alu(nir_intrinsic_instr *load,
2504                                         unsigned *vertex_index,
2505                                         unsigned *vertices_used,
2506                                         unsigned max_vertices)
2507 {
2508    if (load->intrinsic != nir_intrinsic_load_per_vertex_input)
2509       return NULL;
2510 
2511    /* Check the vertex index. Each vertex can be loaded only once. */
2512    if (!nir_src_is_const(load->src[0]))
2513       return false;
2514 
2515    *vertex_index = nir_src_as_uint(load->src[0]);
2516    if (*vertex_index >= max_vertices ||
2517        *vertices_used & BITFIELD_BIT(*vertex_index))
2518       return false;
2519 
2520    *vertices_used |= BITFIELD_BIT(*vertex_index);
2521 
2522    return get_single_use_as_alu(&load->def);
2523 }
2524 
2525 static bool
gather_fmul_tess_coord(nir_intrinsic_instr * load,nir_alu_instr * fmul,unsigned vertex_index,unsigned * tess_coord_swizzle,unsigned * tess_coord_used,nir_def ** load_tess_coord)2526 gather_fmul_tess_coord(nir_intrinsic_instr *load, nir_alu_instr *fmul,
2527                        unsigned vertex_index, unsigned *tess_coord_swizzle,
2528                        unsigned *tess_coord_used, nir_def **load_tess_coord)
2529 {
2530    unsigned other_src = fmul->src[0].src.ssa == &load->def;
2531    nir_instr *other_instr = fmul->src[other_src].src.ssa->parent_instr;
2532 
2533    assert(fmul->src[!other_src].swizzle[0] == 0);
2534 
2535    if (!is_sysval(other_instr, SYSTEM_VALUE_TESS_COORD))
2536       return false;
2537 
2538    unsigned tess_coord_component = fmul->src[other_src].swizzle[0];
2539    /* Each tesscoord component can be used only once. */
2540    if (*tess_coord_used & BITFIELD_BIT(tess_coord_component))
2541       return false;
2542 
2543    *tess_coord_swizzle |= tess_coord_component << (4 * vertex_index);
2544    *tess_coord_used |= BITFIELD_BIT(tess_coord_component);
2545    *load_tess_coord = &nir_instr_as_intrinsic(other_instr)->def;
2546    return true;
2547 }
2548 
2549 /**
2550  * Find interpolation of the form:
2551  *    input[0].slot * TessCoord.a +
2552  *    input[1].slot * TessCoord.b +
2553  *    input[2].slot * TessCoord.c;
2554  *
2555  * a,b,c can be any of x,y,z, but each can occur only once.
2556  */
2557 static bool
find_tes_triangle_interp_3fmul_2fadd(struct linkage_info * linkage,unsigned i)2558 find_tes_triangle_interp_3fmul_2fadd(struct linkage_info *linkage, unsigned i)
2559 {
2560    struct scalar_slot *slot = &linkage->slot[i];
2561    unsigned vertices_used = 0;
2562    unsigned tess_coord_used = 0;
2563    unsigned tess_coord_swizzle = 0;
2564    unsigned num_fmuls = 0, num_fadds = 0;
2565    nir_alu_instr *fadds[2];
2566    nir_def *load_tess_coord = NULL;
2567 
2568    /* Find 3 multiplications by TessCoord and their uses, which must be
2569     * fadds.
2570     */
2571    list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) {
2572       unsigned vertex_index;
2573       nir_alu_instr *fmul =
2574          check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index,
2575                                                  &vertices_used, 3);
2576       /* Only maximum of 3 loads expected. Also reject exact ops because we
2577        * are going to do an inexact transformation with it.
2578        */
2579       if (!fmul || fmul->op != nir_op_fmul || fmul->exact || num_fmuls == 3 ||
2580           !gather_fmul_tess_coord(iter->instr, fmul, vertex_index,
2581                                   &tess_coord_swizzle, &tess_coord_used,
2582                                   &load_tess_coord))
2583          return false;
2584 
2585       num_fmuls++;
2586 
2587       /* The multiplication must only be used by fadd. Also reject exact ops.
2588        */
2589       nir_alu_instr *fadd = get_single_use_as_alu(&fmul->def);
2590       if (!fadd || fadd->op != nir_op_fadd || fadd->exact)
2591          return false;
2592 
2593       /* The 3 fmuls must only be used by 2 fadds. */
2594       unsigned i;
2595       for (i = 0; i < num_fadds; i++) {
2596          if (fadds[i] == fadd)
2597             break;
2598       }
2599       if (i == num_fadds) {
2600          if (num_fadds == 2)
2601             return false;
2602 
2603          fadds[num_fadds++] = fadd;
2604       }
2605    }
2606 
2607    if (num_fmuls != 3 || num_fadds != 2)
2608       return false;
2609 
2610    assert(tess_coord_used == 0x7);
2611 
2612    /* We have found that the only uses of the 3 fmuls are 2 fadds, which
2613     * implies that at least 2 fmuls are used by the same fadd.
2614     *
2615     * Check that 1 fadd is used by the other fadd, which can only be
2616     * the result of the TessCoord interpolation.
2617     */
2618    for (unsigned i = 0; i < 2; i++) {
2619       if (get_single_use_as_alu(&fadds[i]->def) == fadds[!i]) {
2620          switch (tess_coord_swizzle) {
2621          case 0x210:
2622             slot->consumer.tes_interp_load = fadds[!i];
2623             slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW;
2624             slot->consumer.tes_load_tess_coord = load_tess_coord;
2625             return true;
2626 
2627          case 0x102:
2628             slot->consumer.tes_interp_load = fadds[!i];
2629             slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV;
2630             slot->consumer.tes_load_tess_coord = load_tess_coord;
2631             return true;
2632 
2633          default:
2634             return false;
2635          }
2636       }
2637    }
2638 
2639    return false;
2640 }
2641 
2642 /**
2643  * Find interpolation of the form:
2644  *    fma(input[0].slot, TessCoord.a,
2645  *        fma(input[1].slot, TessCoord.b,
2646  *            input[2].slot * TessCoord.c))
2647  *
2648  * a,b,c can be any of x,y,z, but each can occur only once.
2649  */
2650 static bool
find_tes_triangle_interp_1fmul_2ffma(struct linkage_info * linkage,unsigned i)2651 find_tes_triangle_interp_1fmul_2ffma(struct linkage_info *linkage, unsigned i)
2652 {
2653    struct scalar_slot *slot = &linkage->slot[i];
2654    unsigned vertices_used = 0;
2655    unsigned tess_coord_used = 0;
2656    unsigned tess_coord_swizzle = 0;
2657    unsigned num_fmuls = 0, num_ffmas = 0;
2658    nir_alu_instr *ffmas[2], *fmul = NULL;
2659    nir_def *load_tess_coord = NULL;
2660 
2661    list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) {
2662       unsigned vertex_index;
2663       nir_alu_instr *alu =
2664          check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index,
2665                                                  &vertices_used, 3);
2666 
2667       /* Reject exact ops because we are going to do an inexact transformation
2668        * with it.
2669        */
2670       if (!alu || (alu->op != nir_op_fmul && alu->op != nir_op_ffma) ||
2671           alu->exact ||
2672           !gather_fmul_tess_coord(iter->instr, alu, vertex_index,
2673                                   &tess_coord_swizzle, &tess_coord_used,
2674                                   &load_tess_coord))
2675          return false;
2676 
2677       /* The multiplication must only be used by ffma. */
2678       if (alu->op == nir_op_fmul) {
2679          nir_alu_instr *ffma = get_single_use_as_alu(&alu->def);
2680          if (!ffma || ffma->op != nir_op_ffma)
2681             return false;
2682 
2683          if (num_fmuls == 1)
2684             return false;
2685 
2686          fmul = alu;
2687          num_fmuls++;
2688       } else {
2689          if (num_ffmas == 2)
2690             return false;
2691 
2692          ffmas[num_ffmas++] = alu;
2693       }
2694    }
2695 
2696    if (num_fmuls != 1 || num_ffmas != 2)
2697       return false;
2698 
2699    assert(tess_coord_used == 0x7);
2700 
2701    /* We have found that fmul has only 1 use and it's ffma, and there are 2
2702     * ffmas. Fail if neither ffma is using fmul.
2703     */
2704    if (ffmas[0]->src[2].src.ssa != &fmul->def &&
2705        ffmas[1]->src[2].src.ssa != &fmul->def)
2706       return false;
2707 
2708    /* If one ffma is using the other ffma, it's guaranteed to be src[2]. */
2709    for (unsigned i = 0; i < 2; i++) {
2710       if (get_single_use_as_alu(&ffmas[i]->def) == ffmas[!i]) {
2711          switch (tess_coord_swizzle) {
2712          case 0x210:
2713             slot->consumer.tes_interp_load = ffmas[!i];
2714             slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW;
2715             slot->consumer.tes_load_tess_coord = load_tess_coord;
2716             return true;
2717 
2718          case 0x102:
2719             slot->consumer.tes_interp_load = ffmas[!i];
2720             slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV;
2721             slot->consumer.tes_load_tess_coord = load_tess_coord;
2722             return true;
2723 
2724          default:
2725             return false;
2726          }
2727       }
2728    }
2729 
2730    return false;
2731 }
2732 
2733 static void
find_open_coded_tes_input_interpolation(struct linkage_info * linkage)2734 find_open_coded_tes_input_interpolation(struct linkage_info *linkage)
2735 {
2736    if (linkage->consumer_stage != MESA_SHADER_TESS_EVAL)
2737       return;
2738 
2739    unsigned i;
2740    BITSET_FOREACH_SET(i, linkage->flat32_mask, NUM_SCALAR_SLOTS) {
2741       if (vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
2742           vec4_slot(i) <= VARYING_SLOT_PATCH31)
2743          continue;
2744       if (find_tes_triangle_interp_3fmul_2fadd(linkage, i))
2745          continue;
2746       if (find_tes_triangle_interp_1fmul_2ffma(linkage, i))
2747          continue;
2748    }
2749 
2750    BITSET_FOREACH_SET(i, linkage->flat16_mask, NUM_SCALAR_SLOTS) {
2751       if (vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
2752           vec4_slot(i) <= VARYING_SLOT_PATCH31)
2753          continue;
2754       if (find_tes_triangle_interp_3fmul_2fadd(linkage, i))
2755          continue;
2756       if (find_tes_triangle_interp_1fmul_2ffma(linkage, i))
2757          continue;
2758    }
2759 }
2760 
2761 /******************************************************************
2762  * BACKWARD INTER-SHADER CODE MOTION
2763  ******************************************************************/
2764 
2765 #define NEED_UPDATE_MOVABLE_FLAGS(instr) \
2766    (!((instr)->pass_flags & (FLAG_MOVABLE | FLAG_UNMOVABLE)))
2767 
2768 #define GET_SRC_INTERP(alu, i) \
2769    ((alu)->src[i].src.ssa->parent_instr->pass_flags & FLAG_INTERP_MASK)
2770 
2771 static bool
can_move_alu_across_interp(struct linkage_info * linkage,nir_alu_instr * alu)2772 can_move_alu_across_interp(struct linkage_info *linkage, nir_alu_instr *alu)
2773 {
2774    /* Exact ALUs can't be moved across interpolation. */
2775    if (alu->exact)
2776       return false;
2777 
2778    /* Interpolation converts Infs to NaNs. If we turn a result of an ALU
2779     * instruction into a new interpolated input, it converts Infs to NaNs for
2780     * that instruction, while removing the Infs to NaNs conversion for sourced
2781     * interpolated values. We can't do that if Infs and NaNs must be preserved.
2782     */
2783    if (preserve_infs_nans(linkage->consumer_builder.shader, alu->def.bit_size))
2784       return false;
2785 
2786    switch (alu->op) {
2787    /* Always legal if the sources are interpolated identically because:
2788     *    interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j)
2789     *    interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j)
2790     */
2791    case nir_op_fadd:
2792    case nir_op_fsub:
2793    /* This is the same as multiplying by -1, which is always legal, see fmul.
2794     */
2795    case nir_op_fneg:
2796    case nir_op_mov:
2797       return true;
2798 
2799    /* At least one side of the multiplication must be convergent because this
2800     * is the only equation with multiplication that is true:
2801     *    interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j)
2802     */
2803    case nir_op_fmul:
2804    case nir_op_fmulz:
2805    case nir_op_ffma:
2806    case nir_op_ffmaz:
2807       return GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT ||
2808              GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT;
2809 
2810    case nir_op_fdiv:
2811       /* The right side must be convergent, which then follows the fmul rule.
2812        */
2813       return GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT;
2814 
2815    case nir_op_flrp:
2816       /* Using the same rule as fmul. */
2817       return (GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT &&
2818               GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT) ||
2819              GET_SRC_INTERP(alu, 2) == FLAG_INTERP_CONVERGENT;
2820 
2821    default:
2822       /* Moving other ALU instructions across interpolation is illegal. */
2823       return false;
2824    }
2825 }
2826 
2827 /* Determine whether an instruction is movable from the consumer to
2828  * the producer. Also determine which interpolation modes each ALU instruction
2829  * should use if its value was promoted to a new input.
2830  */
2831 static void
update_movable_flags(struct linkage_info * linkage,nir_instr * instr)2832 update_movable_flags(struct linkage_info *linkage, nir_instr *instr)
2833 {
2834    /* This function shouldn't be called more than once for each instruction
2835     * to minimize recursive calling.
2836     */
2837    assert(NEED_UPDATE_MOVABLE_FLAGS(instr));
2838 
2839    switch (instr->type) {
2840    case nir_instr_type_undef:
2841    case nir_instr_type_load_const:
2842       /* Treat constants as convergent, which means compatible with both flat
2843        * and non-flat inputs.
2844        */
2845       instr->pass_flags |= FLAG_MOVABLE | FLAG_INTERP_CONVERGENT;
2846       return;
2847 
2848    case nir_instr_type_alu: {
2849       nir_alu_instr *alu = nir_instr_as_alu(instr);
2850       unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
2851       unsigned alu_interp;
2852 
2853       /* These are shader-dependent and thus unmovable. */
2854       if (nir_op_is_derivative(alu->op)) {
2855          instr->pass_flags |= FLAG_UNMOVABLE;
2856          return;
2857       }
2858 
2859       /* Make vector ops unmovable. They are technically movable but more
2860        * complicated, and NIR should be scalarized for this pass anyway.
2861        * The only remaining vector ops should be vecN for intrinsic sources.
2862        */
2863       if (alu->def.num_components > 1) {
2864          instr->pass_flags |= FLAG_UNMOVABLE;
2865          return;
2866       }
2867 
2868       alu_interp = FLAG_INTERP_CONVERGENT;
2869 
2870       for (unsigned i = 0; i < num_srcs; i++) {
2871          nir_instr *src_instr = alu->src[i].src.ssa->parent_instr;
2872 
2873          if (NEED_UPDATE_MOVABLE_FLAGS(src_instr))
2874             update_movable_flags(linkage, src_instr);
2875 
2876          if (src_instr->pass_flags & FLAG_UNMOVABLE) {
2877             instr->pass_flags |= FLAG_UNMOVABLE;
2878             return;
2879          }
2880 
2881          /* Determine which interpolation mode this ALU instruction should
2882           * use if it was promoted to a new input.
2883           */
2884          unsigned src_interp = src_instr->pass_flags & FLAG_INTERP_MASK;
2885 
2886          if (alu_interp == src_interp ||
2887              src_interp == FLAG_INTERP_CONVERGENT) {
2888             /* Nothing to do. */
2889          } else if (alu_interp == FLAG_INTERP_CONVERGENT) {
2890             alu_interp = src_interp;
2891          } else {
2892             assert(alu_interp != FLAG_INTERP_CONVERGENT &&
2893                    src_interp != FLAG_INTERP_CONVERGENT &&
2894                    alu_interp != src_interp);
2895             /* The ALU instruction sources conflicting interpolation flags.
2896              * It can never become a new input.
2897              */
2898             instr->pass_flags |= FLAG_UNMOVABLE;
2899             return;
2900          }
2901       }
2902 
2903       /* Check if we can move the ALU instruction across an interpolated
2904        * load into the previous shader.
2905        */
2906       if (alu_interp > FLAG_INTERP_FLAT &&
2907           !can_move_alu_across_interp(linkage, alu)) {
2908          instr->pass_flags |= FLAG_UNMOVABLE;
2909          return;
2910       }
2911 
2912       instr->pass_flags |= FLAG_MOVABLE | alu_interp;
2913       return;
2914    }
2915 
2916    case nir_instr_type_intrinsic: {
2917       /* Movable input loads already have FLAG_MOVABLE on them.
2918        * Unmovable input loads skipped by initialization get UNMOVABLE here.
2919        * (e.g. colors, texcoords)
2920        *
2921        * The only other movable intrinsic is load_deref for uniforms and UBOs.
2922        * Other intrinsics are not movable.
2923        */
2924       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2925 
2926       if (intr->intrinsic == nir_intrinsic_load_deref) {
2927          nir_instr *deref = intr->src[0].ssa->parent_instr;
2928 
2929          if (NEED_UPDATE_MOVABLE_FLAGS(deref))
2930             update_movable_flags(linkage, deref);
2931 
2932          if (deref->pass_flags & FLAG_MOVABLE) {
2933             /* Treat uniforms as convergent, which means compatible with both
2934              * flat and non-flat inputs.
2935              */
2936             instr->pass_flags |= FLAG_MOVABLE | FLAG_INTERP_CONVERGENT;
2937             return;
2938          }
2939       }
2940 
2941       instr->pass_flags |= FLAG_UNMOVABLE;
2942       return;
2943    }
2944 
2945    case nir_instr_type_deref:
2946       if (can_move_deref_between_shaders(linkage, instr))
2947          instr->pass_flags |= FLAG_MOVABLE;
2948       else
2949          instr->pass_flags |= FLAG_UNMOVABLE;
2950       return;
2951 
2952    default:
2953       instr->pass_flags |= FLAG_UNMOVABLE;
2954       return;
2955    }
2956 }
2957 
2958 /* Gather the input loads used by the post-dominator using DFS. */
2959 static void
gather_used_input_loads(nir_instr * instr,nir_intrinsic_instr * loads[NUM_SCALAR_SLOTS],unsigned * num_loads)2960 gather_used_input_loads(nir_instr *instr,
2961                         nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS],
2962                         unsigned *num_loads)
2963 {
2964    switch (instr->type) {
2965    case nir_instr_type_undef:
2966    case nir_instr_type_load_const:
2967       return;
2968 
2969    case nir_instr_type_alu: {
2970       nir_alu_instr *alu = nir_instr_as_alu(instr);
2971       unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
2972 
2973       for (unsigned i = 0; i < num_srcs; i++) {
2974          gather_used_input_loads(alu->src[i].src.ssa->parent_instr,
2975                                  loads, num_loads);
2976       }
2977       return;
2978    }
2979 
2980    case nir_instr_type_intrinsic: {
2981       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2982 
2983       switch (intr->intrinsic) {
2984       case nir_intrinsic_load_deref:
2985       case nir_intrinsic_load_tess_coord:
2986          return;
2987 
2988       case nir_intrinsic_load_input:
2989       case nir_intrinsic_load_per_vertex_input:
2990       case nir_intrinsic_load_interpolated_input:
2991          if (!(intr->instr.pass_flags & FLAG_GATHER_LOADS_VISITED)) {
2992             assert(*num_loads < NUM_SCALAR_SLOTS*8);
2993             loads[(*num_loads)++] = intr;
2994             intr->instr.pass_flags |= FLAG_GATHER_LOADS_VISITED;
2995          }
2996          return;
2997 
2998       default:
2999          printf("%u\n", intr->intrinsic);
3000          unreachable("unexpected intrinsic");
3001       }
3002    }
3003 
3004    default:
3005       unreachable("unexpected instr type");
3006    }
3007 }
3008 
3009 /* Move a post-dominator, which is an ALU opcode, into the previous shader,
3010  * and replace the post-dominator with a new input load.
3011  */
3012 static bool
try_move_postdominator(struct linkage_info * linkage,struct nir_use_dominance_state * postdom_state,nir_alu_instr * postdom,nir_def * load_def,nir_intrinsic_instr * first_load,nir_opt_varyings_progress * progress)3013 try_move_postdominator(struct linkage_info *linkage,
3014                        struct nir_use_dominance_state *postdom_state,
3015                        nir_alu_instr *postdom,
3016                        nir_def *load_def,
3017                        nir_intrinsic_instr *first_load,
3018                        nir_opt_varyings_progress *progress)
3019 {
3020 #define PRINT 0
3021 #if PRINT
3022    printf("Trying to move post-dom: ");
3023    nir_print_instr(&postdom->instr, stdout);
3024    puts("");
3025 #endif
3026 
3027    /* Gather the input loads used by the post-dominator using DFS. */
3028    nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS*8];
3029    unsigned num_loads = 0;
3030    gather_used_input_loads(&postdom->instr, loads, &num_loads);
3031 
3032    /* Clear the flag set by gather_used_input_loads. */
3033    for (unsigned i = 0; i < num_loads; i++)
3034       loads[i]->instr.pass_flags &= ~FLAG_GATHER_LOADS_VISITED;
3035 
3036    /* For all the loads, the previous shader must have the corresponding
3037     * output stores in the same basic block because we are going to replace
3038     * them with 1 store. Only TCS and GS can have stores of different outputs
3039     * in different blocks.
3040     */
3041    nir_block *block = NULL;
3042 
3043    for (unsigned i = 0; i < num_loads; i++) {
3044       unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]);
3045       struct scalar_slot *slot = &linkage->slot[slot_index];
3046 
3047       assert(list_is_singular(&slot->producer.stores));
3048       nir_intrinsic_instr *store =
3049          list_first_entry(&slot->producer.stores, struct list_node,
3050                           head)->instr;
3051 
3052       if (!block) {
3053          block = store->instr.block;
3054          continue;
3055       }
3056       if (block != store->instr.block)
3057          return false;
3058    }
3059 
3060    assert(block);
3061 
3062 #if PRINT
3063    printf("Post-dom accepted: ");
3064    nir_print_instr(&postdom->instr, stdout);
3065    puts("\n");
3066 #endif
3067 
3068    /* Determine the scalar slot index of the new varying. It will reuse
3069     * the slot of the load we started from because the load will be
3070     * removed.
3071     */
3072    unsigned final_slot = intr_get_scalar_16bit_slot(first_load);
3073 
3074    /* Replace the post-dominator in the consumer with a new input load.
3075     * Since we are reusing the same slot as the first load and it has
3076     * the right interpolation qualifiers, use it as the new load by using
3077     * it in place of the post-dominator.
3078     *
3079     * Boolean post-dominators are upcast in the producer and then downcast
3080     * in the consumer.
3081     */
3082    unsigned slot_index = final_slot;
3083    struct scalar_slot *slot = &linkage->slot[slot_index];
3084    nir_builder *b = &linkage->consumer_builder;
3085    b->cursor = nir_after_instr(load_def->parent_instr);
3086    unsigned alu_interp = postdom->instr.pass_flags & FLAG_INTERP_MASK;
3087    nir_def *new_input, *new_tes_loads[3];
3088    BITSET_WORD *mask;
3089 
3090    /* Convergent instruction results that are not interpolatable (integer or
3091     * FP64) should not be moved because compaction can relocate convergent
3092     * varyings to interpolated vec4 slots because the definition of convergent
3093     * varyings implies that they can be interpolated (which doesn't work with
3094     * integer and FP64 values).
3095     *
3096     * Check the result type and if it's not float and the driver doesn't
3097     * support convergent flat loads from interpolated vec4 slots, don't move
3098     * it.
3099     */
3100    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
3101        alu_interp == FLAG_INTERP_CONVERGENT &&
3102        !linkage->can_mix_convergent_flat_with_interpolated &&
3103        ((postdom->def.bit_size != 16 && postdom->def.bit_size != 32) ||
3104         !(nir_op_infos[postdom->op].output_type & nir_type_float)))
3105       return false;
3106 
3107    /* NIR can't do 1-bit inputs. Convert them to a bigger size. */
3108    assert(postdom->def.bit_size & (1 | 16 | 32));
3109    unsigned new_bit_size = postdom->def.bit_size;
3110 
3111    if (new_bit_size == 1) {
3112       assert(alu_interp == FLAG_INTERP_CONVERGENT ||
3113              alu_interp == FLAG_INTERP_FLAT);
3114       /* TODO: We could use 16 bits instead, but that currently fails on AMD.
3115        */
3116       new_bit_size = 32;
3117    }
3118 
3119    bool rewrite_convergent_to_flat =
3120       alu_interp == FLAG_INTERP_CONVERGENT &&
3121       linkage->can_mix_convergent_flat_with_interpolated;
3122 
3123    /* Create the new input load. This creates a new load (or a series of
3124     * loads in case of open-coded TES interpolation) that's identical to
3125     * the original load(s).
3126     */
3127    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
3128        alu_interp != FLAG_INTERP_FLAT && !rewrite_convergent_to_flat) {
3129       nir_def *baryc = NULL;
3130 
3131       /* Determine the barycentric coordinates. */
3132       switch (alu_interp) {
3133       case FLAG_INTERP_PERSP_PIXEL:
3134       case FLAG_INTERP_LINEAR_PIXEL:
3135          baryc = nir_load_barycentric_pixel(b, 32);
3136          break;
3137       case FLAG_INTERP_PERSP_CENTROID:
3138       case FLAG_INTERP_LINEAR_CENTROID:
3139          baryc = nir_load_barycentric_centroid(b, 32);
3140          break;
3141       case FLAG_INTERP_PERSP_SAMPLE:
3142       case FLAG_INTERP_LINEAR_SAMPLE:
3143          baryc = nir_load_barycentric_sample(b, 32);
3144          break;
3145       default:
3146          baryc = first_load->src[0].ssa;
3147          break;
3148       }
3149 
3150       if (baryc != first_load->src[0].ssa) {
3151          nir_intrinsic_instr *baryc_i =
3152             nir_instr_as_intrinsic(baryc->parent_instr);
3153 
3154          if (alu_interp == FLAG_INTERP_LINEAR_PIXEL ||
3155             alu_interp == FLAG_INTERP_LINEAR_CENTROID ||
3156             alu_interp == FLAG_INTERP_LINEAR_SAMPLE)
3157             nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_NOPERSPECTIVE);
3158          else
3159             nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_SMOOTH);
3160       }
3161 
3162       new_input = nir_load_interpolated_input(
3163                      b, 1, new_bit_size, baryc, nir_imm_int(b, 0),
3164                      .base = nir_intrinsic_base(first_load),
3165                      .component = nir_intrinsic_component(first_load),
3166                      .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3167                                   new_bit_size,
3168                      .io_semantics = nir_intrinsic_io_semantics(first_load));
3169 
3170       if (alu_interp == FLAG_INTERP_CONVERGENT) {
3171          mask = new_bit_size == 16 ? linkage->convergent16_mask
3172                                    : linkage->convergent32_mask;
3173       } else {
3174          mask = new_bit_size == 16 ? linkage->interp_fp16_mask
3175                                    : linkage->interp_fp32_mask;
3176       }
3177    } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3178               alu_interp > FLAG_INTERP_FLAT) {
3179       nir_def *zero = nir_imm_int(b, 0);
3180 
3181       for (unsigned i = 0; i < 3; i++) {
3182          new_tes_loads[i] =
3183             nir_load_per_vertex_input(b, 1, new_bit_size,
3184                   i ? nir_imm_int(b, i) : zero, zero,
3185                   .base = nir_intrinsic_base(first_load),
3186                   .component = nir_intrinsic_component(first_load),
3187                      .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3188                                   new_bit_size,
3189                   .io_semantics = nir_intrinsic_io_semantics(first_load));
3190       }
3191 
3192       int remap_uvw[3] = {0, 1, 2};
3193       int remap_wuv[3] = {2, 0, 1};
3194       int *remap;
3195 
3196       switch (alu_interp) {
3197       case FLAG_INTERP_TES_TRIANGLE_UVW:
3198          remap = remap_uvw;
3199          break;
3200       case FLAG_INTERP_TES_TRIANGLE_WUV:
3201          remap = remap_wuv;
3202          break;
3203       default:
3204          unreachable("invalid TES interpolation mode");
3205       }
3206 
3207       nir_def *tesscoord = slot->consumer.tes_load_tess_coord;
3208       nir_def *defs[3];
3209 
3210       for (unsigned i = 0; i < 3; i++) {
3211          if (i == 0) {
3212             defs[i] = nir_fmul(b, new_tes_loads[i],
3213                                nir_channel(b, tesscoord, remap[i]));
3214          } else {
3215             defs[i] = nir_ffma(b, new_tes_loads[i],
3216                                nir_channel(b, tesscoord, remap[i]),
3217                                defs[i - 1]);
3218          }
3219       }
3220       new_input = defs[2];
3221 
3222       mask = new_bit_size == 16 ? linkage->flat16_mask
3223                                 : linkage->flat32_mask;
3224    } else {
3225       /* We have to rewrite convergent to flat here and not during compaction
3226        * because compaction adds code to convert Infs to NaNs for
3227        * "load_interpolated_input -> load_input" replacements, which corrupts
3228        * integer data.
3229        */
3230       assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT ||
3231              alu_interp == FLAG_INTERP_FLAT || rewrite_convergent_to_flat);
3232 
3233       new_input =
3234          nir_load_input(b, 1, new_bit_size, nir_imm_int(b, 0),
3235                         .base = nir_intrinsic_base(first_load),
3236                         .component = nir_intrinsic_component(first_load),
3237                         .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3238                                     new_bit_size,
3239                         .io_semantics = nir_intrinsic_io_semantics(first_load));
3240 
3241       mask = new_bit_size == 16 ? linkage->flat16_mask
3242                                 : linkage->flat32_mask;
3243 
3244       if (rewrite_convergent_to_flat) {
3245          mask = new_bit_size == 16 ? linkage->convergent16_mask
3246                                    : linkage->convergent32_mask;
3247       }
3248    }
3249 
3250    assert(!BITSET_TEST(linkage->no_varying32_mask, slot_index));
3251    assert(!BITSET_TEST(linkage->no_varying16_mask, slot_index));
3252 
3253    /* Re-set the category of the new scalar input. This will cause
3254     * the compaction to treat it as a different type, so that it will move it
3255     * into the vec4 that has compatible interpolation qualifiers.
3256     *
3257     * This shouldn't be done if any of the interp masks are not set, which
3258     * indicates that compaction is disallowed.
3259     */
3260    if (BITSET_TEST(linkage->interp_fp32_mask, slot_index) ||
3261        BITSET_TEST(linkage->interp_fp16_mask, slot_index) ||
3262        BITSET_TEST(linkage->flat32_mask, slot_index) ||
3263        BITSET_TEST(linkage->flat16_mask, slot_index) ||
3264        BITSET_TEST(linkage->convergent32_mask, slot_index) ||
3265        BITSET_TEST(linkage->convergent16_mask, slot_index)) {
3266       BITSET_CLEAR(linkage->interp_fp32_mask, slot_index);
3267       BITSET_CLEAR(linkage->interp_fp16_mask, slot_index);
3268       BITSET_CLEAR(linkage->flat16_mask, slot_index);
3269       BITSET_CLEAR(linkage->flat32_mask, slot_index);
3270       BITSET_CLEAR(linkage->convergent16_mask, slot_index);
3271       BITSET_CLEAR(linkage->convergent32_mask, slot_index);
3272       BITSET_SET(mask, slot_index);
3273    }
3274 
3275    /* Replace the existing load with the new load in the slot. */
3276    if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3277        alu_interp >= FLAG_INTERP_TES_TRIANGLE_UVW) {
3278       /* For TES, replace all 3 loads. */
3279       unsigned i = 0;
3280       list_for_each_entry(struct list_node, iter, &slot->consumer.loads,
3281                           head) {
3282          assert(i < 3);
3283          iter->instr = nir_instr_as_intrinsic(new_tes_loads[i]->parent_instr);
3284          i++;
3285       }
3286 
3287       assert(i == 3);
3288       assert(postdom->def.bit_size != 1);
3289 
3290       slot->consumer.tes_interp_load =
3291          nir_instr_as_alu(new_input->parent_instr);
3292    } else {
3293       assert(list_is_singular(&slot->consumer.loads));
3294       list_first_entry(&slot->consumer.loads, struct list_node, head)->instr =
3295          nir_instr_as_intrinsic(new_input->parent_instr);
3296 
3297       /* The input is a bigger type even if the post-dominator is boolean. */
3298       if (postdom->def.bit_size == 1)
3299          new_input = nir_ine_imm(b, new_input, 0);
3300    }
3301 
3302    nir_def_rewrite_uses(&postdom->def, new_input);
3303 
3304    /* Clone the post-dominator at the end of the block in the producer
3305     * where the output stores are.
3306     */
3307    b = &linkage->producer_builder;
3308    b->cursor = nir_after_block_before_jump(block);
3309    nir_def *producer_clone = clone_ssa(linkage, b, &postdom->def);
3310 
3311    /* Boolean post-dominators are upcast in the producer because we can't
3312     * use 1-bit outputs.
3313     */
3314    if (producer_clone->bit_size == 1)
3315       producer_clone = nir_b2bN(b, producer_clone, new_bit_size);
3316 
3317    /* Move the existing store to the end of the block and rewrite it to use
3318     * the post-dominator result.
3319     */
3320    nir_intrinsic_instr *store =
3321       list_first_entry(&linkage->slot[final_slot].producer.stores,
3322                        struct list_node, head)->instr;
3323    nir_instr_move(b->cursor, &store->instr);
3324    if (nir_src_bit_size(store->src[0]) != producer_clone->bit_size)
3325       nir_intrinsic_set_src_type(store, nir_alu_type_get_base_type(nir_intrinsic_src_type(store)) |
3326                                         producer_clone->bit_size);
3327    nir_src_rewrite(&store->src[0], producer_clone);
3328 
3329    /* Remove all loads and stores that we are replacing from the producer
3330     * and consumer.
3331     */
3332    for (unsigned i = 0; i < num_loads; i++) {
3333       unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]);
3334 
3335       if (slot_index == final_slot) {
3336          /* Keep the load and store that we reused. */
3337          continue;
3338       }
3339 
3340       /* Remove loads and stores that are dead after the code motion. Only
3341        * those loads that are post-dominated by the post-dominator are dead.
3342        */
3343       struct scalar_slot *slot = &linkage->slot[slot_index];
3344       nir_instr *load;
3345 
3346       if (slot->consumer.tes_interp_load) {
3347          load = &slot->consumer.tes_interp_load->instr;
3348 
3349          /* With interpolated TES loads, we get here 3 times, once for each
3350           * per-vertex load. Skip this if we've been here before.
3351           */
3352          if (list_is_empty(&slot->producer.stores)) {
3353             assert(list_is_empty(&slot->consumer.loads));
3354             continue;
3355          }
3356       } else {
3357          assert(list_is_singular(&slot->consumer.loads));
3358          load = &list_first_entry(&slot->consumer.loads,
3359                                   struct list_node, head)->instr->instr;
3360       }
3361 
3362       if (nir_instr_dominates_use(postdom_state, &postdom->instr, load)) {
3363          list_inithead(&slot->consumer.loads);
3364 
3365          /* Remove stores. (transform feedback is allowed here, just not
3366           * in final_slot)
3367           */
3368          remove_all_stores_and_clear_slot(linkage, slot_index, progress);
3369       }
3370    }
3371 
3372    *progress |= nir_progress_producer | nir_progress_consumer;
3373    return true;
3374 }
3375 
3376 static bool
backward_inter_shader_code_motion(struct linkage_info * linkage,nir_opt_varyings_progress * progress)3377 backward_inter_shader_code_motion(struct linkage_info *linkage,
3378                                   nir_opt_varyings_progress *progress)
3379 {
3380    /* These producers are not supported. The description at the beginning
3381     * suggests a possible workaround.
3382     */
3383    if (linkage->producer_stage == MESA_SHADER_GEOMETRY ||
3384        linkage->producer_stage == MESA_SHADER_MESH ||
3385        linkage->producer_stage == MESA_SHADER_TASK)
3386       return false;
3387 
3388    /* Clear pass_flags. */
3389    nir_shader_clear_pass_flags(linkage->consumer_builder.shader);
3390 
3391    /* Gather inputs that can be moved into the previous shader. These are only
3392     * checked for the basic constraints for movability.
3393     */
3394    struct {
3395       nir_def *def;
3396       nir_intrinsic_instr *first_load;
3397    } movable_loads[NUM_SCALAR_SLOTS];
3398    unsigned num_movable_loads = 0;
3399    unsigned i;
3400 
3401    BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
3402       if (!can_optimize_varying(linkage,
3403                                 vec4_slot(i)).inter_shader_code_motion)
3404          continue;
3405 
3406       struct scalar_slot *slot = &linkage->slot[i];
3407 
3408       assert(!list_is_empty(&slot->producer.stores));
3409       assert(!is_interpolated_texcoord(linkage, i));
3410       assert(!is_interpolated_color(linkage, i));
3411 
3412       /* Disallow producer loads. */
3413       if (!list_is_empty(&slot->producer.loads))
3414          continue;
3415 
3416       /* There should be only 1 store per output. */
3417       if (!list_is_singular(&slot->producer.stores))
3418          continue;
3419 
3420       nir_def *load_def = NULL;
3421       nir_intrinsic_instr *load =
3422          list_first_entry(&slot->consumer.loads, struct list_node,
3423                           head)->instr;
3424 
3425       nir_intrinsic_instr *store =
3426         list_first_entry(&slot->producer.stores, struct list_node,
3427                          head)->instr;
3428 
3429       /* Set interpolation flags.
3430        * Handle interpolated TES loads first because they are special.
3431        */
3432       if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3433           slot->consumer.tes_interp_load) {
3434          if (linkage->producer_stage == MESA_SHADER_VERTEX) {
3435             /* VS -> TES has no constraints on VS stores. */
3436             load_def = &slot->consumer.tes_interp_load->def;
3437             load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD |
3438                                                   slot->consumer.tes_interp_mode;
3439          } else {
3440             assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
3441             assert(store->intrinsic == nir_intrinsic_store_per_vertex_output);
3442 
3443             /* The vertex index of the store must InvocationID. */
3444             if (is_sysval(store->src[1].ssa->parent_instr,
3445                           SYSTEM_VALUE_INVOCATION_ID)) {
3446                load_def = &slot->consumer.tes_interp_load->def;
3447                load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD |
3448                                                      slot->consumer.tes_interp_mode;
3449             } else {
3450                continue;
3451             }
3452          }
3453       } else {
3454          /* Allow only 1 load per input. CSE should be run before this. */
3455          if (!list_is_singular(&slot->consumer.loads))
3456             continue;
3457 
3458          /* This can only be TCS -> TES, which is handled above and rejected
3459           * otherwise.
3460           */
3461          if (store->intrinsic == nir_intrinsic_store_per_vertex_output) {
3462             assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
3463             continue;
3464          }
3465 
3466          /* TODO: handle load_per_vertex_input for TCS and GS.
3467           * TES can also occur here if tes_interp_load is NULL.
3468           */
3469          if (load->intrinsic == nir_intrinsic_load_per_vertex_input)
3470             continue;
3471 
3472          load_def = &load->def;
3473 
3474          switch (load->intrinsic) {
3475          case nir_intrinsic_load_interpolated_input: {
3476             assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT);
3477             nir_intrinsic_instr *baryc =
3478                nir_instr_as_intrinsic(load->src[0].ssa->parent_instr);
3479             nir_intrinsic_op op = baryc->intrinsic;
3480             enum glsl_interp_mode interp = nir_intrinsic_interp_mode(baryc);
3481             bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
3482             bool convergent = BITSET_TEST(linkage->convergent32_mask, i) ||
3483                               BITSET_TEST(linkage->convergent16_mask, i);
3484 
3485             assert(interp == INTERP_MODE_NONE ||
3486                    interp == INTERP_MODE_SMOOTH ||
3487                    interp == INTERP_MODE_NOPERSPECTIVE);
3488 
3489             if (convergent) {
3490                load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3491             } else if (op == nir_intrinsic_load_barycentric_pixel) {
3492                load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_PIXEL
3493                                                 : FLAG_INTERP_PERSP_PIXEL;
3494             } else if (op == nir_intrinsic_load_barycentric_centroid) {
3495                load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_CENTROID
3496                                                 : FLAG_INTERP_PERSP_CENTROID;
3497             } else if (op == nir_intrinsic_load_barycentric_sample) {
3498                load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_SAMPLE
3499                                                 : FLAG_INTERP_PERSP_SAMPLE;
3500             } else {
3501                /* Optimizing at_offset and at_sample would be possible but
3502                 * maybe not worth it if they are not convergent. Convergent
3503                 * inputs can trivially switch the barycentric coordinates
3504                 * to different ones or flat.
3505                 */
3506                continue;
3507             }
3508             break;
3509          }
3510          case nir_intrinsic_load_input:
3511             if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
3512                if (BITSET_TEST(linkage->convergent32_mask, i) ||
3513                    BITSET_TEST(linkage->convergent16_mask, i))
3514                   load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3515                else
3516                   load->instr.pass_flags |= FLAG_INTERP_FLAT;
3517             } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) {
3518                assert(vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
3519                       vec4_slot(i) <= VARYING_SLOT_PATCH31);
3520                /* Patch inputs are always convergent. */
3521                load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3522             } else {
3523                /* It's not a fragment shader. We still need to set this. */
3524                load->instr.pass_flags |= FLAG_INTERP_FLAT;
3525             }
3526             break;
3527          case nir_intrinsic_load_per_primitive_input:
3528          case nir_intrinsic_load_input_vertex:
3529             /* Inter-shader code motion is unimplemented these. */
3530             continue;
3531          default:
3532             unreachable("unexpected load intrinsic");
3533          }
3534       }
3535 
3536       load_def->parent_instr->pass_flags |= FLAG_MOVABLE;
3537 
3538       /* Disallow transform feedback. The load is "movable" for the purpose of
3539        * finding a movable post-dominator, we just can't rewrite the store
3540        * because we need to keep it for xfb, so the post-dominator search
3541        * will have to start from a different load (only that varying will have
3542        * its value rewritten).
3543        */
3544       if (BITSET_TEST(linkage->xfb_mask, i))
3545          continue;
3546 
3547       assert(num_movable_loads < ARRAY_SIZE(movable_loads));
3548       movable_loads[num_movable_loads].def = load_def;
3549       movable_loads[num_movable_loads].first_load = load;
3550       num_movable_loads++;
3551    }
3552 
3553    if (!num_movable_loads)
3554       return false;
3555 
3556    /* Inter-shader code motion turns ALU results into outputs, but not all
3557     * bit sizes are supported by outputs.
3558     *
3559     * The 1-bit type is allowed because the pass always promotes 1-bit
3560     * outputs to 16 or 32 bits, whichever is supported.
3561     *
3562     * TODO: We could support replacing 2 32-bit inputs with one 64-bit
3563     * post-dominator by supporting 64 bits here, but the likelihood of that
3564     * occuring seems low.
3565     */
3566    unsigned supported_io_types = 32 | 1;
3567 
3568    if (linkage->producer_builder.shader->options->io_options &
3569        linkage->consumer_builder.shader->options->io_options &
3570        nir_io_16bit_input_output_support)
3571       supported_io_types |= 16;
3572 
3573    struct nir_use_dominance_state *postdom_state =
3574       nir_calc_use_dominance_impl(linkage->consumer_builder.impl, true);
3575 
3576    for (unsigned i = 0; i < num_movable_loads; i++) {
3577       nir_def *load_def = movable_loads[i].def;
3578       nir_instr *iter = load_def->parent_instr;
3579       nir_instr *movable_postdom = NULL;
3580 
3581       /* Find the farthest post-dominator that is movable. */
3582       while (iter) {
3583          iter = nir_get_immediate_use_dominator(postdom_state, iter);
3584          if (iter) {
3585             if (NEED_UPDATE_MOVABLE_FLAGS(iter))
3586                update_movable_flags(linkage, iter);
3587 
3588             if (iter->pass_flags & FLAG_UNMOVABLE)
3589                break;
3590 
3591             /* This can only be an ALU instruction. */
3592             nir_alu_instr *alu = nir_instr_as_alu(iter);
3593 
3594             /* Skip unsupported bit sizes and keep searching. */
3595             if (!(alu->def.bit_size & supported_io_types))
3596                continue;
3597 
3598             /* Skip comparison opcodes that directly source the first load
3599              * and a constant because any 1-bit values would have to be
3600              * converted to 32 bits in the producer and then converted back
3601              * to 1 bit using nir_op_ine in the consumer, achieving nothing.
3602              */
3603             if (alu->def.bit_size == 1 &&
3604                 ((nir_op_infos[alu->op].num_inputs == 1 &&
3605                   alu->src[0].src.ssa == load_def) ||
3606                  (nir_op_infos[alu->op].num_inputs == 2 &&
3607                   ((alu->src[0].src.ssa == load_def &&
3608                     alu->src[1].src.ssa->parent_instr->type ==
3609                     nir_instr_type_load_const) ||
3610                    (alu->src[0].src.ssa->parent_instr->type ==
3611                     nir_instr_type_load_const &&
3612                     alu->src[1].src.ssa == load_def)))))
3613                continue;
3614 
3615             movable_postdom = iter;
3616          }
3617       }
3618 
3619       /* Add the post-dominator to the list unless it's been added already. */
3620       if (movable_postdom &&
3621           !(movable_postdom->pass_flags & FLAG_POST_DOMINATOR_PROCESSED)) {
3622          if (try_move_postdominator(linkage, postdom_state,
3623                                     nir_instr_as_alu(movable_postdom),
3624                                     load_def, movable_loads[i].first_load,
3625                                     progress)) {
3626             /* Moving only one postdominator can change the IR enough that
3627              * we should start from scratch.
3628              */
3629             ralloc_free(postdom_state);
3630             return true;
3631          }
3632 
3633          movable_postdom->pass_flags |= FLAG_POST_DOMINATOR_PROCESSED;
3634       }
3635    }
3636 
3637    ralloc_free(postdom_state);
3638    return false;
3639 }
3640 
3641 /******************************************************************
3642  * COMPACTION
3643  ******************************************************************/
3644 
3645 /* Relocate a slot to a new index. Used by compaction. new_index is
3646  * the component index at 16-bit granularity, so the size of vec4 is 8
3647  * in that representation.
3648  */
3649 static void
relocate_slot(struct linkage_info * linkage,struct scalar_slot * slot,unsigned i,unsigned new_index,enum fs_vec4_type fs_vec4_type,bool convergent,nir_opt_varyings_progress * progress)3650 relocate_slot(struct linkage_info *linkage, struct scalar_slot *slot,
3651               unsigned i, unsigned new_index, enum fs_vec4_type fs_vec4_type,
3652               bool convergent, nir_opt_varyings_progress *progress)
3653 {
3654    assert(!list_is_empty(&slot->producer.stores));
3655 
3656    list_for_each_entry(struct list_node, iter, &slot->producer.stores, head) {
3657       assert(!nir_intrinsic_io_semantics(iter->instr).no_varying ||
3658              has_xfb(iter->instr) ||
3659              linkage->producer_stage == MESA_SHADER_TESS_CTRL);
3660       assert(!is_active_sysval_output(linkage, i, iter->instr));
3661    }
3662 
3663    /* Relocate the slot in all loads and stores. */
3664    struct list_head *instruction_lists[3] = {
3665       &slot->producer.stores,
3666       &slot->producer.loads,
3667       &slot->consumer.loads,
3668    };
3669 
3670    for (unsigned i = 0; i < ARRAY_SIZE(instruction_lists); i++) {
3671       list_for_each_entry(struct list_node, iter, instruction_lists[i], head) {
3672          nir_intrinsic_instr *intr = iter->instr;
3673 
3674          gl_varying_slot new_semantic = vec4_slot(new_index);
3675          unsigned new_component = (new_index % 8) / 2;
3676          bool new_high_16bits = new_index % 2;
3677 
3678          /* We also need to relocate xfb info because it's always relative
3679           * to component 0. This just moves it into the correct xfb slot.
3680           */
3681          if (has_xfb(intr)) {
3682             unsigned old_component = nir_intrinsic_component(intr);
3683             static const nir_io_xfb clear_xfb;
3684             nir_io_xfb xfb;
3685             bool new_is_odd = new_component % 2 == 1;
3686 
3687             memset(&xfb, 0, sizeof(xfb));
3688 
3689             if (old_component >= 2) {
3690                xfb.out[new_is_odd] = nir_intrinsic_io_xfb2(intr).out[old_component - 2];
3691                nir_intrinsic_set_io_xfb2(intr, clear_xfb);
3692             } else {
3693                xfb.out[new_is_odd] = nir_intrinsic_io_xfb(intr).out[old_component];
3694                nir_intrinsic_set_io_xfb(intr, clear_xfb);
3695             }
3696 
3697             if (new_component >= 2)
3698                nir_intrinsic_set_io_xfb2(intr, xfb);
3699             else
3700                nir_intrinsic_set_io_xfb(intr, xfb);
3701          }
3702 
3703          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
3704 
3705          /* When relocating a back color store, don't change it to a front
3706           * color as that would be incorrect. Keep it as back color and only
3707           * relocate it between BFC0 and BFC1.
3708           */
3709          if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
3710              (sem.location == VARYING_SLOT_BFC0 ||
3711               sem.location == VARYING_SLOT_BFC1)) {
3712             assert(new_semantic == VARYING_SLOT_COL0 ||
3713                    new_semantic == VARYING_SLOT_COL1);
3714             new_semantic = VARYING_SLOT_BFC0 +
3715                            (new_semantic - VARYING_SLOT_COL0);
3716          }
3717 
3718 #if PRINT_RELOCATE_SLOT
3719          unsigned bit_size =
3720             (intr->intrinsic == nir_intrinsic_load_input ||
3721              intr->intrinsic == nir_intrinsic_load_input_vertex ||
3722              intr->intrinsic == nir_intrinsic_load_interpolated_input)
3723             ? intr->def.bit_size : intr->src[0].ssa->bit_size;
3724 
3725          assert(bit_size == 16 || bit_size == 32);
3726 
3727          fprintf(stderr, "--- relocating: %s.%c%s%s -> %s.%c%s%s FS_VEC4_TYPE_%s\n",
3728                  gl_varying_slot_name_for_stage(sem.location, linkage->producer_stage) + 13,
3729                  "xyzw"[nir_intrinsic_component(intr) % 4],
3730                  (bit_size == 16 && !sem.high_16bits) ? ".lo" : "",
3731                  (bit_size == 16 && sem.high_16bits) ? ".hi" : "",
3732                  gl_varying_slot_name_for_stage(new_semantic, linkage->producer_stage) + 13,
3733                  "xyzw"[new_component % 4],
3734                  (bit_size == 16 && !new_high_16bits) ? ".lo" : "",
3735                  (bit_size == 16 && new_high_16bits) ? ".hi" : "",
3736                  fs_vec4_type_strings[fs_vec4_type]);
3737 #endif /* PRINT_RELOCATE_SLOT */
3738 
3739          sem.location = new_semantic;
3740          sem.high_16bits = new_high_16bits;
3741 
3742          /* This is never indirectly indexed. Simplify num_slots. */
3743          sem.num_slots = 1;
3744 
3745          nir_intrinsic_set_io_semantics(intr, sem);
3746          nir_intrinsic_set_component(intr, new_component);
3747 
3748          if (fs_vec4_type == FS_VEC4_TYPE_PER_PRIMITIVE) {
3749             assert(intr->intrinsic == nir_intrinsic_store_per_primitive_output ||
3750                    intr->intrinsic == nir_intrinsic_load_per_primitive_output ||
3751                    intr->intrinsic == nir_intrinsic_load_per_primitive_input);
3752          } else {
3753             assert(intr->intrinsic != nir_intrinsic_store_per_primitive_output &&
3754                    intr->intrinsic != nir_intrinsic_load_per_primitive_output &&
3755                    intr->intrinsic != nir_intrinsic_load_per_primitive_input);
3756          }
3757 
3758          /* This path is used when promoting convergent interpolated
3759           * inputs to flat. Replace load_interpolated_input with load_input.
3760           */
3761          if (intr->intrinsic == nir_intrinsic_load_interpolated_input &&
3762              (fs_vec4_type == FS_VEC4_TYPE_FLAT ||
3763               /* Promote all convergent loads to flat if the driver supports it. */
3764               (convergent &&
3765                linkage->can_mix_convergent_flat_with_interpolated))) {
3766             assert(instruction_lists[i] == &slot->consumer.loads);
3767             nir_builder *b = &linkage->consumer_builder;
3768 
3769             b->cursor = nir_before_instr(&intr->instr);
3770             nir_def *load =
3771                nir_load_input(b, 1, intr->def.bit_size,
3772                               nir_get_io_offset_src(intr)->ssa,
3773                               .io_semantics = sem,
3774                               .component = new_component,
3775                               .dest_type = nir_intrinsic_dest_type(intr));
3776 
3777             nir_def_rewrite_uses(&intr->def, load);
3778             iter->instr = nir_instr_as_intrinsic(load->parent_instr);
3779             nir_instr_remove(&intr->instr);
3780             *progress |= nir_progress_consumer;
3781 
3782             /* Interpolation converts Infs to NaNs. If we change it to flat,
3783              * we need to convert Infs to NaNs manually in the producer to
3784              * preserve that.
3785              */
3786             if (preserve_nans(linkage->consumer_builder.shader,
3787                               load->bit_size)) {
3788                list_for_each_entry(struct list_node, iter,
3789                                    &slot->producer.stores, head) {
3790                   nir_intrinsic_instr *store = iter->instr;
3791 
3792                   nir_builder *b = &linkage->producer_builder;
3793                   b->cursor = nir_before_instr(&store->instr);
3794                   nir_def *repl =
3795                      build_convert_inf_to_nan(b, store->src[0].ssa);
3796                   nir_src_rewrite(&store->src[0], repl);
3797                }
3798             }
3799          }
3800       }
3801    }
3802 }
3803 
3804 /**
3805  * A helper function for compact_varyings(). Assign new slot indices for
3806  * existing slots of a certain vec4 type (FLAT, FP16, or FP32). Skip already-
3807  * assigned scalar slots (determined by assigned_mask) and don't assign to
3808  * vec4 slots that have an incompatible vec4 type (determined by
3809  * assigned_fs_vec4_type). This works with both 32-bit and 16-bit types.
3810  * slot_size is the component size in the units of 16 bits (2 means 32 bits).
3811  *
3812  * The number of slots to assign can optionally be limited by
3813  * max_assigned_slots.
3814  *
3815  * Return how many 16-bit slots are left unused in the last vec4 (up to 8
3816  * slots).
3817  */
3818 static unsigned
fs_assign_slots(struct linkage_info * linkage,BITSET_WORD * assigned_mask,uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],BITSET_WORD * input_mask,enum fs_vec4_type fs_vec4_type,unsigned slot_size,unsigned max_assigned_slots,bool convergent,bool assign_colors,unsigned color_channel_rotate,nir_opt_varyings_progress * progress)3819 fs_assign_slots(struct linkage_info *linkage,
3820                 BITSET_WORD *assigned_mask,
3821                 uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],
3822                 BITSET_WORD *input_mask,
3823                 enum fs_vec4_type fs_vec4_type,
3824                 unsigned slot_size,
3825                 unsigned max_assigned_slots,
3826                 bool convergent,
3827                 bool assign_colors,
3828                 unsigned color_channel_rotate,
3829                 nir_opt_varyings_progress *progress)
3830 {
3831    unsigned i, slot_index, max_slot;
3832    unsigned num_assigned_slots = 0;
3833 
3834    if (assign_colors) {
3835       slot_index = VARYING_SLOT_COL0 * 8; /* starting slot */
3836       max_slot = VARYING_SLOT_COL1 * 8 + 8;
3837    } else {
3838       slot_index = VARYING_SLOT_VAR0 * 8; /* starting slot */
3839       max_slot = VARYING_SLOT_MAX;
3840    }
3841 
3842    /* Assign new slot indices for scalar slots. */
3843    BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) {
3844       if (is_interpolated_color(linkage, i) != assign_colors)
3845          continue;
3846 
3847       /* Skip indirectly-indexed scalar slots and slots incompatible
3848        * with the FS vec4 type.
3849        */
3850       while ((fs_vec4_type != FS_VEC4_TYPE_NONE &&
3851               assigned_fs_vec4_type[vec4_slot(slot_index)] !=
3852               FS_VEC4_TYPE_NONE &&
3853               assigned_fs_vec4_type[vec4_slot(slot_index)] !=
3854               fs_vec4_type) ||
3855              BITSET_TEST32(linkage->indirect_mask, slot_index) ||
3856              BITSET_TEST(assigned_mask, slot_index)) {
3857          /* If the FS vec4 type is incompatible. Move to the next vec4. */
3858          if (fs_vec4_type != FS_VEC4_TYPE_NONE &&
3859              assigned_fs_vec4_type[vec4_slot(slot_index)] !=
3860              FS_VEC4_TYPE_NONE &&
3861              assigned_fs_vec4_type[vec4_slot(slot_index)] != fs_vec4_type) {
3862             slot_index = align(slot_index + slot_size, 8); /* move to next vec4 */
3863             continue;
3864          }
3865 
3866          /* Copy the FS vec4 type if indexed indirectly, and move to
3867           * the next slot.
3868           */
3869          if (BITSET_TEST32(linkage->indirect_mask, slot_index)) {
3870             if (assigned_fs_vec4_type) {
3871                assigned_fs_vec4_type[vec4_slot(slot_index)] =
3872                   linkage->fs_vec4_type[vec4_slot(slot_index)];
3873             }
3874             assert(slot_index % 2 == 0);
3875             slot_index += 2; /* increment by 32 bits */
3876             continue;
3877          }
3878 
3879          /* This slot is already assigned (assigned_mask is set). Move to
3880           * the next one.
3881           */
3882          slot_index += slot_size;
3883       }
3884 
3885       /* Assign color channels in this order, starting
3886        * at the color_channel_rotate component first. Cases:
3887        *    color_channel_rotate = 0: xyzw
3888        *    color_channel_rotate = 1: yzwx
3889        *    color_channel_rotate = 2: zwxy
3890        *    color_channel_rotate = 3: wxyz
3891        *
3892        * This has no effect on behavior per se, but some drivers merge VARn
3893        * and COLn into one output if each defines different components.
3894        * For example, if we store VAR0.xy and COL0.z, a driver can merge them
3895        * by mapping the same output to 2 different inputs (VAR0 and COL0) if
3896        * color-specific behavior is per component, but it can't merge VAR0.xy
3897        * and COL0.x because they both define x.
3898        */
3899       unsigned new_slot_index = slot_index;
3900       if (assign_colors && color_channel_rotate) {
3901          new_slot_index = (vec4_slot(new_slot_index)) * 8 +
3902                           (new_slot_index + color_channel_rotate * 2) % 8;
3903       }
3904 
3905       /* Relocate the slot. */
3906       assert(slot_index < max_slot * 8);
3907       relocate_slot(linkage, &linkage->slot[i], i, new_slot_index,
3908                     fs_vec4_type, convergent, progress);
3909 
3910       for (unsigned i = 0; i < slot_size; ++i)
3911          BITSET_SET(assigned_mask, slot_index + i);
3912 
3913       if (assigned_fs_vec4_type)
3914          assigned_fs_vec4_type[vec4_slot(slot_index)] = fs_vec4_type;
3915       slot_index += slot_size; /* move to the next slot */
3916       num_assigned_slots += slot_size;
3917 
3918       /* Remove the slot from the input (unassigned) mask. */
3919       BITSET_CLEAR(input_mask, i);
3920 
3921       /* The number of slots to assign can optionally be limited. */
3922       assert(num_assigned_slots <= max_assigned_slots);
3923       if (num_assigned_slots == max_assigned_slots)
3924          break;
3925    }
3926 
3927    assert(slot_index <= max_slot * 8);
3928    /* Return how many 16-bit slots are left unused in the last vec4. */
3929    return (NUM_SCALAR_SLOTS - slot_index) % 8;
3930 }
3931 
3932 /**
3933  * This is called once for 32-bit inputs and once for 16-bit inputs.
3934  * It assigns new slot indices to all scalar slots specified in the masks.
3935  *
3936  * \param linkage             Linkage info
3937  * \param assigned_mask       Which scalar (16-bit) slots are already taken.
3938  * \param assigned_fs_vec4_type Which vec4 slots have an assigned qualifier
3939  *                              and can only be filled with compatible slots.
3940  * \param interp_mask         The list of interp slots to assign locations for.
3941  * \param flat_mask           The list of flat slots to assign locations for.
3942  * \param convergent_mask     The list of slots that have convergent output
3943  *                            stores.
3944  * \param sized_interp_type   One of FS_VEC4_TYPE_INTERP_{FP32, FP16, COLOR}.
3945  * \param slot_size           1 for 16 bits, 2 for 32 bits
3946  * \param color_channel_rotate Assign color channels starting with this index,
3947  *                            e.g. 2 assigns channels in the zwxy order.
3948  * \param assign_colors       Whether to assign only color varyings or only
3949  *                            non-color varyings.
3950  */
3951 static void
fs_assign_slot_groups(struct linkage_info * linkage,BITSET_WORD * assigned_mask,uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],BITSET_WORD * interp_mask,BITSET_WORD * flat_mask,BITSET_WORD * convergent_mask,BITSET_WORD * color_interp_mask,enum fs_vec4_type sized_interp_type,unsigned slot_size,bool assign_colors,unsigned color_channel_rotate,nir_opt_varyings_progress * progress)3952 fs_assign_slot_groups(struct linkage_info *linkage,
3953                       BITSET_WORD *assigned_mask,
3954                       uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],
3955                       BITSET_WORD *interp_mask,
3956                       BITSET_WORD *flat_mask,
3957                       BITSET_WORD *convergent_mask,
3958                       BITSET_WORD *color_interp_mask,
3959                       enum fs_vec4_type sized_interp_type,
3960                       unsigned slot_size,
3961                       bool assign_colors,
3962                       unsigned color_channel_rotate,
3963                       nir_opt_varyings_progress *progress)
3964 {
3965    /* Put interpolated slots first. */
3966    unsigned unused_interp_slots =
3967       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
3968                       interp_mask, sized_interp_type,
3969                       slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
3970                       color_channel_rotate, progress);
3971 
3972    unsigned unused_color_interp_slots = 0;
3973    if (color_interp_mask) {
3974       unused_color_interp_slots =
3975          fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
3976                          color_interp_mask, FS_VEC4_TYPE_INTERP_COLOR,
3977                          slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
3978                          color_channel_rotate, progress);
3979    }
3980 
3981    /* Put flat slots next.
3982     * Note that only flat vec4 slots can have both 32-bit and 16-bit types
3983     * packed in the same vec4. 32-bit flat inputs are packed first, followed
3984     * by 16-bit flat inputs.
3985     */
3986    unsigned unused_flat_slots =
3987       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
3988                       flat_mask, FS_VEC4_TYPE_FLAT,
3989                       slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
3990                       color_channel_rotate, progress);
3991 
3992    /* Take the inputs with convergent values and assign them as follows.
3993     * Since they can be assigned as both interpolated and flat, we can
3994     * choose. We prefer them to be flat, but if interpolated vec4s have
3995     * unused components, try to fill those before starting a new flat vec4.
3996     *
3997     * First, fill the unused components of flat (if any), then fill
3998     * the unused components of interpolated (if any), and then make
3999     * the remaining convergent inputs flat.
4000     */
4001    if (unused_flat_slots) {
4002       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4003                       convergent_mask, FS_VEC4_TYPE_FLAT,
4004                       slot_size, unused_flat_slots, true, assign_colors,
4005                       color_channel_rotate, progress);
4006    }
4007    if (unused_interp_slots) {
4008       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4009                       convergent_mask, sized_interp_type,
4010                       slot_size, unused_interp_slots, true, assign_colors,
4011                       color_channel_rotate, progress);
4012    }
4013    if (unused_color_interp_slots) {
4014       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4015                       convergent_mask, FS_VEC4_TYPE_INTERP_COLOR,
4016                       slot_size, unused_color_interp_slots, true, assign_colors,
4017                       color_channel_rotate, progress);
4018    }
4019    fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4020                    convergent_mask, FS_VEC4_TYPE_FLAT,
4021                    slot_size, NUM_SCALAR_SLOTS, true, assign_colors,
4022                    color_channel_rotate, progress);
4023 }
4024 
4025 static void
vs_tcs_tes_gs_assign_slots(struct linkage_info * linkage,BITSET_WORD * input_mask,unsigned * slot_index,unsigned * patch_slot_index,unsigned slot_size,nir_opt_varyings_progress * progress)4026 vs_tcs_tes_gs_assign_slots(struct linkage_info *linkage,
4027                            BITSET_WORD *input_mask,
4028                            unsigned *slot_index,
4029                            unsigned *patch_slot_index,
4030                            unsigned slot_size,
4031                            nir_opt_varyings_progress *progress)
4032 {
4033    unsigned i;
4034 
4035    BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) {
4036       if (i >= VARYING_SLOT_PATCH0 * 8 && i < VARYING_SLOT_TESS_MAX * 8) {
4037          /* Skip indirectly-indexed scalar slots at 32-bit granularity.
4038           * We have to do it at this granularity because the low 16-bit
4039           * slot is set to 1 for 32-bit inputs but not the high 16-bit slot.
4040           */
4041          while (BITSET_TEST32(linkage->indirect_mask, *patch_slot_index))
4042             *patch_slot_index = align(*patch_slot_index + 1, 2);
4043 
4044          assert(*patch_slot_index < VARYING_SLOT_TESS_MAX * 8);
4045          relocate_slot(linkage, &linkage->slot[i], i, *patch_slot_index,
4046                        FS_VEC4_TYPE_NONE, false, progress);
4047          *patch_slot_index += slot_size; /* increment by 16 or 32 bits */
4048       } else {
4049          /* If the driver wants to use POS and we've already used it, move
4050           * to VARn.
4051           */
4052          if (*slot_index < VARYING_SLOT_VAR0 &&
4053              *slot_index >= VARYING_SLOT_POS + 8)
4054             *slot_index = VARYING_SLOT_VAR0 * 8;
4055 
4056          /* Skip indirectly-indexed scalar slots at 32-bit granularity. */
4057          while (BITSET_TEST32(linkage->indirect_mask, *slot_index))
4058             *slot_index = align(*slot_index + 1, 2);
4059 
4060          assert(*slot_index < VARYING_SLOT_MAX * 8);
4061          relocate_slot(linkage, &linkage->slot[i], i, *slot_index,
4062                        FS_VEC4_TYPE_NONE, false, progress);
4063          *slot_index += slot_size; /* increment by 16 or 32 bits */
4064       }
4065    }
4066 }
4067 
4068 /**
4069  * Compaction means scalarizing and then packing scalar components into full
4070  * vec4s, so that we minimize the number of unused components in vec4 slots.
4071  *
4072  * Compaction is as simple as moving a scalar input from one scalar slot
4073  * to another. Indirectly-indexed slots are not touched, so the compaction
4074  * has to compact around them. Unused 32-bit components of indirectly-indexed
4075  * slots are still filled, so no space is wasted there, but if indirectly-
4076  * indexed 16-bit components have the other 16-bit half unused, that half is
4077  * wasted.
4078  */
4079 static void
compact_varyings(struct linkage_info * linkage,nir_opt_varyings_progress * progress)4080 compact_varyings(struct linkage_info *linkage,
4081                  nir_opt_varyings_progress *progress)
4082 {
4083    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
4084       /* These arrays are used to track which scalar slots we've already
4085        * assigned. We can fill unused components of indirectly-indexed slots,
4086        * but only if the vec4 slot type (FLAT, FP16, or FP32) is the same.
4087        * Assign vec4 slot type separately, skipping over already assigned
4088        * scalar slots.
4089        */
4090       uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS] = {0};
4091       BITSET_DECLARE(assigned_mask, NUM_SCALAR_SLOTS);
4092       BITSET_ZERO(assigned_mask);
4093 
4094       fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4095                             linkage->interp_fp32_mask, linkage->flat32_mask,
4096                             linkage->convergent32_mask, NULL,
4097                             FS_VEC4_TYPE_INTERP_FP32, 2, false, 0, progress);
4098 
4099       /* Now do the same thing, but for 16-bit inputs. */
4100       fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4101                             linkage->interp_fp16_mask, linkage->flat16_mask,
4102                             linkage->convergent16_mask, NULL,
4103                             FS_VEC4_TYPE_INTERP_FP16, 1, false, 0, progress);
4104 
4105       /* Assign INTERP_MODE_EXPLICIT. Both FP32 and FP16 can occupy the same
4106        * slot because the vertex data is passed to FS as-is.
4107        */
4108       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4109                       linkage->interp_explicit32_mask, FS_VEC4_TYPE_INTERP_EXPLICIT,
4110                       2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4111 
4112       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4113                       linkage->interp_explicit16_mask, FS_VEC4_TYPE_INTERP_EXPLICIT,
4114                       1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4115 
4116       /* Same for strict vertex ordering. */
4117       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4118                       linkage->interp_explicit_strict32_mask, FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
4119                       2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4120 
4121       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4122                       linkage->interp_explicit_strict16_mask, FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
4123                       1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4124 
4125       /* Same for per-primitive. */
4126       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4127                       linkage->per_primitive32_mask, FS_VEC4_TYPE_PER_PRIMITIVE,
4128                       2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4129 
4130       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4131                       linkage->per_primitive16_mask, FS_VEC4_TYPE_PER_PRIMITIVE,
4132                       1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4133 
4134       /* Put transform-feedback-only outputs last. */
4135       fs_assign_slots(linkage, assigned_mask, NULL,
4136                       linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2,
4137                       NUM_SCALAR_SLOTS, false, false, 0, progress);
4138 
4139       fs_assign_slots(linkage, assigned_mask, NULL,
4140                       linkage->xfb16_only_mask, FS_VEC4_TYPE_NONE, 1,
4141                       NUM_SCALAR_SLOTS, false, false, 0, progress);
4142 
4143       /* Color varyings are only compacted among themselves. */
4144       /* Set whether the shader contains any color varyings. */
4145       unsigned col0 = VARYING_SLOT_COL0 * 8;
4146       bool has_colors =
4147          !BITSET_TEST_RANGE_INSIDE_WORD(linkage->interp_fp32_mask, col0, 16,
4148                                         0) ||
4149          !BITSET_TEST_RANGE_INSIDE_WORD(linkage->convergent32_mask, col0, 16,
4150                                         0) ||
4151          !BITSET_TEST_RANGE_INSIDE_WORD(linkage->color32_mask, col0, 16, 0) ||
4152          !BITSET_TEST_RANGE_INSIDE_WORD(linkage->flat32_mask, col0, 16, 0) ||
4153          !BITSET_TEST_RANGE_INSIDE_WORD(linkage->xfb32_only_mask, col0, 16, 0);
4154 
4155       if (has_colors) {
4156          unsigned color_channel_rotate =
4157             DIV_ROUND_UP(BITSET_LAST_BIT(assigned_mask), 2) % 4;
4158 
4159          fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4160                                linkage->interp_fp32_mask, linkage->flat32_mask,
4161                                linkage->convergent32_mask, linkage->color32_mask,
4162                                FS_VEC4_TYPE_INTERP_FP32, 2, true,
4163                                color_channel_rotate, progress);
4164 
4165          /* Put transform-feedback-only outputs last. */
4166          fs_assign_slots(linkage, assigned_mask, NULL,
4167                          linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2,
4168                          NUM_SCALAR_SLOTS, false, true, color_channel_rotate,
4169                          progress);
4170       }
4171    } else {
4172       /* The consumer is a TCS, TES, or GS.
4173        *
4174        * "use_pos" says whether the driver prefers that compaction with non-FS
4175        * consumers puts varyings into POS first before using any VARn.
4176        */
4177       bool use_pos = !(linkage->producer_builder.shader->options->io_options &
4178                        nir_io_dont_use_pos_for_non_fs_varyings);
4179       unsigned slot_index = (use_pos ? VARYING_SLOT_POS
4180                                      : VARYING_SLOT_VAR0) * 8;
4181       unsigned patch_slot_index = VARYING_SLOT_PATCH0 * 8;
4182 
4183       /* Compact 32-bit inputs. */
4184       vs_tcs_tes_gs_assign_slots(linkage, linkage->flat32_mask, &slot_index,
4185                                  &patch_slot_index, 2, progress);
4186 
4187       /* Compact 16-bit inputs, allowing them to share vec4 slots with 32-bit
4188        * inputs.
4189        */
4190       vs_tcs_tes_gs_assign_slots(linkage, linkage->flat16_mask, &slot_index,
4191                                  &patch_slot_index, 1, progress);
4192 
4193       /* Put no-varying slots last. These are TCS outputs read by TCS but not
4194        * TES.
4195        */
4196       vs_tcs_tes_gs_assign_slots(linkage, linkage->no_varying32_mask, &slot_index,
4197                                  &patch_slot_index, 2, progress);
4198       vs_tcs_tes_gs_assign_slots(linkage, linkage->no_varying16_mask, &slot_index,
4199                                  &patch_slot_index, 1, progress);
4200 
4201       assert(slot_index <= VARYING_SLOT_MAX * 8);
4202       assert(patch_slot_index <= VARYING_SLOT_TESS_MAX * 8);
4203    }
4204 }
4205 
4206 /******************************************************************
4207  * PUTTING IT ALL TOGETHER
4208  ******************************************************************/
4209 
4210 static void
init_linkage(nir_shader * producer,nir_shader * consumer,bool spirv,unsigned max_uniform_components,unsigned max_ubos_per_stage,struct linkage_info * linkage)4211 init_linkage(nir_shader *producer, nir_shader *consumer, bool spirv,
4212              unsigned max_uniform_components, unsigned max_ubos_per_stage,
4213              struct linkage_info *linkage)
4214 {
4215    *linkage = (struct linkage_info){
4216       .spirv = spirv,
4217       .can_mix_convergent_flat_with_interpolated =
4218          consumer->info.stage == MESA_SHADER_FRAGMENT &&
4219          consumer->options->io_options &
4220          nir_io_mix_convergent_flat_with_interpolated,
4221       .producer_stage = producer->info.stage,
4222       .consumer_stage = consumer->info.stage,
4223       .producer_builder =
4224          nir_builder_create(nir_shader_get_entrypoint(producer)),
4225       .consumer_builder =
4226          nir_builder_create(nir_shader_get_entrypoint(consumer)),
4227 
4228       .max_varying_expression_cost =
4229          producer->options->varying_expression_max_cost ?
4230          producer->options->varying_expression_max_cost(producer, consumer) : 0,
4231 
4232       .linear_mem_ctx = linear_context(ralloc_context(NULL)),
4233    };
4234 
4235    for (unsigned i = 0; i < ARRAY_SIZE(linkage->slot); i++) {
4236       list_inithead(&linkage->slot[i].producer.loads);
4237       list_inithead(&linkage->slot[i].producer.stores);
4238       list_inithead(&linkage->slot[i].consumer.loads);
4239    }
4240 
4241    /* Preparation. */
4242    nir_shader_intrinsics_pass(consumer, gather_inputs, 0, linkage);
4243    nir_shader_intrinsics_pass(producer, gather_outputs, 0, linkage);
4244    tidy_up_indirect_varyings(linkage);
4245    determine_uniform_movability(linkage, max_uniform_components);
4246    determine_ubo_movability(linkage, max_ubos_per_stage);
4247 }
4248 
4249 static void
free_linkage(struct linkage_info * linkage)4250 free_linkage(struct linkage_info *linkage)
4251 {
4252    ralloc_free(ralloc_parent_of_linear_context(linkage->linear_mem_ctx));
4253 }
4254 
4255 static void
print_shader_linkage(nir_shader * producer,nir_shader * consumer)4256 print_shader_linkage(nir_shader *producer, nir_shader *consumer)
4257 {
4258    struct linkage_info *linkage = MALLOC_STRUCT(linkage_info);
4259 
4260    init_linkage(producer, consumer, false, 0, 0, linkage);
4261    print_linkage(linkage);
4262    free_linkage(linkage);
4263    FREE(linkage);
4264 }
4265 
4266 /**
4267  * Run lots of optimizations on varyings. See the description at the beginning
4268  * of this file.
4269  */
4270 nir_opt_varyings_progress
nir_opt_varyings(nir_shader * producer,nir_shader * consumer,bool spirv,unsigned max_uniform_components,unsigned max_ubos_per_stage)4271 nir_opt_varyings(nir_shader *producer, nir_shader *consumer, bool spirv,
4272                  unsigned max_uniform_components, unsigned max_ubos_per_stage)
4273 {
4274    /* Task -> Mesh I/O uses payload variables and not varying slots,
4275     * so this pass can't do anything about it.
4276     */
4277    if (producer->info.stage == MESA_SHADER_TASK)
4278       return 0;
4279 
4280    nir_opt_varyings_progress progress = 0;
4281    struct linkage_info *linkage = MALLOC_STRUCT(linkage_info);
4282    if (linkage == NULL)
4283       return 0;
4284 
4285    /* Producers before a fragment shader must have up-to-date vertex
4286     * divergence information.
4287     */
4288    if (consumer->info.stage == MESA_SHADER_FRAGMENT) {
4289       /* Required by the divergence analysis. */
4290       NIR_PASS(_, producer, nir_convert_to_lcssa, true, true);
4291       nir_vertex_divergence_analysis(producer);
4292    }
4293 
4294    init_linkage(producer, consumer, spirv, max_uniform_components,
4295                 max_ubos_per_stage, linkage);
4296 
4297    /* Part 1: Run optimizations that only remove varyings. (they can move
4298     * instructions between shaders)
4299     */
4300    remove_dead_varyings(linkage, &progress);
4301    propagate_uniform_expressions(linkage, &progress);
4302 
4303    /* Part 2: Deduplicate outputs. */
4304    deduplicate_outputs(linkage, &progress);
4305 
4306    /* Run CSE on the consumer after output deduplication because duplicated
4307     * loads can prevent finding the post-dominator for inter-shader code
4308     * motion.
4309     */
4310    NIR_PASS(_, consumer, nir_opt_cse);
4311 
4312    /* Re-gather linkage info after CSE. */
4313    free_linkage(linkage);
4314    init_linkage(producer, consumer, spirv, max_uniform_components,
4315                 max_ubos_per_stage, linkage);
4316    /* This must be done again to clean up bitmasks in linkage. */
4317    remove_dead_varyings(linkage, &progress);
4318 
4319    /* This must be done after deduplication and before inter-shader code
4320     * motion.
4321     */
4322    tidy_up_convergent_varyings(linkage);
4323    find_open_coded_tes_input_interpolation(linkage);
4324 
4325    /* Part 3: Run optimizations that completely change varyings. */
4326 #if PRINT
4327    int i = 0;
4328    puts("Before:");
4329    nir_print_shader(linkage.producer_builder.shader, stdout);
4330    nir_print_shader(linkage.consumer_builder.shader, stdout);
4331    print_linkage(&linkage);
4332    puts("");
4333 #endif
4334 
4335    while (backward_inter_shader_code_motion(linkage, &progress)) {
4336 #if PRINT
4337       i++;
4338       printf("Finished: %i\n", i);
4339       nir_print_shader(linkage->producer_builder.shader, stdout);
4340       nir_print_shader(linkage->consumer_builder.shader, stdout);
4341       print_linkage(linkage);
4342       puts("");
4343 #endif
4344    }
4345 
4346    /* Part 4: Do compaction. */
4347    compact_varyings(linkage, &progress);
4348 
4349    nir_metadata_preserve(linkage->producer_builder.impl,
4350                          progress & nir_progress_producer ?
4351                             (nir_metadata_control_flow) :
4352                             nir_metadata_all);
4353    nir_metadata_preserve(linkage->consumer_builder.impl,
4354                          progress & nir_progress_consumer ?
4355                             (nir_metadata_control_flow) :
4356                             nir_metadata_all);
4357    free_linkage(linkage);
4358    FREE(linkage);
4359 
4360    if (progress & nir_progress_producer)
4361       nir_validate_shader(producer, "nir_opt_varyings");
4362    if (progress & nir_progress_consumer)
4363       nir_validate_shader(consumer, "nir_opt_varyings");
4364 
4365    return progress;
4366 }
4367