xref: /aosp_15_r20/external/mesa3d/docs/drivers/anv.rst (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1ANV
2===
3
4Experimental features
5---------------------
6
7.. _`Bindless model`:
8
9Binding Model
10-------------
11
12Here is the ANV bindless binding model that was implemented for the
13descriptor indexing feature of Vulkan 1.2 :
14
15.. graphviz::
16
17  digraph G {
18    fontcolor="black";
19    compound=true;
20
21    subgraph cluster_1 {
22      label = "Binding Table (HW)";
23
24      bgcolor="cornflowerblue";
25
26      node [ style=filled,shape="record",fillcolor="white",
27             label="RT0"    ] n0;
28      node [ label="RT1"    ] n1;
29      node [ label="dynbuf0"] n2;
30      node [ label="set0"   ] n3;
31      node [ label="set1"   ] n4;
32      node [ label="set2"   ] n5;
33
34      n0 -> n1 -> n2 -> n3 -> n4 -> n5 [style=invis];
35    }
36    subgraph cluster_2 {
37      label = "Descriptor Set 0";
38
39      bgcolor="burlywood3";
40      fixedsize = true;
41
42      node [ style=filled,shape="record",fillcolor="white", fixedsize = true, width=4,
43             label="binding 0 - STORAGE_IMAGE\n anv_storage_image_descriptor"          ] n8;
44      node [ label="binding 1 - COMBINED_IMAGE_SAMPLER\n anv_sampled_image_descriptor" ] n9;
45      node [ label="binding 2 - UNIFORM_BUFFER\n anv_address_range_descriptor"         ] n10;
46      node [ label="binding 3 - UNIFORM_TEXEL_BUFFER\n anv_storage_image_descriptor"   ] n11;
47
48      n8 -> n9 -> n10 -> n11 [style=invis];
49    }
50    subgraph cluster_5 {
51      label = "Vulkan Objects"
52
53      fontcolor="black";
54      bgcolor="darkolivegreen4";
55
56      subgraph cluster_6 {
57        label = "VkImageView";
58
59        bgcolor=darkolivegreen3;
60        node [ style=filled,shape="box",fillcolor="white", fixedsize = true, width=2,
61               label="surface_state" ] n12;
62      }
63      subgraph cluster_7 {
64        label = "VkSampler";
65
66        bgcolor=darkolivegreen3;
67        node [ style=filled,shape="box",fillcolor="white", fixedsize = true, width=2,
68               label="sample_state" ] n13;
69      }
70      subgraph cluster_8 {
71        label = "VkImageView";
72        bgcolor="darkolivegreen3";
73
74        node [ style=filled,shape="box",fillcolor="white", fixedsize = true, width=2,
75               label="surface_state" ] n14;
76      }
77      subgraph cluster_9 {
78        label = "VkBuffer";
79        bgcolor=darkolivegreen3;
80
81        node [ style=filled,shape="box",fillcolor="white", fixedsize = true, width=2,
82               label="address" ] n15;
83      }
84      subgraph cluster_10 {
85        label = "VkBufferView";
86
87        bgcolor=darkolivegreen3;
88        node [ style=filled,shape="box",fillcolor="white", fixedsize = true, width=2,
89               label="surface_state" ] n16;
90      }
91
92      n12 -> n13 -> n14 -> n15 -> n16 [style=invis];
93    }
94
95    subgraph cluster_11 {
96      subgraph cluster_12 {
97        label = "CommandBuffer state stream";
98
99        bgcolor="gold3";
100        node [ style=filled,shape="box",fillcolor="white", fixedsize = true, width=2,
101               label="surface_state" ] n17;
102        node [ label="surface_state" ] n18;
103        node [ label="surface_state" ] n19;
104
105        n17 -> n18 -> n19 [style=invis];
106      }
107    }
108
109    n3  -> n8 [lhead=cluster_2];
110
111    n8  -> n12;
112    n9  -> n13;
113    n9  -> n14;
114    n10 -> n15;
115    n11 -> n16;
116
117    n0 -> n17;
118    n1 -> n18;
119    n2 -> n19;
120  }
121
122
123
124The HW binding table is generated when the draw or dispatch commands
125are emitted. Here are the types of entries one can find in the binding
126table :
127
128- The currently bound descriptor sets, one entry per descriptor set
129  (our limit is 8).
130
131- For dynamic buffers, one entry per dynamic buffer.
132
133- For draw commands, render target entries if needed.
134
135The entries of the HW binding table for descriptor sets are
136RENDER_SURFACE_STATE similar to what you would have for a normal
137uniform buffer. The shader will emit reads this buffer first to get
138the information it needs to access a surface/sampler/etc... and then
139emits the appropriate message using the information gathered from the
140descriptor set buffer.
141
142Each binding type entry gets an associated structure in memory
143(``anv_storage_image_descriptor``, ``anv_sampled_image_descriptor``,
144``anv_address_range_descriptor``, ``anv_storage_image_descriptor``).
145This is the information read by the shader.
146
147
148.. _`Binding tables`:
149
150Binding Tables
151--------------
152
153Binding tables are arrays of 32bit offset entries referencing surface
154states. This is how shaders can refer to binding table entry to read
155or write a surface. For example fragment shaders will often refer to
156entry 0 as the first render target.
157
158The way binding tables are managed is fairly awkward.
159
160Each shader stage must have its binding table programmed through
161a corresponding instruction
162``3DSTATE_BINDING_TABLE_POINTERS_*`` (each stage has its own).
163
164.. graphviz::
165
166  digraph structs {
167    node [shape=record];
168    struct3 [label="{ binding tables&#92;n area | { <bt4> BT4 | <bt3> BT3 | ... | <bt0> BT0 } }|{ surface state&#92;n area |{<ss0> ss0|<ss1> ss1|<ss2> ss2|...}}"];
169    struct3:bt0 -> struct3:ss0;
170    struct3:bt0 -> struct3:ss1;
171  }
172
173
174The value programmed in the ``3DSTATE_BINDING_TABLE_POINTERS_*``
175instructions is not a 64bit pointer but an offset from the address
176programmed in ``STATE_BASE_ADDRESS::Surface State Base Address`` or
177``3DSTATE_BINDING_TABLE_POOL_ALLOC::Binding Table Pool Base Address``
178(available on Gfx11+). The offset value in
179``3DSTATE_BINDING_TABLE_POINTERS_*`` is also limited to a few bits
180(not a full 32bit value), meaning that as we use more and more binding
181tables we need to reposition ``STATE_BASE_ADDRESS::Surface State Base
182Address`` to make space for new binding table arrays.
183
184To make things even more awkward, the binding table entries are also
185relative to ``STATE_BASE_ADDRESS::Surface State Base Address`` so as
186we change ``STATE_BASE_ADDRESS::Surface State Base Address`` we need
187add that offsets to the binding table entries.
188
189The way with deal with this is that we allocate 4Gb of address space
190(since the binding table entries can address 4Gb of surface state
191elements). We reserve the first gigabyte exclusively to binding
192tables, so that anywhere we position our binding table in that first
193gigabyte, it can always refer to the surface states in the next 3Gb.
194
195
196.. _`Descriptor Set Memory Layout`:
197
198Descriptor Set Memory Layout
199----------------------------
200
201Here is a representation of how the descriptor set bindings, with each
202elements in each binding is mapped to a the descriptor set memory :
203
204.. graphviz::
205
206  digraph structs {
207    node [shape=record];
208    rankdir=LR;
209
210    struct1 [label="Descriptor Set | \
211                    <b0> binding 0\n STORAGE_IMAGE \n (array_length=3) | \
212                    <b1> binding 1\n COMBINED_IMAGE_SAMPLER \n (array_length=2) | \
213                    <b2> binding 2\n UNIFORM_BUFFER \n (array_length=1) | \
214                    <b3> binding 3\n UNIFORM_TEXEL_BUFFER \n (array_length=1)"];
215    struct2 [label="Descriptor Set Memory | \
216                    <b0e0> anv_storage_image_descriptor|\
217                    <b0e1> anv_storage_image_descriptor|\
218                    <b0e2> anv_storage_image_descriptor|\
219                    <b1e0> anv_sampled_image_descriptor|\
220                    <b1e1> anv_sampled_image_descriptor|\
221                    <b2e0> anv_address_range_descriptor|\
222                    <b3e0> anv_storage_image_descriptor"];
223
224    struct1:b0 -> struct2:b0e0;
225    struct1:b0 -> struct2:b0e1;
226    struct1:b0 -> struct2:b0e2;
227    struct1:b1 -> struct2:b1e0;
228    struct1:b1 -> struct2:b1e1;
229    struct1:b2 -> struct2:b2e0;
230    struct1:b3 -> struct2:b3e0;
231  }
232
233Each Binding in the descriptor set is allocated an array of
234``anv_*_descriptor`` data structure. The type of ``anv_*_descriptor``
235used for a binding is selected based on the ``VkDescriptorType`` of
236the bindings.
237
238The value of ``anv_descriptor_set_binding_layout::descriptor_offset``
239is a byte offset from the descriptor set memory to the associated
240binding. ``anv_descriptor_set_binding_layout::array_size`` is the
241number of ``anv_*_descriptor`` elements in the descriptor set memory
242from that offset for the binding.
243
244
245Pipeline state emission
246-----------------------
247
248Vulkan initially started by baking as much state as possible in
249pipelines. But extension after extension, more and more state has
250become potentially dynamic.
251
252ANV tries to limit the amount of time an instruction has to be packed
253to reprogram part of the 3D pipeline state. The packing is happening
254in 2 places :
255
256- ``genX_pipeline.c`` where the non dynamic state is emitted in the
257  pipeline batch. Chunks of the batches are copied into the command
258  buffer as a result of calling ``vkCmdBindPipeline()``, depending on
259  what changes from the previously bound graphics pipeline
260
261- ``genX_gfx_state.c`` where the dynamic state is added to already
262  packed instructions from ``genX_pipeline.c``
263
264The rule to know where to emit an instruction programming the 3D
265pipeline is as follow :
266
267- If any field of the instruction can be made dynamic, it should be
268  emitted in ``genX_gfx_state.c``
269
270- Otherwise, the instruction can be emitted in ``genX_pipeline.c``
271
272When a piece of state programming is dynamic, it should have a
273corresponding field in ``anv_gfx_dynamic_state`` and the
274``genX(cmd_buffer_flush_gfx_runtime_state)`` function should be
275updated to ensure we minimize the amount of time an instruction should
276be emitted. Each instruction should have a associated
277``ANV_GFX_STATE_*`` mask so that the dynamic emission code can tell
278when to re-emit an instruction.
279
280
281Generated indirect draws optimization
282-------------------------------------
283
284Indirect draws have traditionally been implemented on Intel HW by
285loading the indirect parameters from memory into HW registers using
286the command streamer's ``MI_LOAD_REGISTER_MEM`` instruction before
287dispatching a draw call to the 3D pipeline.
288
289On recent products, it was found that the command streamer is showing
290as performance bottleneck, because it cannot dispatch draw calls fast
291enough to keep the 3D pipeline busy.
292
293The solution to this problem is to change the way we deal with
294indirect draws. Instead of loading HW registers with values using the
295command streamer, we generate entire set of ``3DPRIMITIVE``
296instructions using a shader. The generated instructions contain the
297entire draw call parameters. This way the command streamer executes
298only ``3DPRIMITIVE`` instructions and doesn't do any data loading from
299memory or touch HW registers, feeding the 3D pipeline as fast as it
300can.
301
302In ANV this implemented in 2 different ways :
303
304By generating instructions directly into the command stream using a
305side batch buffer. When ANV encounters the first indirect draws, it
306generates a jump into the side batch, the side batch contains a draw
307call using a generation shader for each indirect draw. We keep adding
308on more generation draws into the batch until we have to stop due to
309command buffer end, secondary command buffer calls or a barrier
310containing the access flag ``VK_ACCESS_INDIRECT_COMMAND_READ_BIT``.
311The side batch buffer jump back right after the instruction where it
312was called. Here is a high level diagram showing how the generation
313batch buffer writes in the main command buffer :
314
315.. graphviz::
316
317  digraph commands_mode {
318    rankdir = "LR"
319    "main-command-buffer" [
320      label = "main command buffer|...|draw indirect0 start|<f0>jump to\ngeneration batch|<f1>|<f2>empty instruction0|<f3>empty instruction1|...|draw indirect0 end|...|draw indirect1 start|<f4>empty instruction0|<f5>empty instruction1|...|<f6>draw indirect1 end|..."
321      shape = "record"
322    ];
323    "generation-command-buffer" [
324      label = "generation command buffer|<f0>|<f1>write draw indirect0|<f2>write draw indirect1|...|<f3>exit jump"
325      shape = "record"
326    ];
327    "main-command-buffer":f0 -> "generation-command-buffer":f0;
328    "generation-command-buffer":f1 -> "main-command-buffer":f2 [color="#0000ff"];
329    "generation-command-buffer":f1 -> "main-command-buffer":f3 [color="#0000ff"];
330    "generation-command-buffer":f2 -> "main-command-buffer":f4 [color="#0000ff"];
331    "generation-command-buffer":f2 -> "main-command-buffer":f5 [color="#0000ff"];
332    "generation-command-buffer":f3 -> "main-command-buffer":f1;
333  }
334
335By generating instructions into a ring buffer of commands, when the
336draw count number is high. This solution allows smaller batches to be
337emitted. Here is a high level diagram showing how things are
338executed :
339
340.. graphviz::
341
342  digraph ring_mode {
343    rankdir=LR;
344    "main-command-buffer" [
345      label = "main command buffer|...| draw indirect |<f1>generation shader|<f2> jump to ring|<f3> increment\ndraw_base|<f4>..."
346      shape = "record"
347    ];
348    "ring-buffer" [
349      label = "ring buffer|<f0>generated draw0|<f1>generated draw1|<f2>generated draw2|...|<f3>exit jump"
350      shape = "record"
351    ];
352    "main-command-buffer":f2 -> "ring-buffer":f0;
353    "ring-buffer":f3 -> "main-command-buffer":f3;
354    "ring-buffer":f3 -> "main-command-buffer":f4;
355    "main-command-buffer":f3 -> "main-command-buffer":f1;
356    "main-command-buffer":f1 -> "ring-buffer":f1 [color="#0000ff"];
357    "main-command-buffer":f1 -> "ring-buffer":f2 [color="#0000ff"];
358  }
359
360Runtime dependencies
361--------------------
362
363Starting with Intel 12th generation/Alder Lake-P and Intel Arc Alchemist, the Intel 3D driver stack requires GuC firmware for proper operation. You have two options to install the firmware:
364
365- Distro package: Install the pre-packaged firmware included in your Linux distribution's repositories.
366- Manual download: You can download the firmware from the official repository: https://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git/tree/i915. Place the downloaded files in the /lib/firmware/i915 directory.
367
368Important: For optimal performance, we recommend updating the GuC firmware to version 70.6.3 or later.