xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/genX_cmd_buffer.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_render_pass.h"
30 #include "vk_util.h"
31 
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34 
35 #include "ds/intel_tracepoints.h"
36 
37 #include "genX_mi_builder.h"
38 #include "genX_cmd_draw_generated_flush.h"
39 
40 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
41                                         uint32_t pipeline);
42 
43 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)44 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
45    enum anv_pipe_bits bits = 0;
46    bits |= (pc->DepthCacheFlushEnable) ?  ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
47    bits |= (pc->DCFlushEnable) ?  ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
48 #if GFX_VERx10 >= 125
49    bits |= (pc->PSSStallSyncEnable) ?  ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
50 #endif
51 #if GFX_VER == 12
52    bits |= (pc->TileCacheFlushEnable) ?  ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
53 #endif
54 #if GFX_VER >= 12
55    bits |= (pc->HDCPipelineFlushEnable) ?  ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
56 #endif
57    bits |= (pc->RenderTargetCacheFlushEnable) ?  ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
58    bits |= (pc->VFCacheInvalidationEnable) ?  ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
59    bits |= (pc->StateCacheInvalidationEnable) ?  ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
60    bits |= (pc->ConstantCacheInvalidationEnable) ?  ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
61    bits |= (pc->TextureCacheInvalidationEnable) ?  ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
62    bits |= (pc->InstructionCacheInvalidateEnable) ?  ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
63    bits |= (pc->StallAtPixelScoreboard) ?  ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
64    bits |= (pc->DepthStallEnable) ?  ANV_PIPE_DEPTH_STALL_BIT : 0;
65    bits |= (pc->CommandStreamerStallEnable) ?  ANV_PIPE_CS_STALL_BIT : 0;
66 #if GFX_VERx10 == 125
67    bits |= (pc->UntypedDataPortCacheFlushEnable) ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0;
68    bits |= (pc->CCSFlushEnable) ? ANV_PIPE_CCS_CACHE_FLUSH_BIT : 0;
69 #endif
70    return bits;
71 }
72 
73 #define anv_debug_dump_pc(pc, reason) \
74    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
75       fputs("pc: emit PC=( ", stdout); \
76       anv_dump_pipe_bits(convert_pc_to_bits(&(pc)), stdout);   \
77       fprintf(stdout, ") reason: %s\n", reason); \
78    }
79 
80 static inline void
fill_state_base_addr(struct anv_cmd_buffer * cmd_buffer,struct GENX (STATE_BASE_ADDRESS)* sba)81 fill_state_base_addr(struct anv_cmd_buffer *cmd_buffer,
82                      struct GENX(STATE_BASE_ADDRESS) *sba)
83 {
84    struct anv_device *device = cmd_buffer->device;
85    const uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
86 
87    /* If no API entry point selected the current mode (this can happen if the
88     * first operation in the command buffer is a , select BUFFER if
89     * EXT_descriptor_buffer is enabled, otherwise LEGACY.
90     */
91    if (cmd_buffer->state.pending_db_mode ==
92        ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN) {
93       cmd_buffer->state.pending_db_mode =
94          cmd_buffer->device->vk.enabled_extensions.EXT_descriptor_buffer ?
95          ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER :
96          ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
97    }
98 
99    *sba = (struct GENX(STATE_BASE_ADDRESS)) { GENX(STATE_BASE_ADDRESS_header), };
100 
101    sba->GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
102    sba->GeneralStateMOCS = mocs;
103    sba->GeneralStateBufferSize = 0xfffff;
104    sba->GeneralStateBaseAddressModifyEnable = true;
105    sba->GeneralStateBufferSizeModifyEnable = true;
106 
107 #if GFX_VERx10 == 120
108    /* Since DG2, scratch surfaces have their own surface state with its own
109     * MOCS setting, but prior to that, the MOCS for scratch accesses are
110     * governed by SBA.StatelessDataPortAccessMOCS.
111     */
112    const isl_surf_usage_flags_t protected_usage =
113       cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT ?
114       ISL_SURF_USAGE_PROTECTED_BIT : 0;
115    const uint32_t stateless_mocs = isl_mocs(&device->isl_dev, protected_usage, false);
116 #else
117    const uint32_t stateless_mocs = mocs;
118 #endif
119 
120    sba->StatelessDataPortAccessMOCS = stateless_mocs;
121 
122 #if GFX_VERx10 >= 125
123    sba->SurfaceStateBaseAddress =
124       (struct anv_address) { .offset =
125                              device->physical->va.internal_surface_state_pool.addr,
126    };
127 #else
128    sba->SurfaceStateBaseAddress =
129       anv_cmd_buffer_surface_base_address(cmd_buffer);
130 #endif
131    sba->SurfaceStateMOCS = mocs;
132    sba->SurfaceStateBaseAddressModifyEnable = true;
133 
134    sba->IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
135    sba->IndirectObjectMOCS = mocs;
136    sba->IndirectObjectBufferSize = 0xfffff;
137    sba->IndirectObjectBaseAddressModifyEnable = true;
138    sba->IndirectObjectBufferSizeModifyEnable  = true;
139 
140    sba->InstructionBaseAddress =
141       (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
142    sba->InstructionMOCS = mocs;
143    sba->InstructionBufferSize =
144       device->physical->va.instruction_state_pool.size / 4096;
145    sba->InstructionBaseAddressModifyEnable = true;
146    sba->InstructionBuffersizeModifyEnable = true;
147 
148 #if GFX_VER >= 11
149    sba->BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
150    sba->BindlessSamplerStateBufferSize = 0;
151    sba->BindlessSamplerStateMOCS = mocs;
152    sba->BindlessSamplerStateBaseAddressModifyEnable = true;
153 #endif
154 
155    sba->DynamicStateBaseAddress = (struct anv_address) {
156       .offset = device->physical->va.dynamic_state_pool.addr,
157    };
158    sba->DynamicStateBufferSize =
159       (device->physical->va.dynamic_state_pool.size +
160        device->physical->va.dynamic_visible_pool.size +
161        device->physical->va.push_descriptor_buffer_pool.size) / 4096;
162    sba->DynamicStateMOCS = mocs;
163    sba->DynamicStateBaseAddressModifyEnable = true;
164    sba->DynamicStateBufferSizeModifyEnable = true;
165 
166    if (cmd_buffer->state.pending_db_mode == ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER) {
167 #if GFX_VERx10 >= 125
168       sba->BindlessSurfaceStateBaseAddress = (struct anv_address) {
169          .offset = device->physical->va.dynamic_visible_pool.addr,
170       };
171       sba->BindlessSurfaceStateSize =
172          (device->physical->va.dynamic_visible_pool.size +
173           device->physical->va.push_descriptor_buffer_pool.size) - 1;
174       sba->BindlessSurfaceStateMOCS = mocs;
175       sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
176 #else
177       const uint64_t surfaces_addr =
178          cmd_buffer->state.descriptor_buffers.surfaces_address != 0 ?
179          cmd_buffer->state.descriptor_buffers.surfaces_address :
180          anv_address_physical(device->workaround_address);
181       const uint64_t surfaces_size =
182          cmd_buffer->state.descriptor_buffers.surfaces_address != 0 ?
183          MIN2(device->physical->va.dynamic_visible_pool.size -
184               (cmd_buffer->state.descriptor_buffers.surfaces_address -
185                device->physical->va.dynamic_visible_pool.addr),
186               anv_physical_device_bindless_heap_size(device->physical, true)) :
187          (device->workaround_bo->size - device->workaround_address.offset);
188       sba->BindlessSurfaceStateBaseAddress = (struct anv_address) {
189          .offset = surfaces_addr,
190       };
191       sba->BindlessSurfaceStateSize = surfaces_size / ANV_SURFACE_STATE_SIZE - 1;
192       sba->BindlessSurfaceStateMOCS = mocs;
193       sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
194 #endif /* GFX_VERx10 < 125 */
195    } else if (!device->physical->indirect_descriptors) {
196 #if GFX_VERx10 >= 125
197       sba->BindlessSurfaceStateBaseAddress = (struct anv_address) {
198          .offset = device->physical->va.internal_surface_state_pool.addr,
199       };
200       sba->BindlessSurfaceStateSize =
201          (device->physical->va.internal_surface_state_pool.size +
202           device->physical->va.bindless_surface_state_pool.size) - 1;
203       sba->BindlessSurfaceStateMOCS = mocs;
204       sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
205 #else
206       unreachable("Direct descriptor not supported");
207 #endif
208    } else {
209       sba->BindlessSurfaceStateBaseAddress =
210          (struct anv_address) { .offset =
211                                 device->physical->va.bindless_surface_state_pool.addr,
212       };
213       sba->BindlessSurfaceStateSize =
214          anv_physical_device_bindless_heap_size(device->physical, false) /
215          ANV_SURFACE_STATE_SIZE - 1;
216       sba->BindlessSurfaceStateMOCS = mocs;
217       sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
218    }
219 
220 #if GFX_VERx10 >= 125
221    sba->L1CacheControl = L1CC_WB;
222 #endif
223 }
224 
225 void
genX(cmd_buffer_emit_state_base_address)226 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
227 {
228    if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
229        anv_cmd_buffer_is_video_queue(cmd_buffer))
230       return;
231 
232    struct anv_device *device = cmd_buffer->device;
233 
234    struct GENX(STATE_BASE_ADDRESS) sba = {};
235    fill_state_base_addr(cmd_buffer, &sba);
236 
237 #if GFX_VERx10 >= 125
238    struct mi_builder b;
239    mi_builder_init(&b, device->info, &cmd_buffer->batch);
240    mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
241    struct mi_goto_target t = MI_GOTO_TARGET_INIT;
242    mi_goto_if(&b,
243               mi_ieq(&b, mi_reg64(ANV_BINDLESS_SURFACE_BASE_ADDR_REG),
244                          mi_imm(sba.BindlessSurfaceStateBaseAddress.offset)),
245               &t);
246 #endif
247 
248    /* Emit a render target cache flush.
249     *
250     * This isn't documented anywhere in the PRM.  However, it seems to be
251     * necessary prior to changing the surface state base address.  Without
252     * this, we get GPU hangs when using multi-level command buffers which
253     * clear depth, reset state base address, and then go render stuff.
254     */
255    genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
256                                 cmd_buffer->state.current_pipeline,
257 #if GFX_VER >= 12
258                                 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
259 #else
260                                 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
261 #endif
262                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
263                                 ANV_PIPE_CS_STALL_BIT);
264 
265 #if INTEL_NEEDS_WA_1607854226
266    /* Wa_1607854226:
267     *
268     *  Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
269     *  mode by putting the pipeline temporarily in 3D mode.
270     */
271    uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;
272    genX(flush_pipeline_select_3d)(cmd_buffer);
273 #endif
274 
275    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), _sba) {
276       _sba = sba;
277    }
278 
279    if (cmd_buffer->state.current_db_mode != cmd_buffer->state.pending_db_mode)
280       cmd_buffer->state.current_db_mode = cmd_buffer->state.pending_db_mode;
281 
282 #if INTEL_NEEDS_WA_1607854226
283    /* Wa_1607854226:
284     *
285     *  Put the pipeline back into its current mode.
286     */
287    if (gfx12_wa_pipeline != UINT32_MAX)
288       genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
289 #endif
290 
291    /* After re-setting the surface state base address, we have to do some
292     * cache flushing so that the sampler engine will pick up the new
293     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
294     * Shared Function > 3D Sampler > State > State Caching (page 96):
295     *
296     *    Coherency with system memory in the state cache, like the texture
297     *    cache is handled partially by software. It is expected that the
298     *    command stream or shader will issue Cache Flush operation or
299     *    Cache_Flush sampler message to ensure that the L1 cache remains
300     *    coherent with system memory.
301     *
302     *    [...]
303     *
304     *    Whenever the value of the Dynamic_State_Base_Addr,
305     *    Surface_State_Base_Addr are altered, the L1 state cache must be
306     *    invalidated to ensure the new surface or sampler state is fetched
307     *    from system memory.
308     *
309     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
310     * which, according the PIPE_CONTROL instruction documentation in the
311     * Broadwell PRM:
312     *
313     *    Setting this bit is independent of any other bit in this packet.
314     *    This bit controls the invalidation of the L1 and L2 state caches
315     *    at the top of the pipe i.e. at the parsing time.
316     *
317     * Unfortunately, experimentation seems to indicate that state cache
318     * invalidation through a PIPE_CONTROL does nothing whatsoever in
319     * regards to surface state and binding tables.  In stead, it seems that
320     * invalidating the texture cache is what is actually needed.
321     *
322     * XXX:  As far as we have been able to determine through
323     * experimentation, shows that flush the texture cache appears to be
324     * sufficient.  The theory here is that all of the sampling/rendering
325     * units cache the binding table in the texture cache.  However, we have
326     * yet to be able to actually confirm this.
327     *
328     * Wa_14013910100:
329     *
330     *  "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
331     *   or program pipe control with Instruction cache invalidate post
332     *   STATE_BASE_ADDRESS command"
333     */
334    enum anv_pipe_bits bits =
335       ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
336       ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
337       ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
338       (intel_needs_workaround(device->info, 16013000631) ?
339        ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0);
340 
341 #if GFX_VER >= 9 && GFX_VER <= 11
342       /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
343        *
344        *    "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
345        *     always set for GPGPU workloads when “Texture Cache Invalidation
346        *     Enable” bit is set".
347        *
348        * Workaround stopped appearing in TGL PRMs.
349        */
350       if (cmd_buffer->state.current_pipeline == GPGPU)
351          bits |= ANV_PIPE_CS_STALL_BIT;
352 #endif
353    genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
354                                 cmd_buffer->state.current_pipeline,
355                                 bits);
356 
357    assert(cmd_buffer->state.current_db_mode !=
358           ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
359 
360 #if GFX_VERx10 >= 125
361    assert(sba.BindlessSurfaceStateBaseAddress.offset != 0);
362    mi_store(&b, mi_reg64(ANV_BINDLESS_SURFACE_BASE_ADDR_REG),
363                 mi_imm(sba.BindlessSurfaceStateBaseAddress.offset));
364 
365    mi_goto_target(&b, &t);
366 #endif
367 
368 #if GFX_VERx10 >= 125
369    genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
370 #endif
371 
372    /* If we have emitted a new state base address we probably need to re-emit
373     * binding tables.
374     */
375    cmd_buffer->state.descriptors_dirty |= ~0;
376 }
377 
378 void
genX(cmd_buffer_emit_bt_pool_base_address)379 genX(cmd_buffer_emit_bt_pool_base_address)(struct anv_cmd_buffer *cmd_buffer)
380 {
381    if (!anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer))
382       return;
383 
384    /* If we are emitting a new state base address we probably need to re-emit
385     * binding tables.
386     */
387    cmd_buffer->state.descriptors_dirty |= ~0;
388 
389 #if GFX_VERx10 >= 125
390    struct anv_device *device = cmd_buffer->device;
391    const uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
392 
393    genx_batch_emit_pipe_control(&cmd_buffer->batch,
394                                 cmd_buffer->device->info,
395                                 cmd_buffer->state.current_pipeline,
396                                 ANV_PIPE_CS_STALL_BIT);
397    anv_batch_emit(
398       &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
399       btpa.BindingTablePoolBaseAddress =
400          anv_cmd_buffer_surface_base_address(cmd_buffer);
401       btpa.BindingTablePoolBufferSize = device->physical->va.binding_table_pool.size / 4096;
402       btpa.MOCS = mocs;
403    }
404 
405    genx_batch_emit_pipe_control(&cmd_buffer->batch,
406                                 cmd_buffer->device->info,
407                                 cmd_buffer->state.current_pipeline,
408                                 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
409 #else /* GFX_VERx10 < 125 */
410    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
411 #endif
412 }
413 
414 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)415 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
416                   struct anv_address addr)
417 {
418    VkResult result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
419                                            addr.bo);
420 
421    if (unlikely(result != VK_SUCCESS))
422       anv_batch_set_error(&cmd_buffer->batch, result);
423 }
424 
425 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,const struct anv_surface_state * state)426 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
427                          const struct anv_surface_state *state)
428 {
429    assert(!anv_address_is_null(state->address));
430    add_surface_reloc(cmd_buffer, state->address);
431 
432    if (!anv_address_is_null(state->aux_address)) {
433       VkResult result =
434          anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
435                                state->aux_address.bo);
436       if (result != VK_SUCCESS)
437          anv_batch_set_error(&cmd_buffer->batch, result);
438    }
439 
440    if (!anv_address_is_null(state->clear_address)) {
441       VkResult result =
442          anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
443                                state->clear_address.bo);
444       if (result != VK_SUCCESS)
445          anv_batch_set_error(&cmd_buffer->batch, result);
446    }
447 }
448 
449 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
450  * the initial layout is undefined, the HiZ buffer and depth buffer will
451  * represent the same data at the end of this operation.
452  */
453 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)454 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
455                         const struct anv_image *image,
456                         uint32_t base_level, uint32_t level_count,
457                         uint32_t base_layer, uint32_t layer_count,
458                         VkImageLayout initial_layout,
459                         VkImageLayout final_layout,
460                         bool will_full_fast_clear)
461 {
462    const uint32_t depth_plane =
463       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
464    if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
465       return;
466 
467    /* Initialize the indirect clear color prior to first use. */
468    const struct anv_address clear_color_addr =
469       anv_image_get_clear_color_addr(cmd_buffer->device, image,
470                                      VK_IMAGE_ASPECT_DEPTH_BIT);
471    if (!anv_address_is_null(clear_color_addr) &&
472        (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
473         initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED)) {
474       const enum isl_format depth_format =
475          image->planes[depth_plane].primary_surface.isl.format;
476       const union isl_color_value clear_value =
477          anv_image_hiz_clear_value(image);
478 
479       uint32_t depth_value[4] = {};
480       isl_color_value_pack(&clear_value, depth_format, depth_value);
481 
482       const uint32_t clear_pixel_offset = clear_color_addr.offset +
483          isl_get_sampler_clear_field_offset(cmd_buffer->device->info,
484                                             depth_format);
485       const struct anv_address clear_pixel_addr = {
486          .bo = clear_color_addr.bo,
487          .offset = clear_pixel_offset,
488       };
489 
490       struct mi_builder b;
491       mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
492       mi_builder_set_write_check(&b, true);
493       mi_store(&b, mi_mem32(clear_pixel_addr), mi_imm(depth_value[0]));
494    }
495 
496    /* If will_full_fast_clear is set, the caller promises to fast-clear the
497     * largest portion of the specified range as it can.
498     */
499    if (will_full_fast_clear)
500       return;
501 
502    const enum isl_aux_state initial_state =
503       anv_layout_to_aux_state(cmd_buffer->device->info, image,
504                               VK_IMAGE_ASPECT_DEPTH_BIT,
505                               initial_layout,
506                               cmd_buffer->queue_family->queueFlags);
507    const enum isl_aux_state final_state =
508       anv_layout_to_aux_state(cmd_buffer->device->info, image,
509                               VK_IMAGE_ASPECT_DEPTH_BIT,
510                               final_layout,
511                               cmd_buffer->queue_family->queueFlags);
512 
513    const bool initial_depth_valid =
514       isl_aux_state_has_valid_primary(initial_state);
515    const bool initial_hiz_valid =
516       isl_aux_state_has_valid_aux(initial_state);
517    const bool final_needs_depth =
518       isl_aux_state_has_valid_primary(final_state);
519    const bool final_needs_hiz =
520       isl_aux_state_has_valid_aux(final_state);
521 
522    /* Getting into the pass-through state for Depth is tricky and involves
523     * both a resolve and an ambiguate.  We don't handle that state right now
524     * as anv_layout_to_aux_state never returns it.
525     */
526    assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
527 
528    enum isl_aux_op hiz_op = ISL_AUX_OP_NONE;
529    if (final_needs_depth && !initial_depth_valid) {
530       assert(initial_hiz_valid);
531       hiz_op = ISL_AUX_OP_FULL_RESOLVE;
532    } else if (final_needs_hiz && !initial_hiz_valid) {
533       assert(initial_depth_valid);
534       hiz_op = ISL_AUX_OP_AMBIGUATE;
535    }
536 
537    if (hiz_op != ISL_AUX_OP_NONE) {
538       for (uint32_t l = 0; l < level_count; l++) {
539          const uint32_t level = base_level + l;
540 
541          uint32_t aux_layers =
542             anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level);
543          if (base_layer >= aux_layers)
544             break; /* We will only get fewer layers as level increases */
545          uint32_t level_layer_count =
546             MIN2(layer_count, aux_layers - base_layer);
547 
548          anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
549                           level, base_layer, level_layer_count, hiz_op);
550       }
551    }
552 
553    /* Additional tile cache flush for MTL:
554     *
555     * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
556     * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
557     */
558    if (intel_device_info_is_mtl(cmd_buffer->device->info) &&
559        image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS &&
560        final_needs_depth && !initial_depth_valid) {
561       anv_add_pending_pipe_bits(cmd_buffer,
562                                 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
563                                 "HIZ-CCS flush");
564    }
565 }
566 
567 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
568  * the initial layout is undefined, the HiZ buffer and depth buffer will
569  * represent the same data at the end of this operation.
570  */
571 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)572 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
573                           const struct anv_image *image,
574                           uint32_t base_level, uint32_t level_count,
575                           uint32_t base_layer, uint32_t layer_count,
576                           VkImageLayout initial_layout,
577                           VkImageLayout final_layout,
578                           bool will_full_fast_clear)
579 {
580 #if GFX_VER == 12
581    const uint32_t plane =
582       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
583    if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
584       return;
585 
586    if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
587         initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
588        cmd_buffer->device->info->has_aux_map) {
589       /* If will_full_fast_clear is set, the caller promises to fast-clear the
590        * largest portion of the specified range as it can.
591        */
592       if (will_full_fast_clear)
593          return;
594 
595       for (uint32_t l = 0; l < level_count; l++) {
596          const uint32_t level = base_level + l;
597          const VkRect2D clear_rect = {
598             .offset.x = 0,
599             .offset.y = 0,
600             .extent.width = u_minify(image->vk.extent.width, level),
601             .extent.height = u_minify(image->vk.extent.height, level),
602          };
603 
604          uint32_t aux_layers =
605             anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);
606 
607          if (base_layer >= aux_layers)
608             break; /* We will only get fewer layers as level increases */
609 
610          uint32_t level_layer_count =
611             MIN2(layer_count, aux_layers - base_layer);
612 
613          /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression
614           * Enable:
615           *
616           *    "When enabled, Stencil Buffer needs to be initialized via
617           *    stencil clear (HZ_OP) before any renderpass."
618           */
619          const VkClearDepthStencilValue clear_value = {};
620          anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
621                              level, base_layer, level_layer_count,
622                              clear_rect, &clear_value);
623       }
624    }
625 
626    /* Additional tile cache flush for MTL:
627     *
628     * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
629     * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
630     */
631    if (intel_device_info_is_mtl(cmd_buffer->device->info)) {
632       anv_add_pending_pipe_bits(cmd_buffer,
633                                 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
634                                 "HIZ-CCS flush");
635    }
636 #endif
637 }
638 
639 #define MI_PREDICATE_SRC0    0x2400
640 #define MI_PREDICATE_SRC1    0x2408
641 #define MI_PREDICATE_RESULT  0x2418
642 
643 static void
set_image_compressed_bit(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t base_layer,uint32_t layer_count,bool compressed)644 set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
645                          const struct anv_image *image,
646                          VkImageAspectFlagBits aspect,
647                          uint32_t level,
648                          uint32_t base_layer, uint32_t layer_count,
649                          bool compressed)
650 {
651    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
652 
653    /* We only have compression tracking for CCS_E */
654    if (!isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage))
655       return;
656 
657    struct anv_device *device = cmd_buffer->device;
658    struct mi_builder b;
659    mi_builder_init(&b, device->info, &cmd_buffer->batch);
660    mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
661 
662    for (uint32_t a = 0; a < layer_count; a++) {
663       uint32_t layer = base_layer + a;
664       struct anv_address comp_state_addr =
665          anv_image_get_compression_state_addr(device,
666                                               image, aspect,
667                                               level, layer);
668       mi_store(&b, mi_mem32(comp_state_addr),
669                    mi_imm(compressed ? UINT32_MAX : 0));
670    }
671 
672    /* FCV_CCS_E images are automatically fast cleared to default value at
673     * render time. In order to account for this, anv should set the the
674     * appropriate fast clear state for level0/layer0.
675     *
676     * At the moment, tracking the fast clear state for higher levels/layers is
677     * neither supported, nor do we enter a situation where it is a concern.
678     */
679    if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E &&
680        base_layer == 0 && level == 0) {
681       struct anv_address fc_type_addr =
682          anv_image_get_fast_clear_type_addr(device, image, aspect);
683       mi_store(&b, mi_mem32(fc_type_addr),
684                    mi_imm(ANV_FAST_CLEAR_DEFAULT_VALUE));
685    }
686 }
687 
688 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)689 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
690                            const struct anv_image *image,
691                            VkImageAspectFlagBits aspect,
692                            enum anv_fast_clear_type fast_clear)
693 {
694    struct anv_device *device = cmd_buffer->device;
695    struct mi_builder b;
696    mi_builder_init(&b, device->info, &cmd_buffer->batch);
697    mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
698 
699    struct anv_address fc_type_addr =
700       anv_image_get_fast_clear_type_addr(device, image, aspect);
701    mi_store(&b, mi_mem32(fc_type_addr), mi_imm(fast_clear));
702 
703    /* Whenever we have fast-clear, we consider that slice to be compressed.
704     * This makes building predicates much easier.
705     */
706    if (fast_clear != ANV_FAST_CLEAR_NONE)
707       set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);
708 }
709 
710 /* This is only really practical on haswell and above because it requires
711  * MI math in order to get it correct.
712  */
713 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)714 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
715                                   const struct anv_image *image,
716                                   VkImageAspectFlagBits aspect,
717                                   uint32_t level, uint32_t array_layer,
718                                   enum isl_aux_op resolve_op,
719                                   enum anv_fast_clear_type fast_clear_supported)
720 {
721    struct anv_device *device = cmd_buffer->device;
722    struct anv_address addr =
723       anv_image_get_fast_clear_type_addr(device, image, aspect);
724    struct mi_builder b;
725    mi_builder_init(&b, device->info, &cmd_buffer->batch);
726    mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
727 
728    const struct mi_value fast_clear_type = mi_mem32(addr);
729 
730    if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
731       /* In this case, we're doing a full resolve which means we want the
732        * resolve to happen if any compression (including fast-clears) is
733        * present.
734        *
735        * In order to simplify the logic a bit, we make the assumption that,
736        * if the first slice has been fast-cleared, it is also marked as
737        * compressed.  See also set_image_fast_clear_state.
738        */
739       const struct mi_value compression_state =
740          mi_mem32(anv_image_get_compression_state_addr(device,
741                                                        image, aspect,
742                                                        level, array_layer));
743       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);
744       mi_store(&b, compression_state, mi_imm(0));
745 
746       if (level == 0 && array_layer == 0) {
747          /* If the predicate is true, we want to write 0 to the fast clear type
748           * and, if it's false, leave it alone.  We can do this by writing
749           *
750           * clear_type = clear_type & ~predicate;
751           */
752          struct mi_value new_fast_clear_type =
753             mi_iand(&b, fast_clear_type,
754                         mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));
755          mi_store(&b, fast_clear_type, new_fast_clear_type);
756       }
757    } else if (level == 0 && array_layer == 0) {
758       /* In this case, we are doing a partial resolve to get rid of fast-clear
759        * colors.  We don't care about the compression state but we do care
760        * about how much fast clear is allowed by the final layout.
761        */
762       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
763       assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
764 
765       /* We need to compute (fast_clear_supported < image->fast_clear) */
766       struct mi_value pred =
767          mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
768       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
769 
770       /* If the predicate is true, we want to write 0 to the fast clear type
771        * and, if it's false, leave it alone.  We can do this by writing
772        *
773        * clear_type = clear_type & ~predicate;
774        */
775       struct mi_value new_fast_clear_type =
776          mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
777       mi_store(&b, fast_clear_type, new_fast_clear_type);
778    } else {
779       /* In this case, we're trying to do a partial resolve on a slice that
780        * doesn't have clear color.  There's nothing to do.
781        */
782       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
783       return;
784    }
785 
786    /* Set src1 to 0 and use a != condition */
787    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
788 
789    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
790       mip.LoadOperation    = LOAD_LOADINV;
791       mip.CombineOperation = COMBINE_SET;
792       mip.CompareOperation = COMPARE_SRCS_EQUAL;
793    }
794 }
795 
796 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)797 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
798                                const struct anv_image *image,
799                                enum isl_format format,
800                                struct isl_swizzle swizzle,
801                                VkImageAspectFlagBits aspect,
802                                uint32_t level, uint32_t array_layer,
803                                enum isl_aux_op resolve_op,
804                                enum anv_fast_clear_type fast_clear_supported)
805 {
806    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
807 
808    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
809                                      aspect, level, array_layer,
810                                      resolve_op, fast_clear_supported);
811 
812    /* CCS_D only supports full resolves and BLORP will assert on us if we try
813     * to do a partial resolve on a CCS_D surface.
814     */
815    if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
816        image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
817       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
818 
819    anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
820                     level, array_layer, 1, resolve_op, NULL, true);
821 }
822 
823 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)824 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
825                                const struct anv_image *image,
826                                enum isl_format format,
827                                struct isl_swizzle swizzle,
828                                VkImageAspectFlagBits aspect,
829                                uint32_t array_layer,
830                                enum isl_aux_op resolve_op,
831                                enum anv_fast_clear_type fast_clear_supported)
832 {
833    assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
834    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
835 
836    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
837                                      aspect, 0, array_layer,
838                                      resolve_op, fast_clear_supported);
839 
840    anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
841                     array_layer, 1, resolve_op, NULL, true);
842 }
843 
844 void
genX(cmd_buffer_mark_image_written)845 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
846                                     const struct anv_image *image,
847                                     VkImageAspectFlagBits aspect,
848                                     enum isl_aux_usage aux_usage,
849                                     uint32_t level,
850                                     uint32_t base_layer,
851                                     uint32_t layer_count)
852 {
853 #if GFX_VER < 20
854    /* The aspect must be exactly one of the image aspects. */
855    assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
856 
857    /* Filter out aux usages that don't have any compression tracking.
858     * Note: We only have compression tracking for CCS_E images, but it's
859     * possible for a CCS_E enabled image to have a subresource with a different
860     * aux usage.
861     */
862    if (!isl_aux_usage_has_compression(aux_usage))
863       return;
864 
865    set_image_compressed_bit(cmd_buffer, image, aspect,
866                             level, base_layer, layer_count, true);
867 #endif
868 }
869 
870 /* Copy the fast-clear value dword(s) between a surface state object and an
871  * image's fast clear state buffer.
872  */
873 void
genX(load_image_clear_color)874 genX(load_image_clear_color)(struct anv_cmd_buffer *cmd_buffer,
875                              struct anv_state surface_state,
876                              const struct anv_image *image)
877 {
878 #if GFX_VER < 10
879    assert(cmd_buffer && image);
880    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
881 
882    struct anv_address ss_clear_addr =
883       anv_state_pool_state_address(
884          &cmd_buffer->device->internal_surface_state_pool,
885          (struct anv_state) {
886             .offset = surface_state.offset +
887                       cmd_buffer->device->isl_dev.ss.clear_value_offset
888          });
889    const struct anv_address entry_addr =
890       anv_image_get_clear_color_addr(cmd_buffer->device, image,
891                                      VK_IMAGE_ASPECT_COLOR_BIT);
892    unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
893 
894    struct mi_builder b;
895    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
896    mi_builder_set_write_check(&b, true);
897 
898    mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
899 
900    /* Updating a surface state object may require that the state cache be
901     * invalidated. From the SKL PRM, Shared Functions -> State -> State
902     * Caching:
903     *
904     *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
905     *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
906     *    modified [...], the L1 state cache must be invalidated to ensure
907     *    the new surface or sampler state is fetched from system memory.
908     *
909     * In testing, SKL doesn't actually seem to need this, but HSW does.
910     */
911    anv_add_pending_pipe_bits(cmd_buffer,
912                              ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
913                              "after load_image_clear_color surface state update");
914 #endif
915 }
916 
917 static void
set_image_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,const VkImageAspectFlags aspect,const uint32_t * pixel)918 set_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
919                       const struct anv_image *image,
920                       const VkImageAspectFlags aspect,
921                       const uint32_t *pixel)
922 {
923    UNUSED struct anv_batch *batch = &cmd_buffer->batch;
924    uint32_t plane = anv_image_aspect_to_plane(image, aspect);
925    enum isl_format format = image->planes[plane].primary_surface.isl.format;
926 
927    union isl_color_value clear_color;
928    isl_color_value_unpack(&clear_color, format, pixel);
929 
930    struct anv_address addr =
931       anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
932    assert(!anv_address_is_null(addr));
933 
934 #if GFX_VER >= 20
935    assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 0);
936    assert(cmd_buffer->device->isl_dev.ss.clear_value_size == 0);
937    unreachable("storing clear colors on invalid gfx_ver" );
938 #elif GFX_VER >= 11
939    assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 32);
940    uint32_t *dw = anv_batch_emitn(batch, 3 + 6, GENX(MI_STORE_DATA_IMM),
941                                   .StoreQword = true, .Address = addr);
942    dw[3] = clear_color.u32[0];
943    dw[4] = clear_color.u32[1];
944    dw[5] = clear_color.u32[2];
945    dw[6] = clear_color.u32[3];
946    dw[7] = pixel[0];
947    dw[8] = pixel[1];
948 #else
949    assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 0);
950    assert(cmd_buffer->device->isl_dev.ss.clear_value_size == 16);
951    uint32_t *dw = anv_batch_emitn(batch, 3 + 4, GENX(MI_STORE_DATA_IMM),
952                                   .StoreQword = true, .Address = addr);
953    dw[3] = clear_color.u32[0];
954    dw[4] = clear_color.u32[1];
955    dw[5] = clear_color.u32[2];
956    dw[6] = clear_color.u32[3];
957 #endif
958 }
959 
960 void
genX(set_fast_clear_state)961 genX(set_fast_clear_state)(struct anv_cmd_buffer *cmd_buffer,
962                            const struct anv_image *image,
963                            const enum isl_format format,
964                            union isl_color_value clear_color)
965 {
966    uint32_t pixel[4] = {};
967    isl_color_value_pack(&clear_color, format, pixel);
968    set_image_clear_color(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, pixel);
969 
970    if (isl_color_value_is_zero(clear_color, format)) {
971       /* This image has the auxiliary buffer enabled. We can mark the
972        * subresource as not needing a resolve because the clear color
973        * will match what's in every RENDER_SURFACE_STATE object when
974        * it's being used for sampling.
975        */
976       set_image_fast_clear_state(cmd_buffer, image,
977                                  VK_IMAGE_ASPECT_COLOR_BIT,
978                                  ANV_FAST_CLEAR_DEFAULT_VALUE);
979    } else {
980       set_image_fast_clear_state(cmd_buffer, image,
981                                  VK_IMAGE_ASPECT_COLOR_BIT,
982                                  ANV_FAST_CLEAR_ANY);
983    }
984 }
985 
986 /**
987  * @brief Transitions a color buffer from one layout to another.
988  *
989  * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
990  * more information.
991  *
992  * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
993  * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
994  *                    this represents the maximum layers to transition at each
995  *                    specified miplevel.
996  */
997 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint32_t src_queue_family,uint32_t dst_queue_family,bool will_full_fast_clear)998 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
999                         const struct anv_image *image,
1000                         VkImageAspectFlagBits aspect,
1001                         const uint32_t base_level, uint32_t level_count,
1002                         uint32_t base_layer, uint32_t layer_count,
1003                         VkImageLayout initial_layout,
1004                         VkImageLayout final_layout,
1005                         uint32_t src_queue_family,
1006                         uint32_t dst_queue_family,
1007                         bool will_full_fast_clear)
1008 {
1009    struct anv_device *device = cmd_buffer->device;
1010    const struct intel_device_info *devinfo = device->info;
1011    /* Validate the inputs. */
1012    assert(cmd_buffer);
1013    assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1014    /* These values aren't supported for simplicity's sake. */
1015    assert(level_count != VK_REMAINING_MIP_LEVELS &&
1016           layer_count != VK_REMAINING_ARRAY_LAYERS);
1017    /* Ensure the subresource range is valid. */
1018    UNUSED uint64_t last_level_num = base_level + level_count;
1019    const uint32_t max_depth = u_minify(image->vk.extent.depth, base_level);
1020    UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
1021    assert((uint64_t)base_layer + layer_count  <= image_layers);
1022    assert(last_level_num <= image->vk.mip_levels);
1023    /* If there is a layout transfer, the final layout cannot be undefined or
1024     * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
1025     */
1026    assert(initial_layout == final_layout ||
1027           (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
1028            final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
1029    const struct isl_drm_modifier_info *isl_mod_info =
1030       image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
1031       ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
1032       : NULL;
1033 
1034    const bool src_queue_external =
1035       src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1036       src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1037 
1038    const bool dst_queue_external =
1039       dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1040       dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1041 
1042    /* If the queues are external, consider the first queue family flags
1043     * (should be the most capable)
1044     */
1045    const VkQueueFlagBits src_queue_flags =
1046       device->physical->queue.families[
1047          (src_queue_external || src_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
1048          0 : src_queue_family].queueFlags;
1049    const VkQueueFlagBits dst_queue_flags =
1050       device->physical->queue.families[
1051          (dst_queue_external || dst_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
1052          0 : dst_queue_family].queueFlags;
1053 
1054    /* Simultaneous acquire and release on external queues is illegal. */
1055    assert(!src_queue_external || !dst_queue_external);
1056 
1057    /* Ownership transition on an external queue requires special action if the
1058     * image has a DRM format modifier because we store image data in
1059     * a driver-private bo which is inaccessible to the external queue.
1060     */
1061    const bool private_binding_acquire =
1062       src_queue_external &&
1063       anv_image_is_externally_shared(image) &&
1064       anv_image_has_private_binding(image);
1065 
1066    const bool private_binding_release =
1067       dst_queue_external &&
1068       anv_image_is_externally_shared(image) &&
1069       anv_image_has_private_binding(image);
1070 
1071    if (initial_layout == final_layout &&
1072        !private_binding_acquire && !private_binding_release) {
1073       /* No work is needed. */
1074        return;
1075    }
1076 
1077    /**
1078     * Section 7.7.4 of the Vulkan 1.3.260 spec says:
1079     *
1080     *    If the transfer is via an image memory barrier, and an image layout
1081     *    transition is desired, then the values of oldLayout and newLayout in the
1082     *    release operation's memory barrier must be equal to values of oldLayout
1083     *    and newLayout in the acquire operation's memory barrier. Although the
1084     *    image layout transition is submitted twice, it will only be executed
1085     *    once. A layout transition specified in this way happens-after the
1086     *    release operation and happens-before the acquire operation.
1087     *
1088     * Because we know that we get match transition on each queue, we choose to
1089     * only do the work on one queue type : RENDER. In the cases where we do
1090     * transitions between COMPUTE & TRANSFER, we should have matching
1091     * aux/fast_clear value which would trigger no work in the code below.
1092     */
1093    if (!(src_queue_external || dst_queue_external) &&
1094        src_queue_family != VK_QUEUE_FAMILY_IGNORED &&
1095        dst_queue_family != VK_QUEUE_FAMILY_IGNORED &&
1096        src_queue_family != dst_queue_family) {
1097       enum intel_engine_class src_engine =
1098          cmd_buffer->queue_family->engine_class;
1099       if (src_engine != INTEL_ENGINE_CLASS_RENDER)
1100          return;
1101    }
1102 
1103    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
1104 
1105    if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
1106       return;
1107 
1108    enum isl_aux_usage initial_aux_usage =
1109       anv_layout_to_aux_usage(devinfo, image, aspect, 0,
1110                               initial_layout, src_queue_flags);
1111    enum isl_aux_usage final_aux_usage =
1112       anv_layout_to_aux_usage(devinfo, image, aspect, 0,
1113                               final_layout, dst_queue_flags);
1114    enum anv_fast_clear_type initial_fast_clear =
1115       anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout,
1116                                     src_queue_flags);
1117    enum anv_fast_clear_type final_fast_clear =
1118       anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout,
1119                                     dst_queue_flags);
1120 
1121    /* We must override the anv_layout_to_* functions because they are unaware
1122     * of acquire/release direction.
1123     */
1124    if (private_binding_acquire) {
1125       initial_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
1126          image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
1127       initial_fast_clear = isl_mod_info->supports_clear_color ?
1128          initial_fast_clear : ANV_FAST_CLEAR_NONE;
1129    } else if (private_binding_release) {
1130       final_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
1131          image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
1132       final_fast_clear = isl_mod_info->supports_clear_color ?
1133          final_fast_clear : ANV_FAST_CLEAR_NONE;
1134    }
1135 
1136    assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
1137 
1138    /* The following layouts are equivalent for non-linear images. */
1139    const bool initial_layout_undefined =
1140       initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
1141       initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
1142 
1143    bool must_init_fast_clear_state = false;
1144    bool must_init_aux_surface = false;
1145 
1146    if (initial_layout_undefined) {
1147       /* The subresource may have been aliased and populated with arbitrary
1148        * data, so we should initialize fast-clear state on platforms prior to
1149        * Xe2. Xe2+ platforms don't need it thanks to the new design of fast-
1150        * clear.
1151        */
1152       must_init_fast_clear_state = devinfo->ver < 20;
1153 
1154       if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS ||
1155           devinfo->has_illegal_ccs_values) {
1156 
1157          must_init_aux_surface = true;
1158 
1159       } else {
1160          assert(isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage));
1161 
1162          /* We can start using the CCS immediately without ambiguating. The
1163           * two conditions that enable this are:
1164           *
1165           * 1) The device treats all possible CCS values as legal. In other
1166           *    words, we can't confuse the hardware with random bits in the
1167           *    CCS.
1168           *
1169           * 2) We enable compression on all writable image layouts. The CCS
1170           *    will receive all writes and will therefore always be in sync
1171           *    with the main surface.
1172           *
1173           *    If we were to disable compression on some writable layouts, the
1174           *    CCS could get out of sync with the main surface and the app
1175           *    could lose the data it wrote previously. For example, this
1176           *    could happen if an app: transitions from UNDEFINED w/o
1177           *    ambiguating -> renders with AUX_NONE -> samples with AUX_CCS.
1178           *
1179           * The second condition is asserted below, but could be moved
1180           * elsewhere for more coverage (we're only checking transitions from
1181           * an undefined layout).
1182           */
1183          assert(vk_image_layout_is_read_only(final_layout, aspect) ||
1184                 (final_aux_usage != ISL_AUX_USAGE_NONE));
1185 
1186          must_init_aux_surface = false;
1187       }
1188 
1189    } else if (private_binding_acquire) {
1190       /* The fast clear state lives in a driver-private bo, and therefore the
1191        * external/foreign queue is unaware of it.
1192        *
1193        * If this is the first time we are accessing the image, then the fast
1194        * clear state is uninitialized.
1195        *
1196        * If this is NOT the first time we are accessing the image, then the fast
1197        * clear state may still be valid and correct due to the resolve during
1198        * our most recent ownership release.  However, we do not track the aux
1199        * state with MI stores, and therefore must assume the worst-case: that
1200        * this is the first time we are accessing the image.
1201        */
1202       assert(image->planes[plane].fast_clear_memory_range.binding ==
1203               ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1204       must_init_fast_clear_state = true;
1205 
1206       if (anv_image_get_aux_memory_range(image, plane)->binding ==
1207           ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
1208          /* The aux surface, like the fast clear state, lives in
1209           * a driver-private bo.  We must initialize the aux surface for the
1210           * same reasons we must initialize the fast clear state.
1211           */
1212          must_init_aux_surface = true;
1213       } else {
1214          /* The aux surface, unlike the fast clear state, lives in
1215           * application-visible VkDeviceMemory and is shared with the
1216           * external/foreign queue. Therefore, when we acquire ownership of the
1217           * image with a defined VkImageLayout, the aux surface is valid and has
1218           * the aux state required by the modifier.
1219           */
1220          must_init_aux_surface = false;
1221       }
1222    }
1223 
1224    if (must_init_fast_clear_state) {
1225       if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
1226          assert(!image->planes[plane].can_non_zero_fast_clear);
1227          const uint32_t zero_pixel[4] = {};
1228          set_image_clear_color(cmd_buffer, image, aspect, zero_pixel);
1229       }
1230       if (base_level == 0 && base_layer == 0) {
1231          set_image_fast_clear_state(cmd_buffer, image, aspect,
1232                                     ANV_FAST_CLEAR_NONE);
1233       }
1234    }
1235 
1236    if (must_init_aux_surface) {
1237       assert(devinfo->ver >= 20 || must_init_fast_clear_state);
1238 
1239       /* Initialize the aux buffers to enable correct rendering.  In order to
1240        * ensure that things such as storage images work correctly, aux buffers
1241        * need to be initialized to valid data.
1242        *
1243        * Having an aux buffer with invalid data is a problem for two reasons:
1244        *
1245        *  1) Having an invalid value in the buffer can confuse the hardware.
1246        *     For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1247        *     invalid and leads to the hardware doing strange things.  It
1248        *     doesn't hang as far as we can tell but rendering corruption can
1249        *     occur.
1250        *
1251        *  2) If this transition is into the GENERAL layout and we then use the
1252        *     image as a storage image, then we must have the aux buffer in the
1253        *     pass-through state so that, if we then go to texture from the
1254        *     image, we get the results of our storage image writes and not the
1255        *     fast clear color or other random data.
1256        *
1257        * For CCS both of the problems above are real demonstrable issues.  In
1258        * that case, the only thing we can do is to perform an ambiguate to
1259        * transition the aux surface into the pass-through state.
1260        *
1261        * For MCS, (2) is never an issue because we don't support multisampled
1262        * storage images.  In theory, issue (1) is a problem with MCS but we've
1263        * never seen it in the wild.  For 4x and 16x, all bit patterns could,
1264        * in theory, be interpreted as something but we don't know that all bit
1265        * patterns are actually valid.  For 2x and 8x, you could easily end up
1266        * with the MCS referring to an invalid plane because not all bits of
1267        * the MCS value are actually used.  Even though we've never seen issues
1268        * in the wild, it's best to play it safe and initialize the MCS.  We
1269        * could use a fast-clear for MCS because we only ever touch from render
1270        * and texture (no image load store). However, due to WA 14013111325,
1271        * we choose to ambiguate MCS as well.
1272        */
1273       if (image->vk.samples == 1) {
1274          for (uint32_t l = 0; l < level_count; l++) {
1275             const uint32_t level = base_level + l;
1276 
1277             uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1278             if (base_layer >= aux_layers)
1279                break; /* We will only get fewer layers as level increases */
1280             uint32_t level_layer_count =
1281                MIN2(layer_count, aux_layers - base_layer);
1282 
1283             /* If will_full_fast_clear is set, the caller promises to
1284              * fast-clear the largest portion of the specified range as it can.
1285              * For color images, that means only the first LOD and array slice.
1286              */
1287             if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1288                base_layer++;
1289                level_layer_count--;
1290                if (level_layer_count == 0)
1291                   continue;
1292             }
1293 
1294             anv_image_ccs_op(cmd_buffer, image,
1295                              image->planes[plane].primary_surface.isl.format,
1296                              ISL_SWIZZLE_IDENTITY,
1297                              aspect, level, base_layer, level_layer_count,
1298                              ISL_AUX_OP_AMBIGUATE, NULL, false);
1299 
1300             set_image_compressed_bit(cmd_buffer, image, aspect, level,
1301                                      base_layer, level_layer_count, false);
1302          }
1303       } else {
1304          /* If will_full_fast_clear is set, the caller promises to fast-clear
1305           * the largest portion of the specified range as it can.
1306           */
1307          if (will_full_fast_clear)
1308             return;
1309 
1310          assert(base_level == 0 && level_count == 1);
1311          anv_image_mcs_op(cmd_buffer, image,
1312                           image->planes[plane].primary_surface.isl.format,
1313                           ISL_SWIZZLE_IDENTITY,
1314                           aspect, base_layer, layer_count,
1315                           ISL_AUX_OP_AMBIGUATE, NULL, false);
1316       }
1317       return;
1318    }
1319 
1320    /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1321     * We can handle transitions between CCS_D/E to and from NONE.  What we
1322     * don't yet handle is switching between CCS_E and CCS_D within a given
1323     * image.  Doing so in a performant way requires more detailed aux state
1324     * tracking such as what is done in i965.  For now, just assume that we
1325     * only have one type of compression.
1326     */
1327    assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1328           final_aux_usage == ISL_AUX_USAGE_NONE ||
1329           initial_aux_usage == final_aux_usage);
1330 
1331    /* If initial aux usage is NONE, there is nothing to resolve */
1332    if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1333       return;
1334 
1335    enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1336 
1337    /* If the initial layout supports more fast clear than the final layout
1338     * then we need at least a partial resolve.
1339     */
1340    if (final_fast_clear < initial_fast_clear) {
1341       /* Partial resolves will actually only occur on layer 0/level 0. This
1342        * is generally okay because anv only allows explicit fast clears to
1343        * the first subresource.
1344        *
1345        * The situation is a bit different with FCV_CCS_E. With that aux
1346        * usage, implicit fast clears can occur on any layer and level.
1347        * anv doesn't track fast clear states for more than the first
1348        * subresource, so we need to assert that a layout transition doesn't
1349        * attempt to partial resolve the other subresources.
1350        *
1351        * At the moment, we don't enter such a situation, and partial resolves
1352        * for higher level/layer resources shouldn't be a concern.
1353        */
1354       if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
1355          assert(base_level == 0 && level_count == 1 &&
1356                 base_layer == 0 && layer_count == 1);
1357       }
1358       resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1359    }
1360 
1361    if (isl_aux_usage_has_ccs_e(initial_aux_usage) &&
1362        !isl_aux_usage_has_ccs_e(final_aux_usage))
1363       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
1364 
1365    if (resolve_op == ISL_AUX_OP_NONE)
1366       return;
1367 
1368    for (uint32_t l = 0; l < level_count; l++) {
1369       uint32_t level = base_level + l;
1370 
1371       uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1372       if (base_layer >= aux_layers)
1373          break; /* We will only get fewer layers as level increases */
1374       uint32_t level_layer_count =
1375          MIN2(layer_count, aux_layers - base_layer);
1376 
1377       for (uint32_t a = 0; a < level_layer_count; a++) {
1378          uint32_t array_layer = base_layer + a;
1379 
1380          /* If will_full_fast_clear is set, the caller promises to fast-clear
1381           * the largest portion of the specified range as it can.  For color
1382           * images, that means only the first LOD and array slice.
1383           */
1384          if (level == 0 && array_layer == 0 && will_full_fast_clear)
1385             continue;
1386 
1387          if (image->vk.samples == 1) {
1388             anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1389                                            image->planes[plane].primary_surface.isl.format,
1390                                            ISL_SWIZZLE_IDENTITY,
1391                                            aspect, level, array_layer, resolve_op,
1392                                            final_fast_clear);
1393          } else {
1394             /* We only support fast-clear on the first layer so partial
1395              * resolves should not be used on other layers as they will use
1396              * the clear color stored in memory that is only valid for layer0.
1397              */
1398             if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1399                 array_layer != 0)
1400                continue;
1401 
1402             anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1403                                            image->planes[plane].primary_surface.isl.format,
1404                                            ISL_SWIZZLE_IDENTITY,
1405                                            aspect, array_layer, resolve_op,
1406                                            final_fast_clear);
1407          }
1408       }
1409    }
1410 }
1411 
1412 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer * cmd_buffer,uint32_t color_att_count)1413 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1414                                 uint32_t color_att_count)
1415 {
1416    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1417 
1418    /* Reserve one for the NULL state. */
1419    unsigned num_states = 1 + color_att_count;
1420    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1421    const uint32_t ss_stride = align(isl_dev->ss.size, isl_dev->ss.align);
1422    gfx->att_states =
1423       anv_cmd_buffer_alloc_surface_states(cmd_buffer, num_states);
1424    if (gfx->att_states.map == NULL)
1425       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1426 
1427    struct anv_state next_state = gfx->att_states;
1428    next_state.alloc_size = isl_dev->ss.size;
1429 
1430    gfx->null_surface_state = next_state;
1431    next_state.offset += ss_stride;
1432    next_state.map += ss_stride;
1433 
1434    gfx->color_att_count = color_att_count;
1435    for (uint32_t i = 0; i < color_att_count; i++) {
1436       gfx->color_att[i] = (struct anv_attachment) {
1437          .surface_state.state = next_state,
1438       };
1439       next_state.offset += ss_stride;
1440       next_state.map += ss_stride;
1441    }
1442    gfx->depth_att = (struct anv_attachment) { };
1443    gfx->stencil_att = (struct anv_attachment) { };
1444 
1445    return VK_SUCCESS;
1446 }
1447 
1448 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer * cmd_buffer)1449 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1450 {
1451    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1452 
1453    gfx->render_area = (VkRect2D) { };
1454    gfx->layer_count = 0;
1455    gfx->samples = 0;
1456 
1457    gfx->color_att_count = 0;
1458    gfx->depth_att = (struct anv_attachment) { };
1459    gfx->stencil_att = (struct anv_attachment) { };
1460    gfx->null_surface_state = ANV_STATE_NULL;
1461 }
1462 
1463 /**
1464  * Program the hardware to use the specified L3 configuration.
1465  */
1466 void
genX(cmd_buffer_config_l3)1467 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
1468                            const struct intel_l3_config *cfg)
1469 {
1470    assert(cfg || GFX_VER >= 12);
1471    if (cfg == cmd_buffer->state.current_l3_config)
1472       return;
1473 
1474 #if GFX_VER >= 11
1475    /* On Gfx11+ we use only one config, so verify it remains the same and skip
1476     * the stalling programming entirely.
1477     */
1478    assert(cfg == cmd_buffer->device->l3_config);
1479 #else
1480    if (INTEL_DEBUG(DEBUG_L3)) {
1481       mesa_logd("L3 config transition: ");
1482       intel_dump_l3_config(cfg, stderr);
1483    }
1484 
1485    /* According to the hardware docs, the L3 partitioning can only be changed
1486     * while the pipeline is completely drained and the caches are flushed,
1487     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1488     */
1489    genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1490                                 cmd_buffer->state.current_pipeline,
1491                                 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1492                                 ANV_PIPE_CS_STALL_BIT);
1493 
1494    /* ...followed by a second pipelined PIPE_CONTROL that initiates
1495     * invalidation of the relevant caches.  Note that because RO invalidation
1496     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1497     * command is processed by the CS) we cannot combine it with the previous
1498     * stalling flush as the hardware documentation suggests, because that
1499     * would cause the CS to stall on previous rendering *after* RO
1500     * invalidation and wouldn't prevent the RO caches from being polluted by
1501     * concurrent rendering before the stall completes.  This intentionally
1502     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1503     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1504     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1505     * already guarantee that there is no concurrent GPGPU kernel execution
1506     * (see SKL HSD 2132585).
1507     */
1508    genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1509                                 cmd_buffer->state.current_pipeline,
1510                                 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
1511                                 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
1512                                 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1513                                 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
1514 
1515    /* Now send a third stalling flush to make sure that invalidation is
1516     * complete when the L3 configuration registers are modified.
1517     */
1518    genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1519                                 cmd_buffer->state.current_pipeline,
1520                                 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1521                                 ANV_PIPE_CS_STALL_BIT);
1522 
1523    genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
1524 #endif /* GFX_VER >= 11 */
1525    cmd_buffer->state.current_l3_config = cfg;
1526 }
1527 
1528 ALWAYS_INLINE void
genX(invalidate_aux_map)1529 genX(invalidate_aux_map)(struct anv_batch *batch,
1530                          struct anv_device *device,
1531                          enum intel_engine_class engine_class,
1532                          enum anv_pipe_bits bits)
1533 {
1534 #if GFX_VER == 12
1535    if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info->has_aux_map) {
1536       uint32_t register_addr = 0;
1537       switch (engine_class) {
1538       case INTEL_ENGINE_CLASS_COMPUTE:
1539          register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
1540          break;
1541       case INTEL_ENGINE_CLASS_COPY:
1542 #if GFX_VERx10 >= 125
1543          register_addr = GENX(BCS_CCS_AUX_INV_num);
1544 #endif
1545          break;
1546       case INTEL_ENGINE_CLASS_VIDEO:
1547          register_addr = GENX(VD0_CCS_AUX_INV_num);
1548          break;
1549       case INTEL_ENGINE_CLASS_RENDER:
1550       default:
1551          register_addr = GENX(GFX_CCS_AUX_INV_num);
1552          break;
1553       }
1554 
1555       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
1556          lri.RegisterOffset = register_addr;
1557          lri.DataDWord = 1;
1558       }
1559 
1560       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
1561       if (intel_needs_workaround(device->info, 16018063123) &&
1562           engine_class == INTEL_ENGINE_CLASS_COPY) {
1563          genX(batch_emit_fast_color_dummy_blit)(batch, device);
1564       }
1565 
1566       /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
1567        *
1568        *    "Poll Aux Invalidation bit once the invalidation is set
1569        *     (Register 4208 bit 0)"
1570        */
1571       anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
1572          sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
1573          sem.WaitMode = PollingMode;
1574          sem.RegisterPollMode = true;
1575          sem.SemaphoreDataDword = 0x0;
1576          sem.SemaphoreAddress =
1577             anv_address_from_u64(register_addr);
1578       }
1579    }
1580 #else
1581    assert(!device->info->has_aux_map);
1582 #endif
1583 }
1584 
1585 ALWAYS_INLINE enum anv_pipe_bits
genX(emit_apply_pipe_flushes)1586 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
1587                               struct anv_device *device,
1588                               uint32_t current_pipeline,
1589                               enum anv_pipe_bits bits,
1590                               enum anv_pipe_bits *emitted_flush_bits)
1591 {
1592 #if GFX_VER >= 12
1593    /* From the TGL PRM, Volume 2a, "PIPE_CONTROL":
1594     *
1595     *     "SW must follow below programming restrictions when programming
1596     *      PIPE_CONTROL command [for ComputeCS]:
1597     *      ...
1598     *      Following bits must not be set when programmed for ComputeCS:
1599     *      - "Render Target Cache Flush Enable", "Depth Cache Flush Enable"
1600     *         and "Tile Cache Flush Enable"
1601     *      - "Depth Stall Enable", Stall at Pixel Scoreboard and
1602     *         "PSD Sync Enable".
1603     *      - "OVR Tile 0 Flush", "TBIMR Force Batch Closure",
1604     *         "AMFS Flush Enable", "VF Cache Invalidation Enable" and
1605     *         "Global Snapshot Count Reset"."
1606     *
1607     * XXX: According to spec this should not be a concern for a regular
1608     * RCS in GPGPU mode, but during testing it was found that at least
1609     * "VF Cache Invalidation Enable" bit is ignored in such case.
1610     * This can cause us to miss some important invalidations
1611     * (e.g. from CmdPipelineBarriers) and have incoherent data.
1612     *
1613     * There is also a Wa_1606932921 "RCS is not waking up fixed function clock
1614     * when specific 3d related bits are programmed in pipecontrol in
1615     * compute mode" that suggests us not to use "RT Cache Flush" in GPGPU mode.
1616     *
1617     * The other bits are not confirmed to cause problems, but included here
1618     * just to be safe, as they're also not really relevant in the GPGPU mode,
1619     * and having them doesn't seem to cause any regressions.
1620     *
1621     * So if we're currently in GPGPU mode, we hide some bits from
1622     * this flush, and will flush them only when we'll be able to.
1623     * Similar thing with GPGPU-only bits.
1624     */
1625    enum anv_pipe_bits defer_bits = bits &
1626       (current_pipeline == GPGPU ? ANV_PIPE_GFX_BITS: ANV_PIPE_GPGPU_BITS);
1627 
1628    bits &= ~defer_bits;
1629 #endif
1630 
1631    /*
1632     * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
1633     *
1634     *    Write synchronization is a special case of end-of-pipe
1635     *    synchronization that requires that the render cache and/or depth
1636     *    related caches are flushed to memory, where the data will become
1637     *    globally visible. This type of synchronization is required prior to
1638     *    SW (CPU) actually reading the result data from memory, or initiating
1639     *    an operation that will use as a read surface (such as a texture
1640     *    surface) a previous render target and/or depth/stencil buffer
1641     *
1642     *
1643     * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1644     *
1645     *    Exercising the write cache flush bits (Render Target Cache Flush
1646     *    Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
1647     *    ensures the write caches are flushed and doesn't guarantee the data
1648     *    is globally visible.
1649     *
1650     *    SW can track the completion of the end-of-pipe-synchronization by
1651     *    using "Notify Enable" and "PostSync Operation - Write Immediate
1652     *    Data" in the PIPE_CONTROL command.
1653     *
1654     * In other words, flushes are pipelined while invalidations are handled
1655     * immediately.  Therefore, if we're flushing anything then we need to
1656     * schedule an end-of-pipe sync before any invalidations can happen.
1657     */
1658    if (bits & ANV_PIPE_FLUSH_BITS)
1659       bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1660 
1661 
1662    /* HSD 1209978178: docs say that before programming the aux table:
1663     *
1664     *    "Driver must ensure that the engine is IDLE but ensure it doesn't
1665     *    add extra flushes in the case it knows that the engine is already
1666     *    IDLE."
1667     *
1668     * HSD 22012751911: SW Programming sequence when issuing aux invalidation:
1669     *
1670     *    "Render target Cache Flush + L3 Fabric Flush + State Invalidation + CS Stall"
1671     *
1672     * Notice we don't set the L3 Fabric Flush here, because we have
1673     * ANV_PIPE_END_OF_PIPE_SYNC_BIT which inserts a CS stall. The
1674     * PIPE_CONTROL::L3 Fabric Flush documentation says :
1675     *
1676     *    "L3 Fabric Flush will ensure all the pending transactions in the L3
1677     *     Fabric are flushed to global observation point. HW does implicit L3
1678     *     Fabric Flush on all stalling flushes (both explicit and implicit)
1679     *     and on PIPECONTROL having Post Sync Operation enabled."
1680     *
1681     * Therefore setting L3 Fabric Flush here would be redundant.
1682     */
1683    if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)) {
1684       if (current_pipeline == GPGPU) {
1685          bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
1686                   ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1687                   (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
1688       } else if (current_pipeline == _3D) {
1689          bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
1690                   ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1691                   ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
1692                   (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
1693       }
1694    }
1695 
1696    /* If we're going to do an invalidate and we have a pending end-of-pipe
1697     * sync that has yet to be resolved, we do the end-of-pipe sync now.
1698     */
1699    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
1700        (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
1701       bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
1702       bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1703 
1704       if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) && bits) {
1705          fputs("pc: add ", stderr);
1706          anv_dump_pipe_bits(ANV_PIPE_END_OF_PIPE_SYNC_BIT, stdout);
1707          fprintf(stderr, "reason: Ensure flushes done before invalidate\n");
1708       }
1709    }
1710 
1711    /* Project: SKL / Argument: LRI Post Sync Operation [23]
1712     *
1713     * "PIPECONTROL command with “Command Streamer Stall Enable” must be
1714     *  programmed prior to programming a PIPECONTROL command with "LRI
1715     *  Post Sync Operation" in GPGPU mode of operation (i.e when
1716     *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
1717     *
1718     * The same text exists a few rows below for Post Sync Op.
1719     */
1720    if (bits & ANV_PIPE_POST_SYNC_BIT) {
1721       if (GFX_VER == 9 && current_pipeline == GPGPU)
1722          bits |= ANV_PIPE_CS_STALL_BIT;
1723       bits &= ~ANV_PIPE_POST_SYNC_BIT;
1724    }
1725 
1726    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1727                ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
1728       enum anv_pipe_bits flush_bits =
1729          bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1730                  ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1731 
1732       uint32_t sync_op = NoWrite;
1733       struct anv_address addr = ANV_NULL_ADDRESS;
1734 
1735       /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
1736        *
1737        *    "The most common action to perform upon reaching a
1738        *    synchronization point is to write a value out to memory. An
1739        *    immediate value (included with the synchronization command) may
1740        *    be written."
1741        *
1742        *
1743        * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
1744        *
1745        *    "In case the data flushed out by the render engine is to be
1746        *    read back in to the render engine in coherent manner, then the
1747        *    render engine has to wait for the fence completion before
1748        *    accessing the flushed data. This can be achieved by following
1749        *    means on various products: PIPE_CONTROL command with CS Stall
1750        *    and the required write caches flushed with Post-Sync-Operation
1751        *    as Write Immediate Data.
1752        *
1753        *    Example:
1754        *       - Workload-1 (3D/GPGPU/MEDIA)
1755        *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1756        *         Immediate Data, Required Write Cache Flush bits set)
1757        *       - Workload-2 (Can use the data produce or output by
1758        *         Workload-1)
1759        */
1760       if (flush_bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
1761          flush_bits |= ANV_PIPE_CS_STALL_BIT;
1762          sync_op = WriteImmediateData;
1763          addr = device->workaround_address;
1764       }
1765 
1766       /* Flush PC. */
1767       genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
1768                                          sync_op, addr, 0, flush_bits);
1769 
1770       /* If the caller wants to know what flushes have been emitted,
1771        * provide the bits based off the PIPE_CONTROL programmed bits.
1772        */
1773       if (emitted_flush_bits != NULL)
1774          *emitted_flush_bits = flush_bits;
1775 
1776       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1777                 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1778    }
1779 
1780    if (bits & ANV_PIPE_INVALIDATE_BITS) {
1781       uint32_t sync_op = NoWrite;
1782       struct anv_address addr = ANV_NULL_ADDRESS;
1783 
1784       /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
1785        *
1786        *    "When VF Cache Invalidate is set “Post Sync Operation” must be
1787        *    enabled to “Write Immediate Data” or “Write PS Depth Count” or
1788        *    “Write Timestamp”.
1789        */
1790       if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1791          sync_op = WriteImmediateData;
1792          addr = device->workaround_address;
1793       }
1794 
1795       /* Invalidate PC. */
1796       genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
1797                                          sync_op, addr, 0, bits);
1798 
1799       enum intel_engine_class engine_class =
1800          current_pipeline == GPGPU ? INTEL_ENGINE_CLASS_COMPUTE :
1801                                      INTEL_ENGINE_CLASS_RENDER;
1802       genX(invalidate_aux_map)(batch, device, engine_class, bits);
1803 
1804       bits &= ~ANV_PIPE_INVALIDATE_BITS;
1805    }
1806 
1807 #if GFX_VER >= 12
1808    bits |= defer_bits;
1809 #endif
1810 
1811    return bits;
1812 }
1813 
1814 ALWAYS_INLINE void
genX(cmd_buffer_apply_pipe_flushes)1815 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
1816 {
1817 #if INTEL_NEEDS_WA_1508744258
1818    /* If we're changing the state of the RHWO optimization, we need to have
1819     * sb_stall+cs_stall.
1820     */
1821    const bool rhwo_opt_change =
1822       cmd_buffer->state.rhwo_optimization_enabled !=
1823       cmd_buffer->state.pending_rhwo_optimization_enabled;
1824    if (rhwo_opt_change) {
1825       anv_add_pending_pipe_bits(cmd_buffer,
1826                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
1827                                 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1828                                 "change RHWO optimization");
1829    }
1830 #endif
1831 
1832    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
1833 
1834    if (unlikely(cmd_buffer->device->physical->always_flush_cache))
1835       bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
1836    else if (bits == 0)
1837       return;
1838 
1839    if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
1840        anv_cmd_buffer_is_video_queue(cmd_buffer)) {
1841       if (bits & ANV_PIPE_INVALIDATE_BITS) {
1842          genX(invalidate_aux_map)(&cmd_buffer->batch, cmd_buffer->device,
1843                                   cmd_buffer->queue_family->engine_class, bits);
1844          bits &= ~ANV_PIPE_INVALIDATE_BITS;
1845       }
1846       cmd_buffer->state.pending_pipe_bits = bits;
1847       return;
1848    }
1849 
1850    if (GFX_VER == 9 &&
1851        (bits & ANV_PIPE_CS_STALL_BIT) &&
1852        (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1853       /* If we are doing a VF cache invalidate AND a CS stall (it must be
1854        * both) then we can reset our vertex cache tracking.
1855        */
1856       memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
1857              sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
1858       memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
1859              sizeof(cmd_buffer->state.gfx.ib_dirty_range));
1860    }
1861 
1862    enum anv_pipe_bits emitted_bits = 0;
1863    cmd_buffer->state.pending_pipe_bits =
1864       genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
1865                                     cmd_buffer->device,
1866                                     cmd_buffer->state.current_pipeline,
1867                                     bits,
1868                                     &emitted_bits);
1869    anv_cmd_buffer_update_pending_query_bits(cmd_buffer, emitted_bits);
1870 
1871 #if INTEL_NEEDS_WA_1508744258
1872    if (rhwo_opt_change) {
1873       anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
1874          c1.RCCRHWOOptimizationDisable =
1875             !cmd_buffer->state.pending_rhwo_optimization_enabled;
1876          c1.RCCRHWOOptimizationDisableMask = true;
1877       }
1878       cmd_buffer->state.rhwo_optimization_enabled =
1879          cmd_buffer->state.pending_rhwo_optimization_enabled;
1880    }
1881 #endif
1882 
1883 }
1884 
1885 static inline struct anv_state
emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1886 emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1887                                         struct anv_cmd_pipeline_state *pipe_state,
1888                                         struct anv_pipeline_binding *binding,
1889                                         const struct anv_descriptor *desc)
1890 {
1891    if (!desc->buffer)
1892       return anv_null_surface_state_for_binding_table(cmd_buffer->device);
1893 
1894    /* Compute the offset within the buffer */
1895    uint32_t dynamic_offset =
1896       pipe_state->dynamic_offsets[
1897          binding->set].offsets[binding->dynamic_offset_index];
1898    uint64_t offset = desc->offset + dynamic_offset;
1899    /* Clamp to the buffer size */
1900    offset = MIN2(offset, desc->buffer->vk.size);
1901    /* Clamp the range to the buffer size */
1902    uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
1903 
1904    /* Align the range for consistency */
1905    if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
1906       range = align(range, ANV_UBO_ALIGNMENT);
1907 
1908    struct anv_address address =
1909       anv_address_add(desc->buffer->address, offset);
1910 
1911    struct anv_state surface_state =
1912       anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
1913    if (surface_state.map == NULL)
1914       return ANV_STATE_NULL;
1915 
1916    enum isl_format format =
1917       anv_isl_format_for_descriptor_type(cmd_buffer->device,
1918                                          desc->type);
1919 
1920    isl_surf_usage_flags_t usage =
1921       anv_isl_usage_for_descriptor_type(desc->type);
1922 
1923    anv_fill_buffer_surface_state(cmd_buffer->device,
1924                                  surface_state.map,
1925                                  format, ISL_SWIZZLE_IDENTITY,
1926                                  usage, address, range, 1);
1927 
1928    return surface_state;
1929 }
1930 
1931 static uint32_t
emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1932 emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1933                                              struct anv_cmd_pipeline_state *pipe_state,
1934                                              struct anv_pipeline_binding *binding,
1935                                              const struct anv_descriptor *desc)
1936 {
1937    struct anv_device *device = cmd_buffer->device;
1938    struct anv_state surface_state;
1939 
1940    /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
1941     * Depending on where the descriptor surface state is allocated, they can
1942     * either come from device->internal_surface_state_pool or
1943     * device->bindless_surface_state_pool.
1944     */
1945    switch (desc->type) {
1946    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
1947    case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
1948    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
1949       if (desc->image_view) {
1950          const struct anv_surface_state *sstate =
1951             anv_image_view_texture_surface_state(desc->image_view,
1952                                                  binding->plane,
1953                                                  desc->layout);
1954          surface_state = desc->image_view->use_surface_state_stream ?
1955             sstate->state :
1956             anv_bindless_state_for_binding_table(device, sstate->state);
1957          assert(surface_state.alloc_size);
1958       } else {
1959          surface_state = anv_null_surface_state_for_binding_table(device);
1960       }
1961       break;
1962    }
1963 
1964    case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
1965       if (desc->image_view) {
1966          const struct anv_surface_state *sstate =
1967             anv_image_view_storage_surface_state(desc->image_view);
1968          surface_state = desc->image_view->use_surface_state_stream ?
1969             sstate->state :
1970             anv_bindless_state_for_binding_table(device, sstate->state);
1971          assert(surface_state.alloc_size);
1972       } else {
1973          surface_state =
1974             anv_null_surface_state_for_binding_table(device);
1975       }
1976       break;
1977    }
1978 
1979    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
1980    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
1981       if (desc->set_buffer_view) {
1982          surface_state = desc->set_buffer_view->general.state;
1983          assert(surface_state.alloc_size);
1984       } else {
1985          surface_state = anv_null_surface_state_for_binding_table(device);
1986       }
1987       break;
1988 
1989    case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
1990       if (desc->buffer_view) {
1991          surface_state = anv_bindless_state_for_binding_table(
1992             device,
1993             desc->buffer_view->general.state);
1994          assert(surface_state.alloc_size);
1995       } else {
1996          surface_state = anv_null_surface_state_for_binding_table(device);
1997       }
1998       break;
1999 
2000    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2001    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
2002       surface_state =
2003          emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
2004                                                  binding, desc);
2005       break;
2006 
2007    case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2008       if (desc->buffer_view) {
2009          surface_state = anv_bindless_state_for_binding_table(
2010             device, desc->buffer_view->storage.state);
2011          assert(surface_state.alloc_size);
2012       } else {
2013          surface_state = anv_null_surface_state_for_binding_table(device);
2014       }
2015       break;
2016 
2017    default:
2018       unreachable("Invalid descriptor type");
2019    }
2020 
2021    return surface_state.offset;
2022 }
2023 
2024 static uint32_t
emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const struct anv_descriptor_set * set,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)2025 emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
2026                                            struct anv_cmd_pipeline_state *pipe_state,
2027                                            const struct anv_descriptor_set *set,
2028                                            struct anv_pipeline_binding *binding,
2029                                            const struct anv_descriptor *desc)
2030 {
2031    uint32_t desc_offset;
2032 
2033    /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
2034     * Depending on where the descriptor surface state is allocated, they can
2035     * either come from device->internal_surface_state_pool or
2036     * device->bindless_surface_state_pool.
2037     */
2038    switch (desc->type) {
2039    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2040    case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2041    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
2042    case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
2043    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2044    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2045    case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2046    case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2047       desc_offset = set->desc_offset + binding->set_offset;
2048       break;
2049 
2050    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2051    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2052       struct anv_state state =
2053          emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
2054                                                  binding, desc);
2055       desc_offset = state.offset;
2056       break;
2057    }
2058 
2059    default:
2060       unreachable("Invalid descriptor type");
2061    }
2062 
2063    return desc_offset;
2064 }
2065 
2066 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)2067 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2068                    struct anv_cmd_pipeline_state *pipe_state,
2069                    struct anv_shader_bin *shader,
2070                    struct anv_state *bt_state)
2071 {
2072    uint32_t state_offset;
2073 
2074    struct anv_pipeline_bind_map *map = &shader->bind_map;
2075    if (map->surface_count == 0) {
2076       *bt_state = (struct anv_state) { 0, };
2077       return VK_SUCCESS;
2078    }
2079 
2080    *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2081                                                   map->surface_count,
2082                                                   &state_offset);
2083    uint32_t *bt_map = bt_state->map;
2084 
2085    if (bt_state->map == NULL)
2086       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2087 
2088    for (uint32_t s = 0; s < map->surface_count; s++) {
2089       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2090 
2091       struct anv_state surface_state;
2092 
2093       switch (binding->set) {
2094       case ANV_DESCRIPTOR_SET_NULL:
2095          bt_map[s] = 0;
2096          break;
2097 
2098       case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2099          /* Color attachment binding */
2100          assert(shader->stage == MESA_SHADER_FRAGMENT);
2101          if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2102             const struct anv_attachment *att =
2103                &cmd_buffer->state.gfx.color_att[binding->index];
2104             surface_state = att->surface_state.state;
2105          } else {
2106             surface_state = cmd_buffer->state.gfx.null_surface_state;
2107          }
2108          assert(surface_state.map);
2109          bt_map[s] = surface_state.offset + state_offset;
2110          break;
2111 
2112       case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2113          /* This is always the first binding for compute shaders */
2114          assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2115 
2116          struct anv_state surface_state =
2117             anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2118          if (surface_state.map == NULL)
2119             return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2120 
2121          const enum isl_format format =
2122             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2123                                                VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2124          anv_fill_buffer_surface_state(cmd_buffer->device, surface_state.map,
2125                                        format, ISL_SWIZZLE_IDENTITY,
2126                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2127                                        cmd_buffer->state.compute.num_workgroups,
2128                                        12, 1);
2129 
2130          assert(surface_state.map);
2131          bt_map[s] = surface_state.offset + state_offset;
2132          break;
2133       }
2134 
2135       case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2136          struct anv_descriptor_set *set =
2137             pipe_state->descriptors[binding->index];
2138 
2139          /* If the shader doesn't access the set buffer, just put the null
2140           * surface.
2141           */
2142          if (set->is_push && !shader->push_desc_info.used_set_buffer) {
2143             bt_map[s] = 0;
2144             break;
2145          }
2146 
2147          /* This is a descriptor set buffer so the set index is actually
2148           * given by binding->binding.  (Yes, that's confusing.)
2149           */
2150          assert(set->desc_surface_mem.alloc_size);
2151          assert(set->desc_surface_state.alloc_size);
2152          bt_map[s] = set->desc_surface_state.offset + state_offset;
2153          add_surface_reloc(cmd_buffer, anv_descriptor_set_address(set));
2154          break;
2155       }
2156 
2157       case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
2158          assert(pipe_state->descriptor_buffers[binding->index].state.alloc_size);
2159          bt_map[s] = pipe_state->descriptor_buffers[binding->index].state.offset +
2160                      state_offset;
2161          break;
2162       }
2163 
2164       default: {
2165          assert(binding->set < MAX_SETS);
2166          const struct anv_descriptor_set *set =
2167             pipe_state->descriptors[binding->set];
2168 
2169          if (binding->index >= set->descriptor_count) {
2170             /* From the Vulkan spec section entitled "DescriptorSet and
2171              * Binding Assignment":
2172              *
2173              *    "If the array is runtime-sized, then array elements greater
2174              *    than or equal to the size of that binding in the bound
2175              *    descriptor set must not be used."
2176              *
2177              * Unfortunately, the compiler isn't smart enough to figure out
2178              * when a dynamic binding isn't used so it may grab the whole
2179              * array and stick it in the binding table.  In this case, it's
2180              * safe to just skip those bindings that are OOB.
2181              */
2182             assert(binding->index < set->layout->descriptor_count);
2183             continue;
2184          }
2185 
2186          /* For push descriptor, if the binding is fully promoted to push
2187           * constants, just reference the null surface in the binding table.
2188           * It's unused and we didn't allocate/pack a surface state for it .
2189           */
2190          if (set->is_push) {
2191             uint32_t desc_idx = set->layout->binding[binding->binding].descriptor_index;
2192             assert(desc_idx < MAX_PUSH_DESCRIPTORS);
2193 
2194             if (shader->push_desc_info.fully_promoted_ubo_descriptors & BITFIELD_BIT(desc_idx)) {
2195                surface_state =
2196                   anv_null_surface_state_for_binding_table(cmd_buffer->device);
2197                break;
2198             }
2199          }
2200 
2201          const struct anv_descriptor *desc = &set->descriptors[binding->index];
2202          if (desc->type == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR ||
2203              desc->type == VK_DESCRIPTOR_TYPE_SAMPLER) {
2204             /* Nothing for us to do here */
2205             continue;
2206          }
2207 
2208          const struct anv_pipeline *pipeline = pipe_state->pipeline;
2209          uint32_t surface_state_offset;
2210          if (pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
2211             surface_state_offset =
2212                emit_indirect_descriptor_binding_table_entry(cmd_buffer,
2213                                                             pipe_state,
2214                                                             binding, desc);
2215          } else {
2216             assert(pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT ||
2217                    pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER);
2218             surface_state_offset =
2219                emit_direct_descriptor_binding_table_entry(cmd_buffer, pipe_state,
2220                                                           set, binding, desc);
2221          }
2222 
2223          bt_map[s] = surface_state_offset + state_offset;
2224          break;
2225       }
2226       }
2227    }
2228 
2229    return VK_SUCCESS;
2230 }
2231 
2232 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2233 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2234               struct anv_cmd_pipeline_state *pipe_state,
2235               struct anv_shader_bin *shader,
2236               struct anv_state *state)
2237 {
2238    struct anv_pipeline_bind_map *map = &shader->bind_map;
2239    if (map->sampler_count == 0) {
2240       *state = (struct anv_state) { 0, };
2241       return VK_SUCCESS;
2242    }
2243 
2244    uint32_t size = map->sampler_count * 16;
2245    *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2246 
2247    if (state->map == NULL)
2248       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2249 
2250    for (uint32_t s = 0; s < map->sampler_count; s++) {
2251       struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2252       const struct anv_descriptor *desc =
2253          &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2254 
2255       if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2256           desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2257          continue;
2258 
2259       struct anv_sampler *sampler = desc->sampler;
2260 
2261       /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2262        * happens to be zero.
2263        */
2264       if (sampler == NULL)
2265          continue;
2266 
2267       memcpy(state->map + (s * 16), sampler->state[binding->plane],
2268              sizeof(sampler->state[0]));
2269    }
2270 
2271    return VK_SUCCESS;
2272 }
2273 
2274 uint32_t
genX(cmd_buffer_flush_descriptor_sets)2275 genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
2276                                        struct anv_cmd_pipeline_state *pipe_state,
2277                                        const VkShaderStageFlags dirty,
2278                                        struct anv_shader_bin **shaders,
2279                                        uint32_t num_shaders)
2280 {
2281    VkShaderStageFlags flushed = 0;
2282 
2283    VkResult result = VK_SUCCESS;
2284    for (uint32_t i = 0; i < num_shaders; i++) {
2285       if (!shaders[i])
2286          continue;
2287 
2288       gl_shader_stage stage = shaders[i]->stage;
2289       VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2290       if ((vk_stage & dirty) == 0)
2291          continue;
2292 
2293       assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2294       result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2295                              &cmd_buffer->state.samplers[stage]);
2296       if (result != VK_SUCCESS)
2297          break;
2298 
2299       assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2300       result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2301                                   &cmd_buffer->state.binding_tables[stage]);
2302       if (result != VK_SUCCESS)
2303          break;
2304 
2305       flushed |= vk_stage;
2306    }
2307 
2308    if (result != VK_SUCCESS) {
2309       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2310 
2311       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2312       if (result != VK_SUCCESS)
2313          return 0;
2314 
2315       /* Re-emit the BT base address so we get the new surface state base
2316        * address before we start emitting binding tables etc.
2317        */
2318       genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
2319 
2320       /* Re-emit all active binding tables */
2321       flushed = 0;
2322 
2323       for (uint32_t i = 0; i < num_shaders; i++) {
2324          if (!shaders[i])
2325             continue;
2326 
2327          gl_shader_stage stage = shaders[i]->stage;
2328 
2329          result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2330                                 &cmd_buffer->state.samplers[stage]);
2331          if (result != VK_SUCCESS) {
2332             anv_batch_set_error(&cmd_buffer->batch, result);
2333             return 0;
2334          }
2335          result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2336                                      &cmd_buffer->state.binding_tables[stage]);
2337          if (result != VK_SUCCESS) {
2338             anv_batch_set_error(&cmd_buffer->batch, result);
2339             return 0;
2340          }
2341 
2342          flushed |= mesa_to_vk_shader_stage(stage);
2343       }
2344    }
2345 
2346    return flushed;
2347 }
2348 
2349 /* This function generates the surface state used to read the content of the
2350  * descriptor buffer.
2351  */
2352 void
genX(cmd_buffer_emit_push_descriptor_buffer_surface)2353 genX(cmd_buffer_emit_push_descriptor_buffer_surface)(struct anv_cmd_buffer *cmd_buffer,
2354                                                      struct anv_descriptor_set *set)
2355 {
2356    assert(set->desc_surface_state.map == NULL);
2357 
2358    struct anv_descriptor_set_layout *layout = set->layout;
2359    enum isl_format format =
2360       anv_isl_format_for_descriptor_type(cmd_buffer->device,
2361                                          VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2362 
2363    set->desc_surface_state =
2364       anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2365    if (set->desc_surface_state.map == NULL)
2366       return;
2367    anv_fill_buffer_surface_state(cmd_buffer->device,
2368                                  set->desc_surface_state.map,
2369                                  format, ISL_SWIZZLE_IDENTITY,
2370                                  ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2371                                  set->desc_surface_addr,
2372                                  layout->descriptor_buffer_surface_size, 1);
2373 }
2374 
2375 /* This functions generates surface states used by a pipeline for push
2376  * descriptors. This is delayed to the draw/dispatch time to avoid allocation
2377  * and surface state generation when a pipeline is not going to use the
2378  * binding table to access any push descriptor data.
2379  */
2380 void
genX(cmd_buffer_emit_push_descriptor_surfaces)2381 genX(cmd_buffer_emit_push_descriptor_surfaces)(struct anv_cmd_buffer *cmd_buffer,
2382                                                struct anv_descriptor_set *set)
2383 {
2384    while (set->generate_surface_states) {
2385       int desc_idx = u_bit_scan(&set->generate_surface_states);
2386       struct anv_descriptor *desc = &set->descriptors[desc_idx];
2387       struct anv_buffer_view *bview = desc->set_buffer_view;
2388 
2389       if (bview != NULL && bview->general.state.map == NULL) {
2390          bview->general.state =
2391             anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2392          if (bview->general.state.map == NULL)
2393             return;
2394          anv_descriptor_write_surface_state(cmd_buffer->device, desc,
2395                                             bview->general.state);
2396       }
2397    }
2398 }
2399 
2400 ALWAYS_INLINE void
genX(batch_emit_pipe_control)2401 genX(batch_emit_pipe_control)(struct anv_batch *batch,
2402                               const struct intel_device_info *devinfo,
2403                               uint32_t current_pipeline,
2404                               enum anv_pipe_bits bits,
2405                               const char *reason)
2406 {
2407    genX(batch_emit_pipe_control_write)(batch,
2408                                        devinfo,
2409                                        current_pipeline,
2410                                        NoWrite,
2411                                        ANV_NULL_ADDRESS,
2412                                        0,
2413                                        bits,
2414                                        reason);
2415 }
2416 
2417 ALWAYS_INLINE void
genX(batch_emit_pipe_control_write)2418 genX(batch_emit_pipe_control_write)(struct anv_batch *batch,
2419                                     const struct intel_device_info *devinfo,
2420                                     uint32_t current_pipeline,
2421                                     uint32_t post_sync_op,
2422                                     struct anv_address address,
2423                                     uint32_t imm_data,
2424                                     enum anv_pipe_bits bits,
2425                                     const char *reason)
2426 {
2427    if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
2428        (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO))
2429       unreachable("Trying to emit unsupported PIPE_CONTROL command.");
2430 
2431    const bool trace_flush =
2432       (bits & (ANV_PIPE_FLUSH_BITS |
2433                ANV_PIPE_STALL_BITS |
2434                ANV_PIPE_INVALIDATE_BITS |
2435                ANV_PIPE_END_OF_PIPE_SYNC_BIT)) != 0;
2436    if (trace_flush && batch->trace != NULL) {
2437       // Store pipe control reasons if there is enough space
2438       if (batch->pc_reasons_count < ARRAY_SIZE(batch->pc_reasons)) {
2439          batch->pc_reasons[batch->pc_reasons_count++] = reason;
2440       }
2441       trace_intel_begin_stall(batch->trace);
2442    }
2443 
2444 
2445    /* XXX - insert all workarounds and GFX specific things below. */
2446 
2447    /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
2448     * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
2449     * with CS_STALL Bit set (with No POST_SYNC ENABLED)
2450     */
2451    if (intel_device_info_is_adln(devinfo) &&
2452        current_pipeline == GPGPU &&
2453        post_sync_op != NoWrite) {
2454       anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2455          pipe.CommandStreamerStallEnable = true;
2456          anv_debug_dump_pc(pipe, "Wa_14014966230");
2457       };
2458    }
2459 
2460    /* SKL PRMs, Volume 7: 3D-Media-GPGPU, Programming Restrictions for
2461     * PIPE_CONTROL, Flush Types:
2462     *   "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
2463     * For newer platforms this is documented in the PIPE_CONTROL instruction
2464     * page.
2465     */
2466    if (current_pipeline == GPGPU &&
2467        (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT))
2468       bits |= ANV_PIPE_CS_STALL_BIT;
2469 
2470 #if INTEL_NEEDS_WA_1409600907
2471    /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
2472     * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
2473     */
2474    if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT)
2475       bits |= ANV_PIPE_DEPTH_STALL_BIT;
2476 #endif
2477 
2478 #if GFX_VERx10 >= 125
2479    if (current_pipeline != GPGPU) {
2480       if (bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
2481          bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
2482    } else {
2483       if (bits & (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
2484                   ANV_PIPE_DATA_CACHE_FLUSH_BIT))
2485          bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
2486    }
2487 
2488    /* BSpec 47112: PIPE_CONTROL::Untyped Data-Port Cache Flush:
2489     *
2490     *    "'HDC Pipeline Flush' bit must be set for this bit to take
2491     *     effect."
2492     */
2493    if (bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
2494       bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2495 #endif
2496 
2497 #if GFX_VER < 12
2498    if (bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
2499       bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2500 #endif
2501 
2502    /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2503     *
2504     *    "If the VF Cache Invalidation Enable is set to a 1 in a
2505     *    PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to
2506     *    0, with the VF Cache Invalidation Enable set to 0 needs to be sent
2507     *    prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to
2508     *    a 1."
2509     *
2510     * This appears to hang Broadwell, so we restrict it to just gfx9.
2511     */
2512    if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
2513       anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
2514 
2515 #if GFX_VER >= 9 && GFX_VER <= 11
2516    /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2517     *
2518     *    "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
2519     *     always set for GPGPU workloads when “Texture Cache
2520     *     Invalidation Enable” bit is set".
2521     *
2522     * Workaround stopped appearing in TGL PRMs.
2523     */
2524    if (current_pipeline == GPGPU &&
2525        (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT))
2526       bits |= ANV_PIPE_CS_STALL_BIT;
2527 #endif
2528 
2529    anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2530 #if GFX_VERx10 >= 125
2531       pipe.UntypedDataPortCacheFlushEnable =
2532          bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
2533       pipe.CCSFlushEnable = bits & ANV_PIPE_CCS_CACHE_FLUSH_BIT;
2534 #endif
2535 #if GFX_VER == 12
2536       pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
2537 #endif
2538 #if GFX_VER > 11
2539       pipe.HDCPipelineFlushEnable = bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2540 #endif
2541       pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
2542       pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2543       pipe.RenderTargetCacheFlushEnable =
2544          bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
2545 
2546       pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
2547 
2548       pipe.TLBInvalidate = bits & ANV_PIPE_TLB_INVALIDATE_BIT;
2549 
2550 #if GFX_VERx10 >= 125
2551       pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
2552 #endif
2553       pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
2554       pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2555 
2556       pipe.StateCacheInvalidationEnable =
2557          bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
2558       pipe.ConstantCacheInvalidationEnable =
2559          bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
2560 #if GFX_VER >= 12
2561       /* Invalidates the L3 cache part in which index & vertex data is loaded
2562        * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
2563        */
2564       pipe.L3ReadOnlyCacheInvalidationEnable =
2565          bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2566 #endif
2567       pipe.VFCacheInvalidationEnable =
2568          bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2569       pipe.TextureCacheInvalidationEnable =
2570          bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
2571       pipe.InstructionCacheInvalidateEnable =
2572          bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
2573 
2574       pipe.PostSyncOperation = post_sync_op;
2575       pipe.Address = address;
2576       pipe.DestinationAddressType = DAT_PPGTT;
2577       pipe.ImmediateData = imm_data;
2578 
2579       anv_debug_dump_pc(pipe, reason);
2580    }
2581 
2582    if (trace_flush && batch->trace != NULL) {
2583          trace_intel_end_stall(batch->trace, bits,
2584                                anv_pipe_flush_bit_to_ds_stall_flag,
2585                                batch->pc_reasons[0],
2586                                batch->pc_reasons[1],
2587                                batch->pc_reasons[2],
2588                                batch->pc_reasons[3]);
2589          batch->pc_reasons[0] = NULL;
2590          batch->pc_reasons[1] = NULL;
2591          batch->pc_reasons[2] = NULL;
2592          batch->pc_reasons[3] = NULL;
2593          batch->pc_reasons_count = 0;
2594    }
2595 }
2596 
2597 /* Set preemption on/off. */
2598 void
genX(batch_set_preemption)2599 genX(batch_set_preemption)(struct anv_batch *batch,
2600                            const struct intel_device_info *devinfo,
2601                            uint32_t current_pipeline,
2602                            bool value)
2603 {
2604 #if INTEL_WA_16013994831_GFX_VER
2605    if (!intel_needs_workaround(devinfo, 16013994831))
2606       return;
2607 
2608    anv_batch_write_reg(batch, GENX(CS_CHICKEN1), cc1) {
2609       cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !value;
2610       cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
2611    }
2612 
2613    /* Wa_16013994831 - we need to insert CS_STALL and 250 noops. */
2614    genx_batch_emit_pipe_control(batch, devinfo, current_pipeline,
2615                                 ANV_PIPE_CS_STALL_BIT);
2616 
2617    for (unsigned i = 0; i < 250; i++)
2618       anv_batch_emit(batch, GENX(MI_NOOP), noop);
2619 #endif
2620 }
2621 
2622 void
genX(cmd_buffer_set_preemption)2623 genX(cmd_buffer_set_preemption)(struct anv_cmd_buffer *cmd_buffer, bool value)
2624 {
2625 #if GFX_VERx10 >= 120
2626    if (cmd_buffer->state.gfx.object_preemption == value)
2627       return;
2628 
2629    genX(batch_set_preemption)(&cmd_buffer->batch, cmd_buffer->device->info,
2630                               cmd_buffer->state.current_pipeline,
2631                               value);
2632    cmd_buffer->state.gfx.object_preemption = value;
2633 #endif
2634 }
2635 
2636 ALWAYS_INLINE static void
update_descriptor_set_surface_state(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,uint32_t set_idx)2637 update_descriptor_set_surface_state(struct anv_cmd_buffer *cmd_buffer,
2638                                     struct anv_cmd_pipeline_state *pipe_state,
2639                                     uint32_t set_idx)
2640 {
2641    if (!pipe_state->descriptor_buffers[set_idx].bound)
2642       return;
2643 
2644    const struct anv_physical_device *device = cmd_buffer->device->physical;
2645    const int32_t buffer_index =
2646       pipe_state->descriptor_buffers[set_idx].buffer_index;
2647    const struct anv_va_range *push_va_range =
2648       GFX_VERx10 >= 125 ?
2649       &device->va.push_descriptor_buffer_pool :
2650       &device->va.internal_surface_state_pool;
2651    const struct anv_va_range *va_range =
2652       buffer_index == -1 ? push_va_range : &device->va.dynamic_visible_pool;
2653    const uint64_t descriptor_set_addr =
2654       (buffer_index == -1 ? va_range->addr :
2655        cmd_buffer->state.descriptor_buffers.address[buffer_index]) +
2656       pipe_state->descriptor_buffers[set_idx].buffer_offset;
2657    const uint64_t set_size =
2658       MIN2(va_range->size - (descriptor_set_addr - va_range->addr),
2659            anv_physical_device_bindless_heap_size(device, true));
2660 
2661    if (descriptor_set_addr != pipe_state->descriptor_buffers[set_idx].address) {
2662       pipe_state->descriptor_buffers[set_idx].address = descriptor_set_addr;
2663 
2664       struct anv_state surface_state =
2665          anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2666       const enum isl_format format =
2667          anv_isl_format_for_descriptor_type(cmd_buffer->device,
2668                                             VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2669       anv_fill_buffer_surface_state(
2670          cmd_buffer->device, surface_state.map,
2671          format, ISL_SWIZZLE_IDENTITY,
2672          ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2673          anv_address_from_u64(pipe_state->descriptor_buffers[set_idx].address),
2674          set_size, 1);
2675 
2676       pipe_state->descriptor_buffers[set_idx].state = surface_state;
2677    }
2678 }
2679 
2680 ALWAYS_INLINE static uint32_t
compute_descriptor_set_surface_offset(const struct anv_cmd_buffer * cmd_buffer,const struct anv_cmd_pipeline_state * pipe_state,const uint32_t set_idx)2681 compute_descriptor_set_surface_offset(const struct anv_cmd_buffer *cmd_buffer,
2682                                       const struct anv_cmd_pipeline_state *pipe_state,
2683                                       const uint32_t set_idx)
2684 {
2685    const struct anv_physical_device *device = cmd_buffer->device->physical;
2686 
2687    if (device->uses_ex_bso) {
2688       int32_t buffer_index =
2689          pipe_state->descriptor_buffers[set_idx].buffer_index;
2690       uint64_t buffer_address =
2691          buffer_index == -1 ?
2692          device->va.push_descriptor_buffer_pool.addr :
2693          cmd_buffer->state.descriptor_buffers.address[buffer_index];
2694 
2695       return (buffer_address - device->va.dynamic_visible_pool.addr) +
2696               pipe_state->descriptor_buffers[set_idx].buffer_offset;
2697    }
2698 
2699    return pipe_state->descriptor_buffers[set_idx].buffer_offset << 6;
2700 }
2701 
2702 ALWAYS_INLINE static uint32_t
compute_descriptor_set_sampler_offset(const struct anv_cmd_buffer * cmd_buffer,const struct anv_cmd_pipeline_state * pipe_state,const uint32_t set_idx)2703 compute_descriptor_set_sampler_offset(const struct anv_cmd_buffer *cmd_buffer,
2704                                       const struct anv_cmd_pipeline_state *pipe_state,
2705                                       const uint32_t set_idx)
2706 {
2707    const struct anv_physical_device *device = cmd_buffer->device->physical;
2708    int32_t buffer_index =
2709       pipe_state->descriptor_buffers[set_idx].buffer_index;
2710    uint64_t buffer_address =
2711       buffer_index == -1 ?
2712       device->va.push_descriptor_buffer_pool.addr :
2713       cmd_buffer->state.descriptor_buffers.address[buffer_index];
2714 
2715    return (buffer_address - device->va.dynamic_state_pool.addr) +
2716       pipe_state->descriptor_buffers[set_idx].buffer_offset;
2717 }
2718 
2719 void
genX(flush_descriptor_buffers)2720 genX(flush_descriptor_buffers)(struct anv_cmd_buffer *cmd_buffer,
2721                                struct anv_cmd_pipeline_state *pipe_state)
2722 {
2723    /* On Gfx12.5+ the STATE_BASE_ADDRESS BindlessSurfaceStateBaseAddress &
2724     * DynamicStateBaseAddress are fixed. So as long as we stay in one
2725     * descriptor buffer mode, there is no need to switch.
2726     */
2727 #if GFX_VERx10 >= 125
2728    if (cmd_buffer->state.current_db_mode !=
2729        cmd_buffer->state.pending_db_mode)
2730       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2731 #else
2732    if (cmd_buffer->state.descriptor_buffers.dirty)
2733       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2734 #endif
2735 
2736    assert(cmd_buffer->state.current_db_mode !=
2737           ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
2738    if (cmd_buffer->state.current_db_mode == ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER &&
2739        (cmd_buffer->state.descriptor_buffers.dirty ||
2740         (pipe_state->pipeline->active_stages &
2741          cmd_buffer->state.descriptor_buffers.offsets_dirty) != 0)) {
2742       struct anv_push_constants *push_constants =
2743          &pipe_state->push_constants;
2744       for (uint32_t i = 0; i < ARRAY_SIZE(push_constants->desc_surface_offsets); i++) {
2745          update_descriptor_set_surface_state(cmd_buffer, pipe_state, i);
2746 
2747          push_constants->desc_surface_offsets[i] =
2748             compute_descriptor_set_surface_offset(cmd_buffer, pipe_state, i);
2749          push_constants->desc_sampler_offsets[i] =
2750             compute_descriptor_set_sampler_offset(cmd_buffer, pipe_state, i);
2751       }
2752 
2753 #if GFX_VERx10 < 125
2754       struct anv_device *device = cmd_buffer->device;
2755       push_constants->surfaces_base_offset =
2756          (cmd_buffer->state.descriptor_buffers.surfaces_address -
2757           device->physical->va.dynamic_visible_pool.addr);
2758 #endif
2759 
2760       cmd_buffer->state.push_constants_dirty |=
2761          (cmd_buffer->state.descriptor_buffers.offsets_dirty &
2762           pipe_state->pipeline->active_stages);
2763       pipe_state->push_constants_data_dirty = true;
2764       cmd_buffer->state.descriptor_buffers.offsets_dirty &=
2765          ~pipe_state->pipeline->active_stages;
2766    }
2767 
2768    cmd_buffer->state.descriptor_buffers.dirty = false;
2769 }
2770 
2771 void
genX(cmd_buffer_begin_companion)2772 genX(cmd_buffer_begin_companion)(struct anv_cmd_buffer *cmd_buffer,
2773                                  VkCommandBufferLevel level)
2774 {
2775    cmd_buffer->vk.level = level;
2776    cmd_buffer->is_companion_rcs_cmd_buffer = true;
2777 
2778    trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
2779 
2780 #if GFX_VER >= 12
2781    /* Reenable prefetching at the beginning of secondary command buffers. We
2782     * do this so that the return instruction edition is not prefetched before
2783     * completion.
2784     */
2785    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2786       anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
2787          arb.PreParserDisableMask = true;
2788          arb.PreParserDisable = false;
2789       }
2790    }
2791 #endif
2792 
2793    /* A companion command buffer is only used for blorp commands atm, so
2794     * default to the legacy mode.
2795     */
2796    cmd_buffer->state.current_db_mode = ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
2797    genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
2798 
2799    /* Invalidate the aux table in every primary command buffer. This ensures
2800     * the command buffer see the last updates made by the host.
2801     */
2802    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
2803        cmd_buffer->device->info->has_aux_map) {
2804       anv_add_pending_pipe_bits(cmd_buffer,
2805                                 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
2806                                 "new cmd buffer with aux-tt");
2807    }
2808 }
2809 
2810 static bool
aux_op_resolves(enum isl_aux_op aux_op)2811 aux_op_resolves(enum isl_aux_op aux_op)
2812 {
2813    return aux_op == ISL_AUX_OP_FULL_RESOLVE ||
2814           aux_op == ISL_AUX_OP_PARTIAL_RESOLVE;
2815 }
2816 
2817 static bool
aux_op_clears(enum isl_aux_op aux_op)2818 aux_op_clears(enum isl_aux_op aux_op)
2819 {
2820    return aux_op == ISL_AUX_OP_FAST_CLEAR ||
2821           aux_op == ISL_AUX_OP_AMBIGUATE;
2822 }
2823 
2824 static bool
aux_op_renders(enum isl_aux_op aux_op)2825 aux_op_renders(enum isl_aux_op aux_op)
2826 {
2827    return aux_op == ISL_AUX_OP_NONE;
2828 }
2829 
2830 static void
add_pending_pipe_bits_for_color_aux_op(struct anv_cmd_buffer * cmd_buffer,enum isl_aux_op next_aux_op,enum anv_pipe_bits pipe_bits)2831 add_pending_pipe_bits_for_color_aux_op(struct anv_cmd_buffer *cmd_buffer,
2832                                        enum isl_aux_op next_aux_op,
2833                                        enum anv_pipe_bits pipe_bits)
2834 {
2835    const enum isl_aux_op last_aux_op = cmd_buffer->state.color_aux_op;
2836    assert(next_aux_op != last_aux_op);
2837 
2838    char flush_reason[64] = {};
2839    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) ||
2840        u_trace_enabled(&cmd_buffer->device->ds.trace_context)) {
2841       int ret = snprintf(flush_reason, sizeof(flush_reason),
2842                          "color aux-op: %s -> %s",
2843                          isl_aux_op_to_name(last_aux_op),
2844                          isl_aux_op_to_name(next_aux_op));
2845       assert(ret < sizeof(flush_reason));
2846    }
2847 
2848    anv_add_pending_pipe_bits(cmd_buffer, pipe_bits, flush_reason);
2849 }
2850 
2851 void
genX(cmd_buffer_update_color_aux_op)2852 genX(cmd_buffer_update_color_aux_op)(struct anv_cmd_buffer *cmd_buffer,
2853                                      enum isl_aux_op next_aux_op)
2854 {
2855    const enum isl_aux_op last_aux_op = cmd_buffer->state.color_aux_op;
2856 
2857    if (!aux_op_clears(last_aux_op) && aux_op_clears(next_aux_op)) {
2858 #if GFX_VER >= 20
2859       /* From the Xe2 Bspec 57340 (r59562),
2860        * "MCS/CCS Buffers, Fast Clear for Render Target(s)":
2861        *
2862        *    Synchronization:
2863        *    Due to interaction of scaled clearing rectangle with pixel
2864        *    scoreboard, we require one of the following commands to be
2865        *    issued. [...]
2866        *
2867        *    PIPE_CONTROL
2868        *    PSS Stall Sync Enable            [...] 1b (Enable)
2869        *       Machine-wide Stall at Pixel Stage, wait for all Prior Pixel
2870        *       Work to Reach End of Pipe
2871        *    Render Target Cache Flush Enable [...] 1b (Enable)
2872        *       Post-Sync Op Flushes Render Cache before Unblocking Stall
2873        *
2874        *    This synchronization step is required before and after the fast
2875        *    clear pass, to ensure correct ordering between pixels.
2876        */
2877       add_pending_pipe_bits_for_color_aux_op(
2878             cmd_buffer, next_aux_op,
2879             ANV_PIPE_PSS_STALL_SYNC_BIT |
2880             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2881 
2882 #elif GFX_VERx10 == 125
2883       /* From the ACM Bspec 47704 (r52663), "Render Target Fast Clear":
2884        *
2885        *    Preamble pre fast clear synchronization
2886        *
2887        *    PIPE_CONTROL:
2888        *    PS sync stall = 1
2889        *    Tile Cache Flush = 1
2890        *    RT Write Flush = 1
2891        *    HDC Flush = 1
2892        *    DC Flush = 1
2893        *    Texture Invalidate = 1
2894        *
2895        *    [...]
2896        *
2897        *    Objective of the preamble flushes is to ensure all data is
2898        *    evicted from L1 caches prior to fast clear.
2899        *
2900        * From the ACM PRM Vol. 9, "MCS/CCS Buffers for Render Target(s)":
2901        *
2902        *    Any transition from any value in {Clear, Render, Resolve} to a
2903        *    different value in {Clear, Render, Resolve} requires end of pipe
2904        *    synchronization.
2905        */
2906       add_pending_pipe_bits_for_color_aux_op(
2907             cmd_buffer, next_aux_op,
2908             ANV_PIPE_PSS_STALL_SYNC_BIT |
2909             ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2910             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
2911             ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
2912             ANV_PIPE_DATA_CACHE_FLUSH_BIT |
2913             ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
2914             ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2915 
2916 #elif GFX_VERx10 == 120
2917       /* From the TGL Bspec 47704 (r52663), "Render Target Fast Clear":
2918        *
2919        *    Preamble pre fast clear synchronization
2920        *
2921        *    PIPE_CONTROL:
2922        *    Depth Stall = 1
2923        *    Tile Cache Flush = 1
2924        *    RT Write Flush = 1
2925        *    Texture Invalidate = 1
2926        *
2927        *    [...]
2928        *
2929        *    Objective of the preamble flushes is to ensure all data is
2930        *    evicted from L1 caches prior to fast clear.
2931        *
2932        * From the TGL PRM Vol. 9, "MCS/CCS Buffers for Render Target(s)":
2933        *
2934        *    Any transition from any value in {Clear, Render, Resolve} to a
2935        *    different value in {Clear, Render, Resolve} requires end of pipe
2936        *    synchronization.
2937        */
2938       add_pending_pipe_bits_for_color_aux_op(
2939             cmd_buffer, next_aux_op,
2940             ANV_PIPE_DEPTH_STALL_BIT  |
2941             ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2942             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
2943             ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
2944             ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2945 
2946 #else
2947       /* From the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
2948        *
2949        *    Any transition from any value in {Clear, Render, Resolve} to a
2950        *    different value in {Clear, Render, Resolve} requires end of pipe
2951        *    synchronization.
2952        *
2953        * From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
2954        *
2955        *    After Render target fast clear, pipe-control with color cache
2956        *    write-flush must be issued before sending any DRAW commands on
2957        *    that render target.
2958        *
2959        * The last comment is a bit cryptic and doesn't really tell you what's
2960        * going or what's really needed.  It appears that fast clear ops are
2961        * not properly synchronized with other drawing.  This means that we
2962        * cannot have a fast clear operation in the pipe at the same time as
2963        * other regular drawing operations.  We need to use a PIPE_CONTROL
2964        * to ensure that the contents of the previous draw hit the render
2965        * target before we resolve and then use a second PIPE_CONTROL after
2966        * the resolve to ensure that it is completed before any additional
2967        * drawing occurs.
2968        */
2969       add_pending_pipe_bits_for_color_aux_op(
2970             cmd_buffer, next_aux_op,
2971             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
2972             ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2973 #endif
2974 
2975    } else if (aux_op_clears(last_aux_op) && !aux_op_clears(next_aux_op)) {
2976 #if GFX_VER >= 20
2977       /* From the Xe2 Bspec 57340 (r59562),
2978        * "MCS/CCS Buffers, Fast Clear for Render Target(s)":
2979        *
2980        *    Synchronization:
2981        *    Due to interaction of scaled clearing rectangle with pixel
2982        *    scoreboard, we require one of the following commands to be
2983        *    issued. [...]
2984        *
2985        *    PIPE_CONTROL
2986        *    PSS Stall Sync Enable            [...] 1b (Enable)
2987        *       Machine-wide Stall at Pixel Stage, wait for all Prior Pixel
2988        *       Work to Reach End of Pipe
2989        *    Render Target Cache Flush Enable [...] 1b (Enable)
2990        *       Post-Sync Op Flushes Render Cache before Unblocking Stall
2991        *
2992        *    This synchronization step is required before and after the fast
2993        *    clear pass, to ensure correct ordering between pixels.
2994        */
2995       add_pending_pipe_bits_for_color_aux_op(
2996             cmd_buffer, next_aux_op,
2997             ANV_PIPE_PSS_STALL_SYNC_BIT |
2998             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2999 
3000 #elif GFX_VERx10 == 125
3001       /* From the ACM PRM Vol. 9, "Color Fast Clear Synchronization":
3002        *
3003        *    Postamble post fast clear synchronization
3004        *
3005        *    PIPE_CONTROL:
3006        *    PS sync stall = 1
3007        *    RT flush = 1
3008        *
3009        * From the ACM PRM Vol. 9, "MCS/CCS Buffers for Render Target(s)":
3010        *
3011        *    Any transition from any value in {Clear, Render, Resolve} to a
3012        *    different value in {Clear, Render, Resolve} requires end of pipe
3013        *    synchronization.
3014        */
3015       add_pending_pipe_bits_for_color_aux_op(
3016             cmd_buffer, next_aux_op,
3017             ANV_PIPE_PSS_STALL_SYNC_BIT |
3018             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3019             ANV_PIPE_END_OF_PIPE_SYNC_BIT);
3020 
3021 #elif GFX_VERx10 == 120
3022       /* From the TGL PRM Vol. 9, "Color Fast Clear Synchronization":
3023        *
3024        *    Postamble post fast clear synchronization
3025        *
3026        *    PIPE_CONTROL:
3027        *    Depth Stall = 1
3028        *    Tile Cache Flush = 1
3029        *    RT Write Flush = 1
3030        *
3031        * From the TGL PRM Vol. 9, "MCS/CCS Buffers for Render Target(s)":
3032        *
3033        *    Any transition from any value in {Clear, Render, Resolve} to a
3034        *    different value in {Clear, Render, Resolve} requires end of pipe
3035        *    synchronization.
3036        *
3037        */
3038       add_pending_pipe_bits_for_color_aux_op(
3039             cmd_buffer, next_aux_op,
3040             ANV_PIPE_DEPTH_STALL_BIT |
3041             ANV_PIPE_TILE_CACHE_FLUSH_BIT |
3042             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3043             ANV_PIPE_END_OF_PIPE_SYNC_BIT);
3044 
3045 #else
3046       /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
3047        *
3048        *    After Render target fast clear, pipe-control with color cache
3049        *    write-flush must be issued before sending any DRAW commands on
3050        *    that render target.
3051        *
3052        * From the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
3053        *
3054        *    Any transition from any value in {Clear, Render, Resolve} to a
3055        *    different value in {Clear, Render, Resolve} requires end of pipe
3056        *    synchronization.
3057        */
3058       add_pending_pipe_bits_for_color_aux_op(
3059             cmd_buffer, next_aux_op,
3060             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3061             ANV_PIPE_END_OF_PIPE_SYNC_BIT);
3062 #endif
3063 
3064    } else if (aux_op_renders(last_aux_op) != aux_op_renders(next_aux_op)) {
3065       assert(aux_op_resolves(last_aux_op) != aux_op_resolves(next_aux_op));
3066       /* From the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
3067        *
3068        *    Any transition from any value in {Clear, Render, Resolve} to a
3069        *    different value in {Clear, Render, Resolve} requires end of pipe
3070        *    synchronization.
3071        *
3072        * We perform a flush of the write cache before and after the clear and
3073        * resolve operations to meet this requirement.
3074        *
3075        * Unlike other drawing, fast clear operations are not properly
3076        * synchronized. The first PIPE_CONTROL here likely ensures that the
3077        * contents of the previous render or clear hit the render target before
3078        * we resolve and the second likely ensures that the resolve is complete
3079        * before we do any more rendering or clearing.
3080        */
3081       add_pending_pipe_bits_for_color_aux_op(
3082             cmd_buffer, next_aux_op,
3083             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3084             ANV_PIPE_END_OF_PIPE_SYNC_BIT);
3085    }
3086 
3087    if (last_aux_op != ISL_AUX_OP_FAST_CLEAR &&
3088        next_aux_op == ISL_AUX_OP_FAST_CLEAR &&
3089        cmd_buffer->device->isl_dev.ss.clear_color_state_size > 0) {
3090       /* From the ICL PRM Vol. 9, "State Caching":
3091        *
3092        *    Any values referenced by pointers within the RENDER_SURFACE_STATE
3093        *    [...] (e.g. Clear Color Pointer, [...]) are considered to be part
3094        *    of that state and any changes to these referenced values requires
3095        *    an invalidation of the L1 state cache to ensure the new values are
3096        *    being used as part of the state. [...]
3097        *
3098        * We could alternatively perform this invalidation when we stop
3099        * fast-clearing. A benefit to doing it now, when transitioning to a
3100        * fast clear, is that we save a pipe control by combining the state
3101        * cache invalidation with the texture cache invalidation done on gfx12.
3102        */
3103       anv_add_pending_pipe_bits(cmd_buffer,
3104                                 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
3105                                 "Invalidate for new clear color");
3106    }
3107 
3108    /* Update the auxiliary surface operation, but with one exception. */
3109    if (last_aux_op == ISL_AUX_OP_FAST_CLEAR &&
3110        next_aux_op == ISL_AUX_OP_AMBIGUATE) {
3111       assert(aux_op_clears(last_aux_op) && aux_op_clears(next_aux_op));
3112       /* Fast clears and ambiguates are in the same class of operation, but
3113        * fast clears have more stringent synchronization requirements. For
3114        * better performance, don't replace the current fast clear operation
3115        * state with ambiguate. This allows us to perform one state cache
3116        * invalidation when leaving a sequence which alternates between
3117        * ambiguates and clears, instead of multiple such invalidations.
3118        */
3119    } else {
3120       cmd_buffer->state.color_aux_op = next_aux_op;
3121    }
3122 }
3123 
3124 static void
genX(cmd_buffer_set_protected_memory)3125 genX(cmd_buffer_set_protected_memory)(struct anv_cmd_buffer *cmd_buffer,
3126                                       bool enabled)
3127 {
3128 #if GFX_VER >= 12
3129    if (enabled) {
3130       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SET_APPID), appid) {
3131          /* Default value for single session. */
3132          appid.ProtectedMemoryApplicationID = cmd_buffer->device->protected_session_id;
3133          appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
3134       }
3135    }
3136    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3137       pc.PipeControlFlushEnable = true;
3138       pc.DCFlushEnable = true;
3139       pc.RenderTargetCacheFlushEnable = true;
3140       pc.CommandStreamerStallEnable = true;
3141       if (enabled)
3142          pc.ProtectedMemoryEnable = true;
3143       else
3144          pc.ProtectedMemoryDisable = true;
3145    }
3146 #else
3147    unreachable("Protected content not supported");
3148 #endif
3149 }
3150 
3151 VkResult
genX(BeginCommandBuffer)3152 genX(BeginCommandBuffer)(
3153     VkCommandBuffer                             commandBuffer,
3154     const VkCommandBufferBeginInfo*             pBeginInfo)
3155 {
3156    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3157    VkResult result;
3158 
3159    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
3160     * command buffer's state. Otherwise, we must *reset* its state. In both
3161     * cases we reset it.
3162     *
3163     * From the Vulkan 1.0 spec:
3164     *
3165     *    If a command buffer is in the executable state and the command buffer
3166     *    was allocated from a command pool with the
3167     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
3168     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
3169     *    as if vkResetCommandBuffer had been called with
3170     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
3171     *    the command buffer in the recording state.
3172     */
3173    anv_cmd_buffer_reset(&cmd_buffer->vk, 0);
3174    anv_cmd_buffer_reset_rendering(cmd_buffer);
3175 
3176    cmd_buffer->usage_flags = pBeginInfo->flags;
3177 
3178    /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
3179     * primary level command buffers.
3180     *
3181     * From the Vulkan 1.0 spec:
3182     *
3183     *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
3184     *    secondary command buffer is considered to be entirely inside a render
3185     *    pass. If this is a primary command buffer, then this bit is ignored.
3186     */
3187    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
3188       cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
3189 
3190 #if GFX_VER >= 12
3191    /* Reenable prefetching at the beginning of secondary command buffers. We
3192     * do this so that the return instruction edition is not prefetched before
3193     * completion.
3194     */
3195    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3196       anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
3197          arb.PreParserDisableMask = true;
3198          arb.PreParserDisable = false;
3199       }
3200    }
3201 #endif
3202 
3203    /* Assume the viewport has already been set in primary command buffers. */
3204    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
3205       cmd_buffer->state.gfx.viewport_set = true;
3206 
3207    trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
3208 
3209    if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
3210        anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
3211       /* Invalidate the aux table in every primary command buffer. This
3212        * ensures the command buffer see the last updates made by the host.
3213        */
3214       if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3215           cmd_buffer->device->info->has_aux_map) {
3216          anv_add_pending_pipe_bits(cmd_buffer,
3217                                    ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
3218                                    "new cmd buffer with aux-tt");
3219       }
3220       return VK_SUCCESS;
3221    }
3222 
3223 #if GFX_VER >= 12
3224    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3225        cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3226       genX(cmd_buffer_set_protected_memory)(cmd_buffer, true);
3227 #endif
3228 
3229    if (cmd_buffer->device->vk.enabled_extensions.EXT_descriptor_buffer) {
3230       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
3231    } else {
3232       cmd_buffer->state.current_db_mode = ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
3233       genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
3234    }
3235 
3236    /* We sometimes store vertex data in the dynamic state buffer for blorp
3237     * operations and our dynamic state stream may re-use data from previous
3238     * command buffers.  In order to prevent stale cache data, we flush the VF
3239     * cache.  We could do this on every blorp call but that's not really
3240     * needed as all of the data will get written by the CPU prior to the GPU
3241     * executing anything.  The chances are fairly high that they will use
3242     * blorp at least once per primary command buffer so it shouldn't be
3243     * wasted.
3244     *
3245     * There is also a workaround on gfx8 which requires us to invalidate the
3246     * VF cache occasionally.  It's easier if we can assume we start with a
3247     * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
3248     */
3249    anv_add_pending_pipe_bits(cmd_buffer,
3250                              ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
3251                              "new cmd buffer");
3252 
3253    /* Invalidate the aux table in every primary command buffer. This ensures
3254     * the command buffer see the last updates made by the host.
3255     */
3256    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3257        cmd_buffer->device->info->has_aux_map) {
3258       anv_add_pending_pipe_bits(cmd_buffer,
3259                                 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
3260                                 "new cmd buffer with aux-tt");
3261    }
3262 
3263    /* We send an "Indirect State Pointers Disable" packet at
3264     * EndCommandBuffer, so all push constant packets are ignored during a
3265     * context restore. Documentation says after that command, we need to
3266     * emit push constants again before any rendering operation. So we
3267     * flag them dirty here to make sure they get emitted.
3268     */
3269    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
3270    cmd_buffer->state.gfx.base.push_constants_data_dirty = true;
3271 
3272    if (cmd_buffer->usage_flags &
3273        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3274       struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3275 
3276       char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
3277       const VkRenderingInfo *resume_info =
3278          vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
3279                                                                pBeginInfo,
3280                                                                gcbiar_data);
3281       if (resume_info != NULL) {
3282          genX(CmdBeginRendering)(commandBuffer, resume_info);
3283       } else {
3284          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
3285             vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
3286                                                              pBeginInfo);
3287          assert(inheritance_info);
3288 
3289          gfx->rendering_flags = inheritance_info->flags;
3290          gfx->render_area = (VkRect2D) { };
3291          gfx->layer_count = 0;
3292          gfx->samples = inheritance_info->rasterizationSamples;
3293          gfx->view_mask = inheritance_info->viewMask;
3294 
3295          uint32_t color_att_count = inheritance_info->colorAttachmentCount;
3296          result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
3297          if (result != VK_SUCCESS)
3298             return result;
3299 
3300          for (uint32_t i = 0; i < color_att_count; i++) {
3301             gfx->color_att[i].vk_format =
3302                inheritance_info->pColorAttachmentFormats[i];
3303          }
3304          gfx->depth_att.vk_format =
3305             inheritance_info->depthAttachmentFormat;
3306          gfx->stencil_att.vk_format =
3307             inheritance_info->stencilAttachmentFormat;
3308 
3309          anv_cmd_graphic_state_update_has_uint_rt(gfx);
3310 
3311          cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_AREA |
3312                                         ANV_CMD_DIRTY_RENDER_TARGETS;
3313       }
3314    }
3315 
3316    /* Emit the sample pattern at the beginning of the batch because the
3317     * default locations emitted at the device initialization might have been
3318     * changed by a previous command buffer.
3319     *
3320     * Do not change that when we're continuing a previous renderpass.
3321     */
3322    if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
3323        !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
3324       genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
3325 
3326    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3327       const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
3328          vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
3329 
3330       /* If secondary buffer supports conditional rendering
3331        * we should emit commands as if conditional rendering is enabled.
3332        */
3333       cmd_buffer->state.conditional_render_enabled =
3334          conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
3335 
3336       if (pBeginInfo->pInheritanceInfo->occlusionQueryEnable) {
3337          cmd_buffer->state.gfx.n_occlusion_queries = 1;
3338          cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
3339       }
3340    }
3341 
3342    return VK_SUCCESS;
3343 }
3344 
3345 /* From the PRM, Volume 2a:
3346  *
3347  *    "Indirect State Pointers Disable
3348  *
3349  *    At the completion of the post-sync operation associated with this pipe
3350  *    control packet, the indirect state pointers in the hardware are
3351  *    considered invalid; the indirect pointers are not saved in the context.
3352  *    If any new indirect state commands are executed in the command stream
3353  *    while the pipe control is pending, the new indirect state commands are
3354  *    preserved.
3355  *
3356  *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
3357  *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
3358  *    commands are only considered as Indirect State Pointers. Once ISP is
3359  *    issued in a context, SW must initialize by programming push constant
3360  *    commands for all the shaders (at least to zero length) before attempting
3361  *    any rendering operation for the same context."
3362  *
3363  * 3DSTATE_CONSTANT_* packets are restored during a context restore,
3364  * even though they point to a BO that has been already unreferenced at
3365  * the end of the previous batch buffer. This has been fine so far since
3366  * we are protected by these scratch page (every address not covered by
3367  * a BO should be pointing to the scratch page). But on CNL, it is
3368  * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
3369  * instruction.
3370  *
3371  * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
3372  * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
3373  * context restore, so the mentioned hang doesn't happen. However,
3374  * software must program push constant commands for all stages prior to
3375  * rendering anything. So we flag them dirty in BeginCommandBuffer.
3376  *
3377  * Finally, we also make sure to stall at pixel scoreboard to make sure the
3378  * constants have been loaded into the EUs prior to disable the push constants
3379  * so that it doesn't hang a previous 3DPRIMITIVE.
3380  */
3381 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)3382 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
3383 {
3384    genx_batch_emit_pipe_control(&cmd_buffer->batch,
3385                                 cmd_buffer->device->info,
3386                                 cmd_buffer->state.current_pipeline,
3387                                 ANV_PIPE_CS_STALL_BIT |
3388                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
3389    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3390          pc.IndirectStatePointersDisable = true;
3391          pc.CommandStreamerStallEnable = true;
3392          anv_debug_dump_pc(pc, __func__);
3393    }
3394 }
3395 
3396 static VkResult
end_command_buffer(struct anv_cmd_buffer * cmd_buffer)3397 end_command_buffer(struct anv_cmd_buffer *cmd_buffer)
3398 {
3399    if (anv_batch_has_error(&cmd_buffer->batch))
3400       return cmd_buffer->batch.status;
3401 
3402    anv_measure_endcommandbuffer(cmd_buffer);
3403 
3404    if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
3405        anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
3406       trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
3407       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3408       anv_cmd_buffer_end_batch_buffer(cmd_buffer);
3409       return VK_SUCCESS;
3410    }
3411 
3412    /* Flush query clears using blorp so that secondary query writes do not
3413     * race with the clear.
3414     */
3415    if (cmd_buffer->state.queries.clear_bits) {
3416       anv_add_pending_pipe_bits(cmd_buffer,
3417                                 ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
3418                                 "query clear flush prior command buffer end");
3419    }
3420 
3421    /* Flush any in-progress CCS/MCS operations in preparation for chaining. */
3422    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
3423 
3424    genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
3425 
3426    /* Turn on object level preemption if it is disabled to have it in known
3427     * state at the beginning of new command buffer.
3428     */
3429    if (!cmd_buffer->state.gfx.object_preemption)
3430       genX(cmd_buffer_set_preemption)(cmd_buffer, true);
3431 
3432    /* We want every command buffer to start with the PMA fix in a known state,
3433     * so we disable it at the end of the command buffer.
3434     */
3435    genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
3436 
3437    /* Wa_14015814527
3438     *
3439     * Apply task URB workaround in the end of primary or secondary cmd_buffer.
3440     */
3441    genX(apply_task_urb_workaround)(cmd_buffer);
3442 
3443    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3444 
3445    emit_isp_disable(cmd_buffer);
3446 
3447 #if GFX_VER >= 12
3448    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3449        cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3450       genX(cmd_buffer_set_protected_memory)(cmd_buffer, false);
3451 #endif
3452 
3453    trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
3454 
3455    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
3456 
3457    return VK_SUCCESS;
3458 }
3459 
3460 VkResult
genX(EndCommandBuffer)3461 genX(EndCommandBuffer)(
3462     VkCommandBuffer                             commandBuffer)
3463 {
3464    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3465 
3466    VkResult status = end_command_buffer(cmd_buffer);
3467    if (status != VK_SUCCESS)
3468       return status;
3469 
3470    /* If there is MSAA access over the compute/transfer queue, we can use the
3471     * companion RCS command buffer and end it properly.
3472     */
3473    if (cmd_buffer->companion_rcs_cmd_buffer) {
3474        assert(anv_cmd_buffer_is_compute_queue(cmd_buffer) ||
3475               anv_cmd_buffer_is_blitter_queue(cmd_buffer));
3476        status = end_command_buffer(cmd_buffer->companion_rcs_cmd_buffer);
3477    }
3478 
3479    ANV_RMV(cmd_buffer_create, cmd_buffer->device, cmd_buffer);
3480 
3481    return status;
3482 }
3483 
3484 void
genX(CmdExecuteCommands)3485 genX(CmdExecuteCommands)(
3486     VkCommandBuffer                             commandBuffer,
3487     uint32_t                                    commandBufferCount,
3488     const VkCommandBuffer*                      pCmdBuffers)
3489 {
3490    ANV_FROM_HANDLE(anv_cmd_buffer, container, commandBuffer);
3491 
3492    struct anv_device *device = container->device;
3493 
3494    if (anv_batch_has_error(&container->batch))
3495       return;
3496 
3497    /* The secondary command buffers will assume that the PMA fix is disabled
3498     * when they begin executing.  Make sure this is true.
3499     */
3500    genX(cmd_buffer_enable_pma_fix)(container, false);
3501 
3502    /* Turn on preemption in case it was toggled off. */
3503    if (!container->state.gfx.object_preemption)
3504       genX(cmd_buffer_set_preemption)(container, true);
3505 
3506    /* Wa_14015814527
3507     *
3508     * Apply task URB workaround before secondary cmd buffers.
3509     */
3510    genX(apply_task_urb_workaround)(container);
3511 
3512    /* Flush query clears using blorp so that secondary query writes do not
3513     * race with the clear.
3514     */
3515    if (container->state.queries.clear_bits) {
3516       anv_add_pending_pipe_bits(container,
3517                                 ANV_PIPE_QUERY_BITS(container->state.queries.clear_bits),
3518                                 "query clear flush prior to secondary buffer");
3519    }
3520 
3521    /* Ensure we're in a regular drawing cache mode (assumption for all
3522     * secondary).
3523     */
3524    genX(cmd_buffer_update_color_aux_op(container, ISL_AUX_OP_NONE));
3525 
3526    /* The secondary command buffer doesn't know which textures etc. have been
3527     * flushed prior to their execution.  Apply those flushes now.
3528     */
3529    genX(cmd_buffer_apply_pipe_flushes)(container);
3530 
3531    genX(cmd_buffer_flush_generated_draws)(container);
3532 
3533    UNUSED enum anv_cmd_descriptor_buffer_mode db_mode =
3534       container->state.current_db_mode;
3535 
3536    /* Do a first pass to copy the surface state content of the render targets
3537     * if needed.
3538     */
3539    bool need_surface_state_copy = false;
3540    for (uint32_t i = 0; i < commandBufferCount; i++) {
3541       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3542 
3543       if (secondary->usage_flags &
3544           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3545          need_surface_state_copy = true;
3546          break;
3547       }
3548    }
3549 
3550    if (need_surface_state_copy) {
3551       if (container->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3552          genX(cmd_buffer_set_protected_memory)(container, false);
3553 
3554       /* The memcpy will take care of the 3D preemption requirements. */
3555       struct anv_memcpy_state memcpy_state;
3556       genX(emit_so_memcpy_init)(&memcpy_state, device,
3557                                 container, &container->batch);
3558 
3559       for (uint32_t i = 0; i < commandBufferCount; i++) {
3560          ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3561 
3562          assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
3563          assert(!anv_batch_has_error(&secondary->batch));
3564 
3565          if (secondary->usage_flags &
3566              VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3567             /* If we're continuing a render pass from the container, we need
3568              * to copy the surface states for the current subpass into the
3569              * storage we allocated for them in BeginCommandBuffer.
3570              */
3571             struct anv_state src_state = container->state.gfx.att_states;
3572             struct anv_state dst_state = secondary->state.gfx.att_states;
3573             assert(src_state.alloc_size == dst_state.alloc_size);
3574 
3575             genX(emit_so_memcpy)(
3576                &memcpy_state,
3577                anv_state_pool_state_address(&device->internal_surface_state_pool,
3578                                             dst_state),
3579                anv_state_pool_state_address(&device->internal_surface_state_pool,
3580                                             src_state),
3581                src_state.alloc_size);
3582          }
3583       }
3584       genX(emit_so_memcpy_fini)(&memcpy_state);
3585 
3586       anv_add_pending_pipe_bits(container,
3587                                 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
3588                                 "Wait for primary->secondary RP surface state copies");
3589       genX(cmd_buffer_apply_pipe_flushes)(container);
3590 
3591       if (container->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3592          genX(cmd_buffer_set_protected_memory)(container, true);
3593    }
3594 
3595    /* Ensure preemption is enabled (assumption for all secondary) */
3596    genX(cmd_buffer_set_preemption)(container, true);
3597 
3598    for (uint32_t i = 0; i < commandBufferCount; i++) {
3599       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3600 
3601       assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
3602       assert(!anv_batch_has_error(&secondary->batch));
3603 
3604       if (secondary->state.conditional_render_enabled) {
3605          if (!container->state.conditional_render_enabled) {
3606             /* Secondary buffer is constructed as if it will be executed
3607              * with conditional rendering, we should satisfy this dependency
3608              * regardless of conditional rendering being enabled in container.
3609              */
3610             struct mi_builder b;
3611             mi_builder_init(&b, device->info, &container->batch);
3612             mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
3613                          mi_imm(UINT64_MAX));
3614          }
3615       }
3616 
3617       anv_cmd_buffer_add_secondary(container, secondary);
3618 
3619       /* Add secondary buffer's RCS command buffer to container buffer's RCS
3620        * command buffer for execution if secondary RCS is valid.
3621        */
3622       if (secondary->companion_rcs_cmd_buffer != NULL) {
3623          VkResult result = anv_cmd_buffer_ensure_rcs_companion(container);
3624          if (result != VK_SUCCESS) {
3625             anv_batch_set_error(&container->batch, result);
3626             return;
3627          }
3628 
3629          anv_cmd_buffer_add_secondary(container->companion_rcs_cmd_buffer,
3630                                       secondary->companion_rcs_cmd_buffer);
3631       }
3632 
3633       assert(secondary->perf_query_pool == NULL || container->perf_query_pool == NULL ||
3634              secondary->perf_query_pool == container->perf_query_pool);
3635       if (secondary->perf_query_pool)
3636          container->perf_query_pool = secondary->perf_query_pool;
3637 
3638 #if INTEL_NEEDS_WA_1808121037
3639       if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
3640          container->state.depth_reg_mode = secondary->state.depth_reg_mode;
3641 #endif
3642 
3643       container->state.gfx.viewport_set |= secondary->state.gfx.viewport_set;
3644 
3645       db_mode = secondary->state.current_db_mode;
3646    }
3647 
3648    /* The secondary isn't counted in our VF cache tracking so we need to
3649     * invalidate the whole thing.
3650     */
3651    if (GFX_VER == 9) {
3652       anv_add_pending_pipe_bits(container,
3653                                 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
3654                                 "Secondary cmd buffer not tracked in VF cache");
3655    }
3656 
3657 #if INTEL_WA_16014538804_GFX_VER
3658    if (anv_cmd_buffer_is_render_queue(container) &&
3659        intel_needs_workaround(device->info, 16014538804))
3660       anv_batch_emit(&container->batch, GENX(PIPE_CONTROL), pc);
3661 #endif
3662 
3663    /* The secondary may have selected a different pipeline (3D or compute) and
3664     * may have changed the current L3$ configuration.  Reset our tracking
3665     * variables to invalid values to ensure that we re-emit these in the case
3666     * where we do any draws or compute dispatches from the container after the
3667     * secondary has returned.
3668     */
3669    container->state.current_pipeline = UINT32_MAX;
3670    container->state.current_l3_config = NULL;
3671    container->state.current_hash_scale = 0;
3672    container->state.gfx.push_constant_stages = 0;
3673 
3674    memset(&container->state.gfx.urb_cfg, 0, sizeof(struct intel_urb_config));
3675 
3676    /* Reemit all GFX instructions in container */
3677    memcpy(container->state.gfx.dyn_state.dirty,
3678           device->gfx_dirty_state,
3679           sizeof(container->state.gfx.dyn_state.dirty));
3680    if (container->device->vk.enabled_extensions.KHR_fragment_shading_rate) {
3681       /* Also recompute the CPS_STATE offset */
3682       struct vk_dynamic_graphics_state *dyn =
3683          &container->vk.dynamic_graphics_state;
3684       BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_FSR);
3685    }
3686 
3687    /* Each of the secondary command buffers will use its own state base
3688     * address.  We need to re-emit state base address for the container after
3689     * all of the secondaries are done.
3690     */
3691    if (container->device->vk.enabled_extensions.EXT_descriptor_buffer) {
3692 #if GFX_VERx10 >= 125
3693       /* If the last secondary had a different mode, reemit the last pending
3694        * mode. Otherwise, we can do a lighter binding table pool update.
3695        */
3696       if (db_mode != container->state.current_db_mode) {
3697          container->state.current_db_mode = db_mode;
3698          genX(cmd_buffer_emit_state_base_address)(container);
3699       } else {
3700          genX(cmd_buffer_emit_bt_pool_base_address)(container);
3701       }
3702 #else
3703       genX(cmd_buffer_emit_state_base_address)(container);
3704 #endif
3705    } else {
3706       genX(cmd_buffer_emit_bt_pool_base_address)(container);
3707    }
3708 
3709    /* Copy of utrace timestamp buffers from secondary into container */
3710    if (u_trace_enabled(&device->ds.trace_context)) {
3711       trace_intel_begin_trace_copy(&container->trace);
3712 
3713       struct anv_memcpy_state memcpy_state;
3714       genX(emit_so_memcpy_init)(&memcpy_state, device,
3715                                 container, &container->batch);
3716       uint32_t num_traces = 0;
3717       for (uint32_t i = 0; i < commandBufferCount; i++) {
3718          ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3719 
3720          num_traces += secondary->trace.num_traces;
3721          u_trace_clone_append(u_trace_begin_iterator(&secondary->trace),
3722                               u_trace_end_iterator(&secondary->trace),
3723                               &container->trace,
3724                               &memcpy_state,
3725                               anv_device_utrace_emit_gfx_copy_buffer);
3726       }
3727       genX(emit_so_memcpy_fini)(&memcpy_state);
3728 
3729       trace_intel_end_trace_copy(&container->trace, num_traces);
3730 
3731       /* Memcpy is done using the 3D pipeline. */
3732       container->state.current_pipeline = _3D;
3733    }
3734 }
3735 
3736 static inline enum anv_pipe_bits
anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer * cmd_buffer,VkAccessFlags2 flags)3737 anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
3738                                      VkAccessFlags2 flags)
3739 {
3740    enum anv_pipe_bits pipe_bits = 0;
3741 
3742    u_foreach_bit64(b, flags) {
3743       switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
3744       case VK_ACCESS_2_SHADER_WRITE_BIT:
3745       case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
3746       case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
3747          /* We're transitioning a buffer that was previously used as write
3748           * destination through the data port. To make its content available
3749           * to future operations, flush the hdc pipeline.
3750           */
3751          pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3752          pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3753          break;
3754       case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
3755          /* We're transitioning a buffer that was previously used as render
3756           * target. To make its content available to future operations, flush
3757           * the render target cache.
3758           */
3759          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
3760          break;
3761       case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3762          /* We're transitioning a buffer that was previously used as depth
3763           * buffer. To make its content available to future operations, flush
3764           * the depth cache.
3765           */
3766          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3767          break;
3768       case VK_ACCESS_2_TRANSFER_WRITE_BIT:
3769          /* We're transitioning a buffer that was previously used as a
3770           * transfer write destination. Generic write operations include color
3771           * & depth operations as well as buffer operations like :
3772           *     - vkCmdClearColorImage()
3773           *     - vkCmdClearDepthStencilImage()
3774           *     - vkCmdBlitImage()
3775           *     - vkCmdCopy*(), vkCmdUpdate*(), vkCmdFill*()
3776           *
3777           * Most of these operations are implemented using Blorp which writes
3778           * through the render target cache or the depth cache on the graphics
3779           * queue. On the compute queue, the writes are done through the data
3780           * port.
3781           */
3782          if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
3783             pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3784             pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3785          } else {
3786             /* We can use the data port when trying to stay in compute mode on
3787              * the RCS.
3788              */
3789             pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3790             pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3791             /* Most operations are done through RT/detph writes */
3792             pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
3793             pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3794          }
3795          break;
3796       case VK_ACCESS_2_MEMORY_WRITE_BIT:
3797          /* We're transitioning a buffer for generic write operations. Flush
3798           * all the caches.
3799           */
3800          pipe_bits |= ANV_PIPE_FLUSH_BITS;
3801          break;
3802       case VK_ACCESS_2_HOST_WRITE_BIT:
3803          /* We're transitioning a buffer for access by CPU. Invalidate
3804           * all the caches. Since data and tile caches don't have invalidate,
3805           * we are forced to flush those as well.
3806           */
3807          pipe_bits |= ANV_PIPE_FLUSH_BITS;
3808          pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
3809          break;
3810       case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3811       case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3812          /* We're transitioning a buffer written either from VS stage or from
3813           * the command streamer (see CmdEndTransformFeedbackEXT), we just
3814           * need to stall the CS.
3815           *
3816           * Streamout writes apparently bypassing L3, in order to make them
3817           * visible to the destination, we need to invalidate the other
3818           * caches.
3819           */
3820          pipe_bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_INVALIDATE_BITS;
3821          break;
3822       default:
3823          break; /* Nothing to do */
3824       }
3825    }
3826 
3827    return pipe_bits;
3828 }
3829 
3830 static inline enum anv_pipe_bits
anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer * cmd_buffer,VkAccessFlags2 flags)3831 anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
3832                                           VkAccessFlags2 flags)
3833 {
3834    struct anv_device *device = cmd_buffer->device;
3835    enum anv_pipe_bits pipe_bits = 0;
3836 
3837    u_foreach_bit64(b, flags) {
3838       switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
3839       case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
3840          /* Indirect draw commands take a buffer as input that we're going to
3841           * read from the command streamer to load some of the HW registers
3842           * (see genX_cmd_buffer.c:load_indirect_parameters). This requires a
3843           * command streamer stall so that all the cache flushes have
3844           * completed before the command streamer loads from memory.
3845           */
3846          pipe_bits |=  ANV_PIPE_CS_STALL_BIT;
3847          if (device->info->ver == 9) {
3848             /* Indirect draw commands on Gfx9 also set gl_BaseVertex &
3849              * gl_BaseIndex through a vertex buffer, so invalidate that cache.
3850              */
3851             pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
3852          }
3853          /* For CmdDipatchIndirect, we also load gl_NumWorkGroups through a
3854           * UBO from the buffer, so we need to invalidate constant cache.
3855           */
3856          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
3857          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3858          /* Tile cache flush needed For CmdDipatchIndirect since command
3859           * streamer and vertex fetch aren't L3 coherent.
3860           */
3861          pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3862          break;
3863       case VK_ACCESS_2_INDEX_READ_BIT:
3864       case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
3865          /* We transitioning a buffer to be used for as input for vkCmdDraw*
3866           * commands, so we invalidate the VF cache to make sure there is no
3867           * stale data when we start rendering.
3868           */
3869          pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
3870          break;
3871       case VK_ACCESS_2_UNIFORM_READ_BIT:
3872       case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
3873          /* We transitioning a buffer to be used as uniform data. Because
3874           * uniform is accessed through the data port & sampler, we need to
3875           * invalidate the texture cache (sampler) & constant cache (data
3876           * port) to avoid stale data.
3877           */
3878          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
3879          if (device->physical->compiler->indirect_ubos_use_sampler) {
3880             pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3881          } else {
3882             pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3883             pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3884          }
3885          break;
3886       case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
3887       case VK_ACCESS_2_TRANSFER_READ_BIT:
3888       case VK_ACCESS_2_SHADER_SAMPLED_READ_BIT:
3889          /* Transitioning a buffer to be read through the sampler, so
3890           * invalidate the texture cache, we don't want any stale data.
3891           */
3892          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3893          break;
3894       case VK_ACCESS_2_SHADER_READ_BIT:
3895          /* Same as VK_ACCESS_2_UNIFORM_READ_BIT and
3896           * VK_ACCESS_2_SHADER_SAMPLED_READ_BIT cases above
3897           */
3898          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
3899                       ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3900          if (!device->physical->compiler->indirect_ubos_use_sampler) {
3901             pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3902             pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3903          }
3904          break;
3905       case VK_ACCESS_2_MEMORY_READ_BIT:
3906          /* Transitioning a buffer for generic read, invalidate all the
3907           * caches.
3908           */
3909          pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
3910          break;
3911       case VK_ACCESS_2_MEMORY_WRITE_BIT:
3912          /* Generic write, make sure all previously written things land in
3913           * memory.
3914           */
3915          pipe_bits |= ANV_PIPE_FLUSH_BITS;
3916          break;
3917       case VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT:
3918       case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT:
3919          /* Transitioning a buffer for conditional rendering or transform
3920           * feedback. We'll load the content of this buffer into HW registers
3921           * using the command streamer, so we need to stall the command
3922           * streamer , so we need to stall the command streamer to make sure
3923           * any in-flight flush operations have completed.
3924           */
3925          pipe_bits |= ANV_PIPE_CS_STALL_BIT;
3926          pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3927          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3928          break;
3929       case VK_ACCESS_2_HOST_READ_BIT:
3930          /* We're transitioning a buffer that was written by CPU.  Flush
3931           * all the caches.
3932           */
3933          pipe_bits |= ANV_PIPE_FLUSH_BITS;
3934          break;
3935       case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3936          /* We're transitioning a buffer to be written by the streamout fixed
3937           * function. This one is apparently not L3 coherent, so we need a
3938           * tile cache flush to make sure any previous write is not going to
3939           * create WaW hazards.
3940           */
3941          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3942          pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3943          break;
3944       case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
3945          /* VK_ACCESS_2_SHADER_STORAGE_READ_BIT specifies read access to a
3946           * storage buffer, physical storage buffer, storage texel buffer, or
3947           * storage image in any shader pipeline stage.
3948           *
3949           * Any storage buffers or images written to must be invalidated and
3950           * flushed before the shader can access them.
3951           *
3952           * Both HDC & Untyped flushes also do invalidation. This is why we
3953           * use this here on Gfx12+.
3954           *
3955           * Gfx11 and prior don't have HDC. Only Data cache flush is available
3956           * and it only operates on the written cache lines.
3957           */
3958          if (device->info->ver >= 12) {
3959             pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3960             pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3961          }
3962          break;
3963       case VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT:
3964          pipe_bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
3965          break;
3966       default:
3967          break; /* Nothing to do */
3968       }
3969    }
3970 
3971    return pipe_bits;
3972 }
3973 
3974 static inline bool
stage_is_shader(const VkPipelineStageFlags2 stage)3975 stage_is_shader(const VkPipelineStageFlags2 stage)
3976 {
3977    return (stage & (VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
3978                     VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
3979                     VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
3980                     VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
3981                     VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
3982                     VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
3983                     VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
3984                     VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3985                     VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
3986                     VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT |
3987                     VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT));
3988 }
3989 
3990 static inline bool
stage_is_transfer(const VkPipelineStageFlags2 stage)3991 stage_is_transfer(const VkPipelineStageFlags2 stage)
3992 {
3993    return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3994                     VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT));
3995 }
3996 
3997 static inline bool
stage_is_video(const VkPipelineStageFlags2 stage)3998 stage_is_video(const VkPipelineStageFlags2 stage)
3999 {
4000    return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
4001 #ifdef VK_ENABLE_BETA_EXTENSIONS
4002                     VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR |
4003 #endif
4004                     VK_PIPELINE_STAGE_2_VIDEO_DECODE_BIT_KHR));
4005 }
4006 
4007 static inline bool
mask_is_shader_write(const VkAccessFlags2 access)4008 mask_is_shader_write(const VkAccessFlags2 access)
4009 {
4010    return (access & (VK_ACCESS_2_SHADER_WRITE_BIT |
4011                      VK_ACCESS_2_MEMORY_WRITE_BIT |
4012                      VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT));
4013 }
4014 
4015 static inline bool
mask_is_write(const VkAccessFlags2 access)4016 mask_is_write(const VkAccessFlags2 access)
4017 {
4018    return access & (VK_ACCESS_2_SHADER_WRITE_BIT |
4019                     VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
4020                     VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
4021                     VK_ACCESS_2_TRANSFER_WRITE_BIT |
4022                     VK_ACCESS_2_HOST_WRITE_BIT |
4023                     VK_ACCESS_2_MEMORY_WRITE_BIT |
4024                     VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
4025                     VK_ACCESS_2_VIDEO_DECODE_WRITE_BIT_KHR |
4026 #ifdef VK_ENABLE_BETA_EXTENSIONS
4027                     VK_ACCESS_2_VIDEO_ENCODE_WRITE_BIT_KHR |
4028 #endif
4029                     VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT |
4030                     VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
4031                     VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV |
4032                     VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR |
4033                     VK_ACCESS_2_MICROMAP_WRITE_BIT_EXT |
4034                     VK_ACCESS_2_OPTICAL_FLOW_WRITE_BIT_NV);
4035 }
4036 
4037 static inline bool
mask_is_transfer_write(const VkAccessFlags2 access)4038 mask_is_transfer_write(const VkAccessFlags2 access)
4039 {
4040    return access & (VK_ACCESS_2_TRANSFER_WRITE_BIT |
4041                     VK_ACCESS_2_MEMORY_WRITE_BIT);
4042 }
4043 
4044 static void
cmd_buffer_barrier_video(struct anv_cmd_buffer * cmd_buffer,uint32_t n_dep_infos,const VkDependencyInfo * dep_infos)4045 cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
4046                          uint32_t n_dep_infos,
4047                          const VkDependencyInfo *dep_infos)
4048 {
4049    assert(anv_cmd_buffer_is_video_queue(cmd_buffer));
4050 
4051    bool flush_llc = false;
4052    bool flush_ccs = false;
4053 
4054    for (uint32_t d = 0; d < n_dep_infos; d++) {
4055       const VkDependencyInfo *dep_info = &dep_infos[d];
4056 
4057 
4058       for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
4059          const VkImageMemoryBarrier2 *img_barrier =
4060             &dep_info->pImageMemoryBarriers[i];
4061 
4062          ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
4063          const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
4064 
4065          /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
4066           * memory barrier defines a queue family ownership transfer.
4067           */
4068          if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
4069             flush_llc = true;
4070 
4071          VkImageAspectFlags img_aspects =
4072             vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4073          anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
4074             const uint32_t plane =
4075                anv_image_aspect_to_plane(image, 1UL << aspect_bit);
4076             if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
4077                flush_ccs = true;
4078             }
4079          }
4080       }
4081 
4082       for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
4083          /* Flush the cache if something is written by the video operations and
4084           * used by any other stages except video encode/decode stages or if
4085           * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
4086           * barrier defines a queue family ownership transfer.
4087           */
4088          if ((stage_is_video(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
4089               mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask) &&
4090               !stage_is_video(dep_info->pBufferMemoryBarriers[i].dstStageMask)) ||
4091              (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
4092               dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
4093             flush_llc = true;
4094             break;
4095          }
4096       }
4097 
4098       for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
4099          /* Flush the cache if something is written by the video operations and
4100           * used by any other stages except video encode/decode stage.
4101           */
4102          if (stage_is_video(dep_info->pMemoryBarriers[i].srcStageMask) &&
4103              mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
4104              !stage_is_video(dep_info->pMemoryBarriers[i].dstStageMask)) {
4105             flush_llc = true;
4106             break;
4107          }
4108       }
4109 
4110       /* We cannot gather more information than that. */
4111       if (flush_ccs && flush_llc)
4112          break;
4113    }
4114 
4115    if (flush_ccs || flush_llc) {
4116       anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
4117 #if GFX_VERx10 >= 125
4118          fd.FlushCCS = flush_ccs;
4119 #endif
4120 #if GFX_VER >= 12
4121          /* Using this bit on Gfx9 triggers a GPU hang.
4122           * This is undocumented behavior. Gfx12 seems fine.
4123           * TODO: check Gfx11
4124           */
4125          fd.FlushLLC = flush_llc;
4126 #endif
4127       }
4128    }
4129 }
4130 
4131 static void
cmd_buffer_barrier_blitter(struct anv_cmd_buffer * cmd_buffer,uint32_t n_dep_infos,const VkDependencyInfo * dep_infos)4132 cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer,
4133                            uint32_t n_dep_infos,
4134                            const VkDependencyInfo *dep_infos)
4135 {
4136 #if GFX_VERx10 >= 125
4137    assert(anv_cmd_buffer_is_blitter_queue(cmd_buffer));
4138 
4139    /* The blitter requires an MI_FLUSH_DW command when a buffer transitions
4140     * from being a destination to a source.
4141     */
4142    bool flush_llc = false;
4143    bool flush_ccs = false;
4144 
4145    for (uint32_t d = 0; d < n_dep_infos; d++) {
4146       const VkDependencyInfo *dep_info = &dep_infos[d];
4147 
4148       for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
4149          const VkImageMemoryBarrier2 *img_barrier =
4150             &dep_info->pImageMemoryBarriers[i];
4151 
4152          ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
4153          const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
4154 
4155          /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
4156           * memory barrier defines a queue family transfer operation.
4157           */
4158          if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
4159             flush_llc = true;
4160 
4161          /* Flush cache if transfer command reads the output of the previous
4162           * transfer command, ideally we should just wait for the completion
4163           * but for now just flush the cache to make the data visible.
4164           */
4165          if ((img_barrier->oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL ||
4166               img_barrier->oldLayout == VK_IMAGE_LAYOUT_GENERAL) &&
4167              (img_barrier->newLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL ||
4168               img_barrier->newLayout == VK_IMAGE_LAYOUT_GENERAL)) {
4169             flush_llc = true;
4170          }
4171 
4172          VkImageAspectFlags img_aspects =
4173             vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4174          anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
4175             const uint32_t plane =
4176                anv_image_aspect_to_plane(image, 1UL << aspect_bit);
4177             if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
4178                flush_ccs = true;
4179             }
4180          }
4181       }
4182 
4183       for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
4184          /* Flush the cache if something is written by the transfer command
4185           * and used by any other stages except transfer stage or if
4186           * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
4187           * memory barrier defines a queue family transfer operation.
4188           */
4189          if ((stage_is_transfer(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
4190               mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask)) ||
4191              (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
4192               dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
4193             flush_llc = true;
4194             break;
4195          }
4196       }
4197 
4198       for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
4199          /* Flush the cache if something is written by the transfer command
4200           * and used by any other stages except transfer stage.
4201           */
4202          if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
4203              mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask)) {
4204             flush_llc = true;
4205             break;
4206          }
4207       }
4208 
4209       /* We cannot gather more information than that. */
4210       if (flush_ccs && flush_llc)
4211          break;
4212    }
4213 
4214    if (flush_ccs || flush_llc) {
4215       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
4216       if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
4217          genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
4218                                                 cmd_buffer->device);
4219       }
4220       anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
4221          fd.FlushCCS = flush_ccs;
4222          fd.FlushLLC = flush_llc;
4223       }
4224    }
4225 #endif
4226 }
4227 
4228 static inline bool
cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer * cmd_buffer)4229 cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer *cmd_buffer)
4230 {
4231    /* Query copies are only written with dataport, so we only need to check
4232     * that flag.
4233     */
4234    return (cmd_buffer->state.queries.buffer_write_bits &
4235            ANV_QUERY_WRITES_DATA_FLUSH) != 0;
4236 }
4237 
4238 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,uint32_t n_dep_infos,const VkDependencyInfo * dep_infos,const char * reason)4239 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
4240                    uint32_t n_dep_infos,
4241                    const VkDependencyInfo *dep_infos,
4242                    const char *reason)
4243 {
4244    if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
4245       cmd_buffer_barrier_video(cmd_buffer, n_dep_infos, dep_infos);
4246       return;
4247    }
4248 
4249    if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
4250       cmd_buffer_barrier_blitter(cmd_buffer, n_dep_infos, dep_infos);
4251       return;
4252    }
4253 
4254    /* XXX: Right now, we're really dumb and just flush whatever categories
4255     * the app asks for. One of these days we may make this a bit better but
4256     * right now that's all the hardware allows for in most areas.
4257     */
4258    VkAccessFlags2 src_flags = 0;
4259    VkAccessFlags2 dst_flags = 0;
4260 
4261 #if GFX_VER < 20
4262    bool apply_sparse_flushes = false;
4263    struct anv_device *device = cmd_buffer->device;
4264 #endif
4265    bool flush_query_copies = false;
4266 
4267    for (uint32_t d = 0; d < n_dep_infos; d++) {
4268       const VkDependencyInfo *dep_info = &dep_infos[d];
4269 
4270       for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
4271          src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
4272          dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
4273 
4274          /* Shader writes to buffers that could then be written by a transfer
4275           * command (including queries).
4276           */
4277          if (stage_is_shader(dep_info->pMemoryBarriers[i].srcStageMask) &&
4278              mask_is_shader_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
4279              stage_is_transfer(dep_info->pMemoryBarriers[i].dstStageMask)) {
4280             cmd_buffer->state.queries.buffer_write_bits |=
4281                ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
4282          }
4283 
4284          if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
4285              mask_is_transfer_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
4286              cmd_buffer_has_pending_copy_query(cmd_buffer))
4287             flush_query_copies = true;
4288 
4289 #if GFX_VER < 20
4290          /* There's no way of knowing if this memory barrier is related to
4291           * sparse buffers! This is pretty horrible.
4292           */
4293          if (mask_is_write(src_flags) &&
4294              p_atomic_read(&device->num_sparse_resources) > 0)
4295             apply_sparse_flushes = true;
4296 #endif
4297       }
4298 
4299       for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
4300          const VkBufferMemoryBarrier2 *buf_barrier =
4301             &dep_info->pBufferMemoryBarriers[i];
4302 
4303          src_flags |= buf_barrier->srcAccessMask;
4304          dst_flags |= buf_barrier->dstAccessMask;
4305 
4306          /* Shader writes to buffers that could then be written by a transfer
4307           * command (including queries).
4308           */
4309          if (stage_is_shader(buf_barrier->srcStageMask) &&
4310              mask_is_shader_write(buf_barrier->srcAccessMask) &&
4311              stage_is_transfer(buf_barrier->dstStageMask)) {
4312             cmd_buffer->state.queries.buffer_write_bits |=
4313                ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
4314          }
4315 
4316          if (stage_is_transfer(buf_barrier->srcStageMask) &&
4317              mask_is_transfer_write(buf_barrier->srcAccessMask) &&
4318              cmd_buffer_has_pending_copy_query(cmd_buffer))
4319             flush_query_copies = true;
4320 
4321 #if GFX_VER < 20
4322          ANV_FROM_HANDLE(anv_buffer, buffer, buf_barrier->buffer);
4323 
4324          if (anv_buffer_is_sparse(buffer) && mask_is_write(src_flags))
4325             apply_sparse_flushes = true;
4326 #endif
4327       }
4328 
4329       for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
4330          const VkImageMemoryBarrier2 *img_barrier =
4331             &dep_info->pImageMemoryBarriers[i];
4332 
4333          src_flags |= img_barrier->srcAccessMask;
4334          dst_flags |= img_barrier->dstAccessMask;
4335 
4336          ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
4337          const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
4338 
4339          uint32_t base_layer, layer_count;
4340          if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
4341             base_layer = 0;
4342             layer_count = u_minify(image->vk.extent.depth, range->baseMipLevel);
4343          } else {
4344             base_layer = range->baseArrayLayer;
4345             layer_count = vk_image_subresource_layer_count(&image->vk, range);
4346          }
4347          const uint32_t level_count =
4348             vk_image_subresource_level_count(&image->vk, range);
4349 
4350          VkImageLayout old_layout = img_barrier->oldLayout;
4351          VkImageLayout new_layout = img_barrier->newLayout;
4352 
4353          /* If we're inside a render pass, the runtime might have converted
4354           * some layouts from GENERAL to FEEDBACK_LOOP. Check if that's the
4355           * case and reconvert back to the original layout so that application
4356           * barriers within renderpass are operating with consistent layouts.
4357           */
4358          if (!cmd_buffer->vk.runtime_rp_barrier &&
4359              cmd_buffer->vk.render_pass != NULL) {
4360             assert(anv_cmd_graphics_state_has_image_as_attachment(&cmd_buffer->state.gfx,
4361                                                                   image));
4362             VkImageLayout subpass_att_layout, subpass_stencil_att_layout;
4363 
4364             vk_command_buffer_get_attachment_layout(
4365                &cmd_buffer->vk, &image->vk,
4366                &subpass_att_layout, &subpass_stencil_att_layout);
4367 
4368             old_layout = subpass_att_layout;
4369             new_layout = subpass_att_layout;
4370          }
4371 
4372          if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4373             transition_depth_buffer(cmd_buffer, image,
4374                                     range->baseMipLevel, level_count,
4375                                     base_layer, layer_count,
4376                                     old_layout, new_layout,
4377                                     false /* will_full_fast_clear */);
4378          }
4379 
4380          if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4381             transition_stencil_buffer(cmd_buffer, image,
4382                                       range->baseMipLevel, level_count,
4383                                       base_layer, layer_count,
4384                                       old_layout, new_layout,
4385                                       false /* will_full_fast_clear */);
4386          }
4387 
4388          if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
4389             VkImageAspectFlags color_aspects =
4390                vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4391             anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
4392                transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
4393                                        range->baseMipLevel, level_count,
4394                                        base_layer, layer_count,
4395                                        old_layout, new_layout,
4396                                        img_barrier->srcQueueFamilyIndex,
4397                                        img_barrier->dstQueueFamilyIndex,
4398                                        false /* will_full_fast_clear */);
4399             }
4400          }
4401 #if GFX_VER < 20
4402          /* Mark image as compressed if the destination layout has untracked
4403           * writes to the aux surface.
4404           */
4405          VkImageAspectFlags aspects =
4406             vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4407          anv_foreach_image_aspect_bit(aspect_bit, image, aspects) {
4408             VkImageAspectFlagBits aspect = 1UL << aspect_bit;
4409             if (anv_layout_has_untracked_aux_writes(
4410                    device->info,
4411                    image, aspect,
4412                    img_barrier->newLayout,
4413                    cmd_buffer->queue_family->queueFlags)) {
4414                for (uint32_t l = 0; l < level_count; l++) {
4415                   const uint32_t level = range->baseMipLevel + l;
4416                   const uint32_t aux_layers =
4417                      anv_image_aux_layers(image, aspect, level);
4418 
4419                   if (base_layer >= aux_layers)
4420                      break; /* We will only get fewer layers as level increases */
4421 
4422                   uint32_t level_layer_count =
4423                      MIN2(layer_count, aux_layers - base_layer);
4424 
4425                   set_image_compressed_bit(cmd_buffer, image, aspect,
4426                                            level,
4427                                            base_layer, level_layer_count,
4428                                            true);
4429                }
4430             }
4431          }
4432 
4433          if (anv_image_is_sparse(image) && mask_is_write(src_flags))
4434             apply_sparse_flushes = true;
4435 #endif
4436       }
4437    }
4438 
4439    enum anv_pipe_bits bits =
4440       anv_pipe_flush_bits_for_access_flags(cmd_buffer, src_flags) |
4441       anv_pipe_invalidate_bits_for_access_flags(cmd_buffer, dst_flags);
4442 
4443 #if GFX_VER < 20
4444    /* Our HW implementation of the sparse feature lives in the GAM unit
4445     * (interface between all the GPU caches and external memory). As a result
4446     * writes to NULL bound images & buffers that should be ignored are
4447     * actually still visible in the caches. The only way for us to get correct
4448     * NULL bound regions to return 0s is to evict the caches to force the
4449     * caches to be repopulated with 0s.
4450     */
4451    if (apply_sparse_flushes)
4452       bits |= ANV_PIPE_FLUSH_BITS;
4453 #endif
4454 
4455    /* Copies from query pools are executed with a shader writing through the
4456     * dataport.
4457     */
4458    if (flush_query_copies) {
4459       bits |= (GFX_VER >= 12 ?
4460                ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_DATA_CACHE_FLUSH_BIT);
4461    }
4462 
4463    if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
4464       genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
4465 
4466    anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
4467 }
4468 
genX(CmdPipelineBarrier2)4469 void genX(CmdPipelineBarrier2)(
4470     VkCommandBuffer                             commandBuffer,
4471     const VkDependencyInfo*                     pDependencyInfo)
4472 {
4473    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4474 
4475    cmd_buffer_barrier(cmd_buffer, 1, pDependencyInfo, "pipe barrier");
4476 }
4477 
4478 void
genX(batch_emit_breakpoint)4479 genX(batch_emit_breakpoint)(struct anv_batch *batch,
4480                             struct anv_device *device,
4481                             bool emit_before_draw)
4482 {
4483    /* Update draw call count once */
4484    uint32_t draw_count = emit_before_draw ?
4485                          p_atomic_inc_return(&device->draw_call_count) :
4486                          p_atomic_read(&device->draw_call_count);
4487 
4488    if (((draw_count == intel_debug_bkp_before_draw_count &&
4489         emit_before_draw) ||
4490        (draw_count == intel_debug_bkp_after_draw_count &&
4491         !emit_before_draw))) {
4492       struct anv_address wait_addr =
4493          anv_state_pool_state_address(&device->dynamic_state_pool,
4494                                       device->breakpoint);
4495 
4496       anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
4497          sem.WaitMode            = PollingMode;
4498          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
4499          sem.SemaphoreDataDword  = 0x1;
4500          sem.SemaphoreAddress    = wait_addr;
4501       };
4502    }
4503 }
4504 
4505 /* Only emit PIPELINE_SELECT, for the whole mode switch and flushing use
4506  * flush_pipeline_select()
4507  */
4508 void
genX(emit_pipeline_select)4509 genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline,
4510                            const struct anv_device *device)
4511 {
4512    /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
4513 #if GFX_VER < 20
4514    anv_batch_emit(batch, GENX(PIPELINE_SELECT), ps) {
4515       ps.MaskBits = GFX_VERx10 >= 125 ? 0x93 : GFX_VER >= 12 ? 0x13 : 0x3;
4516 #if GFX_VER == 12
4517       ps.MediaSamplerDOPClockGateEnable = true;
4518 #endif
4519       ps.PipelineSelection = pipeline;
4520 #if GFX_VERx10 == 125
4521       /* It might still be better to only enable this when the compute
4522        * pipeline will have DPAS instructions.
4523        */
4524       ps.SystolicModeEnable = pipeline == GPGPU &&
4525          device->vk.enabled_extensions.KHR_cooperative_matrix &&
4526          device->vk.enabled_features.cooperativeMatrix;
4527 #endif
4528    }
4529 #endif /* if GFX_VER < 20 */
4530 }
4531 
4532 static void
genX(flush_pipeline_select)4533 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
4534                             uint32_t pipeline)
4535 {
4536    UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
4537 
4538    if (cmd_buffer->state.current_pipeline == pipeline)
4539       return;
4540 
4541 #if GFX_VER >= 20
4542    /* While PIPELINE_SELECT is not needed on Xe2+, our current assumption
4543     * is that the pipelined flushes in the 3D pipeline are not getting
4544     * synchronized with the compute dispatches (and vice versa). So we need
4545     * a CS_STALL prior the next set of commands to ensure the flushes have
4546     * completed.
4547     *
4548     * The new RESOURCE_BARRIER instruction has support for synchronizing
4549     * 3D/Compute and once we switch to that we should be able to get rid of
4550     * this CS_STALL.
4551     */
4552    anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT, "pipeline switch stall");
4553 
4554    /* Since we are not stalling/flushing caches explicitly while switching
4555     * between the pipelines, we need to apply data dependency flushes recorded
4556     * previously on the resource.
4557     */
4558    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4559 #else
4560 
4561 #if GFX_VER == 9
4562    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
4563     *
4564     *   Software must clear the COLOR_CALC_STATE Valid field in
4565     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
4566     *   with Pipeline Select set to GPGPU.
4567     *
4568     * The internal hardware docs recommend the same workaround for Gfx9
4569     * hardware too.
4570     */
4571    if (pipeline == GPGPU)
4572       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
4573 #endif
4574 
4575 #if GFX_VERx10 == 120
4576    /* Undocumented workaround to force the re-emission of
4577     * MEDIA_INTERFACE_DESCRIPTOR_LOAD when switching from 3D to Compute
4578     * pipeline without rebinding a pipeline :
4579     *    vkCmdBindPipeline(COMPUTE, cs_pipeline);
4580     *    vkCmdDispatch(...);
4581     *    vkCmdBindPipeline(GRAPHICS, gfx_pipeline);
4582     *    vkCmdDraw(...);
4583     *    vkCmdDispatch(...);
4584     */
4585    if (pipeline == _3D)
4586       cmd_buffer->state.compute.pipeline_dirty = true;
4587 #endif
4588 
4589    /* We apparently cannot flush the tile cache (color/depth) from the GPGPU
4590     * pipeline. That means query clears will not be visible to query
4591     * copy/write. So we need to flush it before going to GPGPU mode.
4592     */
4593    if (cmd_buffer->state.current_pipeline == _3D &&
4594        cmd_buffer->state.queries.clear_bits) {
4595       anv_add_pending_pipe_bits(cmd_buffer,
4596                                 ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
4597                                 "query clear flush prior to GPGPU");
4598    }
4599 
4600    /* Flush and invalidate bits done needed prior PIPELINE_SELECT. */
4601    enum anv_pipe_bits bits = 0;
4602 
4603 #if GFX_VER >= 12
4604    /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
4605     *
4606     *   "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
4607     *   are flushed through a stalling PIPE_CONTROL command prior to
4608     *   programming of PIPELINE_SELECT command transitioning Pipeline Select
4609     *   from 3D to GPGPU/Media.
4610     *   Software must ensure HDC Pipeline flush and Generic Media State Clear
4611     *   is issued through a stalling PIPE_CONTROL command prior to programming
4612     *   of PIPELINE_SELECT command transitioning Pipeline Select from
4613     *   GPGPU/Media to 3D."
4614     *
4615     * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
4616     * because PIPE was not in MEDIA mode?!
4617     */
4618    bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
4619 
4620    if (cmd_buffer->state.current_pipeline == _3D) {
4621       bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4622               ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
4623    } else {
4624       bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
4625    }
4626 #else
4627    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
4628     * PIPELINE_SELECT [DevBWR+]":
4629     *
4630     *   Project: DEVSNB+
4631     *
4632     *   Software must ensure all the write caches are flushed through a
4633     *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
4634     *   command to invalidate read only caches prior to programming
4635     *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
4636     *
4637     * Note the cmd_buffer_apply_pipe_flushes will split this into two
4638     * PIPE_CONTROLs.
4639     */
4640    bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4641            ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
4642            ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
4643            ANV_PIPE_CS_STALL_BIT |
4644            ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
4645            ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
4646            ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
4647            ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
4648            ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
4649 #endif
4650 
4651    /* Wa_16013063087 -  State Cache Invalidate must be issued prior to
4652     * PIPELINE_SELECT when switching from 3D to Compute.
4653     *
4654     * SW must do this by programming of PIPECONTROL with “CS Stall” followed by
4655     * a PIPECONTROL with State Cache Invalidate bit set.
4656     *
4657     */
4658    if (cmd_buffer->state.current_pipeline == _3D && pipeline == GPGPU &&
4659        intel_needs_workaround(cmd_buffer->device->info, 16013063087))
4660       bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
4661 
4662    anv_add_pending_pipe_bits(cmd_buffer, bits, "flush/invalidate PIPELINE_SELECT");
4663    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4664 
4665 #if GFX_VER == 9
4666    if (pipeline == _3D) {
4667       /* There is a mid-object preemption workaround which requires you to
4668        * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D.  However,
4669        * even without preemption, we have issues with geometry flickering when
4670        * GPGPU and 3D are back-to-back and this seems to fix it.  We don't
4671        * really know why.
4672        *
4673        * Also, from the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
4674        *
4675        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4676        *    the only bits that are changed are scoreboard related ..."
4677        *
4678        * This is satisfied by applying pre-PIPELINE_SELECT pipe flushes above.
4679        */
4680       anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
4681          vfe.MaximumNumberofThreads =
4682             devinfo->max_cs_threads * devinfo->subslice_total - 1;
4683          vfe.NumberofURBEntries     = 2;
4684          vfe.URBEntryAllocationSize = 2;
4685       }
4686 
4687       /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
4688        * invalid. Set the compute pipeline to dirty to force a re-emit of the
4689        * pipeline in case we get back-to-back dispatch calls with the same
4690        * pipeline and a PIPELINE_SELECT in between.
4691        */
4692       cmd_buffer->state.compute.pipeline_dirty = true;
4693    }
4694 #endif
4695 
4696    genX(emit_pipeline_select)(&cmd_buffer->batch, pipeline, cmd_buffer->device);
4697 
4698 #if GFX_VER == 9
4699    if (devinfo->platform == INTEL_PLATFORM_GLK) {
4700       /* Project: DevGLK
4701        *
4702        * "This chicken bit works around a hardware issue with barrier logic
4703        *  encountered when switching between GPGPU and 3D pipelines.  To
4704        *  workaround the issue, this mode bit should be set after a pipeline
4705        *  is selected."
4706        */
4707       anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {
4708          scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU
4709                                                   : GLK_BARRIER_MODE_3D_HULL;
4710          scec1.GLKBarrierModeMask = 1;
4711       }
4712    }
4713 #endif
4714 #endif /* else of if GFX_VER >= 20 */
4715    cmd_buffer->state.current_pipeline = pipeline;
4716 }
4717 
4718 void
genX(flush_pipeline_select_3d)4719 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
4720 {
4721    genX(flush_pipeline_select)(cmd_buffer, _3D);
4722 }
4723 
4724 void
genX(flush_pipeline_select_gpgpu)4725 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
4726 {
4727    genX(flush_pipeline_select)(cmd_buffer, GPGPU);
4728 }
4729 
4730 void
genX(cmd_buffer_emit_gfx12_depth_wa)4731 genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
4732                                      const struct isl_surf *surf)
4733 {
4734 #if INTEL_NEEDS_WA_1808121037
4735    const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
4736                                surf->samples == 1;
4737 
4738    switch (cmd_buffer->state.depth_reg_mode) {
4739    case ANV_DEPTH_REG_MODE_HW_DEFAULT:
4740       if (!is_d16_1x_msaa)
4741          return;
4742       break;
4743    case ANV_DEPTH_REG_MODE_D16_1X_MSAA:
4744       if (is_d16_1x_msaa)
4745          return;
4746       break;
4747    case ANV_DEPTH_REG_MODE_UNKNOWN:
4748       break;
4749    }
4750 
4751    /* We'll change some CHICKEN registers depending on the depth surface
4752     * format. Do a depth flush and stall so the pipeline is not using these
4753     * settings while we change the registers.
4754     */
4755    anv_add_pending_pipe_bits(cmd_buffer,
4756                              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
4757                              ANV_PIPE_DEPTH_STALL_BIT |
4758                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
4759                              "Workaround: Stop pipeline for 1808121037");
4760    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4761 
4762    /* Wa_1808121037
4763     *
4764     * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
4765     * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
4766     */
4767    anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
4768       reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
4769       reg.HIZPlaneOptimizationdisablebitMask = true;
4770    }
4771 
4772    cmd_buffer->state.depth_reg_mode =
4773       is_d16_1x_msaa ? ANV_DEPTH_REG_MODE_D16_1X_MSAA :
4774                        ANV_DEPTH_REG_MODE_HW_DEFAULT;
4775 #endif
4776 }
4777 
4778 #if GFX_VER == 9
4779 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
4780  *
4781  *    "The VF cache needs to be invalidated before binding and then using
4782  *    Vertex Buffers that overlap with any previously bound Vertex Buffer
4783  *    (at a 64B granularity) since the last invalidation.  A VF cache
4784  *    invalidate is performed by setting the "VF Cache Invalidation Enable"
4785  *    bit in PIPE_CONTROL."
4786  *
4787  * This is implemented by carefully tracking all vertex and index buffer
4788  * bindings and flushing if the cache ever ends up with a range in the cache
4789  * that would exceed 4 GiB.  This is implemented in three parts:
4790  *
4791  *    1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
4792  *       every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
4793  *       tracking code of the new binding.  If this new binding would cause
4794  *       the cache to have a too-large range on the next draw call, a pipeline
4795  *       stall and VF cache invalidate are added to pending_pipeline_bits.
4796  *
4797  *    2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
4798  *       empty whenever we emit a VF invalidate.
4799  *
4800  *    3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
4801  *       after every 3DPRIMITIVE and copies the bound range into the dirty
4802  *       range for each used buffer.  This has to be a separate step because
4803  *       we don't always re-bind all buffers and so 1. can't know which
4804  *       buffers are actually bound.
4805  */
4806 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)4807 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4808                                                int vb_index,
4809                                                struct anv_address vb_address,
4810                                                uint32_t vb_size)
4811 {
4812    if (GFX_VER > 9)
4813       return;
4814 
4815    struct anv_vb_cache_range *bound, *dirty;
4816    if (vb_index == -1) {
4817       bound = &cmd_buffer->state.gfx.ib_bound_range;
4818       dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4819    } else {
4820       assert(vb_index >= 0);
4821       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4822       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4823       bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
4824       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
4825    }
4826 
4827    if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
4828                                                   vb_address,
4829                                                   vb_size)) {
4830       anv_add_pending_pipe_bits(cmd_buffer,
4831                                 ANV_PIPE_CS_STALL_BIT |
4832                                 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
4833                                 "vb > 32b range");
4834    }
4835 }
4836 
4837 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)4838 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4839                                                     uint32_t access_type,
4840                                                     uint64_t vb_used)
4841 {
4842    if (access_type == RANDOM) {
4843       /* We have an index buffer */
4844       struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
4845       struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4846 
4847       anv_merge_vb_cache_range(dirty, bound);
4848    }
4849 
4850    uint64_t mask = vb_used;
4851    while (mask) {
4852       int i = u_bit_scan64(&mask);
4853       assert(i >= 0);
4854       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4855       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4856 
4857       struct anv_vb_cache_range *bound, *dirty;
4858       bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
4859       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
4860 
4861       anv_merge_vb_cache_range(dirty, bound);
4862    }
4863 }
4864 #endif /* GFX_VER == 9 */
4865 
4866 /**
4867  * Update the pixel hashing modes that determine the balancing of PS threads
4868  * across subslices and slices.
4869  *
4870  * \param width Width bound of the rendering area (already scaled down if \p
4871  *              scale is greater than 1).
4872  * \param height Height bound of the rendering area (already scaled down if \p
4873  *               scale is greater than 1).
4874  * \param scale The number of framebuffer samples that could potentially be
4875  *              affected by an individual channel of the PS thread.  This is
4876  *              typically one for single-sampled rendering, but for operations
4877  *              like CCS resolves and fast clears a single PS invocation may
4878  *              update a huge number of pixels, in which case a finer
4879  *              balancing is desirable in order to maximally utilize the
4880  *              bandwidth available.  UINT_MAX can be used as shorthand for
4881  *              "finest hashing mode available".
4882  */
4883 void
genX(cmd_buffer_emit_hashing_mode)4884 genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
4885                                    unsigned width, unsigned height,
4886                                    unsigned scale)
4887 {
4888 #if GFX_VER == 9
4889    const struct intel_device_info *devinfo = cmd_buffer->device->info;
4890    const unsigned slice_hashing[] = {
4891       /* Because all Gfx9 platforms with more than one slice require
4892        * three-way subslice hashing, a single "normal" 16x16 slice hashing
4893        * block is guaranteed to suffer from substantial imbalance, with one
4894        * subslice receiving twice as much work as the other two in the
4895        * slice.
4896        *
4897        * The performance impact of that would be particularly severe when
4898        * three-way hashing is also in use for slice balancing (which is the
4899        * case for all Gfx9 GT4 platforms), because one of the slices
4900        * receives one every three 16x16 blocks in either direction, which
4901        * is roughly the periodicity of the underlying subslice imbalance
4902        * pattern ("roughly" because in reality the hardware's
4903        * implementation of three-way hashing doesn't do exact modulo 3
4904        * arithmetic, which somewhat decreases the magnitude of this effect
4905        * in practice).  This leads to a systematic subslice imbalance
4906        * within that slice regardless of the size of the primitive.  The
4907        * 32x32 hashing mode guarantees that the subslice imbalance within a
4908        * single slice hashing block is minimal, largely eliminating this
4909        * effect.
4910        */
4911       _32x32,
4912       /* Finest slice hashing mode available. */
4913       NORMAL
4914    };
4915    const unsigned subslice_hashing[] = {
4916       /* 16x16 would provide a slight cache locality benefit especially
4917        * visible in the sampler L1 cache efficiency of low-bandwidth
4918        * non-LLC platforms, but it comes at the cost of greater subslice
4919        * imbalance for primitives of dimensions approximately intermediate
4920        * between 16x4 and 16x16.
4921        */
4922       _16x4,
4923       /* Finest subslice hashing mode available. */
4924       _8x4
4925    };
4926    /* Dimensions of the smallest hashing block of a given hashing mode.  If
4927     * the rendering area is smaller than this there can't possibly be any
4928     * benefit from switching to this mode, so we optimize out the
4929     * transition.
4930     */
4931    const unsigned min_size[][2] = {
4932          { 16, 4 },
4933          { 8, 4 }
4934    };
4935    const unsigned idx = scale > 1;
4936 
4937    if (cmd_buffer->state.current_hash_scale != scale &&
4938        (width > min_size[idx][0] || height > min_size[idx][1])) {
4939       anv_add_pending_pipe_bits(cmd_buffer,
4940                                 ANV_PIPE_CS_STALL_BIT |
4941                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
4942                                 "change pixel hash mode");
4943       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4944 
4945       anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {
4946          gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
4947          gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
4948          gt.SubsliceHashing = subslice_hashing[idx];
4949          gt.SubsliceHashingMask = -1;
4950       }
4951 
4952       cmd_buffer->state.current_hash_scale = scale;
4953    }
4954 #endif
4955 }
4956 
4957 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)4958 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
4959 {
4960    struct anv_device *device = cmd_buffer->device;
4961    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4962 
4963    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
4964                                         device->isl_dev.ds.size / 4);
4965    if (dw == NULL)
4966       return;
4967 
4968    struct isl_view isl_view = {};
4969    struct isl_depth_stencil_hiz_emit_info info = {
4970       .view = &isl_view,
4971       .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
4972    };
4973 
4974    if (gfx->depth_att.iview != NULL) {
4975       isl_view = gfx->depth_att.iview->planes[0].isl;
4976    } else if (gfx->stencil_att.iview != NULL) {
4977       isl_view = gfx->stencil_att.iview->planes[0].isl;
4978    }
4979 
4980    if (gfx->view_mask) {
4981       assert(isl_view.array_len == 0 ||
4982              isl_view.array_len >= util_last_bit(gfx->view_mask));
4983       isl_view.array_len = util_last_bit(gfx->view_mask);
4984    } else {
4985       assert(isl_view.array_len == 0 ||
4986              isl_view.array_len >= util_last_bit(gfx->layer_count));
4987       isl_view.array_len = gfx->layer_count;
4988    }
4989 
4990    if (gfx->depth_att.iview != NULL) {
4991       const struct anv_image_view *iview = gfx->depth_att.iview;
4992       const struct anv_image *image = iview->image;
4993 
4994       const uint32_t depth_plane =
4995          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
4996       const struct anv_surface *depth_surface =
4997          &image->planes[depth_plane].primary_surface;
4998       const struct anv_address depth_address =
4999          anv_image_address(image, &depth_surface->memory_range);
5000 
5001       anv_reloc_list_add_bo(cmd_buffer->batch.relocs, depth_address.bo);
5002 
5003       info.depth_surf = &depth_surface->isl;
5004       info.depth_address = anv_address_physical(depth_address);
5005       info.mocs =
5006          anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
5007 
5008       info.hiz_usage = gfx->depth_att.aux_usage;
5009       if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
5010          assert(isl_aux_usage_has_hiz(info.hiz_usage));
5011 
5012          const struct anv_surface *hiz_surface =
5013             &image->planes[depth_plane].aux_surface;
5014          const struct anv_address hiz_address =
5015             anv_image_address(image, &hiz_surface->memory_range);
5016 
5017          anv_reloc_list_add_bo(cmd_buffer->batch.relocs, hiz_address.bo);
5018 
5019          info.hiz_surf = &hiz_surface->isl;
5020          info.hiz_address = anv_address_physical(hiz_address);
5021 
5022          info.depth_clear_value = anv_image_hiz_clear_value(image).f32[0];
5023       }
5024    }
5025 
5026    if (gfx->stencil_att.iview != NULL) {
5027       const struct anv_image_view *iview = gfx->stencil_att.iview;
5028       const struct anv_image *image = iview->image;
5029 
5030       const uint32_t stencil_plane =
5031          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
5032       const struct anv_surface *stencil_surface =
5033          &image->planes[stencil_plane].primary_surface;
5034       const struct anv_address stencil_address =
5035          anv_image_address(image, &stencil_surface->memory_range);
5036 
5037       anv_reloc_list_add_bo(cmd_buffer->batch.relocs, stencil_address.bo);
5038 
5039       info.stencil_surf = &stencil_surface->isl;
5040 
5041       info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
5042       info.stencil_address = anv_address_physical(stencil_address);
5043       info.mocs =
5044          anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
5045    }
5046 
5047    isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
5048 
5049    if (intel_needs_workaround(cmd_buffer->device->info, 1408224581) ||
5050        intel_needs_workaround(cmd_buffer->device->info, 14014097488) ||
5051        intel_needs_workaround(cmd_buffer->device->info, 14016712196)) {
5052       /* Wa_1408224581
5053        *
5054        * Workaround: Gfx12LP Astep only An additional pipe control with
5055        * post-sync = store dword operation would be required.( w/a is to have
5056        * an additional pipe control after the stencil state whenever the
5057        * surface state bits of this state is changing).
5058        *
5059        * This also seems sufficient to handle Wa_14014097488 and
5060        * Wa_14016712196.
5061        */
5062       genx_batch_emit_pipe_control_write(&cmd_buffer->batch, device->info,
5063                                          cmd_buffer->state.current_pipeline,
5064                                          WriteImmediateData,
5065                                          device->workaround_address, 0, 0);
5066    }
5067 
5068    if (info.depth_surf)
5069       genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
5070 
5071    cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
5072 }
5073 
5074 static void
cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image_view * fsr_iview)5075 cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
5076                                    const struct anv_image_view *fsr_iview)
5077 {
5078 #if GFX_VERx10 >= 125
5079    struct anv_device *device = cmd_buffer->device;
5080 
5081    if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
5082       return;
5083 
5084    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
5085                                         device->isl_dev.cpb.size / 4);
5086    if (dw == NULL)
5087       return;
5088 
5089    struct isl_cpb_emit_info info = { };
5090 
5091    if (fsr_iview) {
5092       const struct anv_image_binding *binding = &fsr_iview->image->bindings[0];
5093 
5094       anv_reloc_list_add_bo(cmd_buffer->batch.relocs, binding->address.bo);
5095 
5096       struct anv_address addr =
5097          anv_address_add(binding->address, binding->memory_range.offset);
5098 
5099       info.view = &fsr_iview->planes[0].isl;
5100       info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
5101       info.address = anv_address_physical(addr);
5102       info.mocs =
5103          anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
5104                   ISL_SURF_USAGE_CPB_BIT);
5105    }
5106 
5107    isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
5108 
5109    /* Wa_14016712196:
5110     * Emit dummy pipe control after state that sends implicit depth flush.
5111     */
5112    if (intel_needs_workaround(device->info, 14016712196)) {
5113       genx_batch_emit_pipe_control_write(&cmd_buffer->batch, device->info,
5114                                          cmd_buffer->state.current_pipeline,
5115                                          WriteImmediateData,
5116                                          device->workaround_address, 0, 0);
5117    }
5118 
5119 #endif /* GFX_VERx10 >= 125 */
5120 }
5121 
5122 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)5123 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
5124 {
5125    const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
5126       vk_find_struct_const(att->pNext,
5127                            RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
5128    if (layout_info != NULL)
5129       return layout_info->initialLayout;
5130 
5131    return att->imageLayout;
5132 }
5133 
genX(CmdBeginRendering)5134 void genX(CmdBeginRendering)(
5135     VkCommandBuffer                             commandBuffer,
5136     const VkRenderingInfo*                      pRenderingInfo)
5137 {
5138    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5139    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5140    VkResult result;
5141 
5142    if (!anv_cmd_buffer_is_render_queue(cmd_buffer)) {
5143       assert(!"Trying to start a render pass on non-render queue!");
5144       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
5145       return;
5146    }
5147 
5148    anv_measure_beginrenderpass(cmd_buffer);
5149    trace_intel_begin_render_pass(&cmd_buffer->trace);
5150 
5151    gfx->rendering_flags = pRenderingInfo->flags;
5152    gfx->view_mask = pRenderingInfo->viewMask;
5153    gfx->layer_count = pRenderingInfo->layerCount;
5154    gfx->samples = 0;
5155 
5156    if (gfx->render_area.offset.x != pRenderingInfo->renderArea.offset.x ||
5157        gfx->render_area.offset.y != pRenderingInfo->renderArea.offset.y ||
5158        gfx->render_area.extent.width != pRenderingInfo->renderArea.extent.width ||
5159        gfx->render_area.extent.height != pRenderingInfo->renderArea.extent.height) {
5160       gfx->render_area = pRenderingInfo->renderArea;
5161       gfx->dirty |= ANV_CMD_DIRTY_RENDER_AREA;
5162    }
5163 
5164    const bool is_multiview = gfx->view_mask != 0;
5165    const VkRect2D render_area = gfx->render_area;
5166    const uint32_t layers =
5167       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5168 
5169    /* The framebuffer size is at least large enough to contain the render
5170     * area.  Because a zero renderArea is possible, we MAX with 1.
5171     */
5172    struct isl_extent3d fb_size = {
5173       .w = MAX2(1, render_area.offset.x + render_area.extent.width),
5174       .h = MAX2(1, render_area.offset.y + render_area.extent.height),
5175       .d = layers,
5176    };
5177 
5178    const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
5179    result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
5180    if (result != VK_SUCCESS)
5181       return;
5182 
5183    genX(flush_pipeline_select_3d)(cmd_buffer);
5184 
5185    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5186       if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
5187          continue;
5188 
5189       const VkRenderingAttachmentInfo *att =
5190          &pRenderingInfo->pColorAttachments[i];
5191       ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
5192       const VkImageLayout initial_layout = attachment_initial_layout(att);
5193 
5194       assert(render_area.offset.x + render_area.extent.width <=
5195              iview->vk.extent.width);
5196       assert(render_area.offset.y + render_area.extent.height <=
5197              iview->vk.extent.height);
5198       assert(layers <= iview->vk.layer_count);
5199 
5200       fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
5201       fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
5202 
5203       assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
5204       gfx->samples |= iview->vk.image->samples;
5205 
5206       enum isl_aux_usage aux_usage =
5207          anv_layout_to_aux_usage(cmd_buffer->device->info,
5208                                  iview->image,
5209                                  VK_IMAGE_ASPECT_COLOR_BIT,
5210                                  VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
5211                                  att->imageLayout,
5212                                  cmd_buffer->queue_family->queueFlags);
5213 
5214       union isl_color_value fast_clear_color = { .u32 = { 0, } };
5215 
5216       if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5217           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
5218          const union isl_color_value clear_color =
5219             vk_to_isl_color_with_format(att->clearValue.color,
5220                                         iview->planes[0].isl.format);
5221 
5222          /* We only support fast-clears on the first layer */
5223          const bool fast_clear =
5224             (!is_multiview || (gfx->view_mask & 1)) &&
5225             anv_can_fast_clear_color_view(cmd_buffer->device, iview,
5226                                           att->imageLayout, clear_color,
5227                                           layers, render_area,
5228                                           cmd_buffer->queue_family->queueFlags);
5229 
5230          if (att->imageLayout != initial_layout) {
5231             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5232                    render_area.extent.width == iview->vk.extent.width &&
5233                    render_area.extent.height == iview->vk.extent.height);
5234             if (is_multiview) {
5235                u_foreach_bit(view, gfx->view_mask) {
5236                   transition_color_buffer(cmd_buffer, iview->image,
5237                                           VK_IMAGE_ASPECT_COLOR_BIT,
5238                                           iview->vk.base_mip_level, 1,
5239                                           iview->vk.base_array_layer + view,
5240                                           1, /* layer_count */
5241                                           initial_layout, att->imageLayout,
5242                                           VK_QUEUE_FAMILY_IGNORED,
5243                                           VK_QUEUE_FAMILY_IGNORED,
5244                                           fast_clear);
5245                }
5246             } else {
5247                transition_color_buffer(cmd_buffer, iview->image,
5248                                        VK_IMAGE_ASPECT_COLOR_BIT,
5249                                        iview->vk.base_mip_level, 1,
5250                                        iview->vk.base_array_layer,
5251                                        gfx->layer_count,
5252                                        initial_layout, att->imageLayout,
5253                                        VK_QUEUE_FAMILY_IGNORED,
5254                                        VK_QUEUE_FAMILY_IGNORED,
5255                                        fast_clear);
5256             }
5257          }
5258 
5259          uint32_t clear_view_mask = pRenderingInfo->viewMask;
5260          uint32_t base_clear_layer = iview->vk.base_array_layer;
5261          uint32_t clear_layer_count = gfx->layer_count;
5262          if (fast_clear) {
5263             /* We only support fast-clears on the first layer */
5264             assert(iview->vk.base_mip_level == 0 &&
5265                    iview->vk.base_array_layer == 0);
5266 
5267             fast_clear_color = clear_color;
5268 
5269             if (iview->image->vk.samples == 1) {
5270                anv_image_ccs_op(cmd_buffer, iview->image,
5271                                 iview->planes[0].isl.format,
5272                                 iview->planes[0].isl.swizzle,
5273                                 VK_IMAGE_ASPECT_COLOR_BIT,
5274                                 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
5275                                 &fast_clear_color,
5276                                 false);
5277             } else {
5278                anv_image_mcs_op(cmd_buffer, iview->image,
5279                                 iview->planes[0].isl.format,
5280                                 iview->planes[0].isl.swizzle,
5281                                 VK_IMAGE_ASPECT_COLOR_BIT,
5282                                 0, 1, ISL_AUX_OP_FAST_CLEAR,
5283                                 &fast_clear_color,
5284                                 false);
5285             }
5286             clear_view_mask &= ~1u;
5287             base_clear_layer++;
5288             clear_layer_count--;
5289 #if GFX_VER < 20
5290             genX(set_fast_clear_state)(cmd_buffer, iview->image,
5291                                        iview->planes[0].isl.format,
5292                                        clear_color);
5293 #endif
5294          }
5295 
5296          if (is_multiview) {
5297             u_foreach_bit(view, clear_view_mask) {
5298                anv_image_clear_color(cmd_buffer, iview->image,
5299                                      VK_IMAGE_ASPECT_COLOR_BIT,
5300                                      aux_usage,
5301                                      iview->planes[0].isl.format,
5302                                      iview->planes[0].isl.swizzle,
5303                                      iview->vk.base_mip_level,
5304                                      iview->vk.base_array_layer + view, 1,
5305                                      render_area, clear_color);
5306             }
5307          } else {
5308             anv_image_clear_color(cmd_buffer, iview->image,
5309                                   VK_IMAGE_ASPECT_COLOR_BIT,
5310                                   aux_usage,
5311                                   iview->planes[0].isl.format,
5312                                   iview->planes[0].isl.swizzle,
5313                                   iview->vk.base_mip_level,
5314                                   base_clear_layer, clear_layer_count,
5315                                   render_area, clear_color);
5316          }
5317       } else {
5318          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5319          assert(att->imageLayout == initial_layout);
5320       }
5321 
5322       gfx->color_att[i].vk_format = iview->vk.format;
5323       gfx->color_att[i].iview = iview;
5324       gfx->color_att[i].layout = att->imageLayout;
5325       gfx->color_att[i].aux_usage = aux_usage;
5326 
5327       struct isl_view isl_view = iview->planes[0].isl;
5328       if (pRenderingInfo->viewMask) {
5329          assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
5330          isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
5331       } else {
5332          assert(isl_view.array_len >= pRenderingInfo->layerCount);
5333          isl_view.array_len = pRenderingInfo->layerCount;
5334       }
5335 
5336       anv_image_fill_surface_state(cmd_buffer->device,
5337                                    iview->image,
5338                                    VK_IMAGE_ASPECT_COLOR_BIT,
5339                                    &isl_view,
5340                                    ISL_SURF_USAGE_RENDER_TARGET_BIT,
5341                                    aux_usage, &fast_clear_color,
5342                                    0, /* anv_image_view_state_flags */
5343                                    &gfx->color_att[i].surface_state);
5344 
5345       add_surface_state_relocs(cmd_buffer, &gfx->color_att[i].surface_state);
5346 
5347       if (GFX_VER < 10 &&
5348           (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
5349            (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
5350           iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
5351           iview->planes[0].isl.base_level == 0 &&
5352           iview->planes[0].isl.base_array_layer == 0) {
5353          genX(load_image_clear_color)(cmd_buffer,
5354                                       gfx->color_att[i].surface_state.state,
5355                                       iview->image);
5356       }
5357 
5358       if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
5359          gfx->color_att[i].resolve_mode = att->resolveMode;
5360          gfx->color_att[i].resolve_iview =
5361             anv_image_view_from_handle(att->resolveImageView);
5362          gfx->color_att[i].resolve_layout = att->resolveImageLayout;
5363       }
5364    }
5365 
5366    anv_cmd_graphic_state_update_has_uint_rt(gfx);
5367 
5368    const struct anv_image_view *fsr_iview = NULL;
5369    const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
5370       vk_find_struct_const(pRenderingInfo->pNext,
5371                            RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
5372    if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
5373       fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
5374       /* imageLayout and shadingRateAttachmentTexelSize are ignored */
5375    }
5376 
5377    const struct anv_image_view *ds_iview = NULL;
5378    const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
5379    const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
5380    if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
5381        (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
5382       const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
5383       VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5384       VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5385       VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5386       VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5387       enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
5388       enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
5389       VkClearDepthStencilValue clear_value = {};
5390 
5391       if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
5392          d_iview = anv_image_view_from_handle(d_att->imageView);
5393          initial_depth_layout = attachment_initial_layout(d_att);
5394          depth_layout = d_att->imageLayout;
5395          depth_aux_usage =
5396             anv_layout_to_aux_usage(cmd_buffer->device->info,
5397                                     d_iview->image,
5398                                     VK_IMAGE_ASPECT_DEPTH_BIT,
5399                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5400                                     depth_layout,
5401                                     cmd_buffer->queue_family->queueFlags);
5402          clear_value.depth = d_att->clearValue.depthStencil.depth;
5403       }
5404 
5405       if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
5406          s_iview = anv_image_view_from_handle(s_att->imageView);
5407          initial_stencil_layout = attachment_initial_layout(s_att);
5408          stencil_layout = s_att->imageLayout;
5409          stencil_aux_usage =
5410             anv_layout_to_aux_usage(cmd_buffer->device->info,
5411                                     s_iview->image,
5412                                     VK_IMAGE_ASPECT_STENCIL_BIT,
5413                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5414                                     stencil_layout,
5415                                     cmd_buffer->queue_family->queueFlags);
5416          clear_value.stencil = s_att->clearValue.depthStencil.stencil;
5417       }
5418 
5419       assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
5420       ds_iview = d_iview != NULL ? d_iview : s_iview;
5421       assert(ds_iview != NULL);
5422 
5423       assert(render_area.offset.x + render_area.extent.width <=
5424              ds_iview->vk.extent.width);
5425       assert(render_area.offset.y + render_area.extent.height <=
5426              ds_iview->vk.extent.height);
5427       assert(layers <= ds_iview->vk.layer_count);
5428 
5429       fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
5430       fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
5431 
5432       assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
5433       gfx->samples |= ds_iview->vk.image->samples;
5434 
5435       VkImageAspectFlags clear_aspects = 0;
5436       if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5437           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5438          clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
5439       if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5440           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5441          clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5442 
5443       if (clear_aspects != 0) {
5444          const bool hiz_clear =
5445             anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
5446                                       depth_layout, clear_aspects,
5447                                       clear_value.depth,
5448                                       render_area,
5449                                       cmd_buffer->queue_family->queueFlags);
5450 
5451          if (depth_layout != initial_depth_layout) {
5452             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5453                    render_area.extent.width == d_iview->vk.extent.width &&
5454                    render_area.extent.height == d_iview->vk.extent.height);
5455 
5456             if (is_multiview) {
5457                u_foreach_bit(view, gfx->view_mask) {
5458                   transition_depth_buffer(cmd_buffer, d_iview->image,
5459                                           d_iview->vk.base_mip_level, 1,
5460                                           d_iview->vk.base_array_layer + view,
5461                                           1 /* layer_count */,
5462                                           initial_depth_layout, depth_layout,
5463                                           hiz_clear);
5464                }
5465             } else {
5466                transition_depth_buffer(cmd_buffer, d_iview->image,
5467                                        d_iview->vk.base_mip_level, 1,
5468                                        d_iview->vk.base_array_layer,
5469                                        gfx->layer_count,
5470                                        initial_depth_layout, depth_layout,
5471                                        hiz_clear);
5472             }
5473          }
5474 
5475          if (stencil_layout != initial_stencil_layout) {
5476             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5477                    render_area.extent.width == s_iview->vk.extent.width &&
5478                    render_area.extent.height == s_iview->vk.extent.height);
5479 
5480             if (is_multiview) {
5481                u_foreach_bit(view, gfx->view_mask) {
5482                   transition_stencil_buffer(cmd_buffer, s_iview->image,
5483                                             s_iview->vk.base_mip_level, 1,
5484                                             s_iview->vk.base_array_layer + view,
5485                                             1 /* layer_count */,
5486                                             initial_stencil_layout,
5487                                             stencil_layout,
5488                                             hiz_clear);
5489                }
5490             } else {
5491                transition_stencil_buffer(cmd_buffer, s_iview->image,
5492                                          s_iview->vk.base_mip_level, 1,
5493                                          s_iview->vk.base_array_layer,
5494                                          gfx->layer_count,
5495                                          initial_stencil_layout,
5496                                          stencil_layout,
5497                                          hiz_clear);
5498             }
5499          }
5500 
5501          if (is_multiview) {
5502             u_foreach_bit(view, gfx->view_mask) {
5503                uint32_t level = ds_iview->vk.base_mip_level;
5504                uint32_t layer = ds_iview->vk.base_array_layer + view;
5505 
5506                if (hiz_clear) {
5507                   anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5508                                       clear_aspects,
5509                                       level, layer, 1,
5510                                       render_area, &clear_value);
5511                } else {
5512                   anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5513                                                 clear_aspects,
5514                                                 depth_aux_usage,
5515                                                 level, layer, 1,
5516                                                 render_area, &clear_value);
5517                }
5518             }
5519          } else {
5520             uint32_t level = ds_iview->vk.base_mip_level;
5521             uint32_t base_layer = ds_iview->vk.base_array_layer;
5522             uint32_t layer_count = gfx->layer_count;
5523 
5524             if (hiz_clear) {
5525                anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5526                                    clear_aspects,
5527                                    level, base_layer, layer_count,
5528                                    render_area, &clear_value);
5529             } else {
5530                anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5531                                              clear_aspects,
5532                                              depth_aux_usage,
5533                                              level, base_layer, layer_count,
5534                                              render_area, &clear_value);
5535             }
5536          }
5537       } else {
5538          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5539          assert(depth_layout == initial_depth_layout);
5540          assert(stencil_layout == initial_stencil_layout);
5541       }
5542 
5543       if (d_iview != NULL) {
5544          gfx->depth_att.vk_format = d_iview->vk.format;
5545          gfx->depth_att.iview = d_iview;
5546          gfx->depth_att.layout = depth_layout;
5547          gfx->depth_att.aux_usage = depth_aux_usage;
5548          if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5549             assert(d_att->resolveImageView != VK_NULL_HANDLE);
5550             gfx->depth_att.resolve_mode = d_att->resolveMode;
5551             gfx->depth_att.resolve_iview =
5552                anv_image_view_from_handle(d_att->resolveImageView);
5553             gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
5554          }
5555       }
5556 
5557       if (s_iview != NULL) {
5558          gfx->stencil_att.vk_format = s_iview->vk.format;
5559          gfx->stencil_att.iview = s_iview;
5560          gfx->stencil_att.layout = stencil_layout;
5561          gfx->stencil_att.aux_usage = stencil_aux_usage;
5562          if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5563             assert(s_att->resolveImageView != VK_NULL_HANDLE);
5564             gfx->stencil_att.resolve_mode = s_att->resolveMode;
5565             gfx->stencil_att.resolve_iview =
5566                anv_image_view_from_handle(s_att->resolveImageView);
5567             gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
5568          }
5569       }
5570    }
5571 
5572    /* Finally, now that we know the right size, set up the null surface */
5573    assert(util_bitcount(gfx->samples) <= 1);
5574    isl_null_fill_state(&cmd_buffer->device->isl_dev,
5575                        gfx->null_surface_state.map,
5576                        .size = fb_size);
5577 
5578    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5579       if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
5580          continue;
5581 
5582       isl_null_fill_state(&cmd_buffer->device->isl_dev,
5583                           gfx->color_att[i].surface_state.state.map,
5584                           .size = fb_size);
5585    }
5586 
5587    /****** We can now start emitting code to begin the render pass ******/
5588 
5589    gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
5590 
5591    /* It is possible to start a render pass with an old pipeline.  Because the
5592     * render pass and subpass index are both baked into the pipeline, this is
5593     * highly unlikely.  In order to do so, it requires that you have a render
5594     * pass with a single subpass and that you use that render pass twice
5595     * back-to-back and use the same pipeline at the start of the second render
5596     * pass as at the end of the first.  In order to avoid unpredictable issues
5597     * with this edge case, we just dirty the pipeline at the start of every
5598     * subpass.
5599     */
5600    gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
5601 
5602 #if GFX_VER >= 11
5603    bool has_color_att = false;
5604    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5605       if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE) {
5606          has_color_att = true;
5607          break;
5608       }
5609    }
5610    if (has_color_att) {
5611       /* The PIPE_CONTROL command description says:
5612       *
5613       *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
5614       *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
5615       *     Target Cache Flush by enabling this bit. When render target flush
5616       *     is set due to new association of BTI, PS Scoreboard Stall bit must
5617       *     be set in this packet."
5618       *
5619       * We assume that a new BeginRendering is always changing the RTs, which
5620       * may not be true and cause excessive flushing.  We can trivially skip it
5621       * in the case that there are no RTs (depth-only rendering), though.
5622       */
5623       anv_add_pending_pipe_bits(cmd_buffer,
5624                               ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
5625                               ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
5626                               "change RT");
5627    }
5628 #endif
5629 
5630    cmd_buffer_emit_depth_stencil(cmd_buffer);
5631 
5632    cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
5633 }
5634 
5635 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer * cmd_buffer,struct anv_attachment * att,VkImageAspectFlagBits aspect)5636 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
5637                                    struct anv_attachment *att,
5638                                    VkImageAspectFlagBits aspect)
5639 {
5640 #if GFX_VER < 20
5641    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5642    const struct anv_image_view *iview = att->iview;
5643 
5644    if (iview == NULL)
5645       return;
5646 
5647    if (gfx->view_mask == 0) {
5648       genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5649                                           aspect, att->aux_usage,
5650                                           iview->planes[0].isl.base_level,
5651                                           iview->planes[0].isl.base_array_layer,
5652                                           gfx->layer_count);
5653    } else {
5654       uint32_t res_view_mask = gfx->view_mask;
5655       while (res_view_mask) {
5656          int i = u_bit_scan(&res_view_mask);
5657 
5658          const uint32_t level = iview->planes[0].isl.base_level;
5659          const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
5660 
5661          genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5662                                              aspect, att->aux_usage,
5663                                              level, layer, 1);
5664       }
5665    }
5666 #endif
5667 }
5668 
genX(CmdEndRendering)5669 void genX(CmdEndRendering)(
5670     VkCommandBuffer                             commandBuffer)
5671 {
5672    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5673    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5674 
5675    if (anv_batch_has_error(&cmd_buffer->batch))
5676       return;
5677 
5678    const bool is_multiview = gfx->view_mask != 0;
5679    const uint32_t layers =
5680       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5681 
5682    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5683       cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
5684                                          VK_IMAGE_ASPECT_COLOR_BIT);
5685    }
5686 
5687    cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
5688                                        VK_IMAGE_ASPECT_DEPTH_BIT);
5689 
5690    cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
5691                                        VK_IMAGE_ASPECT_STENCIL_BIT);
5692 
5693 
5694    if (!(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5695       bool has_color_resolve = false;
5696       bool has_sparse_color_resolve = false;
5697 
5698       for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5699          if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE) {
5700             has_color_resolve = true;
5701             if (anv_image_is_sparse(gfx->color_att[i].iview->image))
5702                   has_sparse_color_resolve = true;
5703          }
5704       }
5705 
5706       if (has_color_resolve) {
5707          /* We are about to do some MSAA resolves.  We need to flush so that
5708           * the result of writes to the MSAA color attachments show up in the
5709           * sampler when we blit to the single-sampled resolve target.
5710           */
5711          anv_add_pending_pipe_bits(cmd_buffer,
5712                                    ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5713                                    ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
5714                                    "MSAA resolve");
5715       }
5716 
5717       const bool has_depth_resolve =
5718          gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE;
5719       const bool has_stencil_resolve =
5720          gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE;
5721       const bool has_sparse_depth_resolve =
5722          has_depth_resolve &&
5723          anv_image_is_sparse(gfx->depth_att.iview->image);
5724       const bool has_sparse_stencil_resolve =
5725          has_stencil_resolve &&
5726          anv_image_is_sparse(gfx->stencil_att.iview->image);
5727 
5728       if (has_depth_resolve || has_stencil_resolve) {
5729          /* We are about to do some MSAA resolves.  We need to flush so that
5730           * the result of writes to the MSAA depth attachments show up in the
5731           * sampler when we blit to the single-sampled resolve target.
5732           */
5733          anv_add_pending_pipe_bits(cmd_buffer,
5734                                  ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5735                                  ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
5736                                  "MSAA resolve");
5737       }
5738 
5739       if (has_sparse_color_resolve || has_sparse_depth_resolve ||
5740           has_sparse_stencil_resolve) {
5741          /* If the resolve image is sparse we need some extra bits to make
5742           * sure unbound regions read 0, as residencyNonResidentStrict
5743           * mandates.
5744           */
5745          anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_TILE_CACHE_FLUSH_BIT,
5746                                    "sparse MSAA resolve");
5747       }
5748 
5749       for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5750          const struct anv_attachment *att = &gfx->color_att[i];
5751          if (att->resolve_mode == VK_RESOLVE_MODE_NONE)
5752             continue;
5753 
5754          anv_attachment_msaa_resolve(cmd_buffer, att, att->layout,
5755                                      VK_IMAGE_ASPECT_COLOR_BIT);
5756       }
5757 
5758       if (has_depth_resolve) {
5759          const struct anv_image_view *src_iview = gfx->depth_att.iview;
5760 
5761          /* MSAA resolves sample from the source attachment.  Transition the
5762           * depth attachment first to get rid of any HiZ that we may not be
5763           * able to handle.
5764           */
5765          transition_depth_buffer(cmd_buffer, src_iview->image, 0, 1,
5766                                  src_iview->planes[0].isl.base_array_layer,
5767                                  layers,
5768                                  gfx->depth_att.layout,
5769                                  VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5770                                  false /* will_full_fast_clear */);
5771 
5772          anv_attachment_msaa_resolve(cmd_buffer, &gfx->depth_att,
5773                                      VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5774                                      VK_IMAGE_ASPECT_DEPTH_BIT);
5775 
5776          /* Transition the source back to the original layout.  This seems a
5777           * bit inefficient but, since HiZ resolves aren't destructive, going
5778           * from less HiZ to more is generally a no-op.
5779           */
5780          transition_depth_buffer(cmd_buffer, src_iview->image, 0, 1,
5781                                  src_iview->planes[0].isl.base_array_layer,
5782                                  layers,
5783                                  VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5784                                  gfx->depth_att.layout,
5785                                  false /* will_full_fast_clear */);
5786       }
5787 
5788       if (has_stencil_resolve) {
5789          anv_attachment_msaa_resolve(cmd_buffer, &gfx->stencil_att,
5790                                      gfx->stencil_att.layout,
5791                                      VK_IMAGE_ASPECT_STENCIL_BIT);
5792       }
5793    }
5794 
5795    trace_intel_end_render_pass(&cmd_buffer->trace,
5796                                gfx->render_area.extent.width,
5797                                gfx->render_area.extent.height,
5798                                gfx->color_att_count,
5799                                gfx->samples);
5800 
5801    anv_cmd_buffer_reset_rendering(cmd_buffer);
5802 }
5803 
5804 void
genX(cmd_emit_conditional_render_predicate)5805 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
5806 {
5807    struct mi_builder b;
5808    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5809 
5810    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
5811                 mi_reg32(ANV_PREDICATE_RESULT_REG));
5812    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5813 
5814    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
5815       mip.LoadOperation    = LOAD_LOADINV;
5816       mip.CombineOperation = COMBINE_SET;
5817       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5818    }
5819 }
5820 
genX(CmdBeginConditionalRenderingEXT)5821 void genX(CmdBeginConditionalRenderingEXT)(
5822    VkCommandBuffer                             commandBuffer,
5823    const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
5824 {
5825    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5826    ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
5827    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5828    struct anv_address value_address =
5829       anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
5830 
5831    const bool isInverted = pConditionalRenderingBegin->flags &
5832                            VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
5833 
5834    cmd_state->conditional_render_enabled = true;
5835 
5836    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5837 
5838    struct mi_builder b;
5839    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5840    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &value_address);
5841    mi_builder_set_mocs(&b, mocs);
5842 
5843    /* Section 19.4 of the Vulkan 1.1.85 spec says:
5844     *
5845     *    If the value of the predicate in buffer memory changes
5846     *    while conditional rendering is active, the rendering commands
5847     *    may be discarded in an implementation-dependent way.
5848     *    Some implementations may latch the value of the predicate
5849     *    upon beginning conditional rendering while others
5850     *    may read it before every rendering command.
5851     *
5852     * So it's perfectly fine to read a value from the buffer once.
5853     */
5854    struct mi_value value =  mi_mem32(value_address);
5855 
5856    /* Precompute predicate result, it is necessary to support secondary
5857     * command buffers since it is unknown if conditional rendering is
5858     * inverted when populating them.
5859     */
5860    mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
5861                 isInverted ? mi_uge(&b, mi_imm(0), value) :
5862                              mi_ult(&b, mi_imm(0), value));
5863 }
5864 
genX(CmdEndConditionalRenderingEXT)5865 void genX(CmdEndConditionalRenderingEXT)(
5866 	VkCommandBuffer                             commandBuffer)
5867 {
5868    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5869    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5870 
5871    cmd_state->conditional_render_enabled = false;
5872 }
5873 
5874 /* Set of stage bits for which are pipelined, i.e. they get queued
5875  * by the command streamer for later execution.
5876  */
5877 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
5878    ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
5879      VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
5880      VK_PIPELINE_STAGE_2_HOST_BIT | \
5881      VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
5882 
genX(CmdSetEvent2)5883 void genX(CmdSetEvent2)(
5884     VkCommandBuffer                             commandBuffer,
5885     VkEvent                                     _event,
5886     const VkDependencyInfo*                     pDependencyInfo)
5887 {
5888    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5889    ANV_FROM_HANDLE(anv_event, event, _event);
5890 
5891    if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
5892       anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
5893          flush.PostSyncOperation = WriteImmediateData;
5894          flush.Address = anv_state_pool_state_address(
5895             &cmd_buffer->device->dynamic_state_pool,
5896             event->state);
5897          flush.ImmediateData = VK_EVENT_SET;
5898       }
5899       return;
5900    }
5901 
5902    VkPipelineStageFlags2 src_stages = 0;
5903 
5904    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
5905       src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
5906    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
5907       src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
5908    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
5909       src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
5910 
5911    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5912    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5913 
5914    enum anv_pipe_bits pc_bits = 0;
5915    if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5916       pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
5917       pc_bits |= ANV_PIPE_CS_STALL_BIT;
5918   }
5919 
5920    genx_batch_emit_pipe_control_write
5921       (&cmd_buffer->batch, cmd_buffer->device->info,
5922        cmd_buffer->state.current_pipeline, WriteImmediateData,
5923        anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5924                                     event->state),
5925        VK_EVENT_SET, pc_bits);
5926 }
5927 
genX(CmdResetEvent2)5928 void genX(CmdResetEvent2)(
5929     VkCommandBuffer                             commandBuffer,
5930     VkEvent                                     _event,
5931     VkPipelineStageFlags2                       stageMask)
5932 {
5933    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5934    ANV_FROM_HANDLE(anv_event, event, _event);
5935 
5936    if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
5937       anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
5938          flush.PostSyncOperation = WriteImmediateData;
5939          flush.Address = anv_state_pool_state_address(
5940             &cmd_buffer->device->dynamic_state_pool,
5941             event->state);
5942          flush.ImmediateData = VK_EVENT_RESET;
5943       }
5944       return;
5945    }
5946 
5947    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5948    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5949 
5950    enum anv_pipe_bits pc_bits = 0;
5951    if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5952       pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
5953       pc_bits |= ANV_PIPE_CS_STALL_BIT;
5954     }
5955 
5956    genx_batch_emit_pipe_control_write
5957       (&cmd_buffer->batch, cmd_buffer->device->info,
5958        cmd_buffer->state.current_pipeline, WriteImmediateData,
5959        anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5960                                     event->state),
5961        VK_EVENT_RESET,
5962        pc_bits);
5963 }
5964 
genX(CmdWaitEvents2)5965 void genX(CmdWaitEvents2)(
5966     VkCommandBuffer                             commandBuffer,
5967     uint32_t                                    eventCount,
5968     const VkEvent*                              pEvents,
5969     const VkDependencyInfo*                     pDependencyInfos)
5970 {
5971    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5972 
5973    for (uint32_t i = 0; i < eventCount; i++) {
5974       ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
5975 
5976       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
5977          sem.WaitMode            = PollingMode;
5978          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
5979          sem.SemaphoreDataDword  = VK_EVENT_SET;
5980          sem.SemaphoreAddress    = anv_state_pool_state_address(
5981             &cmd_buffer->device->dynamic_state_pool,
5982             event->state);
5983       }
5984    }
5985 
5986    cmd_buffer_barrier(cmd_buffer, eventCount, pDependencyInfos, "wait event");
5987 }
5988 
vk_to_intel_index_type(VkIndexType type)5989 static uint32_t vk_to_intel_index_type(VkIndexType type)
5990 {
5991    switch (type) {
5992    case VK_INDEX_TYPE_UINT8_KHR:
5993       return INDEX_BYTE;
5994    case VK_INDEX_TYPE_UINT16:
5995       return INDEX_WORD;
5996    case VK_INDEX_TYPE_UINT32:
5997       return INDEX_DWORD;
5998    default:
5999       unreachable("invalid index type");
6000    }
6001 }
6002 
genX(CmdBindIndexBuffer2KHR)6003 void genX(CmdBindIndexBuffer2KHR)(
6004     VkCommandBuffer                             commandBuffer,
6005     VkBuffer                                    _buffer,
6006     VkDeviceSize                                offset,
6007     VkDeviceSize                                size,
6008     VkIndexType                                 indexType)
6009 {
6010    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6011    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
6012 
6013    uint32_t restart_index = vk_index_to_restart(indexType);
6014    if (cmd_buffer->state.gfx.restart_index != restart_index) {
6015       cmd_buffer->state.gfx.restart_index = restart_index;
6016       cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RESTART_INDEX;
6017    }
6018 
6019    uint32_t index_type = vk_to_intel_index_type(indexType);
6020    if (cmd_buffer->state.gfx.index_buffer != buffer ||
6021        cmd_buffer->state.gfx.index_type != index_type ||
6022        cmd_buffer->state.gfx.index_offset != offset) {
6023       cmd_buffer->state.gfx.index_buffer = buffer;
6024       cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
6025       cmd_buffer->state.gfx.index_offset = offset;
6026       cmd_buffer->state.gfx.index_size = buffer ? vk_buffer_range(&buffer->vk, offset, size) : 0;
6027       cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
6028    }
6029 }
6030 
genX(CmdSetPerformanceOverrideINTEL)6031 VkResult genX(CmdSetPerformanceOverrideINTEL)(
6032     VkCommandBuffer                             commandBuffer,
6033     const VkPerformanceOverrideInfoINTEL*       pOverrideInfo)
6034 {
6035    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6036 
6037    switch (pOverrideInfo->type) {
6038    case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
6039       anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
6040          csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
6041          csdm2.MediaInstructionDisable = pOverrideInfo->enable;
6042          csdm2._3DRenderingInstructionDisableMask = true;
6043          csdm2.MediaInstructionDisableMask = true;
6044       }
6045       break;
6046    }
6047 
6048    case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
6049       if (pOverrideInfo->enable) {
6050          /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
6051          anv_add_pending_pipe_bits(cmd_buffer,
6052                                    ANV_PIPE_FLUSH_BITS |
6053                                    ANV_PIPE_INVALIDATE_BITS,
6054                                    "perf counter isolation");
6055          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6056       }
6057       break;
6058 
6059    default:
6060       unreachable("Invalid override");
6061    }
6062 
6063    return VK_SUCCESS;
6064 }
6065 
genX(CmdSetPerformanceStreamMarkerINTEL)6066 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
6067     VkCommandBuffer                             commandBuffer,
6068     const VkPerformanceStreamMarkerInfoINTEL*   pMarkerInfo)
6069 {
6070    /* TODO: Waiting on the register to write, might depend on generation. */
6071 
6072    return VK_SUCCESS;
6073 }
6074 
6075 #define TIMESTAMP 0x2358
6076 
genX(cmd_emit_timestamp)6077 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
6078                               struct anv_device *device,
6079                               struct anv_address addr,
6080                               enum anv_timestamp_capture_type type,
6081                               void *data) {
6082    /* Make sure ANV_TIMESTAMP_CAPTURE_AT_CS_STALL and
6083     * ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER capture type are not set for
6084     * transfer queue.
6085     */
6086    if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
6087        (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
6088       assert(type != ANV_TIMESTAMP_CAPTURE_AT_CS_STALL &&
6089              type != ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER);
6090    }
6091 
6092    switch (type) {
6093    case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
6094       struct mi_builder b;
6095       mi_builder_init(&b, device->info, batch);
6096       mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
6097       break;
6098    }
6099 
6100    case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE: {
6101       if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
6102           (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
6103          /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
6104          if (intel_needs_workaround(device->info, 16018063123))
6105             genX(batch_emit_fast_color_dummy_blit)(batch, device);
6106          anv_batch_emit(batch, GENX(MI_FLUSH_DW), fd) {
6107             fd.PostSyncOperation = WriteTimestamp;
6108             fd.Address = addr;
6109          }
6110       } else {
6111          genx_batch_emit_pipe_control_write(batch, device->info, 0,
6112                                             WriteTimestamp, addr, 0, 0);
6113       }
6114       break;
6115    }
6116 
6117    case ANV_TIMESTAMP_CAPTURE_AT_CS_STALL:
6118       genx_batch_emit_pipe_control_write
6119            (batch, device->info, 0, WriteTimestamp, addr, 0,
6120             ANV_PIPE_CS_STALL_BIT);
6121       break;
6122 
6123 #if GFX_VERx10 >= 125
6124    case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
6125       uint32_t dwords[GENX(COMPUTE_WALKER_length)];
6126 
6127       GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
6128             .PostSync = (struct GENX(POSTSYNC_DATA)) {
6129                .Operation = WriteTimestamp,
6130                .DestinationAddress = addr,
6131                .MOCS = anv_mocs(device, NULL, 0),
6132             },
6133          });
6134 
6135       for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++) {
6136          if (dwords[i])
6137             ((uint32_t *)data)[i] |= dwords[i];
6138       }
6139       break;
6140    }
6141 
6142    case ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH: {
6143       uint32_t dwords[GENX(EXECUTE_INDIRECT_DISPATCH_length)];
6144 
6145       GENX(EXECUTE_INDIRECT_DISPATCH_pack)
6146       (batch, dwords, &(struct GENX(EXECUTE_INDIRECT_DISPATCH)) {
6147             .MOCS = anv_mocs(device, NULL, 0),
6148             .COMPUTE_WALKER_BODY = {
6149                .PostSync = (struct GENX(POSTSYNC_DATA)) {
6150                   .Operation = WriteTimestamp,
6151                   .DestinationAddress = addr,
6152                   .MOCS = anv_mocs(device, NULL, 0),
6153                },
6154             }
6155       });
6156 
6157       for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++) {
6158          if (dwords[i])
6159             ((uint32_t *)data)[i] |= dwords[i];
6160       }
6161       break;
6162    }
6163 #endif
6164 
6165    default:
6166       unreachable("invalid");
6167    }
6168 }
6169 
genX(cmd_capture_data)6170 void genX(cmd_capture_data)(struct anv_batch *batch,
6171                             struct anv_device *device,
6172                             struct anv_address dst_addr,
6173                             struct anv_address src_addr,
6174                             uint32_t size_B) {
6175    struct mi_builder b;
6176    mi_builder_init(&b, device->info, batch);
6177    mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
6178    mi_memcpy(&b, dst_addr, src_addr, size_B);
6179 }
6180 
genX(batch_emit_secondary_call)6181 void genX(batch_emit_secondary_call)(struct anv_batch *batch,
6182                                      struct anv_device *device,
6183                                      struct anv_address secondary_addr,
6184                                      struct anv_address secondary_return_addr)
6185 {
6186    struct mi_builder b;
6187    mi_builder_init(&b, device->info, batch);
6188    mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
6189    /* Make sure the write in the batch buffer lands before we just execute the
6190     * jump.
6191     */
6192    mi_builder_set_write_check(&b, true);
6193 
6194    /* Emit a write to change the return address of the secondary */
6195    struct mi_reloc_imm_token reloc =
6196       mi_store_relocated_imm(&b, mi_mem64(secondary_return_addr));
6197 
6198    /* Ensure the write have landed before CS reads the address written
6199     * above
6200     */
6201    mi_ensure_write_fence(&b);
6202 
6203 #if GFX_VER >= 12
6204    /* Disable prefetcher before jumping into a secondary */
6205    anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {
6206       arb.PreParserDisableMask = true;
6207       arb.PreParserDisable = true;
6208    }
6209 #endif
6210 
6211    /* Jump into the secondary */
6212    anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
6213       bbs.AddressSpaceIndicator = ASI_PPGTT;
6214       bbs.SecondLevelBatchBuffer = Firstlevelbatch;
6215       bbs.BatchBufferStartAddress = secondary_addr;
6216    }
6217 
6218    /* Replace the return address written by the MI_STORE_DATA_IMM above with
6219     * the primary's current batch address (immediately after the jump).
6220     */
6221    mi_relocate_store_imm(reloc,
6222                          anv_address_physical(
6223                             anv_batch_current_address(batch)));
6224 }
6225 
6226 void *
genX(batch_emit_return)6227 genX(batch_emit_return)(struct anv_batch *batch)
6228 {
6229    return anv_batch_emitn(batch,
6230                           GENX(MI_BATCH_BUFFER_START_length),
6231                           GENX(MI_BATCH_BUFFER_START),
6232                           .AddressSpaceIndicator = ASI_PPGTT,
6233                           .SecondLevelBatchBuffer = Firstlevelbatch);
6234 }
6235 
6236 void
genX(batch_emit_post_3dprimitive_was)6237 genX(batch_emit_post_3dprimitive_was)(struct anv_batch *batch,
6238                                       const struct anv_device *device,
6239                                       uint32_t primitive_topology,
6240                                       uint32_t vertex_count)
6241 {
6242 #if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
6243    if (intel_needs_workaround(device->info, 22014412737) &&
6244        (primitive_topology == _3DPRIM_POINTLIST ||
6245         primitive_topology == _3DPRIM_LINELIST ||
6246         primitive_topology == _3DPRIM_LINESTRIP ||
6247         primitive_topology == _3DPRIM_LINELIST_ADJ ||
6248         primitive_topology == _3DPRIM_LINESTRIP_ADJ ||
6249         primitive_topology == _3DPRIM_LINELOOP ||
6250         primitive_topology == _3DPRIM_POINTLIST_BF ||
6251         primitive_topology == _3DPRIM_LINESTRIP_CONT ||
6252         primitive_topology == _3DPRIM_LINESTRIP_BF ||
6253         primitive_topology == _3DPRIM_LINESTRIP_CONT_BF) &&
6254        (vertex_count == 1 || vertex_count == 2)) {
6255       genx_batch_emit_pipe_control_write
6256          (batch, device->info, 0, WriteImmediateData,
6257           device->workaround_address, 0, 0);
6258 
6259       /* Reset counter because we just emitted a PC */
6260       batch->num_3d_primitives_emitted = 0;
6261    } else if (intel_needs_workaround(device->info, 16014538804)) {
6262       batch->num_3d_primitives_emitted++;
6263       /* WA 16014538804:
6264        *    After every 3 3D_Primitive command,
6265        *    atleast 1 pipe_control must be inserted.
6266        */
6267       if (batch->num_3d_primitives_emitted == 3) {
6268          anv_batch_emit(batch, GENX(PIPE_CONTROL), pc);
6269          batch->num_3d_primitives_emitted = 0;
6270       }
6271    }
6272 #endif
6273 }
6274 
6275 /* Wa_16018063123 */
6276 ALWAYS_INLINE void
genX(batch_emit_fast_color_dummy_blit)6277 genX(batch_emit_fast_color_dummy_blit)(struct anv_batch *batch,
6278                                       struct anv_device *device)
6279 {
6280 #if GFX_VERx10 >= 125
6281    anv_batch_emit(batch, GENX(XY_FAST_COLOR_BLT), blt) {
6282       blt.DestinationBaseAddress = device->workaround_address;
6283       blt.DestinationMOCS = device->isl_dev.mocs.blitter_dst;
6284       blt.DestinationPitch = 63;
6285       blt.DestinationX2 = 1;
6286       blt.DestinationY2 = 4;
6287       blt.DestinationSurfaceWidth = 1;
6288       blt.DestinationSurfaceHeight = 4;
6289       blt.DestinationSurfaceType = XY_SURFTYPE_2D;
6290       blt.DestinationSurfaceQPitch = 4;
6291       blt.DestinationTiling = XY_TILE_LINEAR;
6292    }
6293 #endif
6294 }
6295 
6296 void
genX(urb_workaround)6297 genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer,
6298                      const struct intel_urb_config *urb_cfg)
6299 {
6300 #if INTEL_NEEDS_WA_16014912113
6301    const struct intel_urb_config *current =
6302       &cmd_buffer->state.gfx.urb_cfg;
6303    if (intel_urb_setup_changed(urb_cfg, current, MESA_SHADER_TESS_EVAL) &&
6304        current->size[0] != 0) {
6305       for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
6306          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
6307             urb._3DCommandSubOpcode      += i;
6308             urb.VSURBStartingAddress      = current->start[i];
6309             urb.VSURBEntryAllocationSize  = current->size[i] - 1;
6310             urb.VSNumberofURBEntries      = i == 0 ? 256 : 0;
6311          }
6312       }
6313       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
6314          pc.HDCPipelineFlushEnable = true;
6315       }
6316    }
6317 #endif
6318 }
6319 
6320 struct anv_state
genX(cmd_buffer_begin_companion_rcs_syncpoint)6321 genX(cmd_buffer_begin_companion_rcs_syncpoint)(
6322       struct anv_cmd_buffer   *cmd_buffer)
6323 {
6324 #if GFX_VERx10 >= 125
6325    const struct intel_device_info *info = cmd_buffer->device->info;
6326    struct anv_state syncpoint =
6327       anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 2 * sizeof(uint32_t), 4);
6328    struct anv_address xcs_wait_addr =
6329       anv_cmd_buffer_temporary_state_address(cmd_buffer, syncpoint);
6330    struct anv_address rcs_wait_addr = anv_address_add(xcs_wait_addr, 4);
6331 
6332    /* Reset the sync point */
6333    memset(syncpoint.map, 0, 2 * sizeof(uint32_t));
6334 
6335    struct mi_builder b;
6336 
6337    /* On CCS:
6338     *    - flush all caches & invalidate
6339     *    - unblock RCS
6340     *    - wait on RCS to complete
6341     *    - clear the value we waited on
6342     */
6343 
6344    if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
6345       anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_FLUSH_BITS |
6346                                             ANV_PIPE_INVALIDATE_BITS |
6347                                             ANV_PIPE_STALL_BITS,
6348                                 "post main cmd buffer invalidate");
6349       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6350    } else if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
6351       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
6352       if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
6353          genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
6354                                                 cmd_buffer->device);
6355       }
6356       anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
6357          fd.FlushCCS = true; /* Maybe handle Flush LLC */
6358       }
6359    }
6360 
6361    {
6362       mi_builder_init(&b, info, &cmd_buffer->batch);
6363       mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x1));
6364       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
6365          sem.WaitMode            = PollingMode;
6366          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
6367          sem.SemaphoreDataDword  = 0x1;
6368          sem.SemaphoreAddress    = xcs_wait_addr;
6369       }
6370       /* Make sure to reset the semaphore in case the command buffer is run
6371        * multiple times.
6372        */
6373       mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x0));
6374    }
6375 
6376    /* On RCS:
6377     *    - wait on CCS signal
6378     *    - clear the value we waited on
6379     */
6380    {
6381       mi_builder_init(&b, info, &cmd_buffer->companion_rcs_cmd_buffer->batch);
6382       anv_batch_emit(&cmd_buffer->companion_rcs_cmd_buffer->batch,
6383                      GENX(MI_SEMAPHORE_WAIT),
6384                      sem) {
6385          sem.WaitMode            = PollingMode;
6386          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
6387          sem.SemaphoreDataDword  = 0x1;
6388          sem.SemaphoreAddress    = rcs_wait_addr;
6389       }
6390       /* Make sure to reset the semaphore in case the command buffer is run
6391        * multiple times.
6392        */
6393       mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x0));
6394    }
6395 
6396    return syncpoint;
6397 #else
6398    unreachable("Not implemented");
6399 #endif
6400 }
6401 
6402 void
genX(cmd_buffer_end_companion_rcs_syncpoint)6403 genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer,
6404                                              struct anv_state syncpoint)
6405 {
6406 #if GFX_VERx10 >= 125
6407    struct anv_address xcs_wait_addr =
6408       anv_cmd_buffer_temporary_state_address(cmd_buffer, syncpoint);
6409 
6410    struct mi_builder b;
6411 
6412    /* On RCS:
6413     *    - flush all caches & invalidate
6414     *    - unblock the CCS
6415     */
6416    anv_add_pending_pipe_bits(cmd_buffer->companion_rcs_cmd_buffer,
6417                              ANV_PIPE_FLUSH_BITS |
6418                              ANV_PIPE_INVALIDATE_BITS |
6419                              ANV_PIPE_STALL_BITS,
6420                              "post rcs flush");
6421    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer->companion_rcs_cmd_buffer);
6422 
6423    mi_builder_init(&b, cmd_buffer->device->info,
6424                    &cmd_buffer->companion_rcs_cmd_buffer->batch);
6425    mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x1));
6426 #else
6427    unreachable("Not implemented");
6428 #endif
6429 }
6430 
6431 void
genX(write_trtt_entries)6432 genX(write_trtt_entries)(struct anv_async_submit *submit,
6433                          struct anv_trtt_bind *l3l2_binds,
6434                          uint32_t n_l3l2_binds,
6435                          struct anv_trtt_bind *l1_binds,
6436                          uint32_t n_l1_binds)
6437 {
6438 #if GFX_VER >= 12
6439    const struct intel_device_info *devinfo =
6440       submit->queue->device->info;
6441    struct anv_batch *batch = &submit->batch;
6442 
6443    /* BSpec says:
6444     *   "DWord Length programmed must not exceed 0x3FE."
6445     * For a single dword write the programmed length is 2, and for a single
6446     * qword it's 3. This is the value we actually write to the register field,
6447     * so it's not considering the bias.
6448     */
6449    uint32_t dword_write_len = 2;
6450    uint32_t qword_write_len = 3;
6451    uint32_t max_dword_extra_writes = 0x3FE - dword_write_len;
6452    uint32_t max_qword_extra_writes = (0x3FE - qword_write_len) / 2;
6453 
6454    /* What makes the code below quite complicated is the fact that we can
6455     * write multiple values with MI_STORE_DATA_IMM as long as the writes go to
6456     * contiguous addresses.
6457     */
6458 
6459    for (uint32_t i = 0; i < n_l3l2_binds; i++) {
6460       int extra_writes = 0;
6461       for (uint32_t j = i + 1;
6462            j < n_l3l2_binds && extra_writes <= max_qword_extra_writes;
6463            j++) {
6464          if (l3l2_binds[i].pte_addr + (j - i) * 8 == l3l2_binds[j].pte_addr) {
6465             extra_writes++;
6466          } else {
6467             break;
6468          }
6469       }
6470       bool is_last_write = n_l1_binds == 0 &&
6471                            i + extra_writes + 1 == n_l3l2_binds;
6472 
6473       uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
6474                            qword_write_len + (extra_writes * 2);
6475       uint32_t *dw;
6476       dw = anv_batch_emitn(batch, total_len, GENX(MI_STORE_DATA_IMM),
6477          .ForceWriteCompletionCheck = is_last_write,
6478          .StoreQword = true,
6479          .Address = anv_address_from_u64(l3l2_binds[i].pte_addr),
6480       );
6481       dw += 3;
6482       for (uint32_t j = 0; j < extra_writes + 1; j++) {
6483          uint64_t entry_addr_64b = l3l2_binds[i + j].entry_addr;
6484          *dw = entry_addr_64b & 0xFFFFFFFF;
6485          dw++;
6486          *dw = (entry_addr_64b >> 32) & 0xFFFFFFFF;
6487          dw++;
6488       }
6489       assert(dw == batch->next);
6490 
6491       i += extra_writes;
6492    }
6493 
6494    for (uint32_t i = 0; i < n_l1_binds; i++) {
6495       int extra_writes = 0;
6496       for (uint32_t j = i + 1;
6497            j < n_l1_binds && extra_writes <= max_dword_extra_writes;
6498            j++) {
6499          if (l1_binds[i].pte_addr + (j - i) * 4 ==
6500              l1_binds[j].pte_addr) {
6501             extra_writes++;
6502          } else {
6503             break;
6504          }
6505       }
6506 
6507       bool is_last_write = i + extra_writes + 1 == n_l1_binds;
6508 
6509       uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
6510                            dword_write_len + extra_writes;
6511       uint32_t *dw;
6512       dw = anv_batch_emitn(batch, total_len, GENX(MI_STORE_DATA_IMM),
6513          .ForceWriteCompletionCheck = is_last_write,
6514          .Address = anv_address_from_u64(l1_binds[i].pte_addr),
6515       );
6516       dw += 3;
6517       for (uint32_t j = 0; j < extra_writes + 1; j++) {
6518          *dw = (l1_binds[i + j].entry_addr >> 16) & 0xFFFFFFFF;
6519          dw++;
6520       }
6521       assert(dw == batch->next);
6522 
6523       i += extra_writes;
6524    }
6525 
6526    genx_batch_emit_pipe_control(batch, devinfo, _3D,
6527                                 ANV_PIPE_CS_STALL_BIT |
6528                                 ANV_PIPE_TLB_INVALIDATE_BIT);
6529 #else
6530    unreachable("Not implemented");
6531 #endif
6532 }
6533 
6534 void
genX(async_submit_end)6535 genX(async_submit_end)(struct anv_async_submit *submit)
6536 {
6537    struct anv_batch *batch = &submit->batch;
6538    anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
6539 }
6540 
6541 void
genX(CmdWriteBufferMarker2AMD)6542 genX(CmdWriteBufferMarker2AMD)(VkCommandBuffer commandBuffer,
6543                                VkPipelineStageFlags2 stage,
6544                                VkBuffer dstBuffer,
6545                                VkDeviceSize dstOffset,
6546                                uint32_t marker)
6547 {
6548    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6549    ANV_FROM_HANDLE(anv_buffer, buffer, dstBuffer);
6550 
6551    /* The barriers inserted by the application to make dstBuffer writable
6552     * should already have the L1/L2 cache flushes. On platforms where the
6553     * command streamer is not coherent with L3, we need an additional set of
6554     * cache flushes.
6555     */
6556    enum anv_pipe_bits bits =
6557       (ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info) ? 0 :
6558        (ANV_PIPE_DATA_CACHE_FLUSH_BIT | ANV_PIPE_TILE_CACHE_FLUSH_BIT)) |
6559       ANV_PIPE_END_OF_PIPE_SYNC_BIT;
6560 
6561    trace_intel_begin_write_buffer_marker(&cmd_buffer->trace);
6562 
6563    anv_add_pending_pipe_bits(cmd_buffer, bits, "write buffer marker");
6564    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6565 
6566    struct mi_builder b;
6567    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
6568 
6569    /* Emitting a PIPE_CONTROL with Post-Sync Op = Write Immediate Data
6570     * would be the logical way to implement this extension, as it could
6571     * do a pipelined marker write.  Unfortunately, it requires writing
6572     * whole 64-bit QWords, and VK_AMD_buffer_marker requires writing a
6573     * 32-bit value.  MI_STORE_DATA_IMM is the only good way to do that,
6574     * and unfortunately it requires stalling.
6575     */
6576    mi_store(&b, mi_mem32(anv_address_add(buffer->address, dstOffset)),
6577                 mi_imm(marker));
6578 
6579    trace_intel_end_write_buffer_marker(&cmd_buffer->trace);
6580 }
6581