1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_render_pass.h"
30 #include "vk_util.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 #include "ds/intel_tracepoints.h"
36
37 #include "genX_mi_builder.h"
38 #include "genX_cmd_draw_generated_flush.h"
39
40 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
41 uint32_t pipeline);
42
43 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)44 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
45 enum anv_pipe_bits bits = 0;
46 bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
47 bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
48 #if GFX_VERx10 >= 125
49 bits |= (pc->PSSStallSyncEnable) ? ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
50 #endif
51 #if GFX_VER == 12
52 bits |= (pc->TileCacheFlushEnable) ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
53 #endif
54 #if GFX_VER >= 12
55 bits |= (pc->HDCPipelineFlushEnable) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
56 #endif
57 bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
58 bits |= (pc->VFCacheInvalidationEnable) ? ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
59 bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
60 bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
61 bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
62 bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
63 bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
64 bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0;
65 bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0;
66 #if GFX_VERx10 == 125
67 bits |= (pc->UntypedDataPortCacheFlushEnable) ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0;
68 bits |= (pc->CCSFlushEnable) ? ANV_PIPE_CCS_CACHE_FLUSH_BIT : 0;
69 #endif
70 return bits;
71 }
72
73 #define anv_debug_dump_pc(pc, reason) \
74 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
75 fputs("pc: emit PC=( ", stdout); \
76 anv_dump_pipe_bits(convert_pc_to_bits(&(pc)), stdout); \
77 fprintf(stdout, ") reason: %s\n", reason); \
78 }
79
80 static inline void
fill_state_base_addr(struct anv_cmd_buffer * cmd_buffer,struct GENX (STATE_BASE_ADDRESS)* sba)81 fill_state_base_addr(struct anv_cmd_buffer *cmd_buffer,
82 struct GENX(STATE_BASE_ADDRESS) *sba)
83 {
84 struct anv_device *device = cmd_buffer->device;
85 const uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
86
87 /* If no API entry point selected the current mode (this can happen if the
88 * first operation in the command buffer is a , select BUFFER if
89 * EXT_descriptor_buffer is enabled, otherwise LEGACY.
90 */
91 if (cmd_buffer->state.pending_db_mode ==
92 ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN) {
93 cmd_buffer->state.pending_db_mode =
94 cmd_buffer->device->vk.enabled_extensions.EXT_descriptor_buffer ?
95 ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER :
96 ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
97 }
98
99 *sba = (struct GENX(STATE_BASE_ADDRESS)) { GENX(STATE_BASE_ADDRESS_header), };
100
101 sba->GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
102 sba->GeneralStateMOCS = mocs;
103 sba->GeneralStateBufferSize = 0xfffff;
104 sba->GeneralStateBaseAddressModifyEnable = true;
105 sba->GeneralStateBufferSizeModifyEnable = true;
106
107 #if GFX_VERx10 == 120
108 /* Since DG2, scratch surfaces have their own surface state with its own
109 * MOCS setting, but prior to that, the MOCS for scratch accesses are
110 * governed by SBA.StatelessDataPortAccessMOCS.
111 */
112 const isl_surf_usage_flags_t protected_usage =
113 cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT ?
114 ISL_SURF_USAGE_PROTECTED_BIT : 0;
115 const uint32_t stateless_mocs = isl_mocs(&device->isl_dev, protected_usage, false);
116 #else
117 const uint32_t stateless_mocs = mocs;
118 #endif
119
120 sba->StatelessDataPortAccessMOCS = stateless_mocs;
121
122 #if GFX_VERx10 >= 125
123 sba->SurfaceStateBaseAddress =
124 (struct anv_address) { .offset =
125 device->physical->va.internal_surface_state_pool.addr,
126 };
127 #else
128 sba->SurfaceStateBaseAddress =
129 anv_cmd_buffer_surface_base_address(cmd_buffer);
130 #endif
131 sba->SurfaceStateMOCS = mocs;
132 sba->SurfaceStateBaseAddressModifyEnable = true;
133
134 sba->IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
135 sba->IndirectObjectMOCS = mocs;
136 sba->IndirectObjectBufferSize = 0xfffff;
137 sba->IndirectObjectBaseAddressModifyEnable = true;
138 sba->IndirectObjectBufferSizeModifyEnable = true;
139
140 sba->InstructionBaseAddress =
141 (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
142 sba->InstructionMOCS = mocs;
143 sba->InstructionBufferSize =
144 device->physical->va.instruction_state_pool.size / 4096;
145 sba->InstructionBaseAddressModifyEnable = true;
146 sba->InstructionBuffersizeModifyEnable = true;
147
148 #if GFX_VER >= 11
149 sba->BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
150 sba->BindlessSamplerStateBufferSize = 0;
151 sba->BindlessSamplerStateMOCS = mocs;
152 sba->BindlessSamplerStateBaseAddressModifyEnable = true;
153 #endif
154
155 sba->DynamicStateBaseAddress = (struct anv_address) {
156 .offset = device->physical->va.dynamic_state_pool.addr,
157 };
158 sba->DynamicStateBufferSize =
159 (device->physical->va.dynamic_state_pool.size +
160 device->physical->va.dynamic_visible_pool.size +
161 device->physical->va.push_descriptor_buffer_pool.size) / 4096;
162 sba->DynamicStateMOCS = mocs;
163 sba->DynamicStateBaseAddressModifyEnable = true;
164 sba->DynamicStateBufferSizeModifyEnable = true;
165
166 if (cmd_buffer->state.pending_db_mode == ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER) {
167 #if GFX_VERx10 >= 125
168 sba->BindlessSurfaceStateBaseAddress = (struct anv_address) {
169 .offset = device->physical->va.dynamic_visible_pool.addr,
170 };
171 sba->BindlessSurfaceStateSize =
172 (device->physical->va.dynamic_visible_pool.size +
173 device->physical->va.push_descriptor_buffer_pool.size) - 1;
174 sba->BindlessSurfaceStateMOCS = mocs;
175 sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
176 #else
177 const uint64_t surfaces_addr =
178 cmd_buffer->state.descriptor_buffers.surfaces_address != 0 ?
179 cmd_buffer->state.descriptor_buffers.surfaces_address :
180 anv_address_physical(device->workaround_address);
181 const uint64_t surfaces_size =
182 cmd_buffer->state.descriptor_buffers.surfaces_address != 0 ?
183 MIN2(device->physical->va.dynamic_visible_pool.size -
184 (cmd_buffer->state.descriptor_buffers.surfaces_address -
185 device->physical->va.dynamic_visible_pool.addr),
186 anv_physical_device_bindless_heap_size(device->physical, true)) :
187 (device->workaround_bo->size - device->workaround_address.offset);
188 sba->BindlessSurfaceStateBaseAddress = (struct anv_address) {
189 .offset = surfaces_addr,
190 };
191 sba->BindlessSurfaceStateSize = surfaces_size / ANV_SURFACE_STATE_SIZE - 1;
192 sba->BindlessSurfaceStateMOCS = mocs;
193 sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
194 #endif /* GFX_VERx10 < 125 */
195 } else if (!device->physical->indirect_descriptors) {
196 #if GFX_VERx10 >= 125
197 sba->BindlessSurfaceStateBaseAddress = (struct anv_address) {
198 .offset = device->physical->va.internal_surface_state_pool.addr,
199 };
200 sba->BindlessSurfaceStateSize =
201 (device->physical->va.internal_surface_state_pool.size +
202 device->physical->va.bindless_surface_state_pool.size) - 1;
203 sba->BindlessSurfaceStateMOCS = mocs;
204 sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
205 #else
206 unreachable("Direct descriptor not supported");
207 #endif
208 } else {
209 sba->BindlessSurfaceStateBaseAddress =
210 (struct anv_address) { .offset =
211 device->physical->va.bindless_surface_state_pool.addr,
212 };
213 sba->BindlessSurfaceStateSize =
214 anv_physical_device_bindless_heap_size(device->physical, false) /
215 ANV_SURFACE_STATE_SIZE - 1;
216 sba->BindlessSurfaceStateMOCS = mocs;
217 sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
218 }
219
220 #if GFX_VERx10 >= 125
221 sba->L1CacheControl = L1CC_WB;
222 #endif
223 }
224
225 void
genX(cmd_buffer_emit_state_base_address)226 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
227 {
228 if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
229 anv_cmd_buffer_is_video_queue(cmd_buffer))
230 return;
231
232 struct anv_device *device = cmd_buffer->device;
233
234 struct GENX(STATE_BASE_ADDRESS) sba = {};
235 fill_state_base_addr(cmd_buffer, &sba);
236
237 #if GFX_VERx10 >= 125
238 struct mi_builder b;
239 mi_builder_init(&b, device->info, &cmd_buffer->batch);
240 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
241 struct mi_goto_target t = MI_GOTO_TARGET_INIT;
242 mi_goto_if(&b,
243 mi_ieq(&b, mi_reg64(ANV_BINDLESS_SURFACE_BASE_ADDR_REG),
244 mi_imm(sba.BindlessSurfaceStateBaseAddress.offset)),
245 &t);
246 #endif
247
248 /* Emit a render target cache flush.
249 *
250 * This isn't documented anywhere in the PRM. However, it seems to be
251 * necessary prior to changing the surface state base address. Without
252 * this, we get GPU hangs when using multi-level command buffers which
253 * clear depth, reset state base address, and then go render stuff.
254 */
255 genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
256 cmd_buffer->state.current_pipeline,
257 #if GFX_VER >= 12
258 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
259 #else
260 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
261 #endif
262 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
263 ANV_PIPE_CS_STALL_BIT);
264
265 #if INTEL_NEEDS_WA_1607854226
266 /* Wa_1607854226:
267 *
268 * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
269 * mode by putting the pipeline temporarily in 3D mode.
270 */
271 uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;
272 genX(flush_pipeline_select_3d)(cmd_buffer);
273 #endif
274
275 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), _sba) {
276 _sba = sba;
277 }
278
279 if (cmd_buffer->state.current_db_mode != cmd_buffer->state.pending_db_mode)
280 cmd_buffer->state.current_db_mode = cmd_buffer->state.pending_db_mode;
281
282 #if INTEL_NEEDS_WA_1607854226
283 /* Wa_1607854226:
284 *
285 * Put the pipeline back into its current mode.
286 */
287 if (gfx12_wa_pipeline != UINT32_MAX)
288 genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
289 #endif
290
291 /* After re-setting the surface state base address, we have to do some
292 * cache flushing so that the sampler engine will pick up the new
293 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
294 * Shared Function > 3D Sampler > State > State Caching (page 96):
295 *
296 * Coherency with system memory in the state cache, like the texture
297 * cache is handled partially by software. It is expected that the
298 * command stream or shader will issue Cache Flush operation or
299 * Cache_Flush sampler message to ensure that the L1 cache remains
300 * coherent with system memory.
301 *
302 * [...]
303 *
304 * Whenever the value of the Dynamic_State_Base_Addr,
305 * Surface_State_Base_Addr are altered, the L1 state cache must be
306 * invalidated to ensure the new surface or sampler state is fetched
307 * from system memory.
308 *
309 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
310 * which, according the PIPE_CONTROL instruction documentation in the
311 * Broadwell PRM:
312 *
313 * Setting this bit is independent of any other bit in this packet.
314 * This bit controls the invalidation of the L1 and L2 state caches
315 * at the top of the pipe i.e. at the parsing time.
316 *
317 * Unfortunately, experimentation seems to indicate that state cache
318 * invalidation through a PIPE_CONTROL does nothing whatsoever in
319 * regards to surface state and binding tables. In stead, it seems that
320 * invalidating the texture cache is what is actually needed.
321 *
322 * XXX: As far as we have been able to determine through
323 * experimentation, shows that flush the texture cache appears to be
324 * sufficient. The theory here is that all of the sampling/rendering
325 * units cache the binding table in the texture cache. However, we have
326 * yet to be able to actually confirm this.
327 *
328 * Wa_14013910100:
329 *
330 * "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
331 * or program pipe control with Instruction cache invalidate post
332 * STATE_BASE_ADDRESS command"
333 */
334 enum anv_pipe_bits bits =
335 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
336 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
337 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
338 (intel_needs_workaround(device->info, 16013000631) ?
339 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0);
340
341 #if GFX_VER >= 9 && GFX_VER <= 11
342 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
343 *
344 * "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
345 * always set for GPGPU workloads when “Texture Cache Invalidation
346 * Enable” bit is set".
347 *
348 * Workaround stopped appearing in TGL PRMs.
349 */
350 if (cmd_buffer->state.current_pipeline == GPGPU)
351 bits |= ANV_PIPE_CS_STALL_BIT;
352 #endif
353 genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
354 cmd_buffer->state.current_pipeline,
355 bits);
356
357 assert(cmd_buffer->state.current_db_mode !=
358 ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
359
360 #if GFX_VERx10 >= 125
361 assert(sba.BindlessSurfaceStateBaseAddress.offset != 0);
362 mi_store(&b, mi_reg64(ANV_BINDLESS_SURFACE_BASE_ADDR_REG),
363 mi_imm(sba.BindlessSurfaceStateBaseAddress.offset));
364
365 mi_goto_target(&b, &t);
366 #endif
367
368 #if GFX_VERx10 >= 125
369 genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
370 #endif
371
372 /* If we have emitted a new state base address we probably need to re-emit
373 * binding tables.
374 */
375 cmd_buffer->state.descriptors_dirty |= ~0;
376 }
377
378 void
genX(cmd_buffer_emit_bt_pool_base_address)379 genX(cmd_buffer_emit_bt_pool_base_address)(struct anv_cmd_buffer *cmd_buffer)
380 {
381 if (!anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer))
382 return;
383
384 /* If we are emitting a new state base address we probably need to re-emit
385 * binding tables.
386 */
387 cmd_buffer->state.descriptors_dirty |= ~0;
388
389 #if GFX_VERx10 >= 125
390 struct anv_device *device = cmd_buffer->device;
391 const uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
392
393 genx_batch_emit_pipe_control(&cmd_buffer->batch,
394 cmd_buffer->device->info,
395 cmd_buffer->state.current_pipeline,
396 ANV_PIPE_CS_STALL_BIT);
397 anv_batch_emit(
398 &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
399 btpa.BindingTablePoolBaseAddress =
400 anv_cmd_buffer_surface_base_address(cmd_buffer);
401 btpa.BindingTablePoolBufferSize = device->physical->va.binding_table_pool.size / 4096;
402 btpa.MOCS = mocs;
403 }
404
405 genx_batch_emit_pipe_control(&cmd_buffer->batch,
406 cmd_buffer->device->info,
407 cmd_buffer->state.current_pipeline,
408 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
409 #else /* GFX_VERx10 < 125 */
410 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
411 #endif
412 }
413
414 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)415 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
416 struct anv_address addr)
417 {
418 VkResult result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
419 addr.bo);
420
421 if (unlikely(result != VK_SUCCESS))
422 anv_batch_set_error(&cmd_buffer->batch, result);
423 }
424
425 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,const struct anv_surface_state * state)426 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
427 const struct anv_surface_state *state)
428 {
429 assert(!anv_address_is_null(state->address));
430 add_surface_reloc(cmd_buffer, state->address);
431
432 if (!anv_address_is_null(state->aux_address)) {
433 VkResult result =
434 anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
435 state->aux_address.bo);
436 if (result != VK_SUCCESS)
437 anv_batch_set_error(&cmd_buffer->batch, result);
438 }
439
440 if (!anv_address_is_null(state->clear_address)) {
441 VkResult result =
442 anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
443 state->clear_address.bo);
444 if (result != VK_SUCCESS)
445 anv_batch_set_error(&cmd_buffer->batch, result);
446 }
447 }
448
449 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
450 * the initial layout is undefined, the HiZ buffer and depth buffer will
451 * represent the same data at the end of this operation.
452 */
453 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)454 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
455 const struct anv_image *image,
456 uint32_t base_level, uint32_t level_count,
457 uint32_t base_layer, uint32_t layer_count,
458 VkImageLayout initial_layout,
459 VkImageLayout final_layout,
460 bool will_full_fast_clear)
461 {
462 const uint32_t depth_plane =
463 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
464 if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
465 return;
466
467 /* Initialize the indirect clear color prior to first use. */
468 const struct anv_address clear_color_addr =
469 anv_image_get_clear_color_addr(cmd_buffer->device, image,
470 VK_IMAGE_ASPECT_DEPTH_BIT);
471 if (!anv_address_is_null(clear_color_addr) &&
472 (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
473 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED)) {
474 const enum isl_format depth_format =
475 image->planes[depth_plane].primary_surface.isl.format;
476 const union isl_color_value clear_value =
477 anv_image_hiz_clear_value(image);
478
479 uint32_t depth_value[4] = {};
480 isl_color_value_pack(&clear_value, depth_format, depth_value);
481
482 const uint32_t clear_pixel_offset = clear_color_addr.offset +
483 isl_get_sampler_clear_field_offset(cmd_buffer->device->info,
484 depth_format);
485 const struct anv_address clear_pixel_addr = {
486 .bo = clear_color_addr.bo,
487 .offset = clear_pixel_offset,
488 };
489
490 struct mi_builder b;
491 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
492 mi_builder_set_write_check(&b, true);
493 mi_store(&b, mi_mem32(clear_pixel_addr), mi_imm(depth_value[0]));
494 }
495
496 /* If will_full_fast_clear is set, the caller promises to fast-clear the
497 * largest portion of the specified range as it can.
498 */
499 if (will_full_fast_clear)
500 return;
501
502 const enum isl_aux_state initial_state =
503 anv_layout_to_aux_state(cmd_buffer->device->info, image,
504 VK_IMAGE_ASPECT_DEPTH_BIT,
505 initial_layout,
506 cmd_buffer->queue_family->queueFlags);
507 const enum isl_aux_state final_state =
508 anv_layout_to_aux_state(cmd_buffer->device->info, image,
509 VK_IMAGE_ASPECT_DEPTH_BIT,
510 final_layout,
511 cmd_buffer->queue_family->queueFlags);
512
513 const bool initial_depth_valid =
514 isl_aux_state_has_valid_primary(initial_state);
515 const bool initial_hiz_valid =
516 isl_aux_state_has_valid_aux(initial_state);
517 const bool final_needs_depth =
518 isl_aux_state_has_valid_primary(final_state);
519 const bool final_needs_hiz =
520 isl_aux_state_has_valid_aux(final_state);
521
522 /* Getting into the pass-through state for Depth is tricky and involves
523 * both a resolve and an ambiguate. We don't handle that state right now
524 * as anv_layout_to_aux_state never returns it.
525 */
526 assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
527
528 enum isl_aux_op hiz_op = ISL_AUX_OP_NONE;
529 if (final_needs_depth && !initial_depth_valid) {
530 assert(initial_hiz_valid);
531 hiz_op = ISL_AUX_OP_FULL_RESOLVE;
532 } else if (final_needs_hiz && !initial_hiz_valid) {
533 assert(initial_depth_valid);
534 hiz_op = ISL_AUX_OP_AMBIGUATE;
535 }
536
537 if (hiz_op != ISL_AUX_OP_NONE) {
538 for (uint32_t l = 0; l < level_count; l++) {
539 const uint32_t level = base_level + l;
540
541 uint32_t aux_layers =
542 anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level);
543 if (base_layer >= aux_layers)
544 break; /* We will only get fewer layers as level increases */
545 uint32_t level_layer_count =
546 MIN2(layer_count, aux_layers - base_layer);
547
548 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
549 level, base_layer, level_layer_count, hiz_op);
550 }
551 }
552
553 /* Additional tile cache flush for MTL:
554 *
555 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
556 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
557 */
558 if (intel_device_info_is_mtl(cmd_buffer->device->info) &&
559 image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS &&
560 final_needs_depth && !initial_depth_valid) {
561 anv_add_pending_pipe_bits(cmd_buffer,
562 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
563 "HIZ-CCS flush");
564 }
565 }
566
567 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
568 * the initial layout is undefined, the HiZ buffer and depth buffer will
569 * represent the same data at the end of this operation.
570 */
571 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)572 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
573 const struct anv_image *image,
574 uint32_t base_level, uint32_t level_count,
575 uint32_t base_layer, uint32_t layer_count,
576 VkImageLayout initial_layout,
577 VkImageLayout final_layout,
578 bool will_full_fast_clear)
579 {
580 #if GFX_VER == 12
581 const uint32_t plane =
582 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
583 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
584 return;
585
586 if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
587 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
588 cmd_buffer->device->info->has_aux_map) {
589 /* If will_full_fast_clear is set, the caller promises to fast-clear the
590 * largest portion of the specified range as it can.
591 */
592 if (will_full_fast_clear)
593 return;
594
595 for (uint32_t l = 0; l < level_count; l++) {
596 const uint32_t level = base_level + l;
597 const VkRect2D clear_rect = {
598 .offset.x = 0,
599 .offset.y = 0,
600 .extent.width = u_minify(image->vk.extent.width, level),
601 .extent.height = u_minify(image->vk.extent.height, level),
602 };
603
604 uint32_t aux_layers =
605 anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);
606
607 if (base_layer >= aux_layers)
608 break; /* We will only get fewer layers as level increases */
609
610 uint32_t level_layer_count =
611 MIN2(layer_count, aux_layers - base_layer);
612
613 /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression
614 * Enable:
615 *
616 * "When enabled, Stencil Buffer needs to be initialized via
617 * stencil clear (HZ_OP) before any renderpass."
618 */
619 const VkClearDepthStencilValue clear_value = {};
620 anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
621 level, base_layer, level_layer_count,
622 clear_rect, &clear_value);
623 }
624 }
625
626 /* Additional tile cache flush for MTL:
627 *
628 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
629 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
630 */
631 if (intel_device_info_is_mtl(cmd_buffer->device->info)) {
632 anv_add_pending_pipe_bits(cmd_buffer,
633 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
634 "HIZ-CCS flush");
635 }
636 #endif
637 }
638
639 #define MI_PREDICATE_SRC0 0x2400
640 #define MI_PREDICATE_SRC1 0x2408
641 #define MI_PREDICATE_RESULT 0x2418
642
643 static void
set_image_compressed_bit(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t base_layer,uint32_t layer_count,bool compressed)644 set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
645 const struct anv_image *image,
646 VkImageAspectFlagBits aspect,
647 uint32_t level,
648 uint32_t base_layer, uint32_t layer_count,
649 bool compressed)
650 {
651 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
652
653 /* We only have compression tracking for CCS_E */
654 if (!isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage))
655 return;
656
657 struct anv_device *device = cmd_buffer->device;
658 struct mi_builder b;
659 mi_builder_init(&b, device->info, &cmd_buffer->batch);
660 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
661
662 for (uint32_t a = 0; a < layer_count; a++) {
663 uint32_t layer = base_layer + a;
664 struct anv_address comp_state_addr =
665 anv_image_get_compression_state_addr(device,
666 image, aspect,
667 level, layer);
668 mi_store(&b, mi_mem32(comp_state_addr),
669 mi_imm(compressed ? UINT32_MAX : 0));
670 }
671
672 /* FCV_CCS_E images are automatically fast cleared to default value at
673 * render time. In order to account for this, anv should set the the
674 * appropriate fast clear state for level0/layer0.
675 *
676 * At the moment, tracking the fast clear state for higher levels/layers is
677 * neither supported, nor do we enter a situation where it is a concern.
678 */
679 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E &&
680 base_layer == 0 && level == 0) {
681 struct anv_address fc_type_addr =
682 anv_image_get_fast_clear_type_addr(device, image, aspect);
683 mi_store(&b, mi_mem32(fc_type_addr),
684 mi_imm(ANV_FAST_CLEAR_DEFAULT_VALUE));
685 }
686 }
687
688 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)689 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
690 const struct anv_image *image,
691 VkImageAspectFlagBits aspect,
692 enum anv_fast_clear_type fast_clear)
693 {
694 struct anv_device *device = cmd_buffer->device;
695 struct mi_builder b;
696 mi_builder_init(&b, device->info, &cmd_buffer->batch);
697 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
698
699 struct anv_address fc_type_addr =
700 anv_image_get_fast_clear_type_addr(device, image, aspect);
701 mi_store(&b, mi_mem32(fc_type_addr), mi_imm(fast_clear));
702
703 /* Whenever we have fast-clear, we consider that slice to be compressed.
704 * This makes building predicates much easier.
705 */
706 if (fast_clear != ANV_FAST_CLEAR_NONE)
707 set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);
708 }
709
710 /* This is only really practical on haswell and above because it requires
711 * MI math in order to get it correct.
712 */
713 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)714 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
715 const struct anv_image *image,
716 VkImageAspectFlagBits aspect,
717 uint32_t level, uint32_t array_layer,
718 enum isl_aux_op resolve_op,
719 enum anv_fast_clear_type fast_clear_supported)
720 {
721 struct anv_device *device = cmd_buffer->device;
722 struct anv_address addr =
723 anv_image_get_fast_clear_type_addr(device, image, aspect);
724 struct mi_builder b;
725 mi_builder_init(&b, device->info, &cmd_buffer->batch);
726 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
727
728 const struct mi_value fast_clear_type = mi_mem32(addr);
729
730 if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
731 /* In this case, we're doing a full resolve which means we want the
732 * resolve to happen if any compression (including fast-clears) is
733 * present.
734 *
735 * In order to simplify the logic a bit, we make the assumption that,
736 * if the first slice has been fast-cleared, it is also marked as
737 * compressed. See also set_image_fast_clear_state.
738 */
739 const struct mi_value compression_state =
740 mi_mem32(anv_image_get_compression_state_addr(device,
741 image, aspect,
742 level, array_layer));
743 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);
744 mi_store(&b, compression_state, mi_imm(0));
745
746 if (level == 0 && array_layer == 0) {
747 /* If the predicate is true, we want to write 0 to the fast clear type
748 * and, if it's false, leave it alone. We can do this by writing
749 *
750 * clear_type = clear_type & ~predicate;
751 */
752 struct mi_value new_fast_clear_type =
753 mi_iand(&b, fast_clear_type,
754 mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));
755 mi_store(&b, fast_clear_type, new_fast_clear_type);
756 }
757 } else if (level == 0 && array_layer == 0) {
758 /* In this case, we are doing a partial resolve to get rid of fast-clear
759 * colors. We don't care about the compression state but we do care
760 * about how much fast clear is allowed by the final layout.
761 */
762 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
763 assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
764
765 /* We need to compute (fast_clear_supported < image->fast_clear) */
766 struct mi_value pred =
767 mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
768 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
769
770 /* If the predicate is true, we want to write 0 to the fast clear type
771 * and, if it's false, leave it alone. We can do this by writing
772 *
773 * clear_type = clear_type & ~predicate;
774 */
775 struct mi_value new_fast_clear_type =
776 mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
777 mi_store(&b, fast_clear_type, new_fast_clear_type);
778 } else {
779 /* In this case, we're trying to do a partial resolve on a slice that
780 * doesn't have clear color. There's nothing to do.
781 */
782 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
783 return;
784 }
785
786 /* Set src1 to 0 and use a != condition */
787 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
788
789 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
790 mip.LoadOperation = LOAD_LOADINV;
791 mip.CombineOperation = COMBINE_SET;
792 mip.CompareOperation = COMPARE_SRCS_EQUAL;
793 }
794 }
795
796 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)797 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
798 const struct anv_image *image,
799 enum isl_format format,
800 struct isl_swizzle swizzle,
801 VkImageAspectFlagBits aspect,
802 uint32_t level, uint32_t array_layer,
803 enum isl_aux_op resolve_op,
804 enum anv_fast_clear_type fast_clear_supported)
805 {
806 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
807
808 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
809 aspect, level, array_layer,
810 resolve_op, fast_clear_supported);
811
812 /* CCS_D only supports full resolves and BLORP will assert on us if we try
813 * to do a partial resolve on a CCS_D surface.
814 */
815 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
816 image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
817 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
818
819 anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
820 level, array_layer, 1, resolve_op, NULL, true);
821 }
822
823 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)824 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
825 const struct anv_image *image,
826 enum isl_format format,
827 struct isl_swizzle swizzle,
828 VkImageAspectFlagBits aspect,
829 uint32_t array_layer,
830 enum isl_aux_op resolve_op,
831 enum anv_fast_clear_type fast_clear_supported)
832 {
833 assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
834 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
835
836 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
837 aspect, 0, array_layer,
838 resolve_op, fast_clear_supported);
839
840 anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
841 array_layer, 1, resolve_op, NULL, true);
842 }
843
844 void
genX(cmd_buffer_mark_image_written)845 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
846 const struct anv_image *image,
847 VkImageAspectFlagBits aspect,
848 enum isl_aux_usage aux_usage,
849 uint32_t level,
850 uint32_t base_layer,
851 uint32_t layer_count)
852 {
853 #if GFX_VER < 20
854 /* The aspect must be exactly one of the image aspects. */
855 assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
856
857 /* Filter out aux usages that don't have any compression tracking.
858 * Note: We only have compression tracking for CCS_E images, but it's
859 * possible for a CCS_E enabled image to have a subresource with a different
860 * aux usage.
861 */
862 if (!isl_aux_usage_has_compression(aux_usage))
863 return;
864
865 set_image_compressed_bit(cmd_buffer, image, aspect,
866 level, base_layer, layer_count, true);
867 #endif
868 }
869
870 /* Copy the fast-clear value dword(s) between a surface state object and an
871 * image's fast clear state buffer.
872 */
873 void
genX(load_image_clear_color)874 genX(load_image_clear_color)(struct anv_cmd_buffer *cmd_buffer,
875 struct anv_state surface_state,
876 const struct anv_image *image)
877 {
878 #if GFX_VER < 10
879 assert(cmd_buffer && image);
880 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
881
882 struct anv_address ss_clear_addr =
883 anv_state_pool_state_address(
884 &cmd_buffer->device->internal_surface_state_pool,
885 (struct anv_state) {
886 .offset = surface_state.offset +
887 cmd_buffer->device->isl_dev.ss.clear_value_offset
888 });
889 const struct anv_address entry_addr =
890 anv_image_get_clear_color_addr(cmd_buffer->device, image,
891 VK_IMAGE_ASPECT_COLOR_BIT);
892 unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
893
894 struct mi_builder b;
895 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
896 mi_builder_set_write_check(&b, true);
897
898 mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
899
900 /* Updating a surface state object may require that the state cache be
901 * invalidated. From the SKL PRM, Shared Functions -> State -> State
902 * Caching:
903 *
904 * Whenever the RENDER_SURFACE_STATE object in memory pointed to by
905 * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
906 * modified [...], the L1 state cache must be invalidated to ensure
907 * the new surface or sampler state is fetched from system memory.
908 *
909 * In testing, SKL doesn't actually seem to need this, but HSW does.
910 */
911 anv_add_pending_pipe_bits(cmd_buffer,
912 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
913 "after load_image_clear_color surface state update");
914 #endif
915 }
916
917 static void
set_image_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,const VkImageAspectFlags aspect,const uint32_t * pixel)918 set_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
919 const struct anv_image *image,
920 const VkImageAspectFlags aspect,
921 const uint32_t *pixel)
922 {
923 UNUSED struct anv_batch *batch = &cmd_buffer->batch;
924 uint32_t plane = anv_image_aspect_to_plane(image, aspect);
925 enum isl_format format = image->planes[plane].primary_surface.isl.format;
926
927 union isl_color_value clear_color;
928 isl_color_value_unpack(&clear_color, format, pixel);
929
930 struct anv_address addr =
931 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
932 assert(!anv_address_is_null(addr));
933
934 #if GFX_VER >= 20
935 assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 0);
936 assert(cmd_buffer->device->isl_dev.ss.clear_value_size == 0);
937 unreachable("storing clear colors on invalid gfx_ver" );
938 #elif GFX_VER >= 11
939 assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 32);
940 uint32_t *dw = anv_batch_emitn(batch, 3 + 6, GENX(MI_STORE_DATA_IMM),
941 .StoreQword = true, .Address = addr);
942 dw[3] = clear_color.u32[0];
943 dw[4] = clear_color.u32[1];
944 dw[5] = clear_color.u32[2];
945 dw[6] = clear_color.u32[3];
946 dw[7] = pixel[0];
947 dw[8] = pixel[1];
948 #else
949 assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 0);
950 assert(cmd_buffer->device->isl_dev.ss.clear_value_size == 16);
951 uint32_t *dw = anv_batch_emitn(batch, 3 + 4, GENX(MI_STORE_DATA_IMM),
952 .StoreQword = true, .Address = addr);
953 dw[3] = clear_color.u32[0];
954 dw[4] = clear_color.u32[1];
955 dw[5] = clear_color.u32[2];
956 dw[6] = clear_color.u32[3];
957 #endif
958 }
959
960 void
genX(set_fast_clear_state)961 genX(set_fast_clear_state)(struct anv_cmd_buffer *cmd_buffer,
962 const struct anv_image *image,
963 const enum isl_format format,
964 union isl_color_value clear_color)
965 {
966 uint32_t pixel[4] = {};
967 isl_color_value_pack(&clear_color, format, pixel);
968 set_image_clear_color(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, pixel);
969
970 if (isl_color_value_is_zero(clear_color, format)) {
971 /* This image has the auxiliary buffer enabled. We can mark the
972 * subresource as not needing a resolve because the clear color
973 * will match what's in every RENDER_SURFACE_STATE object when
974 * it's being used for sampling.
975 */
976 set_image_fast_clear_state(cmd_buffer, image,
977 VK_IMAGE_ASPECT_COLOR_BIT,
978 ANV_FAST_CLEAR_DEFAULT_VALUE);
979 } else {
980 set_image_fast_clear_state(cmd_buffer, image,
981 VK_IMAGE_ASPECT_COLOR_BIT,
982 ANV_FAST_CLEAR_ANY);
983 }
984 }
985
986 /**
987 * @brief Transitions a color buffer from one layout to another.
988 *
989 * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
990 * more information.
991 *
992 * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
993 * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
994 * this represents the maximum layers to transition at each
995 * specified miplevel.
996 */
997 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint32_t src_queue_family,uint32_t dst_queue_family,bool will_full_fast_clear)998 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
999 const struct anv_image *image,
1000 VkImageAspectFlagBits aspect,
1001 const uint32_t base_level, uint32_t level_count,
1002 uint32_t base_layer, uint32_t layer_count,
1003 VkImageLayout initial_layout,
1004 VkImageLayout final_layout,
1005 uint32_t src_queue_family,
1006 uint32_t dst_queue_family,
1007 bool will_full_fast_clear)
1008 {
1009 struct anv_device *device = cmd_buffer->device;
1010 const struct intel_device_info *devinfo = device->info;
1011 /* Validate the inputs. */
1012 assert(cmd_buffer);
1013 assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1014 /* These values aren't supported for simplicity's sake. */
1015 assert(level_count != VK_REMAINING_MIP_LEVELS &&
1016 layer_count != VK_REMAINING_ARRAY_LAYERS);
1017 /* Ensure the subresource range is valid. */
1018 UNUSED uint64_t last_level_num = base_level + level_count;
1019 const uint32_t max_depth = u_minify(image->vk.extent.depth, base_level);
1020 UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
1021 assert((uint64_t)base_layer + layer_count <= image_layers);
1022 assert(last_level_num <= image->vk.mip_levels);
1023 /* If there is a layout transfer, the final layout cannot be undefined or
1024 * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
1025 */
1026 assert(initial_layout == final_layout ||
1027 (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
1028 final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
1029 const struct isl_drm_modifier_info *isl_mod_info =
1030 image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
1031 ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
1032 : NULL;
1033
1034 const bool src_queue_external =
1035 src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1036 src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1037
1038 const bool dst_queue_external =
1039 dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1040 dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1041
1042 /* If the queues are external, consider the first queue family flags
1043 * (should be the most capable)
1044 */
1045 const VkQueueFlagBits src_queue_flags =
1046 device->physical->queue.families[
1047 (src_queue_external || src_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
1048 0 : src_queue_family].queueFlags;
1049 const VkQueueFlagBits dst_queue_flags =
1050 device->physical->queue.families[
1051 (dst_queue_external || dst_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
1052 0 : dst_queue_family].queueFlags;
1053
1054 /* Simultaneous acquire and release on external queues is illegal. */
1055 assert(!src_queue_external || !dst_queue_external);
1056
1057 /* Ownership transition on an external queue requires special action if the
1058 * image has a DRM format modifier because we store image data in
1059 * a driver-private bo which is inaccessible to the external queue.
1060 */
1061 const bool private_binding_acquire =
1062 src_queue_external &&
1063 anv_image_is_externally_shared(image) &&
1064 anv_image_has_private_binding(image);
1065
1066 const bool private_binding_release =
1067 dst_queue_external &&
1068 anv_image_is_externally_shared(image) &&
1069 anv_image_has_private_binding(image);
1070
1071 if (initial_layout == final_layout &&
1072 !private_binding_acquire && !private_binding_release) {
1073 /* No work is needed. */
1074 return;
1075 }
1076
1077 /**
1078 * Section 7.7.4 of the Vulkan 1.3.260 spec says:
1079 *
1080 * If the transfer is via an image memory barrier, and an image layout
1081 * transition is desired, then the values of oldLayout and newLayout in the
1082 * release operation's memory barrier must be equal to values of oldLayout
1083 * and newLayout in the acquire operation's memory barrier. Although the
1084 * image layout transition is submitted twice, it will only be executed
1085 * once. A layout transition specified in this way happens-after the
1086 * release operation and happens-before the acquire operation.
1087 *
1088 * Because we know that we get match transition on each queue, we choose to
1089 * only do the work on one queue type : RENDER. In the cases where we do
1090 * transitions between COMPUTE & TRANSFER, we should have matching
1091 * aux/fast_clear value which would trigger no work in the code below.
1092 */
1093 if (!(src_queue_external || dst_queue_external) &&
1094 src_queue_family != VK_QUEUE_FAMILY_IGNORED &&
1095 dst_queue_family != VK_QUEUE_FAMILY_IGNORED &&
1096 src_queue_family != dst_queue_family) {
1097 enum intel_engine_class src_engine =
1098 cmd_buffer->queue_family->engine_class;
1099 if (src_engine != INTEL_ENGINE_CLASS_RENDER)
1100 return;
1101 }
1102
1103 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
1104
1105 if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
1106 return;
1107
1108 enum isl_aux_usage initial_aux_usage =
1109 anv_layout_to_aux_usage(devinfo, image, aspect, 0,
1110 initial_layout, src_queue_flags);
1111 enum isl_aux_usage final_aux_usage =
1112 anv_layout_to_aux_usage(devinfo, image, aspect, 0,
1113 final_layout, dst_queue_flags);
1114 enum anv_fast_clear_type initial_fast_clear =
1115 anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout,
1116 src_queue_flags);
1117 enum anv_fast_clear_type final_fast_clear =
1118 anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout,
1119 dst_queue_flags);
1120
1121 /* We must override the anv_layout_to_* functions because they are unaware
1122 * of acquire/release direction.
1123 */
1124 if (private_binding_acquire) {
1125 initial_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
1126 image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
1127 initial_fast_clear = isl_mod_info->supports_clear_color ?
1128 initial_fast_clear : ANV_FAST_CLEAR_NONE;
1129 } else if (private_binding_release) {
1130 final_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
1131 image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
1132 final_fast_clear = isl_mod_info->supports_clear_color ?
1133 final_fast_clear : ANV_FAST_CLEAR_NONE;
1134 }
1135
1136 assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
1137
1138 /* The following layouts are equivalent for non-linear images. */
1139 const bool initial_layout_undefined =
1140 initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
1141 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
1142
1143 bool must_init_fast_clear_state = false;
1144 bool must_init_aux_surface = false;
1145
1146 if (initial_layout_undefined) {
1147 /* The subresource may have been aliased and populated with arbitrary
1148 * data, so we should initialize fast-clear state on platforms prior to
1149 * Xe2. Xe2+ platforms don't need it thanks to the new design of fast-
1150 * clear.
1151 */
1152 must_init_fast_clear_state = devinfo->ver < 20;
1153
1154 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS ||
1155 devinfo->has_illegal_ccs_values) {
1156
1157 must_init_aux_surface = true;
1158
1159 } else {
1160 assert(isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage));
1161
1162 /* We can start using the CCS immediately without ambiguating. The
1163 * two conditions that enable this are:
1164 *
1165 * 1) The device treats all possible CCS values as legal. In other
1166 * words, we can't confuse the hardware with random bits in the
1167 * CCS.
1168 *
1169 * 2) We enable compression on all writable image layouts. The CCS
1170 * will receive all writes and will therefore always be in sync
1171 * with the main surface.
1172 *
1173 * If we were to disable compression on some writable layouts, the
1174 * CCS could get out of sync with the main surface and the app
1175 * could lose the data it wrote previously. For example, this
1176 * could happen if an app: transitions from UNDEFINED w/o
1177 * ambiguating -> renders with AUX_NONE -> samples with AUX_CCS.
1178 *
1179 * The second condition is asserted below, but could be moved
1180 * elsewhere for more coverage (we're only checking transitions from
1181 * an undefined layout).
1182 */
1183 assert(vk_image_layout_is_read_only(final_layout, aspect) ||
1184 (final_aux_usage != ISL_AUX_USAGE_NONE));
1185
1186 must_init_aux_surface = false;
1187 }
1188
1189 } else if (private_binding_acquire) {
1190 /* The fast clear state lives in a driver-private bo, and therefore the
1191 * external/foreign queue is unaware of it.
1192 *
1193 * If this is the first time we are accessing the image, then the fast
1194 * clear state is uninitialized.
1195 *
1196 * If this is NOT the first time we are accessing the image, then the fast
1197 * clear state may still be valid and correct due to the resolve during
1198 * our most recent ownership release. However, we do not track the aux
1199 * state with MI stores, and therefore must assume the worst-case: that
1200 * this is the first time we are accessing the image.
1201 */
1202 assert(image->planes[plane].fast_clear_memory_range.binding ==
1203 ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1204 must_init_fast_clear_state = true;
1205
1206 if (anv_image_get_aux_memory_range(image, plane)->binding ==
1207 ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
1208 /* The aux surface, like the fast clear state, lives in
1209 * a driver-private bo. We must initialize the aux surface for the
1210 * same reasons we must initialize the fast clear state.
1211 */
1212 must_init_aux_surface = true;
1213 } else {
1214 /* The aux surface, unlike the fast clear state, lives in
1215 * application-visible VkDeviceMemory and is shared with the
1216 * external/foreign queue. Therefore, when we acquire ownership of the
1217 * image with a defined VkImageLayout, the aux surface is valid and has
1218 * the aux state required by the modifier.
1219 */
1220 must_init_aux_surface = false;
1221 }
1222 }
1223
1224 if (must_init_fast_clear_state) {
1225 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
1226 assert(!image->planes[plane].can_non_zero_fast_clear);
1227 const uint32_t zero_pixel[4] = {};
1228 set_image_clear_color(cmd_buffer, image, aspect, zero_pixel);
1229 }
1230 if (base_level == 0 && base_layer == 0) {
1231 set_image_fast_clear_state(cmd_buffer, image, aspect,
1232 ANV_FAST_CLEAR_NONE);
1233 }
1234 }
1235
1236 if (must_init_aux_surface) {
1237 assert(devinfo->ver >= 20 || must_init_fast_clear_state);
1238
1239 /* Initialize the aux buffers to enable correct rendering. In order to
1240 * ensure that things such as storage images work correctly, aux buffers
1241 * need to be initialized to valid data.
1242 *
1243 * Having an aux buffer with invalid data is a problem for two reasons:
1244 *
1245 * 1) Having an invalid value in the buffer can confuse the hardware.
1246 * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1247 * invalid and leads to the hardware doing strange things. It
1248 * doesn't hang as far as we can tell but rendering corruption can
1249 * occur.
1250 *
1251 * 2) If this transition is into the GENERAL layout and we then use the
1252 * image as a storage image, then we must have the aux buffer in the
1253 * pass-through state so that, if we then go to texture from the
1254 * image, we get the results of our storage image writes and not the
1255 * fast clear color or other random data.
1256 *
1257 * For CCS both of the problems above are real demonstrable issues. In
1258 * that case, the only thing we can do is to perform an ambiguate to
1259 * transition the aux surface into the pass-through state.
1260 *
1261 * For MCS, (2) is never an issue because we don't support multisampled
1262 * storage images. In theory, issue (1) is a problem with MCS but we've
1263 * never seen it in the wild. For 4x and 16x, all bit patterns could,
1264 * in theory, be interpreted as something but we don't know that all bit
1265 * patterns are actually valid. For 2x and 8x, you could easily end up
1266 * with the MCS referring to an invalid plane because not all bits of
1267 * the MCS value are actually used. Even though we've never seen issues
1268 * in the wild, it's best to play it safe and initialize the MCS. We
1269 * could use a fast-clear for MCS because we only ever touch from render
1270 * and texture (no image load store). However, due to WA 14013111325,
1271 * we choose to ambiguate MCS as well.
1272 */
1273 if (image->vk.samples == 1) {
1274 for (uint32_t l = 0; l < level_count; l++) {
1275 const uint32_t level = base_level + l;
1276
1277 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1278 if (base_layer >= aux_layers)
1279 break; /* We will only get fewer layers as level increases */
1280 uint32_t level_layer_count =
1281 MIN2(layer_count, aux_layers - base_layer);
1282
1283 /* If will_full_fast_clear is set, the caller promises to
1284 * fast-clear the largest portion of the specified range as it can.
1285 * For color images, that means only the first LOD and array slice.
1286 */
1287 if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1288 base_layer++;
1289 level_layer_count--;
1290 if (level_layer_count == 0)
1291 continue;
1292 }
1293
1294 anv_image_ccs_op(cmd_buffer, image,
1295 image->planes[plane].primary_surface.isl.format,
1296 ISL_SWIZZLE_IDENTITY,
1297 aspect, level, base_layer, level_layer_count,
1298 ISL_AUX_OP_AMBIGUATE, NULL, false);
1299
1300 set_image_compressed_bit(cmd_buffer, image, aspect, level,
1301 base_layer, level_layer_count, false);
1302 }
1303 } else {
1304 /* If will_full_fast_clear is set, the caller promises to fast-clear
1305 * the largest portion of the specified range as it can.
1306 */
1307 if (will_full_fast_clear)
1308 return;
1309
1310 assert(base_level == 0 && level_count == 1);
1311 anv_image_mcs_op(cmd_buffer, image,
1312 image->planes[plane].primary_surface.isl.format,
1313 ISL_SWIZZLE_IDENTITY,
1314 aspect, base_layer, layer_count,
1315 ISL_AUX_OP_AMBIGUATE, NULL, false);
1316 }
1317 return;
1318 }
1319
1320 /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1321 * We can handle transitions between CCS_D/E to and from NONE. What we
1322 * don't yet handle is switching between CCS_E and CCS_D within a given
1323 * image. Doing so in a performant way requires more detailed aux state
1324 * tracking such as what is done in i965. For now, just assume that we
1325 * only have one type of compression.
1326 */
1327 assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1328 final_aux_usage == ISL_AUX_USAGE_NONE ||
1329 initial_aux_usage == final_aux_usage);
1330
1331 /* If initial aux usage is NONE, there is nothing to resolve */
1332 if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1333 return;
1334
1335 enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1336
1337 /* If the initial layout supports more fast clear than the final layout
1338 * then we need at least a partial resolve.
1339 */
1340 if (final_fast_clear < initial_fast_clear) {
1341 /* Partial resolves will actually only occur on layer 0/level 0. This
1342 * is generally okay because anv only allows explicit fast clears to
1343 * the first subresource.
1344 *
1345 * The situation is a bit different with FCV_CCS_E. With that aux
1346 * usage, implicit fast clears can occur on any layer and level.
1347 * anv doesn't track fast clear states for more than the first
1348 * subresource, so we need to assert that a layout transition doesn't
1349 * attempt to partial resolve the other subresources.
1350 *
1351 * At the moment, we don't enter such a situation, and partial resolves
1352 * for higher level/layer resources shouldn't be a concern.
1353 */
1354 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
1355 assert(base_level == 0 && level_count == 1 &&
1356 base_layer == 0 && layer_count == 1);
1357 }
1358 resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1359 }
1360
1361 if (isl_aux_usage_has_ccs_e(initial_aux_usage) &&
1362 !isl_aux_usage_has_ccs_e(final_aux_usage))
1363 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
1364
1365 if (resolve_op == ISL_AUX_OP_NONE)
1366 return;
1367
1368 for (uint32_t l = 0; l < level_count; l++) {
1369 uint32_t level = base_level + l;
1370
1371 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1372 if (base_layer >= aux_layers)
1373 break; /* We will only get fewer layers as level increases */
1374 uint32_t level_layer_count =
1375 MIN2(layer_count, aux_layers - base_layer);
1376
1377 for (uint32_t a = 0; a < level_layer_count; a++) {
1378 uint32_t array_layer = base_layer + a;
1379
1380 /* If will_full_fast_clear is set, the caller promises to fast-clear
1381 * the largest portion of the specified range as it can. For color
1382 * images, that means only the first LOD and array slice.
1383 */
1384 if (level == 0 && array_layer == 0 && will_full_fast_clear)
1385 continue;
1386
1387 if (image->vk.samples == 1) {
1388 anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1389 image->planes[plane].primary_surface.isl.format,
1390 ISL_SWIZZLE_IDENTITY,
1391 aspect, level, array_layer, resolve_op,
1392 final_fast_clear);
1393 } else {
1394 /* We only support fast-clear on the first layer so partial
1395 * resolves should not be used on other layers as they will use
1396 * the clear color stored in memory that is only valid for layer0.
1397 */
1398 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1399 array_layer != 0)
1400 continue;
1401
1402 anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1403 image->planes[plane].primary_surface.isl.format,
1404 ISL_SWIZZLE_IDENTITY,
1405 aspect, array_layer, resolve_op,
1406 final_fast_clear);
1407 }
1408 }
1409 }
1410 }
1411
1412 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer * cmd_buffer,uint32_t color_att_count)1413 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1414 uint32_t color_att_count)
1415 {
1416 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1417
1418 /* Reserve one for the NULL state. */
1419 unsigned num_states = 1 + color_att_count;
1420 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1421 const uint32_t ss_stride = align(isl_dev->ss.size, isl_dev->ss.align);
1422 gfx->att_states =
1423 anv_cmd_buffer_alloc_surface_states(cmd_buffer, num_states);
1424 if (gfx->att_states.map == NULL)
1425 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1426
1427 struct anv_state next_state = gfx->att_states;
1428 next_state.alloc_size = isl_dev->ss.size;
1429
1430 gfx->null_surface_state = next_state;
1431 next_state.offset += ss_stride;
1432 next_state.map += ss_stride;
1433
1434 gfx->color_att_count = color_att_count;
1435 for (uint32_t i = 0; i < color_att_count; i++) {
1436 gfx->color_att[i] = (struct anv_attachment) {
1437 .surface_state.state = next_state,
1438 };
1439 next_state.offset += ss_stride;
1440 next_state.map += ss_stride;
1441 }
1442 gfx->depth_att = (struct anv_attachment) { };
1443 gfx->stencil_att = (struct anv_attachment) { };
1444
1445 return VK_SUCCESS;
1446 }
1447
1448 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer * cmd_buffer)1449 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1450 {
1451 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1452
1453 gfx->render_area = (VkRect2D) { };
1454 gfx->layer_count = 0;
1455 gfx->samples = 0;
1456
1457 gfx->color_att_count = 0;
1458 gfx->depth_att = (struct anv_attachment) { };
1459 gfx->stencil_att = (struct anv_attachment) { };
1460 gfx->null_surface_state = ANV_STATE_NULL;
1461 }
1462
1463 /**
1464 * Program the hardware to use the specified L3 configuration.
1465 */
1466 void
genX(cmd_buffer_config_l3)1467 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
1468 const struct intel_l3_config *cfg)
1469 {
1470 assert(cfg || GFX_VER >= 12);
1471 if (cfg == cmd_buffer->state.current_l3_config)
1472 return;
1473
1474 #if GFX_VER >= 11
1475 /* On Gfx11+ we use only one config, so verify it remains the same and skip
1476 * the stalling programming entirely.
1477 */
1478 assert(cfg == cmd_buffer->device->l3_config);
1479 #else
1480 if (INTEL_DEBUG(DEBUG_L3)) {
1481 mesa_logd("L3 config transition: ");
1482 intel_dump_l3_config(cfg, stderr);
1483 }
1484
1485 /* According to the hardware docs, the L3 partitioning can only be changed
1486 * while the pipeline is completely drained and the caches are flushed,
1487 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1488 */
1489 genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1490 cmd_buffer->state.current_pipeline,
1491 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1492 ANV_PIPE_CS_STALL_BIT);
1493
1494 /* ...followed by a second pipelined PIPE_CONTROL that initiates
1495 * invalidation of the relevant caches. Note that because RO invalidation
1496 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1497 * command is processed by the CS) we cannot combine it with the previous
1498 * stalling flush as the hardware documentation suggests, because that
1499 * would cause the CS to stall on previous rendering *after* RO
1500 * invalidation and wouldn't prevent the RO caches from being polluted by
1501 * concurrent rendering before the stall completes. This intentionally
1502 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1503 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1504 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1505 * already guarantee that there is no concurrent GPGPU kernel execution
1506 * (see SKL HSD 2132585).
1507 */
1508 genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1509 cmd_buffer->state.current_pipeline,
1510 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
1511 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
1512 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1513 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
1514
1515 /* Now send a third stalling flush to make sure that invalidation is
1516 * complete when the L3 configuration registers are modified.
1517 */
1518 genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1519 cmd_buffer->state.current_pipeline,
1520 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1521 ANV_PIPE_CS_STALL_BIT);
1522
1523 genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
1524 #endif /* GFX_VER >= 11 */
1525 cmd_buffer->state.current_l3_config = cfg;
1526 }
1527
1528 ALWAYS_INLINE void
genX(invalidate_aux_map)1529 genX(invalidate_aux_map)(struct anv_batch *batch,
1530 struct anv_device *device,
1531 enum intel_engine_class engine_class,
1532 enum anv_pipe_bits bits)
1533 {
1534 #if GFX_VER == 12
1535 if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info->has_aux_map) {
1536 uint32_t register_addr = 0;
1537 switch (engine_class) {
1538 case INTEL_ENGINE_CLASS_COMPUTE:
1539 register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
1540 break;
1541 case INTEL_ENGINE_CLASS_COPY:
1542 #if GFX_VERx10 >= 125
1543 register_addr = GENX(BCS_CCS_AUX_INV_num);
1544 #endif
1545 break;
1546 case INTEL_ENGINE_CLASS_VIDEO:
1547 register_addr = GENX(VD0_CCS_AUX_INV_num);
1548 break;
1549 case INTEL_ENGINE_CLASS_RENDER:
1550 default:
1551 register_addr = GENX(GFX_CCS_AUX_INV_num);
1552 break;
1553 }
1554
1555 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
1556 lri.RegisterOffset = register_addr;
1557 lri.DataDWord = 1;
1558 }
1559
1560 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
1561 if (intel_needs_workaround(device->info, 16018063123) &&
1562 engine_class == INTEL_ENGINE_CLASS_COPY) {
1563 genX(batch_emit_fast_color_dummy_blit)(batch, device);
1564 }
1565
1566 /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
1567 *
1568 * "Poll Aux Invalidation bit once the invalidation is set
1569 * (Register 4208 bit 0)"
1570 */
1571 anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
1572 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
1573 sem.WaitMode = PollingMode;
1574 sem.RegisterPollMode = true;
1575 sem.SemaphoreDataDword = 0x0;
1576 sem.SemaphoreAddress =
1577 anv_address_from_u64(register_addr);
1578 }
1579 }
1580 #else
1581 assert(!device->info->has_aux_map);
1582 #endif
1583 }
1584
1585 ALWAYS_INLINE enum anv_pipe_bits
genX(emit_apply_pipe_flushes)1586 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
1587 struct anv_device *device,
1588 uint32_t current_pipeline,
1589 enum anv_pipe_bits bits,
1590 enum anv_pipe_bits *emitted_flush_bits)
1591 {
1592 #if GFX_VER >= 12
1593 /* From the TGL PRM, Volume 2a, "PIPE_CONTROL":
1594 *
1595 * "SW must follow below programming restrictions when programming
1596 * PIPE_CONTROL command [for ComputeCS]:
1597 * ...
1598 * Following bits must not be set when programmed for ComputeCS:
1599 * - "Render Target Cache Flush Enable", "Depth Cache Flush Enable"
1600 * and "Tile Cache Flush Enable"
1601 * - "Depth Stall Enable", Stall at Pixel Scoreboard and
1602 * "PSD Sync Enable".
1603 * - "OVR Tile 0 Flush", "TBIMR Force Batch Closure",
1604 * "AMFS Flush Enable", "VF Cache Invalidation Enable" and
1605 * "Global Snapshot Count Reset"."
1606 *
1607 * XXX: According to spec this should not be a concern for a regular
1608 * RCS in GPGPU mode, but during testing it was found that at least
1609 * "VF Cache Invalidation Enable" bit is ignored in such case.
1610 * This can cause us to miss some important invalidations
1611 * (e.g. from CmdPipelineBarriers) and have incoherent data.
1612 *
1613 * There is also a Wa_1606932921 "RCS is not waking up fixed function clock
1614 * when specific 3d related bits are programmed in pipecontrol in
1615 * compute mode" that suggests us not to use "RT Cache Flush" in GPGPU mode.
1616 *
1617 * The other bits are not confirmed to cause problems, but included here
1618 * just to be safe, as they're also not really relevant in the GPGPU mode,
1619 * and having them doesn't seem to cause any regressions.
1620 *
1621 * So if we're currently in GPGPU mode, we hide some bits from
1622 * this flush, and will flush them only when we'll be able to.
1623 * Similar thing with GPGPU-only bits.
1624 */
1625 enum anv_pipe_bits defer_bits = bits &
1626 (current_pipeline == GPGPU ? ANV_PIPE_GFX_BITS: ANV_PIPE_GPGPU_BITS);
1627
1628 bits &= ~defer_bits;
1629 #endif
1630
1631 /*
1632 * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
1633 *
1634 * Write synchronization is a special case of end-of-pipe
1635 * synchronization that requires that the render cache and/or depth
1636 * related caches are flushed to memory, where the data will become
1637 * globally visible. This type of synchronization is required prior to
1638 * SW (CPU) actually reading the result data from memory, or initiating
1639 * an operation that will use as a read surface (such as a texture
1640 * surface) a previous render target and/or depth/stencil buffer
1641 *
1642 *
1643 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1644 *
1645 * Exercising the write cache flush bits (Render Target Cache Flush
1646 * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
1647 * ensures the write caches are flushed and doesn't guarantee the data
1648 * is globally visible.
1649 *
1650 * SW can track the completion of the end-of-pipe-synchronization by
1651 * using "Notify Enable" and "PostSync Operation - Write Immediate
1652 * Data" in the PIPE_CONTROL command.
1653 *
1654 * In other words, flushes are pipelined while invalidations are handled
1655 * immediately. Therefore, if we're flushing anything then we need to
1656 * schedule an end-of-pipe sync before any invalidations can happen.
1657 */
1658 if (bits & ANV_PIPE_FLUSH_BITS)
1659 bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1660
1661
1662 /* HSD 1209978178: docs say that before programming the aux table:
1663 *
1664 * "Driver must ensure that the engine is IDLE but ensure it doesn't
1665 * add extra flushes in the case it knows that the engine is already
1666 * IDLE."
1667 *
1668 * HSD 22012751911: SW Programming sequence when issuing aux invalidation:
1669 *
1670 * "Render target Cache Flush + L3 Fabric Flush + State Invalidation + CS Stall"
1671 *
1672 * Notice we don't set the L3 Fabric Flush here, because we have
1673 * ANV_PIPE_END_OF_PIPE_SYNC_BIT which inserts a CS stall. The
1674 * PIPE_CONTROL::L3 Fabric Flush documentation says :
1675 *
1676 * "L3 Fabric Flush will ensure all the pending transactions in the L3
1677 * Fabric are flushed to global observation point. HW does implicit L3
1678 * Fabric Flush on all stalling flushes (both explicit and implicit)
1679 * and on PIPECONTROL having Post Sync Operation enabled."
1680 *
1681 * Therefore setting L3 Fabric Flush here would be redundant.
1682 */
1683 if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)) {
1684 if (current_pipeline == GPGPU) {
1685 bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
1686 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1687 (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
1688 } else if (current_pipeline == _3D) {
1689 bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
1690 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1691 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
1692 (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
1693 }
1694 }
1695
1696 /* If we're going to do an invalidate and we have a pending end-of-pipe
1697 * sync that has yet to be resolved, we do the end-of-pipe sync now.
1698 */
1699 if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
1700 (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
1701 bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
1702 bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1703
1704 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) && bits) {
1705 fputs("pc: add ", stderr);
1706 anv_dump_pipe_bits(ANV_PIPE_END_OF_PIPE_SYNC_BIT, stdout);
1707 fprintf(stderr, "reason: Ensure flushes done before invalidate\n");
1708 }
1709 }
1710
1711 /* Project: SKL / Argument: LRI Post Sync Operation [23]
1712 *
1713 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
1714 * programmed prior to programming a PIPECONTROL command with "LRI
1715 * Post Sync Operation" in GPGPU mode of operation (i.e when
1716 * PIPELINE_SELECT command is set to GPGPU mode of operation)."
1717 *
1718 * The same text exists a few rows below for Post Sync Op.
1719 */
1720 if (bits & ANV_PIPE_POST_SYNC_BIT) {
1721 if (GFX_VER == 9 && current_pipeline == GPGPU)
1722 bits |= ANV_PIPE_CS_STALL_BIT;
1723 bits &= ~ANV_PIPE_POST_SYNC_BIT;
1724 }
1725
1726 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1727 ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
1728 enum anv_pipe_bits flush_bits =
1729 bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1730 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1731
1732 uint32_t sync_op = NoWrite;
1733 struct anv_address addr = ANV_NULL_ADDRESS;
1734
1735 /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
1736 *
1737 * "The most common action to perform upon reaching a
1738 * synchronization point is to write a value out to memory. An
1739 * immediate value (included with the synchronization command) may
1740 * be written."
1741 *
1742 *
1743 * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
1744 *
1745 * "In case the data flushed out by the render engine is to be
1746 * read back in to the render engine in coherent manner, then the
1747 * render engine has to wait for the fence completion before
1748 * accessing the flushed data. This can be achieved by following
1749 * means on various products: PIPE_CONTROL command with CS Stall
1750 * and the required write caches flushed with Post-Sync-Operation
1751 * as Write Immediate Data.
1752 *
1753 * Example:
1754 * - Workload-1 (3D/GPGPU/MEDIA)
1755 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1756 * Immediate Data, Required Write Cache Flush bits set)
1757 * - Workload-2 (Can use the data produce or output by
1758 * Workload-1)
1759 */
1760 if (flush_bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
1761 flush_bits |= ANV_PIPE_CS_STALL_BIT;
1762 sync_op = WriteImmediateData;
1763 addr = device->workaround_address;
1764 }
1765
1766 /* Flush PC. */
1767 genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
1768 sync_op, addr, 0, flush_bits);
1769
1770 /* If the caller wants to know what flushes have been emitted,
1771 * provide the bits based off the PIPE_CONTROL programmed bits.
1772 */
1773 if (emitted_flush_bits != NULL)
1774 *emitted_flush_bits = flush_bits;
1775
1776 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1777 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1778 }
1779
1780 if (bits & ANV_PIPE_INVALIDATE_BITS) {
1781 uint32_t sync_op = NoWrite;
1782 struct anv_address addr = ANV_NULL_ADDRESS;
1783
1784 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
1785 *
1786 * "When VF Cache Invalidate is set “Post Sync Operation” must be
1787 * enabled to “Write Immediate Data” or “Write PS Depth Count” or
1788 * “Write Timestamp”.
1789 */
1790 if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1791 sync_op = WriteImmediateData;
1792 addr = device->workaround_address;
1793 }
1794
1795 /* Invalidate PC. */
1796 genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
1797 sync_op, addr, 0, bits);
1798
1799 enum intel_engine_class engine_class =
1800 current_pipeline == GPGPU ? INTEL_ENGINE_CLASS_COMPUTE :
1801 INTEL_ENGINE_CLASS_RENDER;
1802 genX(invalidate_aux_map)(batch, device, engine_class, bits);
1803
1804 bits &= ~ANV_PIPE_INVALIDATE_BITS;
1805 }
1806
1807 #if GFX_VER >= 12
1808 bits |= defer_bits;
1809 #endif
1810
1811 return bits;
1812 }
1813
1814 ALWAYS_INLINE void
genX(cmd_buffer_apply_pipe_flushes)1815 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
1816 {
1817 #if INTEL_NEEDS_WA_1508744258
1818 /* If we're changing the state of the RHWO optimization, we need to have
1819 * sb_stall+cs_stall.
1820 */
1821 const bool rhwo_opt_change =
1822 cmd_buffer->state.rhwo_optimization_enabled !=
1823 cmd_buffer->state.pending_rhwo_optimization_enabled;
1824 if (rhwo_opt_change) {
1825 anv_add_pending_pipe_bits(cmd_buffer,
1826 ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
1827 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1828 "change RHWO optimization");
1829 }
1830 #endif
1831
1832 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
1833
1834 if (unlikely(cmd_buffer->device->physical->always_flush_cache))
1835 bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
1836 else if (bits == 0)
1837 return;
1838
1839 if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
1840 anv_cmd_buffer_is_video_queue(cmd_buffer)) {
1841 if (bits & ANV_PIPE_INVALIDATE_BITS) {
1842 genX(invalidate_aux_map)(&cmd_buffer->batch, cmd_buffer->device,
1843 cmd_buffer->queue_family->engine_class, bits);
1844 bits &= ~ANV_PIPE_INVALIDATE_BITS;
1845 }
1846 cmd_buffer->state.pending_pipe_bits = bits;
1847 return;
1848 }
1849
1850 if (GFX_VER == 9 &&
1851 (bits & ANV_PIPE_CS_STALL_BIT) &&
1852 (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1853 /* If we are doing a VF cache invalidate AND a CS stall (it must be
1854 * both) then we can reset our vertex cache tracking.
1855 */
1856 memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
1857 sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
1858 memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
1859 sizeof(cmd_buffer->state.gfx.ib_dirty_range));
1860 }
1861
1862 enum anv_pipe_bits emitted_bits = 0;
1863 cmd_buffer->state.pending_pipe_bits =
1864 genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
1865 cmd_buffer->device,
1866 cmd_buffer->state.current_pipeline,
1867 bits,
1868 &emitted_bits);
1869 anv_cmd_buffer_update_pending_query_bits(cmd_buffer, emitted_bits);
1870
1871 #if INTEL_NEEDS_WA_1508744258
1872 if (rhwo_opt_change) {
1873 anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
1874 c1.RCCRHWOOptimizationDisable =
1875 !cmd_buffer->state.pending_rhwo_optimization_enabled;
1876 c1.RCCRHWOOptimizationDisableMask = true;
1877 }
1878 cmd_buffer->state.rhwo_optimization_enabled =
1879 cmd_buffer->state.pending_rhwo_optimization_enabled;
1880 }
1881 #endif
1882
1883 }
1884
1885 static inline struct anv_state
emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1886 emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1887 struct anv_cmd_pipeline_state *pipe_state,
1888 struct anv_pipeline_binding *binding,
1889 const struct anv_descriptor *desc)
1890 {
1891 if (!desc->buffer)
1892 return anv_null_surface_state_for_binding_table(cmd_buffer->device);
1893
1894 /* Compute the offset within the buffer */
1895 uint32_t dynamic_offset =
1896 pipe_state->dynamic_offsets[
1897 binding->set].offsets[binding->dynamic_offset_index];
1898 uint64_t offset = desc->offset + dynamic_offset;
1899 /* Clamp to the buffer size */
1900 offset = MIN2(offset, desc->buffer->vk.size);
1901 /* Clamp the range to the buffer size */
1902 uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
1903
1904 /* Align the range for consistency */
1905 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
1906 range = align(range, ANV_UBO_ALIGNMENT);
1907
1908 struct anv_address address =
1909 anv_address_add(desc->buffer->address, offset);
1910
1911 struct anv_state surface_state =
1912 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
1913 if (surface_state.map == NULL)
1914 return ANV_STATE_NULL;
1915
1916 enum isl_format format =
1917 anv_isl_format_for_descriptor_type(cmd_buffer->device,
1918 desc->type);
1919
1920 isl_surf_usage_flags_t usage =
1921 anv_isl_usage_for_descriptor_type(desc->type);
1922
1923 anv_fill_buffer_surface_state(cmd_buffer->device,
1924 surface_state.map,
1925 format, ISL_SWIZZLE_IDENTITY,
1926 usage, address, range, 1);
1927
1928 return surface_state;
1929 }
1930
1931 static uint32_t
emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1932 emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1933 struct anv_cmd_pipeline_state *pipe_state,
1934 struct anv_pipeline_binding *binding,
1935 const struct anv_descriptor *desc)
1936 {
1937 struct anv_device *device = cmd_buffer->device;
1938 struct anv_state surface_state;
1939
1940 /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
1941 * Depending on where the descriptor surface state is allocated, they can
1942 * either come from device->internal_surface_state_pool or
1943 * device->bindless_surface_state_pool.
1944 */
1945 switch (desc->type) {
1946 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
1947 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
1948 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
1949 if (desc->image_view) {
1950 const struct anv_surface_state *sstate =
1951 anv_image_view_texture_surface_state(desc->image_view,
1952 binding->plane,
1953 desc->layout);
1954 surface_state = desc->image_view->use_surface_state_stream ?
1955 sstate->state :
1956 anv_bindless_state_for_binding_table(device, sstate->state);
1957 assert(surface_state.alloc_size);
1958 } else {
1959 surface_state = anv_null_surface_state_for_binding_table(device);
1960 }
1961 break;
1962 }
1963
1964 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
1965 if (desc->image_view) {
1966 const struct anv_surface_state *sstate =
1967 anv_image_view_storage_surface_state(desc->image_view);
1968 surface_state = desc->image_view->use_surface_state_stream ?
1969 sstate->state :
1970 anv_bindless_state_for_binding_table(device, sstate->state);
1971 assert(surface_state.alloc_size);
1972 } else {
1973 surface_state =
1974 anv_null_surface_state_for_binding_table(device);
1975 }
1976 break;
1977 }
1978
1979 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
1980 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
1981 if (desc->set_buffer_view) {
1982 surface_state = desc->set_buffer_view->general.state;
1983 assert(surface_state.alloc_size);
1984 } else {
1985 surface_state = anv_null_surface_state_for_binding_table(device);
1986 }
1987 break;
1988
1989 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
1990 if (desc->buffer_view) {
1991 surface_state = anv_bindless_state_for_binding_table(
1992 device,
1993 desc->buffer_view->general.state);
1994 assert(surface_state.alloc_size);
1995 } else {
1996 surface_state = anv_null_surface_state_for_binding_table(device);
1997 }
1998 break;
1999
2000 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2001 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
2002 surface_state =
2003 emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
2004 binding, desc);
2005 break;
2006
2007 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2008 if (desc->buffer_view) {
2009 surface_state = anv_bindless_state_for_binding_table(
2010 device, desc->buffer_view->storage.state);
2011 assert(surface_state.alloc_size);
2012 } else {
2013 surface_state = anv_null_surface_state_for_binding_table(device);
2014 }
2015 break;
2016
2017 default:
2018 unreachable("Invalid descriptor type");
2019 }
2020
2021 return surface_state.offset;
2022 }
2023
2024 static uint32_t
emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const struct anv_descriptor_set * set,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)2025 emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
2026 struct anv_cmd_pipeline_state *pipe_state,
2027 const struct anv_descriptor_set *set,
2028 struct anv_pipeline_binding *binding,
2029 const struct anv_descriptor *desc)
2030 {
2031 uint32_t desc_offset;
2032
2033 /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
2034 * Depending on where the descriptor surface state is allocated, they can
2035 * either come from device->internal_surface_state_pool or
2036 * device->bindless_surface_state_pool.
2037 */
2038 switch (desc->type) {
2039 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2040 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2041 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
2042 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
2043 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2044 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2045 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2046 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2047 desc_offset = set->desc_offset + binding->set_offset;
2048 break;
2049
2050 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2051 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2052 struct anv_state state =
2053 emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
2054 binding, desc);
2055 desc_offset = state.offset;
2056 break;
2057 }
2058
2059 default:
2060 unreachable("Invalid descriptor type");
2061 }
2062
2063 return desc_offset;
2064 }
2065
2066 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)2067 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2068 struct anv_cmd_pipeline_state *pipe_state,
2069 struct anv_shader_bin *shader,
2070 struct anv_state *bt_state)
2071 {
2072 uint32_t state_offset;
2073
2074 struct anv_pipeline_bind_map *map = &shader->bind_map;
2075 if (map->surface_count == 0) {
2076 *bt_state = (struct anv_state) { 0, };
2077 return VK_SUCCESS;
2078 }
2079
2080 *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2081 map->surface_count,
2082 &state_offset);
2083 uint32_t *bt_map = bt_state->map;
2084
2085 if (bt_state->map == NULL)
2086 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2087
2088 for (uint32_t s = 0; s < map->surface_count; s++) {
2089 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2090
2091 struct anv_state surface_state;
2092
2093 switch (binding->set) {
2094 case ANV_DESCRIPTOR_SET_NULL:
2095 bt_map[s] = 0;
2096 break;
2097
2098 case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2099 /* Color attachment binding */
2100 assert(shader->stage == MESA_SHADER_FRAGMENT);
2101 if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2102 const struct anv_attachment *att =
2103 &cmd_buffer->state.gfx.color_att[binding->index];
2104 surface_state = att->surface_state.state;
2105 } else {
2106 surface_state = cmd_buffer->state.gfx.null_surface_state;
2107 }
2108 assert(surface_state.map);
2109 bt_map[s] = surface_state.offset + state_offset;
2110 break;
2111
2112 case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2113 /* This is always the first binding for compute shaders */
2114 assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2115
2116 struct anv_state surface_state =
2117 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2118 if (surface_state.map == NULL)
2119 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2120
2121 const enum isl_format format =
2122 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2123 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2124 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state.map,
2125 format, ISL_SWIZZLE_IDENTITY,
2126 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2127 cmd_buffer->state.compute.num_workgroups,
2128 12, 1);
2129
2130 assert(surface_state.map);
2131 bt_map[s] = surface_state.offset + state_offset;
2132 break;
2133 }
2134
2135 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2136 struct anv_descriptor_set *set =
2137 pipe_state->descriptors[binding->index];
2138
2139 /* If the shader doesn't access the set buffer, just put the null
2140 * surface.
2141 */
2142 if (set->is_push && !shader->push_desc_info.used_set_buffer) {
2143 bt_map[s] = 0;
2144 break;
2145 }
2146
2147 /* This is a descriptor set buffer so the set index is actually
2148 * given by binding->binding. (Yes, that's confusing.)
2149 */
2150 assert(set->desc_surface_mem.alloc_size);
2151 assert(set->desc_surface_state.alloc_size);
2152 bt_map[s] = set->desc_surface_state.offset + state_offset;
2153 add_surface_reloc(cmd_buffer, anv_descriptor_set_address(set));
2154 break;
2155 }
2156
2157 case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
2158 assert(pipe_state->descriptor_buffers[binding->index].state.alloc_size);
2159 bt_map[s] = pipe_state->descriptor_buffers[binding->index].state.offset +
2160 state_offset;
2161 break;
2162 }
2163
2164 default: {
2165 assert(binding->set < MAX_SETS);
2166 const struct anv_descriptor_set *set =
2167 pipe_state->descriptors[binding->set];
2168
2169 if (binding->index >= set->descriptor_count) {
2170 /* From the Vulkan spec section entitled "DescriptorSet and
2171 * Binding Assignment":
2172 *
2173 * "If the array is runtime-sized, then array elements greater
2174 * than or equal to the size of that binding in the bound
2175 * descriptor set must not be used."
2176 *
2177 * Unfortunately, the compiler isn't smart enough to figure out
2178 * when a dynamic binding isn't used so it may grab the whole
2179 * array and stick it in the binding table. In this case, it's
2180 * safe to just skip those bindings that are OOB.
2181 */
2182 assert(binding->index < set->layout->descriptor_count);
2183 continue;
2184 }
2185
2186 /* For push descriptor, if the binding is fully promoted to push
2187 * constants, just reference the null surface in the binding table.
2188 * It's unused and we didn't allocate/pack a surface state for it .
2189 */
2190 if (set->is_push) {
2191 uint32_t desc_idx = set->layout->binding[binding->binding].descriptor_index;
2192 assert(desc_idx < MAX_PUSH_DESCRIPTORS);
2193
2194 if (shader->push_desc_info.fully_promoted_ubo_descriptors & BITFIELD_BIT(desc_idx)) {
2195 surface_state =
2196 anv_null_surface_state_for_binding_table(cmd_buffer->device);
2197 break;
2198 }
2199 }
2200
2201 const struct anv_descriptor *desc = &set->descriptors[binding->index];
2202 if (desc->type == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR ||
2203 desc->type == VK_DESCRIPTOR_TYPE_SAMPLER) {
2204 /* Nothing for us to do here */
2205 continue;
2206 }
2207
2208 const struct anv_pipeline *pipeline = pipe_state->pipeline;
2209 uint32_t surface_state_offset;
2210 if (pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
2211 surface_state_offset =
2212 emit_indirect_descriptor_binding_table_entry(cmd_buffer,
2213 pipe_state,
2214 binding, desc);
2215 } else {
2216 assert(pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT ||
2217 pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER);
2218 surface_state_offset =
2219 emit_direct_descriptor_binding_table_entry(cmd_buffer, pipe_state,
2220 set, binding, desc);
2221 }
2222
2223 bt_map[s] = surface_state_offset + state_offset;
2224 break;
2225 }
2226 }
2227 }
2228
2229 return VK_SUCCESS;
2230 }
2231
2232 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2233 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2234 struct anv_cmd_pipeline_state *pipe_state,
2235 struct anv_shader_bin *shader,
2236 struct anv_state *state)
2237 {
2238 struct anv_pipeline_bind_map *map = &shader->bind_map;
2239 if (map->sampler_count == 0) {
2240 *state = (struct anv_state) { 0, };
2241 return VK_SUCCESS;
2242 }
2243
2244 uint32_t size = map->sampler_count * 16;
2245 *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2246
2247 if (state->map == NULL)
2248 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2249
2250 for (uint32_t s = 0; s < map->sampler_count; s++) {
2251 struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2252 const struct anv_descriptor *desc =
2253 &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2254
2255 if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2256 desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2257 continue;
2258
2259 struct anv_sampler *sampler = desc->sampler;
2260
2261 /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2262 * happens to be zero.
2263 */
2264 if (sampler == NULL)
2265 continue;
2266
2267 memcpy(state->map + (s * 16), sampler->state[binding->plane],
2268 sizeof(sampler->state[0]));
2269 }
2270
2271 return VK_SUCCESS;
2272 }
2273
2274 uint32_t
genX(cmd_buffer_flush_descriptor_sets)2275 genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
2276 struct anv_cmd_pipeline_state *pipe_state,
2277 const VkShaderStageFlags dirty,
2278 struct anv_shader_bin **shaders,
2279 uint32_t num_shaders)
2280 {
2281 VkShaderStageFlags flushed = 0;
2282
2283 VkResult result = VK_SUCCESS;
2284 for (uint32_t i = 0; i < num_shaders; i++) {
2285 if (!shaders[i])
2286 continue;
2287
2288 gl_shader_stage stage = shaders[i]->stage;
2289 VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2290 if ((vk_stage & dirty) == 0)
2291 continue;
2292
2293 assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2294 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2295 &cmd_buffer->state.samplers[stage]);
2296 if (result != VK_SUCCESS)
2297 break;
2298
2299 assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2300 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2301 &cmd_buffer->state.binding_tables[stage]);
2302 if (result != VK_SUCCESS)
2303 break;
2304
2305 flushed |= vk_stage;
2306 }
2307
2308 if (result != VK_SUCCESS) {
2309 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2310
2311 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2312 if (result != VK_SUCCESS)
2313 return 0;
2314
2315 /* Re-emit the BT base address so we get the new surface state base
2316 * address before we start emitting binding tables etc.
2317 */
2318 genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
2319
2320 /* Re-emit all active binding tables */
2321 flushed = 0;
2322
2323 for (uint32_t i = 0; i < num_shaders; i++) {
2324 if (!shaders[i])
2325 continue;
2326
2327 gl_shader_stage stage = shaders[i]->stage;
2328
2329 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2330 &cmd_buffer->state.samplers[stage]);
2331 if (result != VK_SUCCESS) {
2332 anv_batch_set_error(&cmd_buffer->batch, result);
2333 return 0;
2334 }
2335 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2336 &cmd_buffer->state.binding_tables[stage]);
2337 if (result != VK_SUCCESS) {
2338 anv_batch_set_error(&cmd_buffer->batch, result);
2339 return 0;
2340 }
2341
2342 flushed |= mesa_to_vk_shader_stage(stage);
2343 }
2344 }
2345
2346 return flushed;
2347 }
2348
2349 /* This function generates the surface state used to read the content of the
2350 * descriptor buffer.
2351 */
2352 void
genX(cmd_buffer_emit_push_descriptor_buffer_surface)2353 genX(cmd_buffer_emit_push_descriptor_buffer_surface)(struct anv_cmd_buffer *cmd_buffer,
2354 struct anv_descriptor_set *set)
2355 {
2356 assert(set->desc_surface_state.map == NULL);
2357
2358 struct anv_descriptor_set_layout *layout = set->layout;
2359 enum isl_format format =
2360 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2361 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2362
2363 set->desc_surface_state =
2364 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2365 if (set->desc_surface_state.map == NULL)
2366 return;
2367 anv_fill_buffer_surface_state(cmd_buffer->device,
2368 set->desc_surface_state.map,
2369 format, ISL_SWIZZLE_IDENTITY,
2370 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2371 set->desc_surface_addr,
2372 layout->descriptor_buffer_surface_size, 1);
2373 }
2374
2375 /* This functions generates surface states used by a pipeline for push
2376 * descriptors. This is delayed to the draw/dispatch time to avoid allocation
2377 * and surface state generation when a pipeline is not going to use the
2378 * binding table to access any push descriptor data.
2379 */
2380 void
genX(cmd_buffer_emit_push_descriptor_surfaces)2381 genX(cmd_buffer_emit_push_descriptor_surfaces)(struct anv_cmd_buffer *cmd_buffer,
2382 struct anv_descriptor_set *set)
2383 {
2384 while (set->generate_surface_states) {
2385 int desc_idx = u_bit_scan(&set->generate_surface_states);
2386 struct anv_descriptor *desc = &set->descriptors[desc_idx];
2387 struct anv_buffer_view *bview = desc->set_buffer_view;
2388
2389 if (bview != NULL && bview->general.state.map == NULL) {
2390 bview->general.state =
2391 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2392 if (bview->general.state.map == NULL)
2393 return;
2394 anv_descriptor_write_surface_state(cmd_buffer->device, desc,
2395 bview->general.state);
2396 }
2397 }
2398 }
2399
2400 ALWAYS_INLINE void
genX(batch_emit_pipe_control)2401 genX(batch_emit_pipe_control)(struct anv_batch *batch,
2402 const struct intel_device_info *devinfo,
2403 uint32_t current_pipeline,
2404 enum anv_pipe_bits bits,
2405 const char *reason)
2406 {
2407 genX(batch_emit_pipe_control_write)(batch,
2408 devinfo,
2409 current_pipeline,
2410 NoWrite,
2411 ANV_NULL_ADDRESS,
2412 0,
2413 bits,
2414 reason);
2415 }
2416
2417 ALWAYS_INLINE void
genX(batch_emit_pipe_control_write)2418 genX(batch_emit_pipe_control_write)(struct anv_batch *batch,
2419 const struct intel_device_info *devinfo,
2420 uint32_t current_pipeline,
2421 uint32_t post_sync_op,
2422 struct anv_address address,
2423 uint32_t imm_data,
2424 enum anv_pipe_bits bits,
2425 const char *reason)
2426 {
2427 if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
2428 (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO))
2429 unreachable("Trying to emit unsupported PIPE_CONTROL command.");
2430
2431 const bool trace_flush =
2432 (bits & (ANV_PIPE_FLUSH_BITS |
2433 ANV_PIPE_STALL_BITS |
2434 ANV_PIPE_INVALIDATE_BITS |
2435 ANV_PIPE_END_OF_PIPE_SYNC_BIT)) != 0;
2436 if (trace_flush && batch->trace != NULL) {
2437 // Store pipe control reasons if there is enough space
2438 if (batch->pc_reasons_count < ARRAY_SIZE(batch->pc_reasons)) {
2439 batch->pc_reasons[batch->pc_reasons_count++] = reason;
2440 }
2441 trace_intel_begin_stall(batch->trace);
2442 }
2443
2444
2445 /* XXX - insert all workarounds and GFX specific things below. */
2446
2447 /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
2448 * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
2449 * with CS_STALL Bit set (with No POST_SYNC ENABLED)
2450 */
2451 if (intel_device_info_is_adln(devinfo) &&
2452 current_pipeline == GPGPU &&
2453 post_sync_op != NoWrite) {
2454 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2455 pipe.CommandStreamerStallEnable = true;
2456 anv_debug_dump_pc(pipe, "Wa_14014966230");
2457 };
2458 }
2459
2460 /* SKL PRMs, Volume 7: 3D-Media-GPGPU, Programming Restrictions for
2461 * PIPE_CONTROL, Flush Types:
2462 * "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
2463 * For newer platforms this is documented in the PIPE_CONTROL instruction
2464 * page.
2465 */
2466 if (current_pipeline == GPGPU &&
2467 (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT))
2468 bits |= ANV_PIPE_CS_STALL_BIT;
2469
2470 #if INTEL_NEEDS_WA_1409600907
2471 /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
2472 * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
2473 */
2474 if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT)
2475 bits |= ANV_PIPE_DEPTH_STALL_BIT;
2476 #endif
2477
2478 #if GFX_VERx10 >= 125
2479 if (current_pipeline != GPGPU) {
2480 if (bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
2481 bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
2482 } else {
2483 if (bits & (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
2484 ANV_PIPE_DATA_CACHE_FLUSH_BIT))
2485 bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
2486 }
2487
2488 /* BSpec 47112: PIPE_CONTROL::Untyped Data-Port Cache Flush:
2489 *
2490 * "'HDC Pipeline Flush' bit must be set for this bit to take
2491 * effect."
2492 */
2493 if (bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
2494 bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2495 #endif
2496
2497 #if GFX_VER < 12
2498 if (bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
2499 bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2500 #endif
2501
2502 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2503 *
2504 * "If the VF Cache Invalidation Enable is set to a 1 in a
2505 * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to
2506 * 0, with the VF Cache Invalidation Enable set to 0 needs to be sent
2507 * prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to
2508 * a 1."
2509 *
2510 * This appears to hang Broadwell, so we restrict it to just gfx9.
2511 */
2512 if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
2513 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
2514
2515 #if GFX_VER >= 9 && GFX_VER <= 11
2516 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2517 *
2518 * "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
2519 * always set for GPGPU workloads when “Texture Cache
2520 * Invalidation Enable” bit is set".
2521 *
2522 * Workaround stopped appearing in TGL PRMs.
2523 */
2524 if (current_pipeline == GPGPU &&
2525 (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT))
2526 bits |= ANV_PIPE_CS_STALL_BIT;
2527 #endif
2528
2529 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2530 #if GFX_VERx10 >= 125
2531 pipe.UntypedDataPortCacheFlushEnable =
2532 bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
2533 pipe.CCSFlushEnable = bits & ANV_PIPE_CCS_CACHE_FLUSH_BIT;
2534 #endif
2535 #if GFX_VER == 12
2536 pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
2537 #endif
2538 #if GFX_VER > 11
2539 pipe.HDCPipelineFlushEnable = bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2540 #endif
2541 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
2542 pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2543 pipe.RenderTargetCacheFlushEnable =
2544 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
2545
2546 pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
2547
2548 pipe.TLBInvalidate = bits & ANV_PIPE_TLB_INVALIDATE_BIT;
2549
2550 #if GFX_VERx10 >= 125
2551 pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
2552 #endif
2553 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
2554 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2555
2556 pipe.StateCacheInvalidationEnable =
2557 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
2558 pipe.ConstantCacheInvalidationEnable =
2559 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
2560 #if GFX_VER >= 12
2561 /* Invalidates the L3 cache part in which index & vertex data is loaded
2562 * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
2563 */
2564 pipe.L3ReadOnlyCacheInvalidationEnable =
2565 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2566 #endif
2567 pipe.VFCacheInvalidationEnable =
2568 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2569 pipe.TextureCacheInvalidationEnable =
2570 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
2571 pipe.InstructionCacheInvalidateEnable =
2572 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
2573
2574 pipe.PostSyncOperation = post_sync_op;
2575 pipe.Address = address;
2576 pipe.DestinationAddressType = DAT_PPGTT;
2577 pipe.ImmediateData = imm_data;
2578
2579 anv_debug_dump_pc(pipe, reason);
2580 }
2581
2582 if (trace_flush && batch->trace != NULL) {
2583 trace_intel_end_stall(batch->trace, bits,
2584 anv_pipe_flush_bit_to_ds_stall_flag,
2585 batch->pc_reasons[0],
2586 batch->pc_reasons[1],
2587 batch->pc_reasons[2],
2588 batch->pc_reasons[3]);
2589 batch->pc_reasons[0] = NULL;
2590 batch->pc_reasons[1] = NULL;
2591 batch->pc_reasons[2] = NULL;
2592 batch->pc_reasons[3] = NULL;
2593 batch->pc_reasons_count = 0;
2594 }
2595 }
2596
2597 /* Set preemption on/off. */
2598 void
genX(batch_set_preemption)2599 genX(batch_set_preemption)(struct anv_batch *batch,
2600 const struct intel_device_info *devinfo,
2601 uint32_t current_pipeline,
2602 bool value)
2603 {
2604 #if INTEL_WA_16013994831_GFX_VER
2605 if (!intel_needs_workaround(devinfo, 16013994831))
2606 return;
2607
2608 anv_batch_write_reg(batch, GENX(CS_CHICKEN1), cc1) {
2609 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !value;
2610 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
2611 }
2612
2613 /* Wa_16013994831 - we need to insert CS_STALL and 250 noops. */
2614 genx_batch_emit_pipe_control(batch, devinfo, current_pipeline,
2615 ANV_PIPE_CS_STALL_BIT);
2616
2617 for (unsigned i = 0; i < 250; i++)
2618 anv_batch_emit(batch, GENX(MI_NOOP), noop);
2619 #endif
2620 }
2621
2622 void
genX(cmd_buffer_set_preemption)2623 genX(cmd_buffer_set_preemption)(struct anv_cmd_buffer *cmd_buffer, bool value)
2624 {
2625 #if GFX_VERx10 >= 120
2626 if (cmd_buffer->state.gfx.object_preemption == value)
2627 return;
2628
2629 genX(batch_set_preemption)(&cmd_buffer->batch, cmd_buffer->device->info,
2630 cmd_buffer->state.current_pipeline,
2631 value);
2632 cmd_buffer->state.gfx.object_preemption = value;
2633 #endif
2634 }
2635
2636 ALWAYS_INLINE static void
update_descriptor_set_surface_state(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,uint32_t set_idx)2637 update_descriptor_set_surface_state(struct anv_cmd_buffer *cmd_buffer,
2638 struct anv_cmd_pipeline_state *pipe_state,
2639 uint32_t set_idx)
2640 {
2641 if (!pipe_state->descriptor_buffers[set_idx].bound)
2642 return;
2643
2644 const struct anv_physical_device *device = cmd_buffer->device->physical;
2645 const int32_t buffer_index =
2646 pipe_state->descriptor_buffers[set_idx].buffer_index;
2647 const struct anv_va_range *push_va_range =
2648 GFX_VERx10 >= 125 ?
2649 &device->va.push_descriptor_buffer_pool :
2650 &device->va.internal_surface_state_pool;
2651 const struct anv_va_range *va_range =
2652 buffer_index == -1 ? push_va_range : &device->va.dynamic_visible_pool;
2653 const uint64_t descriptor_set_addr =
2654 (buffer_index == -1 ? va_range->addr :
2655 cmd_buffer->state.descriptor_buffers.address[buffer_index]) +
2656 pipe_state->descriptor_buffers[set_idx].buffer_offset;
2657 const uint64_t set_size =
2658 MIN2(va_range->size - (descriptor_set_addr - va_range->addr),
2659 anv_physical_device_bindless_heap_size(device, true));
2660
2661 if (descriptor_set_addr != pipe_state->descriptor_buffers[set_idx].address) {
2662 pipe_state->descriptor_buffers[set_idx].address = descriptor_set_addr;
2663
2664 struct anv_state surface_state =
2665 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2666 const enum isl_format format =
2667 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2668 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2669 anv_fill_buffer_surface_state(
2670 cmd_buffer->device, surface_state.map,
2671 format, ISL_SWIZZLE_IDENTITY,
2672 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2673 anv_address_from_u64(pipe_state->descriptor_buffers[set_idx].address),
2674 set_size, 1);
2675
2676 pipe_state->descriptor_buffers[set_idx].state = surface_state;
2677 }
2678 }
2679
2680 ALWAYS_INLINE static uint32_t
compute_descriptor_set_surface_offset(const struct anv_cmd_buffer * cmd_buffer,const struct anv_cmd_pipeline_state * pipe_state,const uint32_t set_idx)2681 compute_descriptor_set_surface_offset(const struct anv_cmd_buffer *cmd_buffer,
2682 const struct anv_cmd_pipeline_state *pipe_state,
2683 const uint32_t set_idx)
2684 {
2685 const struct anv_physical_device *device = cmd_buffer->device->physical;
2686
2687 if (device->uses_ex_bso) {
2688 int32_t buffer_index =
2689 pipe_state->descriptor_buffers[set_idx].buffer_index;
2690 uint64_t buffer_address =
2691 buffer_index == -1 ?
2692 device->va.push_descriptor_buffer_pool.addr :
2693 cmd_buffer->state.descriptor_buffers.address[buffer_index];
2694
2695 return (buffer_address - device->va.dynamic_visible_pool.addr) +
2696 pipe_state->descriptor_buffers[set_idx].buffer_offset;
2697 }
2698
2699 return pipe_state->descriptor_buffers[set_idx].buffer_offset << 6;
2700 }
2701
2702 ALWAYS_INLINE static uint32_t
compute_descriptor_set_sampler_offset(const struct anv_cmd_buffer * cmd_buffer,const struct anv_cmd_pipeline_state * pipe_state,const uint32_t set_idx)2703 compute_descriptor_set_sampler_offset(const struct anv_cmd_buffer *cmd_buffer,
2704 const struct anv_cmd_pipeline_state *pipe_state,
2705 const uint32_t set_idx)
2706 {
2707 const struct anv_physical_device *device = cmd_buffer->device->physical;
2708 int32_t buffer_index =
2709 pipe_state->descriptor_buffers[set_idx].buffer_index;
2710 uint64_t buffer_address =
2711 buffer_index == -1 ?
2712 device->va.push_descriptor_buffer_pool.addr :
2713 cmd_buffer->state.descriptor_buffers.address[buffer_index];
2714
2715 return (buffer_address - device->va.dynamic_state_pool.addr) +
2716 pipe_state->descriptor_buffers[set_idx].buffer_offset;
2717 }
2718
2719 void
genX(flush_descriptor_buffers)2720 genX(flush_descriptor_buffers)(struct anv_cmd_buffer *cmd_buffer,
2721 struct anv_cmd_pipeline_state *pipe_state)
2722 {
2723 /* On Gfx12.5+ the STATE_BASE_ADDRESS BindlessSurfaceStateBaseAddress &
2724 * DynamicStateBaseAddress are fixed. So as long as we stay in one
2725 * descriptor buffer mode, there is no need to switch.
2726 */
2727 #if GFX_VERx10 >= 125
2728 if (cmd_buffer->state.current_db_mode !=
2729 cmd_buffer->state.pending_db_mode)
2730 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2731 #else
2732 if (cmd_buffer->state.descriptor_buffers.dirty)
2733 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2734 #endif
2735
2736 assert(cmd_buffer->state.current_db_mode !=
2737 ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
2738 if (cmd_buffer->state.current_db_mode == ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER &&
2739 (cmd_buffer->state.descriptor_buffers.dirty ||
2740 (pipe_state->pipeline->active_stages &
2741 cmd_buffer->state.descriptor_buffers.offsets_dirty) != 0)) {
2742 struct anv_push_constants *push_constants =
2743 &pipe_state->push_constants;
2744 for (uint32_t i = 0; i < ARRAY_SIZE(push_constants->desc_surface_offsets); i++) {
2745 update_descriptor_set_surface_state(cmd_buffer, pipe_state, i);
2746
2747 push_constants->desc_surface_offsets[i] =
2748 compute_descriptor_set_surface_offset(cmd_buffer, pipe_state, i);
2749 push_constants->desc_sampler_offsets[i] =
2750 compute_descriptor_set_sampler_offset(cmd_buffer, pipe_state, i);
2751 }
2752
2753 #if GFX_VERx10 < 125
2754 struct anv_device *device = cmd_buffer->device;
2755 push_constants->surfaces_base_offset =
2756 (cmd_buffer->state.descriptor_buffers.surfaces_address -
2757 device->physical->va.dynamic_visible_pool.addr);
2758 #endif
2759
2760 cmd_buffer->state.push_constants_dirty |=
2761 (cmd_buffer->state.descriptor_buffers.offsets_dirty &
2762 pipe_state->pipeline->active_stages);
2763 pipe_state->push_constants_data_dirty = true;
2764 cmd_buffer->state.descriptor_buffers.offsets_dirty &=
2765 ~pipe_state->pipeline->active_stages;
2766 }
2767
2768 cmd_buffer->state.descriptor_buffers.dirty = false;
2769 }
2770
2771 void
genX(cmd_buffer_begin_companion)2772 genX(cmd_buffer_begin_companion)(struct anv_cmd_buffer *cmd_buffer,
2773 VkCommandBufferLevel level)
2774 {
2775 cmd_buffer->vk.level = level;
2776 cmd_buffer->is_companion_rcs_cmd_buffer = true;
2777
2778 trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
2779
2780 #if GFX_VER >= 12
2781 /* Reenable prefetching at the beginning of secondary command buffers. We
2782 * do this so that the return instruction edition is not prefetched before
2783 * completion.
2784 */
2785 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2786 anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
2787 arb.PreParserDisableMask = true;
2788 arb.PreParserDisable = false;
2789 }
2790 }
2791 #endif
2792
2793 /* A companion command buffer is only used for blorp commands atm, so
2794 * default to the legacy mode.
2795 */
2796 cmd_buffer->state.current_db_mode = ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
2797 genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
2798
2799 /* Invalidate the aux table in every primary command buffer. This ensures
2800 * the command buffer see the last updates made by the host.
2801 */
2802 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
2803 cmd_buffer->device->info->has_aux_map) {
2804 anv_add_pending_pipe_bits(cmd_buffer,
2805 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
2806 "new cmd buffer with aux-tt");
2807 }
2808 }
2809
2810 static bool
aux_op_resolves(enum isl_aux_op aux_op)2811 aux_op_resolves(enum isl_aux_op aux_op)
2812 {
2813 return aux_op == ISL_AUX_OP_FULL_RESOLVE ||
2814 aux_op == ISL_AUX_OP_PARTIAL_RESOLVE;
2815 }
2816
2817 static bool
aux_op_clears(enum isl_aux_op aux_op)2818 aux_op_clears(enum isl_aux_op aux_op)
2819 {
2820 return aux_op == ISL_AUX_OP_FAST_CLEAR ||
2821 aux_op == ISL_AUX_OP_AMBIGUATE;
2822 }
2823
2824 static bool
aux_op_renders(enum isl_aux_op aux_op)2825 aux_op_renders(enum isl_aux_op aux_op)
2826 {
2827 return aux_op == ISL_AUX_OP_NONE;
2828 }
2829
2830 static void
add_pending_pipe_bits_for_color_aux_op(struct anv_cmd_buffer * cmd_buffer,enum isl_aux_op next_aux_op,enum anv_pipe_bits pipe_bits)2831 add_pending_pipe_bits_for_color_aux_op(struct anv_cmd_buffer *cmd_buffer,
2832 enum isl_aux_op next_aux_op,
2833 enum anv_pipe_bits pipe_bits)
2834 {
2835 const enum isl_aux_op last_aux_op = cmd_buffer->state.color_aux_op;
2836 assert(next_aux_op != last_aux_op);
2837
2838 char flush_reason[64] = {};
2839 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) ||
2840 u_trace_enabled(&cmd_buffer->device->ds.trace_context)) {
2841 int ret = snprintf(flush_reason, sizeof(flush_reason),
2842 "color aux-op: %s -> %s",
2843 isl_aux_op_to_name(last_aux_op),
2844 isl_aux_op_to_name(next_aux_op));
2845 assert(ret < sizeof(flush_reason));
2846 }
2847
2848 anv_add_pending_pipe_bits(cmd_buffer, pipe_bits, flush_reason);
2849 }
2850
2851 void
genX(cmd_buffer_update_color_aux_op)2852 genX(cmd_buffer_update_color_aux_op)(struct anv_cmd_buffer *cmd_buffer,
2853 enum isl_aux_op next_aux_op)
2854 {
2855 const enum isl_aux_op last_aux_op = cmd_buffer->state.color_aux_op;
2856
2857 if (!aux_op_clears(last_aux_op) && aux_op_clears(next_aux_op)) {
2858 #if GFX_VER >= 20
2859 /* From the Xe2 Bspec 57340 (r59562),
2860 * "MCS/CCS Buffers, Fast Clear for Render Target(s)":
2861 *
2862 * Synchronization:
2863 * Due to interaction of scaled clearing rectangle with pixel
2864 * scoreboard, we require one of the following commands to be
2865 * issued. [...]
2866 *
2867 * PIPE_CONTROL
2868 * PSS Stall Sync Enable [...] 1b (Enable)
2869 * Machine-wide Stall at Pixel Stage, wait for all Prior Pixel
2870 * Work to Reach End of Pipe
2871 * Render Target Cache Flush Enable [...] 1b (Enable)
2872 * Post-Sync Op Flushes Render Cache before Unblocking Stall
2873 *
2874 * This synchronization step is required before and after the fast
2875 * clear pass, to ensure correct ordering between pixels.
2876 */
2877 add_pending_pipe_bits_for_color_aux_op(
2878 cmd_buffer, next_aux_op,
2879 ANV_PIPE_PSS_STALL_SYNC_BIT |
2880 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2881
2882 #elif GFX_VERx10 == 125
2883 /* From the ACM Bspec 47704 (r52663), "Render Target Fast Clear":
2884 *
2885 * Preamble pre fast clear synchronization
2886 *
2887 * PIPE_CONTROL:
2888 * PS sync stall = 1
2889 * Tile Cache Flush = 1
2890 * RT Write Flush = 1
2891 * HDC Flush = 1
2892 * DC Flush = 1
2893 * Texture Invalidate = 1
2894 *
2895 * [...]
2896 *
2897 * Objective of the preamble flushes is to ensure all data is
2898 * evicted from L1 caches prior to fast clear.
2899 *
2900 * From the ACM PRM Vol. 9, "MCS/CCS Buffers for Render Target(s)":
2901 *
2902 * Any transition from any value in {Clear, Render, Resolve} to a
2903 * different value in {Clear, Render, Resolve} requires end of pipe
2904 * synchronization.
2905 */
2906 add_pending_pipe_bits_for_color_aux_op(
2907 cmd_buffer, next_aux_op,
2908 ANV_PIPE_PSS_STALL_SYNC_BIT |
2909 ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2910 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
2911 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
2912 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
2913 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
2914 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2915
2916 #elif GFX_VERx10 == 120
2917 /* From the TGL Bspec 47704 (r52663), "Render Target Fast Clear":
2918 *
2919 * Preamble pre fast clear synchronization
2920 *
2921 * PIPE_CONTROL:
2922 * Depth Stall = 1
2923 * Tile Cache Flush = 1
2924 * RT Write Flush = 1
2925 * Texture Invalidate = 1
2926 *
2927 * [...]
2928 *
2929 * Objective of the preamble flushes is to ensure all data is
2930 * evicted from L1 caches prior to fast clear.
2931 *
2932 * From the TGL PRM Vol. 9, "MCS/CCS Buffers for Render Target(s)":
2933 *
2934 * Any transition from any value in {Clear, Render, Resolve} to a
2935 * different value in {Clear, Render, Resolve} requires end of pipe
2936 * synchronization.
2937 */
2938 add_pending_pipe_bits_for_color_aux_op(
2939 cmd_buffer, next_aux_op,
2940 ANV_PIPE_DEPTH_STALL_BIT |
2941 ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2942 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
2943 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
2944 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2945
2946 #else
2947 /* From the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
2948 *
2949 * Any transition from any value in {Clear, Render, Resolve} to a
2950 * different value in {Clear, Render, Resolve} requires end of pipe
2951 * synchronization.
2952 *
2953 * From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
2954 *
2955 * After Render target fast clear, pipe-control with color cache
2956 * write-flush must be issued before sending any DRAW commands on
2957 * that render target.
2958 *
2959 * The last comment is a bit cryptic and doesn't really tell you what's
2960 * going or what's really needed. It appears that fast clear ops are
2961 * not properly synchronized with other drawing. This means that we
2962 * cannot have a fast clear operation in the pipe at the same time as
2963 * other regular drawing operations. We need to use a PIPE_CONTROL
2964 * to ensure that the contents of the previous draw hit the render
2965 * target before we resolve and then use a second PIPE_CONTROL after
2966 * the resolve to ensure that it is completed before any additional
2967 * drawing occurs.
2968 */
2969 add_pending_pipe_bits_for_color_aux_op(
2970 cmd_buffer, next_aux_op,
2971 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
2972 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2973 #endif
2974
2975 } else if (aux_op_clears(last_aux_op) && !aux_op_clears(next_aux_op)) {
2976 #if GFX_VER >= 20
2977 /* From the Xe2 Bspec 57340 (r59562),
2978 * "MCS/CCS Buffers, Fast Clear for Render Target(s)":
2979 *
2980 * Synchronization:
2981 * Due to interaction of scaled clearing rectangle with pixel
2982 * scoreboard, we require one of the following commands to be
2983 * issued. [...]
2984 *
2985 * PIPE_CONTROL
2986 * PSS Stall Sync Enable [...] 1b (Enable)
2987 * Machine-wide Stall at Pixel Stage, wait for all Prior Pixel
2988 * Work to Reach End of Pipe
2989 * Render Target Cache Flush Enable [...] 1b (Enable)
2990 * Post-Sync Op Flushes Render Cache before Unblocking Stall
2991 *
2992 * This synchronization step is required before and after the fast
2993 * clear pass, to ensure correct ordering between pixels.
2994 */
2995 add_pending_pipe_bits_for_color_aux_op(
2996 cmd_buffer, next_aux_op,
2997 ANV_PIPE_PSS_STALL_SYNC_BIT |
2998 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2999
3000 #elif GFX_VERx10 == 125
3001 /* From the ACM PRM Vol. 9, "Color Fast Clear Synchronization":
3002 *
3003 * Postamble post fast clear synchronization
3004 *
3005 * PIPE_CONTROL:
3006 * PS sync stall = 1
3007 * RT flush = 1
3008 *
3009 * From the ACM PRM Vol. 9, "MCS/CCS Buffers for Render Target(s)":
3010 *
3011 * Any transition from any value in {Clear, Render, Resolve} to a
3012 * different value in {Clear, Render, Resolve} requires end of pipe
3013 * synchronization.
3014 */
3015 add_pending_pipe_bits_for_color_aux_op(
3016 cmd_buffer, next_aux_op,
3017 ANV_PIPE_PSS_STALL_SYNC_BIT |
3018 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3019 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
3020
3021 #elif GFX_VERx10 == 120
3022 /* From the TGL PRM Vol. 9, "Color Fast Clear Synchronization":
3023 *
3024 * Postamble post fast clear synchronization
3025 *
3026 * PIPE_CONTROL:
3027 * Depth Stall = 1
3028 * Tile Cache Flush = 1
3029 * RT Write Flush = 1
3030 *
3031 * From the TGL PRM Vol. 9, "MCS/CCS Buffers for Render Target(s)":
3032 *
3033 * Any transition from any value in {Clear, Render, Resolve} to a
3034 * different value in {Clear, Render, Resolve} requires end of pipe
3035 * synchronization.
3036 *
3037 */
3038 add_pending_pipe_bits_for_color_aux_op(
3039 cmd_buffer, next_aux_op,
3040 ANV_PIPE_DEPTH_STALL_BIT |
3041 ANV_PIPE_TILE_CACHE_FLUSH_BIT |
3042 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3043 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
3044
3045 #else
3046 /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
3047 *
3048 * After Render target fast clear, pipe-control with color cache
3049 * write-flush must be issued before sending any DRAW commands on
3050 * that render target.
3051 *
3052 * From the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
3053 *
3054 * Any transition from any value in {Clear, Render, Resolve} to a
3055 * different value in {Clear, Render, Resolve} requires end of pipe
3056 * synchronization.
3057 */
3058 add_pending_pipe_bits_for_color_aux_op(
3059 cmd_buffer, next_aux_op,
3060 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3061 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
3062 #endif
3063
3064 } else if (aux_op_renders(last_aux_op) != aux_op_renders(next_aux_op)) {
3065 assert(aux_op_resolves(last_aux_op) != aux_op_resolves(next_aux_op));
3066 /* From the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
3067 *
3068 * Any transition from any value in {Clear, Render, Resolve} to a
3069 * different value in {Clear, Render, Resolve} requires end of pipe
3070 * synchronization.
3071 *
3072 * We perform a flush of the write cache before and after the clear and
3073 * resolve operations to meet this requirement.
3074 *
3075 * Unlike other drawing, fast clear operations are not properly
3076 * synchronized. The first PIPE_CONTROL here likely ensures that the
3077 * contents of the previous render or clear hit the render target before
3078 * we resolve and the second likely ensures that the resolve is complete
3079 * before we do any more rendering or clearing.
3080 */
3081 add_pending_pipe_bits_for_color_aux_op(
3082 cmd_buffer, next_aux_op,
3083 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3084 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
3085 }
3086
3087 if (last_aux_op != ISL_AUX_OP_FAST_CLEAR &&
3088 next_aux_op == ISL_AUX_OP_FAST_CLEAR &&
3089 cmd_buffer->device->isl_dev.ss.clear_color_state_size > 0) {
3090 /* From the ICL PRM Vol. 9, "State Caching":
3091 *
3092 * Any values referenced by pointers within the RENDER_SURFACE_STATE
3093 * [...] (e.g. Clear Color Pointer, [...]) are considered to be part
3094 * of that state and any changes to these referenced values requires
3095 * an invalidation of the L1 state cache to ensure the new values are
3096 * being used as part of the state. [...]
3097 *
3098 * We could alternatively perform this invalidation when we stop
3099 * fast-clearing. A benefit to doing it now, when transitioning to a
3100 * fast clear, is that we save a pipe control by combining the state
3101 * cache invalidation with the texture cache invalidation done on gfx12.
3102 */
3103 anv_add_pending_pipe_bits(cmd_buffer,
3104 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
3105 "Invalidate for new clear color");
3106 }
3107
3108 /* Update the auxiliary surface operation, but with one exception. */
3109 if (last_aux_op == ISL_AUX_OP_FAST_CLEAR &&
3110 next_aux_op == ISL_AUX_OP_AMBIGUATE) {
3111 assert(aux_op_clears(last_aux_op) && aux_op_clears(next_aux_op));
3112 /* Fast clears and ambiguates are in the same class of operation, but
3113 * fast clears have more stringent synchronization requirements. For
3114 * better performance, don't replace the current fast clear operation
3115 * state with ambiguate. This allows us to perform one state cache
3116 * invalidation when leaving a sequence which alternates between
3117 * ambiguates and clears, instead of multiple such invalidations.
3118 */
3119 } else {
3120 cmd_buffer->state.color_aux_op = next_aux_op;
3121 }
3122 }
3123
3124 static void
genX(cmd_buffer_set_protected_memory)3125 genX(cmd_buffer_set_protected_memory)(struct anv_cmd_buffer *cmd_buffer,
3126 bool enabled)
3127 {
3128 #if GFX_VER >= 12
3129 if (enabled) {
3130 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SET_APPID), appid) {
3131 /* Default value for single session. */
3132 appid.ProtectedMemoryApplicationID = cmd_buffer->device->protected_session_id;
3133 appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
3134 }
3135 }
3136 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3137 pc.PipeControlFlushEnable = true;
3138 pc.DCFlushEnable = true;
3139 pc.RenderTargetCacheFlushEnable = true;
3140 pc.CommandStreamerStallEnable = true;
3141 if (enabled)
3142 pc.ProtectedMemoryEnable = true;
3143 else
3144 pc.ProtectedMemoryDisable = true;
3145 }
3146 #else
3147 unreachable("Protected content not supported");
3148 #endif
3149 }
3150
3151 VkResult
genX(BeginCommandBuffer)3152 genX(BeginCommandBuffer)(
3153 VkCommandBuffer commandBuffer,
3154 const VkCommandBufferBeginInfo* pBeginInfo)
3155 {
3156 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3157 VkResult result;
3158
3159 /* If this is the first vkBeginCommandBuffer, we must *initialize* the
3160 * command buffer's state. Otherwise, we must *reset* its state. In both
3161 * cases we reset it.
3162 *
3163 * From the Vulkan 1.0 spec:
3164 *
3165 * If a command buffer is in the executable state and the command buffer
3166 * was allocated from a command pool with the
3167 * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
3168 * vkBeginCommandBuffer implicitly resets the command buffer, behaving
3169 * as if vkResetCommandBuffer had been called with
3170 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
3171 * the command buffer in the recording state.
3172 */
3173 anv_cmd_buffer_reset(&cmd_buffer->vk, 0);
3174 anv_cmd_buffer_reset_rendering(cmd_buffer);
3175
3176 cmd_buffer->usage_flags = pBeginInfo->flags;
3177
3178 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
3179 * primary level command buffers.
3180 *
3181 * From the Vulkan 1.0 spec:
3182 *
3183 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
3184 * secondary command buffer is considered to be entirely inside a render
3185 * pass. If this is a primary command buffer, then this bit is ignored.
3186 */
3187 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
3188 cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
3189
3190 #if GFX_VER >= 12
3191 /* Reenable prefetching at the beginning of secondary command buffers. We
3192 * do this so that the return instruction edition is not prefetched before
3193 * completion.
3194 */
3195 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3196 anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
3197 arb.PreParserDisableMask = true;
3198 arb.PreParserDisable = false;
3199 }
3200 }
3201 #endif
3202
3203 /* Assume the viewport has already been set in primary command buffers. */
3204 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
3205 cmd_buffer->state.gfx.viewport_set = true;
3206
3207 trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
3208
3209 if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
3210 anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
3211 /* Invalidate the aux table in every primary command buffer. This
3212 * ensures the command buffer see the last updates made by the host.
3213 */
3214 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3215 cmd_buffer->device->info->has_aux_map) {
3216 anv_add_pending_pipe_bits(cmd_buffer,
3217 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
3218 "new cmd buffer with aux-tt");
3219 }
3220 return VK_SUCCESS;
3221 }
3222
3223 #if GFX_VER >= 12
3224 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3225 cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3226 genX(cmd_buffer_set_protected_memory)(cmd_buffer, true);
3227 #endif
3228
3229 if (cmd_buffer->device->vk.enabled_extensions.EXT_descriptor_buffer) {
3230 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
3231 } else {
3232 cmd_buffer->state.current_db_mode = ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
3233 genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
3234 }
3235
3236 /* We sometimes store vertex data in the dynamic state buffer for blorp
3237 * operations and our dynamic state stream may re-use data from previous
3238 * command buffers. In order to prevent stale cache data, we flush the VF
3239 * cache. We could do this on every blorp call but that's not really
3240 * needed as all of the data will get written by the CPU prior to the GPU
3241 * executing anything. The chances are fairly high that they will use
3242 * blorp at least once per primary command buffer so it shouldn't be
3243 * wasted.
3244 *
3245 * There is also a workaround on gfx8 which requires us to invalidate the
3246 * VF cache occasionally. It's easier if we can assume we start with a
3247 * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
3248 */
3249 anv_add_pending_pipe_bits(cmd_buffer,
3250 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
3251 "new cmd buffer");
3252
3253 /* Invalidate the aux table in every primary command buffer. This ensures
3254 * the command buffer see the last updates made by the host.
3255 */
3256 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3257 cmd_buffer->device->info->has_aux_map) {
3258 anv_add_pending_pipe_bits(cmd_buffer,
3259 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
3260 "new cmd buffer with aux-tt");
3261 }
3262
3263 /* We send an "Indirect State Pointers Disable" packet at
3264 * EndCommandBuffer, so all push constant packets are ignored during a
3265 * context restore. Documentation says after that command, we need to
3266 * emit push constants again before any rendering operation. So we
3267 * flag them dirty here to make sure they get emitted.
3268 */
3269 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
3270 cmd_buffer->state.gfx.base.push_constants_data_dirty = true;
3271
3272 if (cmd_buffer->usage_flags &
3273 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3274 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3275
3276 char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
3277 const VkRenderingInfo *resume_info =
3278 vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
3279 pBeginInfo,
3280 gcbiar_data);
3281 if (resume_info != NULL) {
3282 genX(CmdBeginRendering)(commandBuffer, resume_info);
3283 } else {
3284 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
3285 vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
3286 pBeginInfo);
3287 assert(inheritance_info);
3288
3289 gfx->rendering_flags = inheritance_info->flags;
3290 gfx->render_area = (VkRect2D) { };
3291 gfx->layer_count = 0;
3292 gfx->samples = inheritance_info->rasterizationSamples;
3293 gfx->view_mask = inheritance_info->viewMask;
3294
3295 uint32_t color_att_count = inheritance_info->colorAttachmentCount;
3296 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
3297 if (result != VK_SUCCESS)
3298 return result;
3299
3300 for (uint32_t i = 0; i < color_att_count; i++) {
3301 gfx->color_att[i].vk_format =
3302 inheritance_info->pColorAttachmentFormats[i];
3303 }
3304 gfx->depth_att.vk_format =
3305 inheritance_info->depthAttachmentFormat;
3306 gfx->stencil_att.vk_format =
3307 inheritance_info->stencilAttachmentFormat;
3308
3309 anv_cmd_graphic_state_update_has_uint_rt(gfx);
3310
3311 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_AREA |
3312 ANV_CMD_DIRTY_RENDER_TARGETS;
3313 }
3314 }
3315
3316 /* Emit the sample pattern at the beginning of the batch because the
3317 * default locations emitted at the device initialization might have been
3318 * changed by a previous command buffer.
3319 *
3320 * Do not change that when we're continuing a previous renderpass.
3321 */
3322 if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
3323 !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
3324 genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
3325
3326 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3327 const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
3328 vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
3329
3330 /* If secondary buffer supports conditional rendering
3331 * we should emit commands as if conditional rendering is enabled.
3332 */
3333 cmd_buffer->state.conditional_render_enabled =
3334 conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
3335
3336 if (pBeginInfo->pInheritanceInfo->occlusionQueryEnable) {
3337 cmd_buffer->state.gfx.n_occlusion_queries = 1;
3338 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
3339 }
3340 }
3341
3342 return VK_SUCCESS;
3343 }
3344
3345 /* From the PRM, Volume 2a:
3346 *
3347 * "Indirect State Pointers Disable
3348 *
3349 * At the completion of the post-sync operation associated with this pipe
3350 * control packet, the indirect state pointers in the hardware are
3351 * considered invalid; the indirect pointers are not saved in the context.
3352 * If any new indirect state commands are executed in the command stream
3353 * while the pipe control is pending, the new indirect state commands are
3354 * preserved.
3355 *
3356 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
3357 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
3358 * commands are only considered as Indirect State Pointers. Once ISP is
3359 * issued in a context, SW must initialize by programming push constant
3360 * commands for all the shaders (at least to zero length) before attempting
3361 * any rendering operation for the same context."
3362 *
3363 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
3364 * even though they point to a BO that has been already unreferenced at
3365 * the end of the previous batch buffer. This has been fine so far since
3366 * we are protected by these scratch page (every address not covered by
3367 * a BO should be pointing to the scratch page). But on CNL, it is
3368 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
3369 * instruction.
3370 *
3371 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
3372 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
3373 * context restore, so the mentioned hang doesn't happen. However,
3374 * software must program push constant commands for all stages prior to
3375 * rendering anything. So we flag them dirty in BeginCommandBuffer.
3376 *
3377 * Finally, we also make sure to stall at pixel scoreboard to make sure the
3378 * constants have been loaded into the EUs prior to disable the push constants
3379 * so that it doesn't hang a previous 3DPRIMITIVE.
3380 */
3381 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)3382 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
3383 {
3384 genx_batch_emit_pipe_control(&cmd_buffer->batch,
3385 cmd_buffer->device->info,
3386 cmd_buffer->state.current_pipeline,
3387 ANV_PIPE_CS_STALL_BIT |
3388 ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
3389 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3390 pc.IndirectStatePointersDisable = true;
3391 pc.CommandStreamerStallEnable = true;
3392 anv_debug_dump_pc(pc, __func__);
3393 }
3394 }
3395
3396 static VkResult
end_command_buffer(struct anv_cmd_buffer * cmd_buffer)3397 end_command_buffer(struct anv_cmd_buffer *cmd_buffer)
3398 {
3399 if (anv_batch_has_error(&cmd_buffer->batch))
3400 return cmd_buffer->batch.status;
3401
3402 anv_measure_endcommandbuffer(cmd_buffer);
3403
3404 if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
3405 anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
3406 trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
3407 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3408 anv_cmd_buffer_end_batch_buffer(cmd_buffer);
3409 return VK_SUCCESS;
3410 }
3411
3412 /* Flush query clears using blorp so that secondary query writes do not
3413 * race with the clear.
3414 */
3415 if (cmd_buffer->state.queries.clear_bits) {
3416 anv_add_pending_pipe_bits(cmd_buffer,
3417 ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
3418 "query clear flush prior command buffer end");
3419 }
3420
3421 /* Flush any in-progress CCS/MCS operations in preparation for chaining. */
3422 genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
3423
3424 genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
3425
3426 /* Turn on object level preemption if it is disabled to have it in known
3427 * state at the beginning of new command buffer.
3428 */
3429 if (!cmd_buffer->state.gfx.object_preemption)
3430 genX(cmd_buffer_set_preemption)(cmd_buffer, true);
3431
3432 /* We want every command buffer to start with the PMA fix in a known state,
3433 * so we disable it at the end of the command buffer.
3434 */
3435 genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
3436
3437 /* Wa_14015814527
3438 *
3439 * Apply task URB workaround in the end of primary or secondary cmd_buffer.
3440 */
3441 genX(apply_task_urb_workaround)(cmd_buffer);
3442
3443 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3444
3445 emit_isp_disable(cmd_buffer);
3446
3447 #if GFX_VER >= 12
3448 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3449 cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3450 genX(cmd_buffer_set_protected_memory)(cmd_buffer, false);
3451 #endif
3452
3453 trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
3454
3455 anv_cmd_buffer_end_batch_buffer(cmd_buffer);
3456
3457 return VK_SUCCESS;
3458 }
3459
3460 VkResult
genX(EndCommandBuffer)3461 genX(EndCommandBuffer)(
3462 VkCommandBuffer commandBuffer)
3463 {
3464 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3465
3466 VkResult status = end_command_buffer(cmd_buffer);
3467 if (status != VK_SUCCESS)
3468 return status;
3469
3470 /* If there is MSAA access over the compute/transfer queue, we can use the
3471 * companion RCS command buffer and end it properly.
3472 */
3473 if (cmd_buffer->companion_rcs_cmd_buffer) {
3474 assert(anv_cmd_buffer_is_compute_queue(cmd_buffer) ||
3475 anv_cmd_buffer_is_blitter_queue(cmd_buffer));
3476 status = end_command_buffer(cmd_buffer->companion_rcs_cmd_buffer);
3477 }
3478
3479 ANV_RMV(cmd_buffer_create, cmd_buffer->device, cmd_buffer);
3480
3481 return status;
3482 }
3483
3484 void
genX(CmdExecuteCommands)3485 genX(CmdExecuteCommands)(
3486 VkCommandBuffer commandBuffer,
3487 uint32_t commandBufferCount,
3488 const VkCommandBuffer* pCmdBuffers)
3489 {
3490 ANV_FROM_HANDLE(anv_cmd_buffer, container, commandBuffer);
3491
3492 struct anv_device *device = container->device;
3493
3494 if (anv_batch_has_error(&container->batch))
3495 return;
3496
3497 /* The secondary command buffers will assume that the PMA fix is disabled
3498 * when they begin executing. Make sure this is true.
3499 */
3500 genX(cmd_buffer_enable_pma_fix)(container, false);
3501
3502 /* Turn on preemption in case it was toggled off. */
3503 if (!container->state.gfx.object_preemption)
3504 genX(cmd_buffer_set_preemption)(container, true);
3505
3506 /* Wa_14015814527
3507 *
3508 * Apply task URB workaround before secondary cmd buffers.
3509 */
3510 genX(apply_task_urb_workaround)(container);
3511
3512 /* Flush query clears using blorp so that secondary query writes do not
3513 * race with the clear.
3514 */
3515 if (container->state.queries.clear_bits) {
3516 anv_add_pending_pipe_bits(container,
3517 ANV_PIPE_QUERY_BITS(container->state.queries.clear_bits),
3518 "query clear flush prior to secondary buffer");
3519 }
3520
3521 /* Ensure we're in a regular drawing cache mode (assumption for all
3522 * secondary).
3523 */
3524 genX(cmd_buffer_update_color_aux_op(container, ISL_AUX_OP_NONE));
3525
3526 /* The secondary command buffer doesn't know which textures etc. have been
3527 * flushed prior to their execution. Apply those flushes now.
3528 */
3529 genX(cmd_buffer_apply_pipe_flushes)(container);
3530
3531 genX(cmd_buffer_flush_generated_draws)(container);
3532
3533 UNUSED enum anv_cmd_descriptor_buffer_mode db_mode =
3534 container->state.current_db_mode;
3535
3536 /* Do a first pass to copy the surface state content of the render targets
3537 * if needed.
3538 */
3539 bool need_surface_state_copy = false;
3540 for (uint32_t i = 0; i < commandBufferCount; i++) {
3541 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3542
3543 if (secondary->usage_flags &
3544 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3545 need_surface_state_copy = true;
3546 break;
3547 }
3548 }
3549
3550 if (need_surface_state_copy) {
3551 if (container->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3552 genX(cmd_buffer_set_protected_memory)(container, false);
3553
3554 /* The memcpy will take care of the 3D preemption requirements. */
3555 struct anv_memcpy_state memcpy_state;
3556 genX(emit_so_memcpy_init)(&memcpy_state, device,
3557 container, &container->batch);
3558
3559 for (uint32_t i = 0; i < commandBufferCount; i++) {
3560 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3561
3562 assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
3563 assert(!anv_batch_has_error(&secondary->batch));
3564
3565 if (secondary->usage_flags &
3566 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3567 /* If we're continuing a render pass from the container, we need
3568 * to copy the surface states for the current subpass into the
3569 * storage we allocated for them in BeginCommandBuffer.
3570 */
3571 struct anv_state src_state = container->state.gfx.att_states;
3572 struct anv_state dst_state = secondary->state.gfx.att_states;
3573 assert(src_state.alloc_size == dst_state.alloc_size);
3574
3575 genX(emit_so_memcpy)(
3576 &memcpy_state,
3577 anv_state_pool_state_address(&device->internal_surface_state_pool,
3578 dst_state),
3579 anv_state_pool_state_address(&device->internal_surface_state_pool,
3580 src_state),
3581 src_state.alloc_size);
3582 }
3583 }
3584 genX(emit_so_memcpy_fini)(&memcpy_state);
3585
3586 anv_add_pending_pipe_bits(container,
3587 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
3588 "Wait for primary->secondary RP surface state copies");
3589 genX(cmd_buffer_apply_pipe_flushes)(container);
3590
3591 if (container->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3592 genX(cmd_buffer_set_protected_memory)(container, true);
3593 }
3594
3595 /* Ensure preemption is enabled (assumption for all secondary) */
3596 genX(cmd_buffer_set_preemption)(container, true);
3597
3598 for (uint32_t i = 0; i < commandBufferCount; i++) {
3599 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3600
3601 assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
3602 assert(!anv_batch_has_error(&secondary->batch));
3603
3604 if (secondary->state.conditional_render_enabled) {
3605 if (!container->state.conditional_render_enabled) {
3606 /* Secondary buffer is constructed as if it will be executed
3607 * with conditional rendering, we should satisfy this dependency
3608 * regardless of conditional rendering being enabled in container.
3609 */
3610 struct mi_builder b;
3611 mi_builder_init(&b, device->info, &container->batch);
3612 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
3613 mi_imm(UINT64_MAX));
3614 }
3615 }
3616
3617 anv_cmd_buffer_add_secondary(container, secondary);
3618
3619 /* Add secondary buffer's RCS command buffer to container buffer's RCS
3620 * command buffer for execution if secondary RCS is valid.
3621 */
3622 if (secondary->companion_rcs_cmd_buffer != NULL) {
3623 VkResult result = anv_cmd_buffer_ensure_rcs_companion(container);
3624 if (result != VK_SUCCESS) {
3625 anv_batch_set_error(&container->batch, result);
3626 return;
3627 }
3628
3629 anv_cmd_buffer_add_secondary(container->companion_rcs_cmd_buffer,
3630 secondary->companion_rcs_cmd_buffer);
3631 }
3632
3633 assert(secondary->perf_query_pool == NULL || container->perf_query_pool == NULL ||
3634 secondary->perf_query_pool == container->perf_query_pool);
3635 if (secondary->perf_query_pool)
3636 container->perf_query_pool = secondary->perf_query_pool;
3637
3638 #if INTEL_NEEDS_WA_1808121037
3639 if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
3640 container->state.depth_reg_mode = secondary->state.depth_reg_mode;
3641 #endif
3642
3643 container->state.gfx.viewport_set |= secondary->state.gfx.viewport_set;
3644
3645 db_mode = secondary->state.current_db_mode;
3646 }
3647
3648 /* The secondary isn't counted in our VF cache tracking so we need to
3649 * invalidate the whole thing.
3650 */
3651 if (GFX_VER == 9) {
3652 anv_add_pending_pipe_bits(container,
3653 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
3654 "Secondary cmd buffer not tracked in VF cache");
3655 }
3656
3657 #if INTEL_WA_16014538804_GFX_VER
3658 if (anv_cmd_buffer_is_render_queue(container) &&
3659 intel_needs_workaround(device->info, 16014538804))
3660 anv_batch_emit(&container->batch, GENX(PIPE_CONTROL), pc);
3661 #endif
3662
3663 /* The secondary may have selected a different pipeline (3D or compute) and
3664 * may have changed the current L3$ configuration. Reset our tracking
3665 * variables to invalid values to ensure that we re-emit these in the case
3666 * where we do any draws or compute dispatches from the container after the
3667 * secondary has returned.
3668 */
3669 container->state.current_pipeline = UINT32_MAX;
3670 container->state.current_l3_config = NULL;
3671 container->state.current_hash_scale = 0;
3672 container->state.gfx.push_constant_stages = 0;
3673
3674 memset(&container->state.gfx.urb_cfg, 0, sizeof(struct intel_urb_config));
3675
3676 /* Reemit all GFX instructions in container */
3677 memcpy(container->state.gfx.dyn_state.dirty,
3678 device->gfx_dirty_state,
3679 sizeof(container->state.gfx.dyn_state.dirty));
3680 if (container->device->vk.enabled_extensions.KHR_fragment_shading_rate) {
3681 /* Also recompute the CPS_STATE offset */
3682 struct vk_dynamic_graphics_state *dyn =
3683 &container->vk.dynamic_graphics_state;
3684 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_FSR);
3685 }
3686
3687 /* Each of the secondary command buffers will use its own state base
3688 * address. We need to re-emit state base address for the container after
3689 * all of the secondaries are done.
3690 */
3691 if (container->device->vk.enabled_extensions.EXT_descriptor_buffer) {
3692 #if GFX_VERx10 >= 125
3693 /* If the last secondary had a different mode, reemit the last pending
3694 * mode. Otherwise, we can do a lighter binding table pool update.
3695 */
3696 if (db_mode != container->state.current_db_mode) {
3697 container->state.current_db_mode = db_mode;
3698 genX(cmd_buffer_emit_state_base_address)(container);
3699 } else {
3700 genX(cmd_buffer_emit_bt_pool_base_address)(container);
3701 }
3702 #else
3703 genX(cmd_buffer_emit_state_base_address)(container);
3704 #endif
3705 } else {
3706 genX(cmd_buffer_emit_bt_pool_base_address)(container);
3707 }
3708
3709 /* Copy of utrace timestamp buffers from secondary into container */
3710 if (u_trace_enabled(&device->ds.trace_context)) {
3711 trace_intel_begin_trace_copy(&container->trace);
3712
3713 struct anv_memcpy_state memcpy_state;
3714 genX(emit_so_memcpy_init)(&memcpy_state, device,
3715 container, &container->batch);
3716 uint32_t num_traces = 0;
3717 for (uint32_t i = 0; i < commandBufferCount; i++) {
3718 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3719
3720 num_traces += secondary->trace.num_traces;
3721 u_trace_clone_append(u_trace_begin_iterator(&secondary->trace),
3722 u_trace_end_iterator(&secondary->trace),
3723 &container->trace,
3724 &memcpy_state,
3725 anv_device_utrace_emit_gfx_copy_buffer);
3726 }
3727 genX(emit_so_memcpy_fini)(&memcpy_state);
3728
3729 trace_intel_end_trace_copy(&container->trace, num_traces);
3730
3731 /* Memcpy is done using the 3D pipeline. */
3732 container->state.current_pipeline = _3D;
3733 }
3734 }
3735
3736 static inline enum anv_pipe_bits
anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer * cmd_buffer,VkAccessFlags2 flags)3737 anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
3738 VkAccessFlags2 flags)
3739 {
3740 enum anv_pipe_bits pipe_bits = 0;
3741
3742 u_foreach_bit64(b, flags) {
3743 switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
3744 case VK_ACCESS_2_SHADER_WRITE_BIT:
3745 case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
3746 case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
3747 /* We're transitioning a buffer that was previously used as write
3748 * destination through the data port. To make its content available
3749 * to future operations, flush the hdc pipeline.
3750 */
3751 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3752 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3753 break;
3754 case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
3755 /* We're transitioning a buffer that was previously used as render
3756 * target. To make its content available to future operations, flush
3757 * the render target cache.
3758 */
3759 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
3760 break;
3761 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3762 /* We're transitioning a buffer that was previously used as depth
3763 * buffer. To make its content available to future operations, flush
3764 * the depth cache.
3765 */
3766 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3767 break;
3768 case VK_ACCESS_2_TRANSFER_WRITE_BIT:
3769 /* We're transitioning a buffer that was previously used as a
3770 * transfer write destination. Generic write operations include color
3771 * & depth operations as well as buffer operations like :
3772 * - vkCmdClearColorImage()
3773 * - vkCmdClearDepthStencilImage()
3774 * - vkCmdBlitImage()
3775 * - vkCmdCopy*(), vkCmdUpdate*(), vkCmdFill*()
3776 *
3777 * Most of these operations are implemented using Blorp which writes
3778 * through the render target cache or the depth cache on the graphics
3779 * queue. On the compute queue, the writes are done through the data
3780 * port.
3781 */
3782 if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
3783 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3784 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3785 } else {
3786 /* We can use the data port when trying to stay in compute mode on
3787 * the RCS.
3788 */
3789 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3790 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3791 /* Most operations are done through RT/detph writes */
3792 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
3793 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3794 }
3795 break;
3796 case VK_ACCESS_2_MEMORY_WRITE_BIT:
3797 /* We're transitioning a buffer for generic write operations. Flush
3798 * all the caches.
3799 */
3800 pipe_bits |= ANV_PIPE_FLUSH_BITS;
3801 break;
3802 case VK_ACCESS_2_HOST_WRITE_BIT:
3803 /* We're transitioning a buffer for access by CPU. Invalidate
3804 * all the caches. Since data and tile caches don't have invalidate,
3805 * we are forced to flush those as well.
3806 */
3807 pipe_bits |= ANV_PIPE_FLUSH_BITS;
3808 pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
3809 break;
3810 case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3811 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3812 /* We're transitioning a buffer written either from VS stage or from
3813 * the command streamer (see CmdEndTransformFeedbackEXT), we just
3814 * need to stall the CS.
3815 *
3816 * Streamout writes apparently bypassing L3, in order to make them
3817 * visible to the destination, we need to invalidate the other
3818 * caches.
3819 */
3820 pipe_bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_INVALIDATE_BITS;
3821 break;
3822 default:
3823 break; /* Nothing to do */
3824 }
3825 }
3826
3827 return pipe_bits;
3828 }
3829
3830 static inline enum anv_pipe_bits
anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer * cmd_buffer,VkAccessFlags2 flags)3831 anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
3832 VkAccessFlags2 flags)
3833 {
3834 struct anv_device *device = cmd_buffer->device;
3835 enum anv_pipe_bits pipe_bits = 0;
3836
3837 u_foreach_bit64(b, flags) {
3838 switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
3839 case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
3840 /* Indirect draw commands take a buffer as input that we're going to
3841 * read from the command streamer to load some of the HW registers
3842 * (see genX_cmd_buffer.c:load_indirect_parameters). This requires a
3843 * command streamer stall so that all the cache flushes have
3844 * completed before the command streamer loads from memory.
3845 */
3846 pipe_bits |= ANV_PIPE_CS_STALL_BIT;
3847 if (device->info->ver == 9) {
3848 /* Indirect draw commands on Gfx9 also set gl_BaseVertex &
3849 * gl_BaseIndex through a vertex buffer, so invalidate that cache.
3850 */
3851 pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
3852 }
3853 /* For CmdDipatchIndirect, we also load gl_NumWorkGroups through a
3854 * UBO from the buffer, so we need to invalidate constant cache.
3855 */
3856 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
3857 pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3858 /* Tile cache flush needed For CmdDipatchIndirect since command
3859 * streamer and vertex fetch aren't L3 coherent.
3860 */
3861 pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3862 break;
3863 case VK_ACCESS_2_INDEX_READ_BIT:
3864 case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
3865 /* We transitioning a buffer to be used for as input for vkCmdDraw*
3866 * commands, so we invalidate the VF cache to make sure there is no
3867 * stale data when we start rendering.
3868 */
3869 pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
3870 break;
3871 case VK_ACCESS_2_UNIFORM_READ_BIT:
3872 case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
3873 /* We transitioning a buffer to be used as uniform data. Because
3874 * uniform is accessed through the data port & sampler, we need to
3875 * invalidate the texture cache (sampler) & constant cache (data
3876 * port) to avoid stale data.
3877 */
3878 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
3879 if (device->physical->compiler->indirect_ubos_use_sampler) {
3880 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3881 } else {
3882 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3883 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3884 }
3885 break;
3886 case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
3887 case VK_ACCESS_2_TRANSFER_READ_BIT:
3888 case VK_ACCESS_2_SHADER_SAMPLED_READ_BIT:
3889 /* Transitioning a buffer to be read through the sampler, so
3890 * invalidate the texture cache, we don't want any stale data.
3891 */
3892 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3893 break;
3894 case VK_ACCESS_2_SHADER_READ_BIT:
3895 /* Same as VK_ACCESS_2_UNIFORM_READ_BIT and
3896 * VK_ACCESS_2_SHADER_SAMPLED_READ_BIT cases above
3897 */
3898 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
3899 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3900 if (!device->physical->compiler->indirect_ubos_use_sampler) {
3901 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3902 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3903 }
3904 break;
3905 case VK_ACCESS_2_MEMORY_READ_BIT:
3906 /* Transitioning a buffer for generic read, invalidate all the
3907 * caches.
3908 */
3909 pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
3910 break;
3911 case VK_ACCESS_2_MEMORY_WRITE_BIT:
3912 /* Generic write, make sure all previously written things land in
3913 * memory.
3914 */
3915 pipe_bits |= ANV_PIPE_FLUSH_BITS;
3916 break;
3917 case VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT:
3918 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT:
3919 /* Transitioning a buffer for conditional rendering or transform
3920 * feedback. We'll load the content of this buffer into HW registers
3921 * using the command streamer, so we need to stall the command
3922 * streamer , so we need to stall the command streamer to make sure
3923 * any in-flight flush operations have completed.
3924 */
3925 pipe_bits |= ANV_PIPE_CS_STALL_BIT;
3926 pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3927 pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3928 break;
3929 case VK_ACCESS_2_HOST_READ_BIT:
3930 /* We're transitioning a buffer that was written by CPU. Flush
3931 * all the caches.
3932 */
3933 pipe_bits |= ANV_PIPE_FLUSH_BITS;
3934 break;
3935 case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3936 /* We're transitioning a buffer to be written by the streamout fixed
3937 * function. This one is apparently not L3 coherent, so we need a
3938 * tile cache flush to make sure any previous write is not going to
3939 * create WaW hazards.
3940 */
3941 pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3942 pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3943 break;
3944 case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
3945 /* VK_ACCESS_2_SHADER_STORAGE_READ_BIT specifies read access to a
3946 * storage buffer, physical storage buffer, storage texel buffer, or
3947 * storage image in any shader pipeline stage.
3948 *
3949 * Any storage buffers or images written to must be invalidated and
3950 * flushed before the shader can access them.
3951 *
3952 * Both HDC & Untyped flushes also do invalidation. This is why we
3953 * use this here on Gfx12+.
3954 *
3955 * Gfx11 and prior don't have HDC. Only Data cache flush is available
3956 * and it only operates on the written cache lines.
3957 */
3958 if (device->info->ver >= 12) {
3959 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3960 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3961 }
3962 break;
3963 case VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT:
3964 pipe_bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
3965 break;
3966 default:
3967 break; /* Nothing to do */
3968 }
3969 }
3970
3971 return pipe_bits;
3972 }
3973
3974 static inline bool
stage_is_shader(const VkPipelineStageFlags2 stage)3975 stage_is_shader(const VkPipelineStageFlags2 stage)
3976 {
3977 return (stage & (VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
3978 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
3979 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
3980 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
3981 VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
3982 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
3983 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
3984 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3985 VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
3986 VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT |
3987 VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT));
3988 }
3989
3990 static inline bool
stage_is_transfer(const VkPipelineStageFlags2 stage)3991 stage_is_transfer(const VkPipelineStageFlags2 stage)
3992 {
3993 return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3994 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT));
3995 }
3996
3997 static inline bool
stage_is_video(const VkPipelineStageFlags2 stage)3998 stage_is_video(const VkPipelineStageFlags2 stage)
3999 {
4000 return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
4001 #ifdef VK_ENABLE_BETA_EXTENSIONS
4002 VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR |
4003 #endif
4004 VK_PIPELINE_STAGE_2_VIDEO_DECODE_BIT_KHR));
4005 }
4006
4007 static inline bool
mask_is_shader_write(const VkAccessFlags2 access)4008 mask_is_shader_write(const VkAccessFlags2 access)
4009 {
4010 return (access & (VK_ACCESS_2_SHADER_WRITE_BIT |
4011 VK_ACCESS_2_MEMORY_WRITE_BIT |
4012 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT));
4013 }
4014
4015 static inline bool
mask_is_write(const VkAccessFlags2 access)4016 mask_is_write(const VkAccessFlags2 access)
4017 {
4018 return access & (VK_ACCESS_2_SHADER_WRITE_BIT |
4019 VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
4020 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
4021 VK_ACCESS_2_TRANSFER_WRITE_BIT |
4022 VK_ACCESS_2_HOST_WRITE_BIT |
4023 VK_ACCESS_2_MEMORY_WRITE_BIT |
4024 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
4025 VK_ACCESS_2_VIDEO_DECODE_WRITE_BIT_KHR |
4026 #ifdef VK_ENABLE_BETA_EXTENSIONS
4027 VK_ACCESS_2_VIDEO_ENCODE_WRITE_BIT_KHR |
4028 #endif
4029 VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT |
4030 VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
4031 VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV |
4032 VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR |
4033 VK_ACCESS_2_MICROMAP_WRITE_BIT_EXT |
4034 VK_ACCESS_2_OPTICAL_FLOW_WRITE_BIT_NV);
4035 }
4036
4037 static inline bool
mask_is_transfer_write(const VkAccessFlags2 access)4038 mask_is_transfer_write(const VkAccessFlags2 access)
4039 {
4040 return access & (VK_ACCESS_2_TRANSFER_WRITE_BIT |
4041 VK_ACCESS_2_MEMORY_WRITE_BIT);
4042 }
4043
4044 static void
cmd_buffer_barrier_video(struct anv_cmd_buffer * cmd_buffer,uint32_t n_dep_infos,const VkDependencyInfo * dep_infos)4045 cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
4046 uint32_t n_dep_infos,
4047 const VkDependencyInfo *dep_infos)
4048 {
4049 assert(anv_cmd_buffer_is_video_queue(cmd_buffer));
4050
4051 bool flush_llc = false;
4052 bool flush_ccs = false;
4053
4054 for (uint32_t d = 0; d < n_dep_infos; d++) {
4055 const VkDependencyInfo *dep_info = &dep_infos[d];
4056
4057
4058 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
4059 const VkImageMemoryBarrier2 *img_barrier =
4060 &dep_info->pImageMemoryBarriers[i];
4061
4062 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
4063 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
4064
4065 /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
4066 * memory barrier defines a queue family ownership transfer.
4067 */
4068 if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
4069 flush_llc = true;
4070
4071 VkImageAspectFlags img_aspects =
4072 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4073 anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
4074 const uint32_t plane =
4075 anv_image_aspect_to_plane(image, 1UL << aspect_bit);
4076 if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
4077 flush_ccs = true;
4078 }
4079 }
4080 }
4081
4082 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
4083 /* Flush the cache if something is written by the video operations and
4084 * used by any other stages except video encode/decode stages or if
4085 * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
4086 * barrier defines a queue family ownership transfer.
4087 */
4088 if ((stage_is_video(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
4089 mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask) &&
4090 !stage_is_video(dep_info->pBufferMemoryBarriers[i].dstStageMask)) ||
4091 (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
4092 dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
4093 flush_llc = true;
4094 break;
4095 }
4096 }
4097
4098 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
4099 /* Flush the cache if something is written by the video operations and
4100 * used by any other stages except video encode/decode stage.
4101 */
4102 if (stage_is_video(dep_info->pMemoryBarriers[i].srcStageMask) &&
4103 mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
4104 !stage_is_video(dep_info->pMemoryBarriers[i].dstStageMask)) {
4105 flush_llc = true;
4106 break;
4107 }
4108 }
4109
4110 /* We cannot gather more information than that. */
4111 if (flush_ccs && flush_llc)
4112 break;
4113 }
4114
4115 if (flush_ccs || flush_llc) {
4116 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
4117 #if GFX_VERx10 >= 125
4118 fd.FlushCCS = flush_ccs;
4119 #endif
4120 #if GFX_VER >= 12
4121 /* Using this bit on Gfx9 triggers a GPU hang.
4122 * This is undocumented behavior. Gfx12 seems fine.
4123 * TODO: check Gfx11
4124 */
4125 fd.FlushLLC = flush_llc;
4126 #endif
4127 }
4128 }
4129 }
4130
4131 static void
cmd_buffer_barrier_blitter(struct anv_cmd_buffer * cmd_buffer,uint32_t n_dep_infos,const VkDependencyInfo * dep_infos)4132 cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer,
4133 uint32_t n_dep_infos,
4134 const VkDependencyInfo *dep_infos)
4135 {
4136 #if GFX_VERx10 >= 125
4137 assert(anv_cmd_buffer_is_blitter_queue(cmd_buffer));
4138
4139 /* The blitter requires an MI_FLUSH_DW command when a buffer transitions
4140 * from being a destination to a source.
4141 */
4142 bool flush_llc = false;
4143 bool flush_ccs = false;
4144
4145 for (uint32_t d = 0; d < n_dep_infos; d++) {
4146 const VkDependencyInfo *dep_info = &dep_infos[d];
4147
4148 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
4149 const VkImageMemoryBarrier2 *img_barrier =
4150 &dep_info->pImageMemoryBarriers[i];
4151
4152 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
4153 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
4154
4155 /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
4156 * memory barrier defines a queue family transfer operation.
4157 */
4158 if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
4159 flush_llc = true;
4160
4161 /* Flush cache if transfer command reads the output of the previous
4162 * transfer command, ideally we should just wait for the completion
4163 * but for now just flush the cache to make the data visible.
4164 */
4165 if ((img_barrier->oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL ||
4166 img_barrier->oldLayout == VK_IMAGE_LAYOUT_GENERAL) &&
4167 (img_barrier->newLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL ||
4168 img_barrier->newLayout == VK_IMAGE_LAYOUT_GENERAL)) {
4169 flush_llc = true;
4170 }
4171
4172 VkImageAspectFlags img_aspects =
4173 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4174 anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
4175 const uint32_t plane =
4176 anv_image_aspect_to_plane(image, 1UL << aspect_bit);
4177 if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
4178 flush_ccs = true;
4179 }
4180 }
4181 }
4182
4183 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
4184 /* Flush the cache if something is written by the transfer command
4185 * and used by any other stages except transfer stage or if
4186 * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
4187 * memory barrier defines a queue family transfer operation.
4188 */
4189 if ((stage_is_transfer(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
4190 mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask)) ||
4191 (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
4192 dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
4193 flush_llc = true;
4194 break;
4195 }
4196 }
4197
4198 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
4199 /* Flush the cache if something is written by the transfer command
4200 * and used by any other stages except transfer stage.
4201 */
4202 if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
4203 mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask)) {
4204 flush_llc = true;
4205 break;
4206 }
4207 }
4208
4209 /* We cannot gather more information than that. */
4210 if (flush_ccs && flush_llc)
4211 break;
4212 }
4213
4214 if (flush_ccs || flush_llc) {
4215 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
4216 if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
4217 genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
4218 cmd_buffer->device);
4219 }
4220 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
4221 fd.FlushCCS = flush_ccs;
4222 fd.FlushLLC = flush_llc;
4223 }
4224 }
4225 #endif
4226 }
4227
4228 static inline bool
cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer * cmd_buffer)4229 cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer *cmd_buffer)
4230 {
4231 /* Query copies are only written with dataport, so we only need to check
4232 * that flag.
4233 */
4234 return (cmd_buffer->state.queries.buffer_write_bits &
4235 ANV_QUERY_WRITES_DATA_FLUSH) != 0;
4236 }
4237
4238 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,uint32_t n_dep_infos,const VkDependencyInfo * dep_infos,const char * reason)4239 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
4240 uint32_t n_dep_infos,
4241 const VkDependencyInfo *dep_infos,
4242 const char *reason)
4243 {
4244 if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
4245 cmd_buffer_barrier_video(cmd_buffer, n_dep_infos, dep_infos);
4246 return;
4247 }
4248
4249 if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
4250 cmd_buffer_barrier_blitter(cmd_buffer, n_dep_infos, dep_infos);
4251 return;
4252 }
4253
4254 /* XXX: Right now, we're really dumb and just flush whatever categories
4255 * the app asks for. One of these days we may make this a bit better but
4256 * right now that's all the hardware allows for in most areas.
4257 */
4258 VkAccessFlags2 src_flags = 0;
4259 VkAccessFlags2 dst_flags = 0;
4260
4261 #if GFX_VER < 20
4262 bool apply_sparse_flushes = false;
4263 struct anv_device *device = cmd_buffer->device;
4264 #endif
4265 bool flush_query_copies = false;
4266
4267 for (uint32_t d = 0; d < n_dep_infos; d++) {
4268 const VkDependencyInfo *dep_info = &dep_infos[d];
4269
4270 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
4271 src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
4272 dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
4273
4274 /* Shader writes to buffers that could then be written by a transfer
4275 * command (including queries).
4276 */
4277 if (stage_is_shader(dep_info->pMemoryBarriers[i].srcStageMask) &&
4278 mask_is_shader_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
4279 stage_is_transfer(dep_info->pMemoryBarriers[i].dstStageMask)) {
4280 cmd_buffer->state.queries.buffer_write_bits |=
4281 ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
4282 }
4283
4284 if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
4285 mask_is_transfer_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
4286 cmd_buffer_has_pending_copy_query(cmd_buffer))
4287 flush_query_copies = true;
4288
4289 #if GFX_VER < 20
4290 /* There's no way of knowing if this memory barrier is related to
4291 * sparse buffers! This is pretty horrible.
4292 */
4293 if (mask_is_write(src_flags) &&
4294 p_atomic_read(&device->num_sparse_resources) > 0)
4295 apply_sparse_flushes = true;
4296 #endif
4297 }
4298
4299 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
4300 const VkBufferMemoryBarrier2 *buf_barrier =
4301 &dep_info->pBufferMemoryBarriers[i];
4302
4303 src_flags |= buf_barrier->srcAccessMask;
4304 dst_flags |= buf_barrier->dstAccessMask;
4305
4306 /* Shader writes to buffers that could then be written by a transfer
4307 * command (including queries).
4308 */
4309 if (stage_is_shader(buf_barrier->srcStageMask) &&
4310 mask_is_shader_write(buf_barrier->srcAccessMask) &&
4311 stage_is_transfer(buf_barrier->dstStageMask)) {
4312 cmd_buffer->state.queries.buffer_write_bits |=
4313 ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
4314 }
4315
4316 if (stage_is_transfer(buf_barrier->srcStageMask) &&
4317 mask_is_transfer_write(buf_barrier->srcAccessMask) &&
4318 cmd_buffer_has_pending_copy_query(cmd_buffer))
4319 flush_query_copies = true;
4320
4321 #if GFX_VER < 20
4322 ANV_FROM_HANDLE(anv_buffer, buffer, buf_barrier->buffer);
4323
4324 if (anv_buffer_is_sparse(buffer) && mask_is_write(src_flags))
4325 apply_sparse_flushes = true;
4326 #endif
4327 }
4328
4329 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
4330 const VkImageMemoryBarrier2 *img_barrier =
4331 &dep_info->pImageMemoryBarriers[i];
4332
4333 src_flags |= img_barrier->srcAccessMask;
4334 dst_flags |= img_barrier->dstAccessMask;
4335
4336 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
4337 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
4338
4339 uint32_t base_layer, layer_count;
4340 if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
4341 base_layer = 0;
4342 layer_count = u_minify(image->vk.extent.depth, range->baseMipLevel);
4343 } else {
4344 base_layer = range->baseArrayLayer;
4345 layer_count = vk_image_subresource_layer_count(&image->vk, range);
4346 }
4347 const uint32_t level_count =
4348 vk_image_subresource_level_count(&image->vk, range);
4349
4350 VkImageLayout old_layout = img_barrier->oldLayout;
4351 VkImageLayout new_layout = img_barrier->newLayout;
4352
4353 /* If we're inside a render pass, the runtime might have converted
4354 * some layouts from GENERAL to FEEDBACK_LOOP. Check if that's the
4355 * case and reconvert back to the original layout so that application
4356 * barriers within renderpass are operating with consistent layouts.
4357 */
4358 if (!cmd_buffer->vk.runtime_rp_barrier &&
4359 cmd_buffer->vk.render_pass != NULL) {
4360 assert(anv_cmd_graphics_state_has_image_as_attachment(&cmd_buffer->state.gfx,
4361 image));
4362 VkImageLayout subpass_att_layout, subpass_stencil_att_layout;
4363
4364 vk_command_buffer_get_attachment_layout(
4365 &cmd_buffer->vk, &image->vk,
4366 &subpass_att_layout, &subpass_stencil_att_layout);
4367
4368 old_layout = subpass_att_layout;
4369 new_layout = subpass_att_layout;
4370 }
4371
4372 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4373 transition_depth_buffer(cmd_buffer, image,
4374 range->baseMipLevel, level_count,
4375 base_layer, layer_count,
4376 old_layout, new_layout,
4377 false /* will_full_fast_clear */);
4378 }
4379
4380 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4381 transition_stencil_buffer(cmd_buffer, image,
4382 range->baseMipLevel, level_count,
4383 base_layer, layer_count,
4384 old_layout, new_layout,
4385 false /* will_full_fast_clear */);
4386 }
4387
4388 if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
4389 VkImageAspectFlags color_aspects =
4390 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4391 anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
4392 transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
4393 range->baseMipLevel, level_count,
4394 base_layer, layer_count,
4395 old_layout, new_layout,
4396 img_barrier->srcQueueFamilyIndex,
4397 img_barrier->dstQueueFamilyIndex,
4398 false /* will_full_fast_clear */);
4399 }
4400 }
4401 #if GFX_VER < 20
4402 /* Mark image as compressed if the destination layout has untracked
4403 * writes to the aux surface.
4404 */
4405 VkImageAspectFlags aspects =
4406 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4407 anv_foreach_image_aspect_bit(aspect_bit, image, aspects) {
4408 VkImageAspectFlagBits aspect = 1UL << aspect_bit;
4409 if (anv_layout_has_untracked_aux_writes(
4410 device->info,
4411 image, aspect,
4412 img_barrier->newLayout,
4413 cmd_buffer->queue_family->queueFlags)) {
4414 for (uint32_t l = 0; l < level_count; l++) {
4415 const uint32_t level = range->baseMipLevel + l;
4416 const uint32_t aux_layers =
4417 anv_image_aux_layers(image, aspect, level);
4418
4419 if (base_layer >= aux_layers)
4420 break; /* We will only get fewer layers as level increases */
4421
4422 uint32_t level_layer_count =
4423 MIN2(layer_count, aux_layers - base_layer);
4424
4425 set_image_compressed_bit(cmd_buffer, image, aspect,
4426 level,
4427 base_layer, level_layer_count,
4428 true);
4429 }
4430 }
4431 }
4432
4433 if (anv_image_is_sparse(image) && mask_is_write(src_flags))
4434 apply_sparse_flushes = true;
4435 #endif
4436 }
4437 }
4438
4439 enum anv_pipe_bits bits =
4440 anv_pipe_flush_bits_for_access_flags(cmd_buffer, src_flags) |
4441 anv_pipe_invalidate_bits_for_access_flags(cmd_buffer, dst_flags);
4442
4443 #if GFX_VER < 20
4444 /* Our HW implementation of the sparse feature lives in the GAM unit
4445 * (interface between all the GPU caches and external memory). As a result
4446 * writes to NULL bound images & buffers that should be ignored are
4447 * actually still visible in the caches. The only way for us to get correct
4448 * NULL bound regions to return 0s is to evict the caches to force the
4449 * caches to be repopulated with 0s.
4450 */
4451 if (apply_sparse_flushes)
4452 bits |= ANV_PIPE_FLUSH_BITS;
4453 #endif
4454
4455 /* Copies from query pools are executed with a shader writing through the
4456 * dataport.
4457 */
4458 if (flush_query_copies) {
4459 bits |= (GFX_VER >= 12 ?
4460 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_DATA_CACHE_FLUSH_BIT);
4461 }
4462
4463 if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
4464 genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
4465
4466 anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
4467 }
4468
genX(CmdPipelineBarrier2)4469 void genX(CmdPipelineBarrier2)(
4470 VkCommandBuffer commandBuffer,
4471 const VkDependencyInfo* pDependencyInfo)
4472 {
4473 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4474
4475 cmd_buffer_barrier(cmd_buffer, 1, pDependencyInfo, "pipe barrier");
4476 }
4477
4478 void
genX(batch_emit_breakpoint)4479 genX(batch_emit_breakpoint)(struct anv_batch *batch,
4480 struct anv_device *device,
4481 bool emit_before_draw)
4482 {
4483 /* Update draw call count once */
4484 uint32_t draw_count = emit_before_draw ?
4485 p_atomic_inc_return(&device->draw_call_count) :
4486 p_atomic_read(&device->draw_call_count);
4487
4488 if (((draw_count == intel_debug_bkp_before_draw_count &&
4489 emit_before_draw) ||
4490 (draw_count == intel_debug_bkp_after_draw_count &&
4491 !emit_before_draw))) {
4492 struct anv_address wait_addr =
4493 anv_state_pool_state_address(&device->dynamic_state_pool,
4494 device->breakpoint);
4495
4496 anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
4497 sem.WaitMode = PollingMode;
4498 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
4499 sem.SemaphoreDataDword = 0x1;
4500 sem.SemaphoreAddress = wait_addr;
4501 };
4502 }
4503 }
4504
4505 /* Only emit PIPELINE_SELECT, for the whole mode switch and flushing use
4506 * flush_pipeline_select()
4507 */
4508 void
genX(emit_pipeline_select)4509 genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline,
4510 const struct anv_device *device)
4511 {
4512 /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
4513 #if GFX_VER < 20
4514 anv_batch_emit(batch, GENX(PIPELINE_SELECT), ps) {
4515 ps.MaskBits = GFX_VERx10 >= 125 ? 0x93 : GFX_VER >= 12 ? 0x13 : 0x3;
4516 #if GFX_VER == 12
4517 ps.MediaSamplerDOPClockGateEnable = true;
4518 #endif
4519 ps.PipelineSelection = pipeline;
4520 #if GFX_VERx10 == 125
4521 /* It might still be better to only enable this when the compute
4522 * pipeline will have DPAS instructions.
4523 */
4524 ps.SystolicModeEnable = pipeline == GPGPU &&
4525 device->vk.enabled_extensions.KHR_cooperative_matrix &&
4526 device->vk.enabled_features.cooperativeMatrix;
4527 #endif
4528 }
4529 #endif /* if GFX_VER < 20 */
4530 }
4531
4532 static void
genX(flush_pipeline_select)4533 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
4534 uint32_t pipeline)
4535 {
4536 UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
4537
4538 if (cmd_buffer->state.current_pipeline == pipeline)
4539 return;
4540
4541 #if GFX_VER >= 20
4542 /* While PIPELINE_SELECT is not needed on Xe2+, our current assumption
4543 * is that the pipelined flushes in the 3D pipeline are not getting
4544 * synchronized with the compute dispatches (and vice versa). So we need
4545 * a CS_STALL prior the next set of commands to ensure the flushes have
4546 * completed.
4547 *
4548 * The new RESOURCE_BARRIER instruction has support for synchronizing
4549 * 3D/Compute and once we switch to that we should be able to get rid of
4550 * this CS_STALL.
4551 */
4552 anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT, "pipeline switch stall");
4553
4554 /* Since we are not stalling/flushing caches explicitly while switching
4555 * between the pipelines, we need to apply data dependency flushes recorded
4556 * previously on the resource.
4557 */
4558 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4559 #else
4560
4561 #if GFX_VER == 9
4562 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
4563 *
4564 * Software must clear the COLOR_CALC_STATE Valid field in
4565 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
4566 * with Pipeline Select set to GPGPU.
4567 *
4568 * The internal hardware docs recommend the same workaround for Gfx9
4569 * hardware too.
4570 */
4571 if (pipeline == GPGPU)
4572 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
4573 #endif
4574
4575 #if GFX_VERx10 == 120
4576 /* Undocumented workaround to force the re-emission of
4577 * MEDIA_INTERFACE_DESCRIPTOR_LOAD when switching from 3D to Compute
4578 * pipeline without rebinding a pipeline :
4579 * vkCmdBindPipeline(COMPUTE, cs_pipeline);
4580 * vkCmdDispatch(...);
4581 * vkCmdBindPipeline(GRAPHICS, gfx_pipeline);
4582 * vkCmdDraw(...);
4583 * vkCmdDispatch(...);
4584 */
4585 if (pipeline == _3D)
4586 cmd_buffer->state.compute.pipeline_dirty = true;
4587 #endif
4588
4589 /* We apparently cannot flush the tile cache (color/depth) from the GPGPU
4590 * pipeline. That means query clears will not be visible to query
4591 * copy/write. So we need to flush it before going to GPGPU mode.
4592 */
4593 if (cmd_buffer->state.current_pipeline == _3D &&
4594 cmd_buffer->state.queries.clear_bits) {
4595 anv_add_pending_pipe_bits(cmd_buffer,
4596 ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
4597 "query clear flush prior to GPGPU");
4598 }
4599
4600 /* Flush and invalidate bits done needed prior PIPELINE_SELECT. */
4601 enum anv_pipe_bits bits = 0;
4602
4603 #if GFX_VER >= 12
4604 /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
4605 *
4606 * "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
4607 * are flushed through a stalling PIPE_CONTROL command prior to
4608 * programming of PIPELINE_SELECT command transitioning Pipeline Select
4609 * from 3D to GPGPU/Media.
4610 * Software must ensure HDC Pipeline flush and Generic Media State Clear
4611 * is issued through a stalling PIPE_CONTROL command prior to programming
4612 * of PIPELINE_SELECT command transitioning Pipeline Select from
4613 * GPGPU/Media to 3D."
4614 *
4615 * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
4616 * because PIPE was not in MEDIA mode?!
4617 */
4618 bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
4619
4620 if (cmd_buffer->state.current_pipeline == _3D) {
4621 bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4622 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
4623 } else {
4624 bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
4625 }
4626 #else
4627 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
4628 * PIPELINE_SELECT [DevBWR+]":
4629 *
4630 * Project: DEVSNB+
4631 *
4632 * Software must ensure all the write caches are flushed through a
4633 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
4634 * command to invalidate read only caches prior to programming
4635 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
4636 *
4637 * Note the cmd_buffer_apply_pipe_flushes will split this into two
4638 * PIPE_CONTROLs.
4639 */
4640 bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4641 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
4642 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
4643 ANV_PIPE_CS_STALL_BIT |
4644 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
4645 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
4646 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
4647 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
4648 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
4649 #endif
4650
4651 /* Wa_16013063087 - State Cache Invalidate must be issued prior to
4652 * PIPELINE_SELECT when switching from 3D to Compute.
4653 *
4654 * SW must do this by programming of PIPECONTROL with “CS Stall” followed by
4655 * a PIPECONTROL with State Cache Invalidate bit set.
4656 *
4657 */
4658 if (cmd_buffer->state.current_pipeline == _3D && pipeline == GPGPU &&
4659 intel_needs_workaround(cmd_buffer->device->info, 16013063087))
4660 bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
4661
4662 anv_add_pending_pipe_bits(cmd_buffer, bits, "flush/invalidate PIPELINE_SELECT");
4663 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4664
4665 #if GFX_VER == 9
4666 if (pipeline == _3D) {
4667 /* There is a mid-object preemption workaround which requires you to
4668 * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However,
4669 * even without preemption, we have issues with geometry flickering when
4670 * GPGPU and 3D are back-to-back and this seems to fix it. We don't
4671 * really know why.
4672 *
4673 * Also, from the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
4674 *
4675 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4676 * the only bits that are changed are scoreboard related ..."
4677 *
4678 * This is satisfied by applying pre-PIPELINE_SELECT pipe flushes above.
4679 */
4680 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
4681 vfe.MaximumNumberofThreads =
4682 devinfo->max_cs_threads * devinfo->subslice_total - 1;
4683 vfe.NumberofURBEntries = 2;
4684 vfe.URBEntryAllocationSize = 2;
4685 }
4686
4687 /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
4688 * invalid. Set the compute pipeline to dirty to force a re-emit of the
4689 * pipeline in case we get back-to-back dispatch calls with the same
4690 * pipeline and a PIPELINE_SELECT in between.
4691 */
4692 cmd_buffer->state.compute.pipeline_dirty = true;
4693 }
4694 #endif
4695
4696 genX(emit_pipeline_select)(&cmd_buffer->batch, pipeline, cmd_buffer->device);
4697
4698 #if GFX_VER == 9
4699 if (devinfo->platform == INTEL_PLATFORM_GLK) {
4700 /* Project: DevGLK
4701 *
4702 * "This chicken bit works around a hardware issue with barrier logic
4703 * encountered when switching between GPGPU and 3D pipelines. To
4704 * workaround the issue, this mode bit should be set after a pipeline
4705 * is selected."
4706 */
4707 anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {
4708 scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU
4709 : GLK_BARRIER_MODE_3D_HULL;
4710 scec1.GLKBarrierModeMask = 1;
4711 }
4712 }
4713 #endif
4714 #endif /* else of if GFX_VER >= 20 */
4715 cmd_buffer->state.current_pipeline = pipeline;
4716 }
4717
4718 void
genX(flush_pipeline_select_3d)4719 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
4720 {
4721 genX(flush_pipeline_select)(cmd_buffer, _3D);
4722 }
4723
4724 void
genX(flush_pipeline_select_gpgpu)4725 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
4726 {
4727 genX(flush_pipeline_select)(cmd_buffer, GPGPU);
4728 }
4729
4730 void
genX(cmd_buffer_emit_gfx12_depth_wa)4731 genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
4732 const struct isl_surf *surf)
4733 {
4734 #if INTEL_NEEDS_WA_1808121037
4735 const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
4736 surf->samples == 1;
4737
4738 switch (cmd_buffer->state.depth_reg_mode) {
4739 case ANV_DEPTH_REG_MODE_HW_DEFAULT:
4740 if (!is_d16_1x_msaa)
4741 return;
4742 break;
4743 case ANV_DEPTH_REG_MODE_D16_1X_MSAA:
4744 if (is_d16_1x_msaa)
4745 return;
4746 break;
4747 case ANV_DEPTH_REG_MODE_UNKNOWN:
4748 break;
4749 }
4750
4751 /* We'll change some CHICKEN registers depending on the depth surface
4752 * format. Do a depth flush and stall so the pipeline is not using these
4753 * settings while we change the registers.
4754 */
4755 anv_add_pending_pipe_bits(cmd_buffer,
4756 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
4757 ANV_PIPE_DEPTH_STALL_BIT |
4758 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
4759 "Workaround: Stop pipeline for 1808121037");
4760 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4761
4762 /* Wa_1808121037
4763 *
4764 * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
4765 * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
4766 */
4767 anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
4768 reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
4769 reg.HIZPlaneOptimizationdisablebitMask = true;
4770 }
4771
4772 cmd_buffer->state.depth_reg_mode =
4773 is_d16_1x_msaa ? ANV_DEPTH_REG_MODE_D16_1X_MSAA :
4774 ANV_DEPTH_REG_MODE_HW_DEFAULT;
4775 #endif
4776 }
4777
4778 #if GFX_VER == 9
4779 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
4780 *
4781 * "The VF cache needs to be invalidated before binding and then using
4782 * Vertex Buffers that overlap with any previously bound Vertex Buffer
4783 * (at a 64B granularity) since the last invalidation. A VF cache
4784 * invalidate is performed by setting the "VF Cache Invalidation Enable"
4785 * bit in PIPE_CONTROL."
4786 *
4787 * This is implemented by carefully tracking all vertex and index buffer
4788 * bindings and flushing if the cache ever ends up with a range in the cache
4789 * that would exceed 4 GiB. This is implemented in three parts:
4790 *
4791 * 1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
4792 * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
4793 * tracking code of the new binding. If this new binding would cause
4794 * the cache to have a too-large range on the next draw call, a pipeline
4795 * stall and VF cache invalidate are added to pending_pipeline_bits.
4796 *
4797 * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
4798 * empty whenever we emit a VF invalidate.
4799 *
4800 * 3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
4801 * after every 3DPRIMITIVE and copies the bound range into the dirty
4802 * range for each used buffer. This has to be a separate step because
4803 * we don't always re-bind all buffers and so 1. can't know which
4804 * buffers are actually bound.
4805 */
4806 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)4807 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4808 int vb_index,
4809 struct anv_address vb_address,
4810 uint32_t vb_size)
4811 {
4812 if (GFX_VER > 9)
4813 return;
4814
4815 struct anv_vb_cache_range *bound, *dirty;
4816 if (vb_index == -1) {
4817 bound = &cmd_buffer->state.gfx.ib_bound_range;
4818 dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4819 } else {
4820 assert(vb_index >= 0);
4821 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4822 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4823 bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
4824 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
4825 }
4826
4827 if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
4828 vb_address,
4829 vb_size)) {
4830 anv_add_pending_pipe_bits(cmd_buffer,
4831 ANV_PIPE_CS_STALL_BIT |
4832 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
4833 "vb > 32b range");
4834 }
4835 }
4836
4837 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)4838 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4839 uint32_t access_type,
4840 uint64_t vb_used)
4841 {
4842 if (access_type == RANDOM) {
4843 /* We have an index buffer */
4844 struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
4845 struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4846
4847 anv_merge_vb_cache_range(dirty, bound);
4848 }
4849
4850 uint64_t mask = vb_used;
4851 while (mask) {
4852 int i = u_bit_scan64(&mask);
4853 assert(i >= 0);
4854 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4855 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4856
4857 struct anv_vb_cache_range *bound, *dirty;
4858 bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
4859 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
4860
4861 anv_merge_vb_cache_range(dirty, bound);
4862 }
4863 }
4864 #endif /* GFX_VER == 9 */
4865
4866 /**
4867 * Update the pixel hashing modes that determine the balancing of PS threads
4868 * across subslices and slices.
4869 *
4870 * \param width Width bound of the rendering area (already scaled down if \p
4871 * scale is greater than 1).
4872 * \param height Height bound of the rendering area (already scaled down if \p
4873 * scale is greater than 1).
4874 * \param scale The number of framebuffer samples that could potentially be
4875 * affected by an individual channel of the PS thread. This is
4876 * typically one for single-sampled rendering, but for operations
4877 * like CCS resolves and fast clears a single PS invocation may
4878 * update a huge number of pixels, in which case a finer
4879 * balancing is desirable in order to maximally utilize the
4880 * bandwidth available. UINT_MAX can be used as shorthand for
4881 * "finest hashing mode available".
4882 */
4883 void
genX(cmd_buffer_emit_hashing_mode)4884 genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
4885 unsigned width, unsigned height,
4886 unsigned scale)
4887 {
4888 #if GFX_VER == 9
4889 const struct intel_device_info *devinfo = cmd_buffer->device->info;
4890 const unsigned slice_hashing[] = {
4891 /* Because all Gfx9 platforms with more than one slice require
4892 * three-way subslice hashing, a single "normal" 16x16 slice hashing
4893 * block is guaranteed to suffer from substantial imbalance, with one
4894 * subslice receiving twice as much work as the other two in the
4895 * slice.
4896 *
4897 * The performance impact of that would be particularly severe when
4898 * three-way hashing is also in use for slice balancing (which is the
4899 * case for all Gfx9 GT4 platforms), because one of the slices
4900 * receives one every three 16x16 blocks in either direction, which
4901 * is roughly the periodicity of the underlying subslice imbalance
4902 * pattern ("roughly" because in reality the hardware's
4903 * implementation of three-way hashing doesn't do exact modulo 3
4904 * arithmetic, which somewhat decreases the magnitude of this effect
4905 * in practice). This leads to a systematic subslice imbalance
4906 * within that slice regardless of the size of the primitive. The
4907 * 32x32 hashing mode guarantees that the subslice imbalance within a
4908 * single slice hashing block is minimal, largely eliminating this
4909 * effect.
4910 */
4911 _32x32,
4912 /* Finest slice hashing mode available. */
4913 NORMAL
4914 };
4915 const unsigned subslice_hashing[] = {
4916 /* 16x16 would provide a slight cache locality benefit especially
4917 * visible in the sampler L1 cache efficiency of low-bandwidth
4918 * non-LLC platforms, but it comes at the cost of greater subslice
4919 * imbalance for primitives of dimensions approximately intermediate
4920 * between 16x4 and 16x16.
4921 */
4922 _16x4,
4923 /* Finest subslice hashing mode available. */
4924 _8x4
4925 };
4926 /* Dimensions of the smallest hashing block of a given hashing mode. If
4927 * the rendering area is smaller than this there can't possibly be any
4928 * benefit from switching to this mode, so we optimize out the
4929 * transition.
4930 */
4931 const unsigned min_size[][2] = {
4932 { 16, 4 },
4933 { 8, 4 }
4934 };
4935 const unsigned idx = scale > 1;
4936
4937 if (cmd_buffer->state.current_hash_scale != scale &&
4938 (width > min_size[idx][0] || height > min_size[idx][1])) {
4939 anv_add_pending_pipe_bits(cmd_buffer,
4940 ANV_PIPE_CS_STALL_BIT |
4941 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
4942 "change pixel hash mode");
4943 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4944
4945 anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {
4946 gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
4947 gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
4948 gt.SubsliceHashing = subslice_hashing[idx];
4949 gt.SubsliceHashingMask = -1;
4950 }
4951
4952 cmd_buffer->state.current_hash_scale = scale;
4953 }
4954 #endif
4955 }
4956
4957 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)4958 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
4959 {
4960 struct anv_device *device = cmd_buffer->device;
4961 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4962
4963 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
4964 device->isl_dev.ds.size / 4);
4965 if (dw == NULL)
4966 return;
4967
4968 struct isl_view isl_view = {};
4969 struct isl_depth_stencil_hiz_emit_info info = {
4970 .view = &isl_view,
4971 .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
4972 };
4973
4974 if (gfx->depth_att.iview != NULL) {
4975 isl_view = gfx->depth_att.iview->planes[0].isl;
4976 } else if (gfx->stencil_att.iview != NULL) {
4977 isl_view = gfx->stencil_att.iview->planes[0].isl;
4978 }
4979
4980 if (gfx->view_mask) {
4981 assert(isl_view.array_len == 0 ||
4982 isl_view.array_len >= util_last_bit(gfx->view_mask));
4983 isl_view.array_len = util_last_bit(gfx->view_mask);
4984 } else {
4985 assert(isl_view.array_len == 0 ||
4986 isl_view.array_len >= util_last_bit(gfx->layer_count));
4987 isl_view.array_len = gfx->layer_count;
4988 }
4989
4990 if (gfx->depth_att.iview != NULL) {
4991 const struct anv_image_view *iview = gfx->depth_att.iview;
4992 const struct anv_image *image = iview->image;
4993
4994 const uint32_t depth_plane =
4995 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
4996 const struct anv_surface *depth_surface =
4997 &image->planes[depth_plane].primary_surface;
4998 const struct anv_address depth_address =
4999 anv_image_address(image, &depth_surface->memory_range);
5000
5001 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, depth_address.bo);
5002
5003 info.depth_surf = &depth_surface->isl;
5004 info.depth_address = anv_address_physical(depth_address);
5005 info.mocs =
5006 anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
5007
5008 info.hiz_usage = gfx->depth_att.aux_usage;
5009 if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
5010 assert(isl_aux_usage_has_hiz(info.hiz_usage));
5011
5012 const struct anv_surface *hiz_surface =
5013 &image->planes[depth_plane].aux_surface;
5014 const struct anv_address hiz_address =
5015 anv_image_address(image, &hiz_surface->memory_range);
5016
5017 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, hiz_address.bo);
5018
5019 info.hiz_surf = &hiz_surface->isl;
5020 info.hiz_address = anv_address_physical(hiz_address);
5021
5022 info.depth_clear_value = anv_image_hiz_clear_value(image).f32[0];
5023 }
5024 }
5025
5026 if (gfx->stencil_att.iview != NULL) {
5027 const struct anv_image_view *iview = gfx->stencil_att.iview;
5028 const struct anv_image *image = iview->image;
5029
5030 const uint32_t stencil_plane =
5031 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
5032 const struct anv_surface *stencil_surface =
5033 &image->planes[stencil_plane].primary_surface;
5034 const struct anv_address stencil_address =
5035 anv_image_address(image, &stencil_surface->memory_range);
5036
5037 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, stencil_address.bo);
5038
5039 info.stencil_surf = &stencil_surface->isl;
5040
5041 info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
5042 info.stencil_address = anv_address_physical(stencil_address);
5043 info.mocs =
5044 anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
5045 }
5046
5047 isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
5048
5049 if (intel_needs_workaround(cmd_buffer->device->info, 1408224581) ||
5050 intel_needs_workaround(cmd_buffer->device->info, 14014097488) ||
5051 intel_needs_workaround(cmd_buffer->device->info, 14016712196)) {
5052 /* Wa_1408224581
5053 *
5054 * Workaround: Gfx12LP Astep only An additional pipe control with
5055 * post-sync = store dword operation would be required.( w/a is to have
5056 * an additional pipe control after the stencil state whenever the
5057 * surface state bits of this state is changing).
5058 *
5059 * This also seems sufficient to handle Wa_14014097488 and
5060 * Wa_14016712196.
5061 */
5062 genx_batch_emit_pipe_control_write(&cmd_buffer->batch, device->info,
5063 cmd_buffer->state.current_pipeline,
5064 WriteImmediateData,
5065 device->workaround_address, 0, 0);
5066 }
5067
5068 if (info.depth_surf)
5069 genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
5070
5071 cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
5072 }
5073
5074 static void
cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image_view * fsr_iview)5075 cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
5076 const struct anv_image_view *fsr_iview)
5077 {
5078 #if GFX_VERx10 >= 125
5079 struct anv_device *device = cmd_buffer->device;
5080
5081 if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
5082 return;
5083
5084 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
5085 device->isl_dev.cpb.size / 4);
5086 if (dw == NULL)
5087 return;
5088
5089 struct isl_cpb_emit_info info = { };
5090
5091 if (fsr_iview) {
5092 const struct anv_image_binding *binding = &fsr_iview->image->bindings[0];
5093
5094 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, binding->address.bo);
5095
5096 struct anv_address addr =
5097 anv_address_add(binding->address, binding->memory_range.offset);
5098
5099 info.view = &fsr_iview->planes[0].isl;
5100 info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
5101 info.address = anv_address_physical(addr);
5102 info.mocs =
5103 anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
5104 ISL_SURF_USAGE_CPB_BIT);
5105 }
5106
5107 isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
5108
5109 /* Wa_14016712196:
5110 * Emit dummy pipe control after state that sends implicit depth flush.
5111 */
5112 if (intel_needs_workaround(device->info, 14016712196)) {
5113 genx_batch_emit_pipe_control_write(&cmd_buffer->batch, device->info,
5114 cmd_buffer->state.current_pipeline,
5115 WriteImmediateData,
5116 device->workaround_address, 0, 0);
5117 }
5118
5119 #endif /* GFX_VERx10 >= 125 */
5120 }
5121
5122 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)5123 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
5124 {
5125 const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
5126 vk_find_struct_const(att->pNext,
5127 RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
5128 if (layout_info != NULL)
5129 return layout_info->initialLayout;
5130
5131 return att->imageLayout;
5132 }
5133
genX(CmdBeginRendering)5134 void genX(CmdBeginRendering)(
5135 VkCommandBuffer commandBuffer,
5136 const VkRenderingInfo* pRenderingInfo)
5137 {
5138 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5139 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5140 VkResult result;
5141
5142 if (!anv_cmd_buffer_is_render_queue(cmd_buffer)) {
5143 assert(!"Trying to start a render pass on non-render queue!");
5144 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
5145 return;
5146 }
5147
5148 anv_measure_beginrenderpass(cmd_buffer);
5149 trace_intel_begin_render_pass(&cmd_buffer->trace);
5150
5151 gfx->rendering_flags = pRenderingInfo->flags;
5152 gfx->view_mask = pRenderingInfo->viewMask;
5153 gfx->layer_count = pRenderingInfo->layerCount;
5154 gfx->samples = 0;
5155
5156 if (gfx->render_area.offset.x != pRenderingInfo->renderArea.offset.x ||
5157 gfx->render_area.offset.y != pRenderingInfo->renderArea.offset.y ||
5158 gfx->render_area.extent.width != pRenderingInfo->renderArea.extent.width ||
5159 gfx->render_area.extent.height != pRenderingInfo->renderArea.extent.height) {
5160 gfx->render_area = pRenderingInfo->renderArea;
5161 gfx->dirty |= ANV_CMD_DIRTY_RENDER_AREA;
5162 }
5163
5164 const bool is_multiview = gfx->view_mask != 0;
5165 const VkRect2D render_area = gfx->render_area;
5166 const uint32_t layers =
5167 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5168
5169 /* The framebuffer size is at least large enough to contain the render
5170 * area. Because a zero renderArea is possible, we MAX with 1.
5171 */
5172 struct isl_extent3d fb_size = {
5173 .w = MAX2(1, render_area.offset.x + render_area.extent.width),
5174 .h = MAX2(1, render_area.offset.y + render_area.extent.height),
5175 .d = layers,
5176 };
5177
5178 const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
5179 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
5180 if (result != VK_SUCCESS)
5181 return;
5182
5183 genX(flush_pipeline_select_3d)(cmd_buffer);
5184
5185 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5186 if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
5187 continue;
5188
5189 const VkRenderingAttachmentInfo *att =
5190 &pRenderingInfo->pColorAttachments[i];
5191 ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
5192 const VkImageLayout initial_layout = attachment_initial_layout(att);
5193
5194 assert(render_area.offset.x + render_area.extent.width <=
5195 iview->vk.extent.width);
5196 assert(render_area.offset.y + render_area.extent.height <=
5197 iview->vk.extent.height);
5198 assert(layers <= iview->vk.layer_count);
5199
5200 fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
5201 fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
5202
5203 assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
5204 gfx->samples |= iview->vk.image->samples;
5205
5206 enum isl_aux_usage aux_usage =
5207 anv_layout_to_aux_usage(cmd_buffer->device->info,
5208 iview->image,
5209 VK_IMAGE_ASPECT_COLOR_BIT,
5210 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
5211 att->imageLayout,
5212 cmd_buffer->queue_family->queueFlags);
5213
5214 union isl_color_value fast_clear_color = { .u32 = { 0, } };
5215
5216 if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5217 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
5218 const union isl_color_value clear_color =
5219 vk_to_isl_color_with_format(att->clearValue.color,
5220 iview->planes[0].isl.format);
5221
5222 /* We only support fast-clears on the first layer */
5223 const bool fast_clear =
5224 (!is_multiview || (gfx->view_mask & 1)) &&
5225 anv_can_fast_clear_color_view(cmd_buffer->device, iview,
5226 att->imageLayout, clear_color,
5227 layers, render_area,
5228 cmd_buffer->queue_family->queueFlags);
5229
5230 if (att->imageLayout != initial_layout) {
5231 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5232 render_area.extent.width == iview->vk.extent.width &&
5233 render_area.extent.height == iview->vk.extent.height);
5234 if (is_multiview) {
5235 u_foreach_bit(view, gfx->view_mask) {
5236 transition_color_buffer(cmd_buffer, iview->image,
5237 VK_IMAGE_ASPECT_COLOR_BIT,
5238 iview->vk.base_mip_level, 1,
5239 iview->vk.base_array_layer + view,
5240 1, /* layer_count */
5241 initial_layout, att->imageLayout,
5242 VK_QUEUE_FAMILY_IGNORED,
5243 VK_QUEUE_FAMILY_IGNORED,
5244 fast_clear);
5245 }
5246 } else {
5247 transition_color_buffer(cmd_buffer, iview->image,
5248 VK_IMAGE_ASPECT_COLOR_BIT,
5249 iview->vk.base_mip_level, 1,
5250 iview->vk.base_array_layer,
5251 gfx->layer_count,
5252 initial_layout, att->imageLayout,
5253 VK_QUEUE_FAMILY_IGNORED,
5254 VK_QUEUE_FAMILY_IGNORED,
5255 fast_clear);
5256 }
5257 }
5258
5259 uint32_t clear_view_mask = pRenderingInfo->viewMask;
5260 uint32_t base_clear_layer = iview->vk.base_array_layer;
5261 uint32_t clear_layer_count = gfx->layer_count;
5262 if (fast_clear) {
5263 /* We only support fast-clears on the first layer */
5264 assert(iview->vk.base_mip_level == 0 &&
5265 iview->vk.base_array_layer == 0);
5266
5267 fast_clear_color = clear_color;
5268
5269 if (iview->image->vk.samples == 1) {
5270 anv_image_ccs_op(cmd_buffer, iview->image,
5271 iview->planes[0].isl.format,
5272 iview->planes[0].isl.swizzle,
5273 VK_IMAGE_ASPECT_COLOR_BIT,
5274 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
5275 &fast_clear_color,
5276 false);
5277 } else {
5278 anv_image_mcs_op(cmd_buffer, iview->image,
5279 iview->planes[0].isl.format,
5280 iview->planes[0].isl.swizzle,
5281 VK_IMAGE_ASPECT_COLOR_BIT,
5282 0, 1, ISL_AUX_OP_FAST_CLEAR,
5283 &fast_clear_color,
5284 false);
5285 }
5286 clear_view_mask &= ~1u;
5287 base_clear_layer++;
5288 clear_layer_count--;
5289 #if GFX_VER < 20
5290 genX(set_fast_clear_state)(cmd_buffer, iview->image,
5291 iview->planes[0].isl.format,
5292 clear_color);
5293 #endif
5294 }
5295
5296 if (is_multiview) {
5297 u_foreach_bit(view, clear_view_mask) {
5298 anv_image_clear_color(cmd_buffer, iview->image,
5299 VK_IMAGE_ASPECT_COLOR_BIT,
5300 aux_usage,
5301 iview->planes[0].isl.format,
5302 iview->planes[0].isl.swizzle,
5303 iview->vk.base_mip_level,
5304 iview->vk.base_array_layer + view, 1,
5305 render_area, clear_color);
5306 }
5307 } else {
5308 anv_image_clear_color(cmd_buffer, iview->image,
5309 VK_IMAGE_ASPECT_COLOR_BIT,
5310 aux_usage,
5311 iview->planes[0].isl.format,
5312 iview->planes[0].isl.swizzle,
5313 iview->vk.base_mip_level,
5314 base_clear_layer, clear_layer_count,
5315 render_area, clear_color);
5316 }
5317 } else {
5318 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5319 assert(att->imageLayout == initial_layout);
5320 }
5321
5322 gfx->color_att[i].vk_format = iview->vk.format;
5323 gfx->color_att[i].iview = iview;
5324 gfx->color_att[i].layout = att->imageLayout;
5325 gfx->color_att[i].aux_usage = aux_usage;
5326
5327 struct isl_view isl_view = iview->planes[0].isl;
5328 if (pRenderingInfo->viewMask) {
5329 assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
5330 isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
5331 } else {
5332 assert(isl_view.array_len >= pRenderingInfo->layerCount);
5333 isl_view.array_len = pRenderingInfo->layerCount;
5334 }
5335
5336 anv_image_fill_surface_state(cmd_buffer->device,
5337 iview->image,
5338 VK_IMAGE_ASPECT_COLOR_BIT,
5339 &isl_view,
5340 ISL_SURF_USAGE_RENDER_TARGET_BIT,
5341 aux_usage, &fast_clear_color,
5342 0, /* anv_image_view_state_flags */
5343 &gfx->color_att[i].surface_state);
5344
5345 add_surface_state_relocs(cmd_buffer, &gfx->color_att[i].surface_state);
5346
5347 if (GFX_VER < 10 &&
5348 (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
5349 (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
5350 iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
5351 iview->planes[0].isl.base_level == 0 &&
5352 iview->planes[0].isl.base_array_layer == 0) {
5353 genX(load_image_clear_color)(cmd_buffer,
5354 gfx->color_att[i].surface_state.state,
5355 iview->image);
5356 }
5357
5358 if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
5359 gfx->color_att[i].resolve_mode = att->resolveMode;
5360 gfx->color_att[i].resolve_iview =
5361 anv_image_view_from_handle(att->resolveImageView);
5362 gfx->color_att[i].resolve_layout = att->resolveImageLayout;
5363 }
5364 }
5365
5366 anv_cmd_graphic_state_update_has_uint_rt(gfx);
5367
5368 const struct anv_image_view *fsr_iview = NULL;
5369 const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
5370 vk_find_struct_const(pRenderingInfo->pNext,
5371 RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
5372 if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
5373 fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
5374 /* imageLayout and shadingRateAttachmentTexelSize are ignored */
5375 }
5376
5377 const struct anv_image_view *ds_iview = NULL;
5378 const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
5379 const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
5380 if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
5381 (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
5382 const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
5383 VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5384 VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5385 VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5386 VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5387 enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
5388 enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
5389 VkClearDepthStencilValue clear_value = {};
5390
5391 if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
5392 d_iview = anv_image_view_from_handle(d_att->imageView);
5393 initial_depth_layout = attachment_initial_layout(d_att);
5394 depth_layout = d_att->imageLayout;
5395 depth_aux_usage =
5396 anv_layout_to_aux_usage(cmd_buffer->device->info,
5397 d_iview->image,
5398 VK_IMAGE_ASPECT_DEPTH_BIT,
5399 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5400 depth_layout,
5401 cmd_buffer->queue_family->queueFlags);
5402 clear_value.depth = d_att->clearValue.depthStencil.depth;
5403 }
5404
5405 if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
5406 s_iview = anv_image_view_from_handle(s_att->imageView);
5407 initial_stencil_layout = attachment_initial_layout(s_att);
5408 stencil_layout = s_att->imageLayout;
5409 stencil_aux_usage =
5410 anv_layout_to_aux_usage(cmd_buffer->device->info,
5411 s_iview->image,
5412 VK_IMAGE_ASPECT_STENCIL_BIT,
5413 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5414 stencil_layout,
5415 cmd_buffer->queue_family->queueFlags);
5416 clear_value.stencil = s_att->clearValue.depthStencil.stencil;
5417 }
5418
5419 assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
5420 ds_iview = d_iview != NULL ? d_iview : s_iview;
5421 assert(ds_iview != NULL);
5422
5423 assert(render_area.offset.x + render_area.extent.width <=
5424 ds_iview->vk.extent.width);
5425 assert(render_area.offset.y + render_area.extent.height <=
5426 ds_iview->vk.extent.height);
5427 assert(layers <= ds_iview->vk.layer_count);
5428
5429 fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
5430 fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
5431
5432 assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
5433 gfx->samples |= ds_iview->vk.image->samples;
5434
5435 VkImageAspectFlags clear_aspects = 0;
5436 if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5437 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5438 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
5439 if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5440 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5441 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5442
5443 if (clear_aspects != 0) {
5444 const bool hiz_clear =
5445 anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
5446 depth_layout, clear_aspects,
5447 clear_value.depth,
5448 render_area,
5449 cmd_buffer->queue_family->queueFlags);
5450
5451 if (depth_layout != initial_depth_layout) {
5452 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5453 render_area.extent.width == d_iview->vk.extent.width &&
5454 render_area.extent.height == d_iview->vk.extent.height);
5455
5456 if (is_multiview) {
5457 u_foreach_bit(view, gfx->view_mask) {
5458 transition_depth_buffer(cmd_buffer, d_iview->image,
5459 d_iview->vk.base_mip_level, 1,
5460 d_iview->vk.base_array_layer + view,
5461 1 /* layer_count */,
5462 initial_depth_layout, depth_layout,
5463 hiz_clear);
5464 }
5465 } else {
5466 transition_depth_buffer(cmd_buffer, d_iview->image,
5467 d_iview->vk.base_mip_level, 1,
5468 d_iview->vk.base_array_layer,
5469 gfx->layer_count,
5470 initial_depth_layout, depth_layout,
5471 hiz_clear);
5472 }
5473 }
5474
5475 if (stencil_layout != initial_stencil_layout) {
5476 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5477 render_area.extent.width == s_iview->vk.extent.width &&
5478 render_area.extent.height == s_iview->vk.extent.height);
5479
5480 if (is_multiview) {
5481 u_foreach_bit(view, gfx->view_mask) {
5482 transition_stencil_buffer(cmd_buffer, s_iview->image,
5483 s_iview->vk.base_mip_level, 1,
5484 s_iview->vk.base_array_layer + view,
5485 1 /* layer_count */,
5486 initial_stencil_layout,
5487 stencil_layout,
5488 hiz_clear);
5489 }
5490 } else {
5491 transition_stencil_buffer(cmd_buffer, s_iview->image,
5492 s_iview->vk.base_mip_level, 1,
5493 s_iview->vk.base_array_layer,
5494 gfx->layer_count,
5495 initial_stencil_layout,
5496 stencil_layout,
5497 hiz_clear);
5498 }
5499 }
5500
5501 if (is_multiview) {
5502 u_foreach_bit(view, gfx->view_mask) {
5503 uint32_t level = ds_iview->vk.base_mip_level;
5504 uint32_t layer = ds_iview->vk.base_array_layer + view;
5505
5506 if (hiz_clear) {
5507 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5508 clear_aspects,
5509 level, layer, 1,
5510 render_area, &clear_value);
5511 } else {
5512 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5513 clear_aspects,
5514 depth_aux_usage,
5515 level, layer, 1,
5516 render_area, &clear_value);
5517 }
5518 }
5519 } else {
5520 uint32_t level = ds_iview->vk.base_mip_level;
5521 uint32_t base_layer = ds_iview->vk.base_array_layer;
5522 uint32_t layer_count = gfx->layer_count;
5523
5524 if (hiz_clear) {
5525 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5526 clear_aspects,
5527 level, base_layer, layer_count,
5528 render_area, &clear_value);
5529 } else {
5530 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5531 clear_aspects,
5532 depth_aux_usage,
5533 level, base_layer, layer_count,
5534 render_area, &clear_value);
5535 }
5536 }
5537 } else {
5538 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5539 assert(depth_layout == initial_depth_layout);
5540 assert(stencil_layout == initial_stencil_layout);
5541 }
5542
5543 if (d_iview != NULL) {
5544 gfx->depth_att.vk_format = d_iview->vk.format;
5545 gfx->depth_att.iview = d_iview;
5546 gfx->depth_att.layout = depth_layout;
5547 gfx->depth_att.aux_usage = depth_aux_usage;
5548 if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5549 assert(d_att->resolveImageView != VK_NULL_HANDLE);
5550 gfx->depth_att.resolve_mode = d_att->resolveMode;
5551 gfx->depth_att.resolve_iview =
5552 anv_image_view_from_handle(d_att->resolveImageView);
5553 gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
5554 }
5555 }
5556
5557 if (s_iview != NULL) {
5558 gfx->stencil_att.vk_format = s_iview->vk.format;
5559 gfx->stencil_att.iview = s_iview;
5560 gfx->stencil_att.layout = stencil_layout;
5561 gfx->stencil_att.aux_usage = stencil_aux_usage;
5562 if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5563 assert(s_att->resolveImageView != VK_NULL_HANDLE);
5564 gfx->stencil_att.resolve_mode = s_att->resolveMode;
5565 gfx->stencil_att.resolve_iview =
5566 anv_image_view_from_handle(s_att->resolveImageView);
5567 gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
5568 }
5569 }
5570 }
5571
5572 /* Finally, now that we know the right size, set up the null surface */
5573 assert(util_bitcount(gfx->samples) <= 1);
5574 isl_null_fill_state(&cmd_buffer->device->isl_dev,
5575 gfx->null_surface_state.map,
5576 .size = fb_size);
5577
5578 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5579 if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
5580 continue;
5581
5582 isl_null_fill_state(&cmd_buffer->device->isl_dev,
5583 gfx->color_att[i].surface_state.state.map,
5584 .size = fb_size);
5585 }
5586
5587 /****** We can now start emitting code to begin the render pass ******/
5588
5589 gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
5590
5591 /* It is possible to start a render pass with an old pipeline. Because the
5592 * render pass and subpass index are both baked into the pipeline, this is
5593 * highly unlikely. In order to do so, it requires that you have a render
5594 * pass with a single subpass and that you use that render pass twice
5595 * back-to-back and use the same pipeline at the start of the second render
5596 * pass as at the end of the first. In order to avoid unpredictable issues
5597 * with this edge case, we just dirty the pipeline at the start of every
5598 * subpass.
5599 */
5600 gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
5601
5602 #if GFX_VER >= 11
5603 bool has_color_att = false;
5604 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5605 if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE) {
5606 has_color_att = true;
5607 break;
5608 }
5609 }
5610 if (has_color_att) {
5611 /* The PIPE_CONTROL command description says:
5612 *
5613 * "Whenever a Binding Table Index (BTI) used by a Render Target Message
5614 * points to a different RENDER_SURFACE_STATE, SW must issue a Render
5615 * Target Cache Flush by enabling this bit. When render target flush
5616 * is set due to new association of BTI, PS Scoreboard Stall bit must
5617 * be set in this packet."
5618 *
5619 * We assume that a new BeginRendering is always changing the RTs, which
5620 * may not be true and cause excessive flushing. We can trivially skip it
5621 * in the case that there are no RTs (depth-only rendering), though.
5622 */
5623 anv_add_pending_pipe_bits(cmd_buffer,
5624 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
5625 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
5626 "change RT");
5627 }
5628 #endif
5629
5630 cmd_buffer_emit_depth_stencil(cmd_buffer);
5631
5632 cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
5633 }
5634
5635 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer * cmd_buffer,struct anv_attachment * att,VkImageAspectFlagBits aspect)5636 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
5637 struct anv_attachment *att,
5638 VkImageAspectFlagBits aspect)
5639 {
5640 #if GFX_VER < 20
5641 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5642 const struct anv_image_view *iview = att->iview;
5643
5644 if (iview == NULL)
5645 return;
5646
5647 if (gfx->view_mask == 0) {
5648 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5649 aspect, att->aux_usage,
5650 iview->planes[0].isl.base_level,
5651 iview->planes[0].isl.base_array_layer,
5652 gfx->layer_count);
5653 } else {
5654 uint32_t res_view_mask = gfx->view_mask;
5655 while (res_view_mask) {
5656 int i = u_bit_scan(&res_view_mask);
5657
5658 const uint32_t level = iview->planes[0].isl.base_level;
5659 const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
5660
5661 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5662 aspect, att->aux_usage,
5663 level, layer, 1);
5664 }
5665 }
5666 #endif
5667 }
5668
genX(CmdEndRendering)5669 void genX(CmdEndRendering)(
5670 VkCommandBuffer commandBuffer)
5671 {
5672 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5673 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5674
5675 if (anv_batch_has_error(&cmd_buffer->batch))
5676 return;
5677
5678 const bool is_multiview = gfx->view_mask != 0;
5679 const uint32_t layers =
5680 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5681
5682 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5683 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
5684 VK_IMAGE_ASPECT_COLOR_BIT);
5685 }
5686
5687 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
5688 VK_IMAGE_ASPECT_DEPTH_BIT);
5689
5690 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
5691 VK_IMAGE_ASPECT_STENCIL_BIT);
5692
5693
5694 if (!(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5695 bool has_color_resolve = false;
5696 bool has_sparse_color_resolve = false;
5697
5698 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5699 if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE) {
5700 has_color_resolve = true;
5701 if (anv_image_is_sparse(gfx->color_att[i].iview->image))
5702 has_sparse_color_resolve = true;
5703 }
5704 }
5705
5706 if (has_color_resolve) {
5707 /* We are about to do some MSAA resolves. We need to flush so that
5708 * the result of writes to the MSAA color attachments show up in the
5709 * sampler when we blit to the single-sampled resolve target.
5710 */
5711 anv_add_pending_pipe_bits(cmd_buffer,
5712 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5713 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
5714 "MSAA resolve");
5715 }
5716
5717 const bool has_depth_resolve =
5718 gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE;
5719 const bool has_stencil_resolve =
5720 gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE;
5721 const bool has_sparse_depth_resolve =
5722 has_depth_resolve &&
5723 anv_image_is_sparse(gfx->depth_att.iview->image);
5724 const bool has_sparse_stencil_resolve =
5725 has_stencil_resolve &&
5726 anv_image_is_sparse(gfx->stencil_att.iview->image);
5727
5728 if (has_depth_resolve || has_stencil_resolve) {
5729 /* We are about to do some MSAA resolves. We need to flush so that
5730 * the result of writes to the MSAA depth attachments show up in the
5731 * sampler when we blit to the single-sampled resolve target.
5732 */
5733 anv_add_pending_pipe_bits(cmd_buffer,
5734 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5735 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
5736 "MSAA resolve");
5737 }
5738
5739 if (has_sparse_color_resolve || has_sparse_depth_resolve ||
5740 has_sparse_stencil_resolve) {
5741 /* If the resolve image is sparse we need some extra bits to make
5742 * sure unbound regions read 0, as residencyNonResidentStrict
5743 * mandates.
5744 */
5745 anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_TILE_CACHE_FLUSH_BIT,
5746 "sparse MSAA resolve");
5747 }
5748
5749 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5750 const struct anv_attachment *att = &gfx->color_att[i];
5751 if (att->resolve_mode == VK_RESOLVE_MODE_NONE)
5752 continue;
5753
5754 anv_attachment_msaa_resolve(cmd_buffer, att, att->layout,
5755 VK_IMAGE_ASPECT_COLOR_BIT);
5756 }
5757
5758 if (has_depth_resolve) {
5759 const struct anv_image_view *src_iview = gfx->depth_att.iview;
5760
5761 /* MSAA resolves sample from the source attachment. Transition the
5762 * depth attachment first to get rid of any HiZ that we may not be
5763 * able to handle.
5764 */
5765 transition_depth_buffer(cmd_buffer, src_iview->image, 0, 1,
5766 src_iview->planes[0].isl.base_array_layer,
5767 layers,
5768 gfx->depth_att.layout,
5769 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5770 false /* will_full_fast_clear */);
5771
5772 anv_attachment_msaa_resolve(cmd_buffer, &gfx->depth_att,
5773 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5774 VK_IMAGE_ASPECT_DEPTH_BIT);
5775
5776 /* Transition the source back to the original layout. This seems a
5777 * bit inefficient but, since HiZ resolves aren't destructive, going
5778 * from less HiZ to more is generally a no-op.
5779 */
5780 transition_depth_buffer(cmd_buffer, src_iview->image, 0, 1,
5781 src_iview->planes[0].isl.base_array_layer,
5782 layers,
5783 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5784 gfx->depth_att.layout,
5785 false /* will_full_fast_clear */);
5786 }
5787
5788 if (has_stencil_resolve) {
5789 anv_attachment_msaa_resolve(cmd_buffer, &gfx->stencil_att,
5790 gfx->stencil_att.layout,
5791 VK_IMAGE_ASPECT_STENCIL_BIT);
5792 }
5793 }
5794
5795 trace_intel_end_render_pass(&cmd_buffer->trace,
5796 gfx->render_area.extent.width,
5797 gfx->render_area.extent.height,
5798 gfx->color_att_count,
5799 gfx->samples);
5800
5801 anv_cmd_buffer_reset_rendering(cmd_buffer);
5802 }
5803
5804 void
genX(cmd_emit_conditional_render_predicate)5805 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
5806 {
5807 struct mi_builder b;
5808 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5809
5810 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
5811 mi_reg32(ANV_PREDICATE_RESULT_REG));
5812 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5813
5814 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
5815 mip.LoadOperation = LOAD_LOADINV;
5816 mip.CombineOperation = COMBINE_SET;
5817 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5818 }
5819 }
5820
genX(CmdBeginConditionalRenderingEXT)5821 void genX(CmdBeginConditionalRenderingEXT)(
5822 VkCommandBuffer commandBuffer,
5823 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)
5824 {
5825 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5826 ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
5827 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5828 struct anv_address value_address =
5829 anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
5830
5831 const bool isInverted = pConditionalRenderingBegin->flags &
5832 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
5833
5834 cmd_state->conditional_render_enabled = true;
5835
5836 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5837
5838 struct mi_builder b;
5839 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5840 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &value_address);
5841 mi_builder_set_mocs(&b, mocs);
5842
5843 /* Section 19.4 of the Vulkan 1.1.85 spec says:
5844 *
5845 * If the value of the predicate in buffer memory changes
5846 * while conditional rendering is active, the rendering commands
5847 * may be discarded in an implementation-dependent way.
5848 * Some implementations may latch the value of the predicate
5849 * upon beginning conditional rendering while others
5850 * may read it before every rendering command.
5851 *
5852 * So it's perfectly fine to read a value from the buffer once.
5853 */
5854 struct mi_value value = mi_mem32(value_address);
5855
5856 /* Precompute predicate result, it is necessary to support secondary
5857 * command buffers since it is unknown if conditional rendering is
5858 * inverted when populating them.
5859 */
5860 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
5861 isInverted ? mi_uge(&b, mi_imm(0), value) :
5862 mi_ult(&b, mi_imm(0), value));
5863 }
5864
genX(CmdEndConditionalRenderingEXT)5865 void genX(CmdEndConditionalRenderingEXT)(
5866 VkCommandBuffer commandBuffer)
5867 {
5868 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5869 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5870
5871 cmd_state->conditional_render_enabled = false;
5872 }
5873
5874 /* Set of stage bits for which are pipelined, i.e. they get queued
5875 * by the command streamer for later execution.
5876 */
5877 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
5878 ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
5879 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
5880 VK_PIPELINE_STAGE_2_HOST_BIT | \
5881 VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
5882
genX(CmdSetEvent2)5883 void genX(CmdSetEvent2)(
5884 VkCommandBuffer commandBuffer,
5885 VkEvent _event,
5886 const VkDependencyInfo* pDependencyInfo)
5887 {
5888 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5889 ANV_FROM_HANDLE(anv_event, event, _event);
5890
5891 if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
5892 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
5893 flush.PostSyncOperation = WriteImmediateData;
5894 flush.Address = anv_state_pool_state_address(
5895 &cmd_buffer->device->dynamic_state_pool,
5896 event->state);
5897 flush.ImmediateData = VK_EVENT_SET;
5898 }
5899 return;
5900 }
5901
5902 VkPipelineStageFlags2 src_stages = 0;
5903
5904 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
5905 src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
5906 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
5907 src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
5908 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
5909 src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
5910
5911 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5912 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5913
5914 enum anv_pipe_bits pc_bits = 0;
5915 if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5916 pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
5917 pc_bits |= ANV_PIPE_CS_STALL_BIT;
5918 }
5919
5920 genx_batch_emit_pipe_control_write
5921 (&cmd_buffer->batch, cmd_buffer->device->info,
5922 cmd_buffer->state.current_pipeline, WriteImmediateData,
5923 anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5924 event->state),
5925 VK_EVENT_SET, pc_bits);
5926 }
5927
genX(CmdResetEvent2)5928 void genX(CmdResetEvent2)(
5929 VkCommandBuffer commandBuffer,
5930 VkEvent _event,
5931 VkPipelineStageFlags2 stageMask)
5932 {
5933 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5934 ANV_FROM_HANDLE(anv_event, event, _event);
5935
5936 if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
5937 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
5938 flush.PostSyncOperation = WriteImmediateData;
5939 flush.Address = anv_state_pool_state_address(
5940 &cmd_buffer->device->dynamic_state_pool,
5941 event->state);
5942 flush.ImmediateData = VK_EVENT_RESET;
5943 }
5944 return;
5945 }
5946
5947 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5948 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5949
5950 enum anv_pipe_bits pc_bits = 0;
5951 if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5952 pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
5953 pc_bits |= ANV_PIPE_CS_STALL_BIT;
5954 }
5955
5956 genx_batch_emit_pipe_control_write
5957 (&cmd_buffer->batch, cmd_buffer->device->info,
5958 cmd_buffer->state.current_pipeline, WriteImmediateData,
5959 anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5960 event->state),
5961 VK_EVENT_RESET,
5962 pc_bits);
5963 }
5964
genX(CmdWaitEvents2)5965 void genX(CmdWaitEvents2)(
5966 VkCommandBuffer commandBuffer,
5967 uint32_t eventCount,
5968 const VkEvent* pEvents,
5969 const VkDependencyInfo* pDependencyInfos)
5970 {
5971 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5972
5973 for (uint32_t i = 0; i < eventCount; i++) {
5974 ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
5975
5976 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
5977 sem.WaitMode = PollingMode;
5978 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
5979 sem.SemaphoreDataDword = VK_EVENT_SET;
5980 sem.SemaphoreAddress = anv_state_pool_state_address(
5981 &cmd_buffer->device->dynamic_state_pool,
5982 event->state);
5983 }
5984 }
5985
5986 cmd_buffer_barrier(cmd_buffer, eventCount, pDependencyInfos, "wait event");
5987 }
5988
vk_to_intel_index_type(VkIndexType type)5989 static uint32_t vk_to_intel_index_type(VkIndexType type)
5990 {
5991 switch (type) {
5992 case VK_INDEX_TYPE_UINT8_KHR:
5993 return INDEX_BYTE;
5994 case VK_INDEX_TYPE_UINT16:
5995 return INDEX_WORD;
5996 case VK_INDEX_TYPE_UINT32:
5997 return INDEX_DWORD;
5998 default:
5999 unreachable("invalid index type");
6000 }
6001 }
6002
genX(CmdBindIndexBuffer2KHR)6003 void genX(CmdBindIndexBuffer2KHR)(
6004 VkCommandBuffer commandBuffer,
6005 VkBuffer _buffer,
6006 VkDeviceSize offset,
6007 VkDeviceSize size,
6008 VkIndexType indexType)
6009 {
6010 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6011 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
6012
6013 uint32_t restart_index = vk_index_to_restart(indexType);
6014 if (cmd_buffer->state.gfx.restart_index != restart_index) {
6015 cmd_buffer->state.gfx.restart_index = restart_index;
6016 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RESTART_INDEX;
6017 }
6018
6019 uint32_t index_type = vk_to_intel_index_type(indexType);
6020 if (cmd_buffer->state.gfx.index_buffer != buffer ||
6021 cmd_buffer->state.gfx.index_type != index_type ||
6022 cmd_buffer->state.gfx.index_offset != offset) {
6023 cmd_buffer->state.gfx.index_buffer = buffer;
6024 cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
6025 cmd_buffer->state.gfx.index_offset = offset;
6026 cmd_buffer->state.gfx.index_size = buffer ? vk_buffer_range(&buffer->vk, offset, size) : 0;
6027 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
6028 }
6029 }
6030
genX(CmdSetPerformanceOverrideINTEL)6031 VkResult genX(CmdSetPerformanceOverrideINTEL)(
6032 VkCommandBuffer commandBuffer,
6033 const VkPerformanceOverrideInfoINTEL* pOverrideInfo)
6034 {
6035 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6036
6037 switch (pOverrideInfo->type) {
6038 case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
6039 anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
6040 csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
6041 csdm2.MediaInstructionDisable = pOverrideInfo->enable;
6042 csdm2._3DRenderingInstructionDisableMask = true;
6043 csdm2.MediaInstructionDisableMask = true;
6044 }
6045 break;
6046 }
6047
6048 case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
6049 if (pOverrideInfo->enable) {
6050 /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
6051 anv_add_pending_pipe_bits(cmd_buffer,
6052 ANV_PIPE_FLUSH_BITS |
6053 ANV_PIPE_INVALIDATE_BITS,
6054 "perf counter isolation");
6055 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6056 }
6057 break;
6058
6059 default:
6060 unreachable("Invalid override");
6061 }
6062
6063 return VK_SUCCESS;
6064 }
6065
genX(CmdSetPerformanceStreamMarkerINTEL)6066 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
6067 VkCommandBuffer commandBuffer,
6068 const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo)
6069 {
6070 /* TODO: Waiting on the register to write, might depend on generation. */
6071
6072 return VK_SUCCESS;
6073 }
6074
6075 #define TIMESTAMP 0x2358
6076
genX(cmd_emit_timestamp)6077 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
6078 struct anv_device *device,
6079 struct anv_address addr,
6080 enum anv_timestamp_capture_type type,
6081 void *data) {
6082 /* Make sure ANV_TIMESTAMP_CAPTURE_AT_CS_STALL and
6083 * ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER capture type are not set for
6084 * transfer queue.
6085 */
6086 if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
6087 (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
6088 assert(type != ANV_TIMESTAMP_CAPTURE_AT_CS_STALL &&
6089 type != ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER);
6090 }
6091
6092 switch (type) {
6093 case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
6094 struct mi_builder b;
6095 mi_builder_init(&b, device->info, batch);
6096 mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
6097 break;
6098 }
6099
6100 case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE: {
6101 if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
6102 (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
6103 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
6104 if (intel_needs_workaround(device->info, 16018063123))
6105 genX(batch_emit_fast_color_dummy_blit)(batch, device);
6106 anv_batch_emit(batch, GENX(MI_FLUSH_DW), fd) {
6107 fd.PostSyncOperation = WriteTimestamp;
6108 fd.Address = addr;
6109 }
6110 } else {
6111 genx_batch_emit_pipe_control_write(batch, device->info, 0,
6112 WriteTimestamp, addr, 0, 0);
6113 }
6114 break;
6115 }
6116
6117 case ANV_TIMESTAMP_CAPTURE_AT_CS_STALL:
6118 genx_batch_emit_pipe_control_write
6119 (batch, device->info, 0, WriteTimestamp, addr, 0,
6120 ANV_PIPE_CS_STALL_BIT);
6121 break;
6122
6123 #if GFX_VERx10 >= 125
6124 case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
6125 uint32_t dwords[GENX(COMPUTE_WALKER_length)];
6126
6127 GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
6128 .PostSync = (struct GENX(POSTSYNC_DATA)) {
6129 .Operation = WriteTimestamp,
6130 .DestinationAddress = addr,
6131 .MOCS = anv_mocs(device, NULL, 0),
6132 },
6133 });
6134
6135 for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++) {
6136 if (dwords[i])
6137 ((uint32_t *)data)[i] |= dwords[i];
6138 }
6139 break;
6140 }
6141
6142 case ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH: {
6143 uint32_t dwords[GENX(EXECUTE_INDIRECT_DISPATCH_length)];
6144
6145 GENX(EXECUTE_INDIRECT_DISPATCH_pack)
6146 (batch, dwords, &(struct GENX(EXECUTE_INDIRECT_DISPATCH)) {
6147 .MOCS = anv_mocs(device, NULL, 0),
6148 .COMPUTE_WALKER_BODY = {
6149 .PostSync = (struct GENX(POSTSYNC_DATA)) {
6150 .Operation = WriteTimestamp,
6151 .DestinationAddress = addr,
6152 .MOCS = anv_mocs(device, NULL, 0),
6153 },
6154 }
6155 });
6156
6157 for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++) {
6158 if (dwords[i])
6159 ((uint32_t *)data)[i] |= dwords[i];
6160 }
6161 break;
6162 }
6163 #endif
6164
6165 default:
6166 unreachable("invalid");
6167 }
6168 }
6169
genX(cmd_capture_data)6170 void genX(cmd_capture_data)(struct anv_batch *batch,
6171 struct anv_device *device,
6172 struct anv_address dst_addr,
6173 struct anv_address src_addr,
6174 uint32_t size_B) {
6175 struct mi_builder b;
6176 mi_builder_init(&b, device->info, batch);
6177 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
6178 mi_memcpy(&b, dst_addr, src_addr, size_B);
6179 }
6180
genX(batch_emit_secondary_call)6181 void genX(batch_emit_secondary_call)(struct anv_batch *batch,
6182 struct anv_device *device,
6183 struct anv_address secondary_addr,
6184 struct anv_address secondary_return_addr)
6185 {
6186 struct mi_builder b;
6187 mi_builder_init(&b, device->info, batch);
6188 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
6189 /* Make sure the write in the batch buffer lands before we just execute the
6190 * jump.
6191 */
6192 mi_builder_set_write_check(&b, true);
6193
6194 /* Emit a write to change the return address of the secondary */
6195 struct mi_reloc_imm_token reloc =
6196 mi_store_relocated_imm(&b, mi_mem64(secondary_return_addr));
6197
6198 /* Ensure the write have landed before CS reads the address written
6199 * above
6200 */
6201 mi_ensure_write_fence(&b);
6202
6203 #if GFX_VER >= 12
6204 /* Disable prefetcher before jumping into a secondary */
6205 anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {
6206 arb.PreParserDisableMask = true;
6207 arb.PreParserDisable = true;
6208 }
6209 #endif
6210
6211 /* Jump into the secondary */
6212 anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
6213 bbs.AddressSpaceIndicator = ASI_PPGTT;
6214 bbs.SecondLevelBatchBuffer = Firstlevelbatch;
6215 bbs.BatchBufferStartAddress = secondary_addr;
6216 }
6217
6218 /* Replace the return address written by the MI_STORE_DATA_IMM above with
6219 * the primary's current batch address (immediately after the jump).
6220 */
6221 mi_relocate_store_imm(reloc,
6222 anv_address_physical(
6223 anv_batch_current_address(batch)));
6224 }
6225
6226 void *
genX(batch_emit_return)6227 genX(batch_emit_return)(struct anv_batch *batch)
6228 {
6229 return anv_batch_emitn(batch,
6230 GENX(MI_BATCH_BUFFER_START_length),
6231 GENX(MI_BATCH_BUFFER_START),
6232 .AddressSpaceIndicator = ASI_PPGTT,
6233 .SecondLevelBatchBuffer = Firstlevelbatch);
6234 }
6235
6236 void
genX(batch_emit_post_3dprimitive_was)6237 genX(batch_emit_post_3dprimitive_was)(struct anv_batch *batch,
6238 const struct anv_device *device,
6239 uint32_t primitive_topology,
6240 uint32_t vertex_count)
6241 {
6242 #if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
6243 if (intel_needs_workaround(device->info, 22014412737) &&
6244 (primitive_topology == _3DPRIM_POINTLIST ||
6245 primitive_topology == _3DPRIM_LINELIST ||
6246 primitive_topology == _3DPRIM_LINESTRIP ||
6247 primitive_topology == _3DPRIM_LINELIST_ADJ ||
6248 primitive_topology == _3DPRIM_LINESTRIP_ADJ ||
6249 primitive_topology == _3DPRIM_LINELOOP ||
6250 primitive_topology == _3DPRIM_POINTLIST_BF ||
6251 primitive_topology == _3DPRIM_LINESTRIP_CONT ||
6252 primitive_topology == _3DPRIM_LINESTRIP_BF ||
6253 primitive_topology == _3DPRIM_LINESTRIP_CONT_BF) &&
6254 (vertex_count == 1 || vertex_count == 2)) {
6255 genx_batch_emit_pipe_control_write
6256 (batch, device->info, 0, WriteImmediateData,
6257 device->workaround_address, 0, 0);
6258
6259 /* Reset counter because we just emitted a PC */
6260 batch->num_3d_primitives_emitted = 0;
6261 } else if (intel_needs_workaround(device->info, 16014538804)) {
6262 batch->num_3d_primitives_emitted++;
6263 /* WA 16014538804:
6264 * After every 3 3D_Primitive command,
6265 * atleast 1 pipe_control must be inserted.
6266 */
6267 if (batch->num_3d_primitives_emitted == 3) {
6268 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc);
6269 batch->num_3d_primitives_emitted = 0;
6270 }
6271 }
6272 #endif
6273 }
6274
6275 /* Wa_16018063123 */
6276 ALWAYS_INLINE void
genX(batch_emit_fast_color_dummy_blit)6277 genX(batch_emit_fast_color_dummy_blit)(struct anv_batch *batch,
6278 struct anv_device *device)
6279 {
6280 #if GFX_VERx10 >= 125
6281 anv_batch_emit(batch, GENX(XY_FAST_COLOR_BLT), blt) {
6282 blt.DestinationBaseAddress = device->workaround_address;
6283 blt.DestinationMOCS = device->isl_dev.mocs.blitter_dst;
6284 blt.DestinationPitch = 63;
6285 blt.DestinationX2 = 1;
6286 blt.DestinationY2 = 4;
6287 blt.DestinationSurfaceWidth = 1;
6288 blt.DestinationSurfaceHeight = 4;
6289 blt.DestinationSurfaceType = XY_SURFTYPE_2D;
6290 blt.DestinationSurfaceQPitch = 4;
6291 blt.DestinationTiling = XY_TILE_LINEAR;
6292 }
6293 #endif
6294 }
6295
6296 void
genX(urb_workaround)6297 genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer,
6298 const struct intel_urb_config *urb_cfg)
6299 {
6300 #if INTEL_NEEDS_WA_16014912113
6301 const struct intel_urb_config *current =
6302 &cmd_buffer->state.gfx.urb_cfg;
6303 if (intel_urb_setup_changed(urb_cfg, current, MESA_SHADER_TESS_EVAL) &&
6304 current->size[0] != 0) {
6305 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
6306 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
6307 urb._3DCommandSubOpcode += i;
6308 urb.VSURBStartingAddress = current->start[i];
6309 urb.VSURBEntryAllocationSize = current->size[i] - 1;
6310 urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
6311 }
6312 }
6313 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
6314 pc.HDCPipelineFlushEnable = true;
6315 }
6316 }
6317 #endif
6318 }
6319
6320 struct anv_state
genX(cmd_buffer_begin_companion_rcs_syncpoint)6321 genX(cmd_buffer_begin_companion_rcs_syncpoint)(
6322 struct anv_cmd_buffer *cmd_buffer)
6323 {
6324 #if GFX_VERx10 >= 125
6325 const struct intel_device_info *info = cmd_buffer->device->info;
6326 struct anv_state syncpoint =
6327 anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 2 * sizeof(uint32_t), 4);
6328 struct anv_address xcs_wait_addr =
6329 anv_cmd_buffer_temporary_state_address(cmd_buffer, syncpoint);
6330 struct anv_address rcs_wait_addr = anv_address_add(xcs_wait_addr, 4);
6331
6332 /* Reset the sync point */
6333 memset(syncpoint.map, 0, 2 * sizeof(uint32_t));
6334
6335 struct mi_builder b;
6336
6337 /* On CCS:
6338 * - flush all caches & invalidate
6339 * - unblock RCS
6340 * - wait on RCS to complete
6341 * - clear the value we waited on
6342 */
6343
6344 if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
6345 anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_FLUSH_BITS |
6346 ANV_PIPE_INVALIDATE_BITS |
6347 ANV_PIPE_STALL_BITS,
6348 "post main cmd buffer invalidate");
6349 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6350 } else if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
6351 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
6352 if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
6353 genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
6354 cmd_buffer->device);
6355 }
6356 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
6357 fd.FlushCCS = true; /* Maybe handle Flush LLC */
6358 }
6359 }
6360
6361 {
6362 mi_builder_init(&b, info, &cmd_buffer->batch);
6363 mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x1));
6364 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
6365 sem.WaitMode = PollingMode;
6366 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
6367 sem.SemaphoreDataDword = 0x1;
6368 sem.SemaphoreAddress = xcs_wait_addr;
6369 }
6370 /* Make sure to reset the semaphore in case the command buffer is run
6371 * multiple times.
6372 */
6373 mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x0));
6374 }
6375
6376 /* On RCS:
6377 * - wait on CCS signal
6378 * - clear the value we waited on
6379 */
6380 {
6381 mi_builder_init(&b, info, &cmd_buffer->companion_rcs_cmd_buffer->batch);
6382 anv_batch_emit(&cmd_buffer->companion_rcs_cmd_buffer->batch,
6383 GENX(MI_SEMAPHORE_WAIT),
6384 sem) {
6385 sem.WaitMode = PollingMode;
6386 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
6387 sem.SemaphoreDataDword = 0x1;
6388 sem.SemaphoreAddress = rcs_wait_addr;
6389 }
6390 /* Make sure to reset the semaphore in case the command buffer is run
6391 * multiple times.
6392 */
6393 mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x0));
6394 }
6395
6396 return syncpoint;
6397 #else
6398 unreachable("Not implemented");
6399 #endif
6400 }
6401
6402 void
genX(cmd_buffer_end_companion_rcs_syncpoint)6403 genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer,
6404 struct anv_state syncpoint)
6405 {
6406 #if GFX_VERx10 >= 125
6407 struct anv_address xcs_wait_addr =
6408 anv_cmd_buffer_temporary_state_address(cmd_buffer, syncpoint);
6409
6410 struct mi_builder b;
6411
6412 /* On RCS:
6413 * - flush all caches & invalidate
6414 * - unblock the CCS
6415 */
6416 anv_add_pending_pipe_bits(cmd_buffer->companion_rcs_cmd_buffer,
6417 ANV_PIPE_FLUSH_BITS |
6418 ANV_PIPE_INVALIDATE_BITS |
6419 ANV_PIPE_STALL_BITS,
6420 "post rcs flush");
6421 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer->companion_rcs_cmd_buffer);
6422
6423 mi_builder_init(&b, cmd_buffer->device->info,
6424 &cmd_buffer->companion_rcs_cmd_buffer->batch);
6425 mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x1));
6426 #else
6427 unreachable("Not implemented");
6428 #endif
6429 }
6430
6431 void
genX(write_trtt_entries)6432 genX(write_trtt_entries)(struct anv_async_submit *submit,
6433 struct anv_trtt_bind *l3l2_binds,
6434 uint32_t n_l3l2_binds,
6435 struct anv_trtt_bind *l1_binds,
6436 uint32_t n_l1_binds)
6437 {
6438 #if GFX_VER >= 12
6439 const struct intel_device_info *devinfo =
6440 submit->queue->device->info;
6441 struct anv_batch *batch = &submit->batch;
6442
6443 /* BSpec says:
6444 * "DWord Length programmed must not exceed 0x3FE."
6445 * For a single dword write the programmed length is 2, and for a single
6446 * qword it's 3. This is the value we actually write to the register field,
6447 * so it's not considering the bias.
6448 */
6449 uint32_t dword_write_len = 2;
6450 uint32_t qword_write_len = 3;
6451 uint32_t max_dword_extra_writes = 0x3FE - dword_write_len;
6452 uint32_t max_qword_extra_writes = (0x3FE - qword_write_len) / 2;
6453
6454 /* What makes the code below quite complicated is the fact that we can
6455 * write multiple values with MI_STORE_DATA_IMM as long as the writes go to
6456 * contiguous addresses.
6457 */
6458
6459 for (uint32_t i = 0; i < n_l3l2_binds; i++) {
6460 int extra_writes = 0;
6461 for (uint32_t j = i + 1;
6462 j < n_l3l2_binds && extra_writes <= max_qword_extra_writes;
6463 j++) {
6464 if (l3l2_binds[i].pte_addr + (j - i) * 8 == l3l2_binds[j].pte_addr) {
6465 extra_writes++;
6466 } else {
6467 break;
6468 }
6469 }
6470 bool is_last_write = n_l1_binds == 0 &&
6471 i + extra_writes + 1 == n_l3l2_binds;
6472
6473 uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
6474 qword_write_len + (extra_writes * 2);
6475 uint32_t *dw;
6476 dw = anv_batch_emitn(batch, total_len, GENX(MI_STORE_DATA_IMM),
6477 .ForceWriteCompletionCheck = is_last_write,
6478 .StoreQword = true,
6479 .Address = anv_address_from_u64(l3l2_binds[i].pte_addr),
6480 );
6481 dw += 3;
6482 for (uint32_t j = 0; j < extra_writes + 1; j++) {
6483 uint64_t entry_addr_64b = l3l2_binds[i + j].entry_addr;
6484 *dw = entry_addr_64b & 0xFFFFFFFF;
6485 dw++;
6486 *dw = (entry_addr_64b >> 32) & 0xFFFFFFFF;
6487 dw++;
6488 }
6489 assert(dw == batch->next);
6490
6491 i += extra_writes;
6492 }
6493
6494 for (uint32_t i = 0; i < n_l1_binds; i++) {
6495 int extra_writes = 0;
6496 for (uint32_t j = i + 1;
6497 j < n_l1_binds && extra_writes <= max_dword_extra_writes;
6498 j++) {
6499 if (l1_binds[i].pte_addr + (j - i) * 4 ==
6500 l1_binds[j].pte_addr) {
6501 extra_writes++;
6502 } else {
6503 break;
6504 }
6505 }
6506
6507 bool is_last_write = i + extra_writes + 1 == n_l1_binds;
6508
6509 uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
6510 dword_write_len + extra_writes;
6511 uint32_t *dw;
6512 dw = anv_batch_emitn(batch, total_len, GENX(MI_STORE_DATA_IMM),
6513 .ForceWriteCompletionCheck = is_last_write,
6514 .Address = anv_address_from_u64(l1_binds[i].pte_addr),
6515 );
6516 dw += 3;
6517 for (uint32_t j = 0; j < extra_writes + 1; j++) {
6518 *dw = (l1_binds[i + j].entry_addr >> 16) & 0xFFFFFFFF;
6519 dw++;
6520 }
6521 assert(dw == batch->next);
6522
6523 i += extra_writes;
6524 }
6525
6526 genx_batch_emit_pipe_control(batch, devinfo, _3D,
6527 ANV_PIPE_CS_STALL_BIT |
6528 ANV_PIPE_TLB_INVALIDATE_BIT);
6529 #else
6530 unreachable("Not implemented");
6531 #endif
6532 }
6533
6534 void
genX(async_submit_end)6535 genX(async_submit_end)(struct anv_async_submit *submit)
6536 {
6537 struct anv_batch *batch = &submit->batch;
6538 anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
6539 }
6540
6541 void
genX(CmdWriteBufferMarker2AMD)6542 genX(CmdWriteBufferMarker2AMD)(VkCommandBuffer commandBuffer,
6543 VkPipelineStageFlags2 stage,
6544 VkBuffer dstBuffer,
6545 VkDeviceSize dstOffset,
6546 uint32_t marker)
6547 {
6548 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6549 ANV_FROM_HANDLE(anv_buffer, buffer, dstBuffer);
6550
6551 /* The barriers inserted by the application to make dstBuffer writable
6552 * should already have the L1/L2 cache flushes. On platforms where the
6553 * command streamer is not coherent with L3, we need an additional set of
6554 * cache flushes.
6555 */
6556 enum anv_pipe_bits bits =
6557 (ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info) ? 0 :
6558 (ANV_PIPE_DATA_CACHE_FLUSH_BIT | ANV_PIPE_TILE_CACHE_FLUSH_BIT)) |
6559 ANV_PIPE_END_OF_PIPE_SYNC_BIT;
6560
6561 trace_intel_begin_write_buffer_marker(&cmd_buffer->trace);
6562
6563 anv_add_pending_pipe_bits(cmd_buffer, bits, "write buffer marker");
6564 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6565
6566 struct mi_builder b;
6567 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
6568
6569 /* Emitting a PIPE_CONTROL with Post-Sync Op = Write Immediate Data
6570 * would be the logical way to implement this extension, as it could
6571 * do a pipelined marker write. Unfortunately, it requires writing
6572 * whole 64-bit QWords, and VK_AMD_buffer_marker requires writing a
6573 * 32-bit value. MI_STORE_DATA_IMM is the only good way to do that,
6574 * and unfortunately it requires stalling.
6575 */
6576 mi_store(&b, mi_mem32(anv_address_add(buffer->address, dstOffset)),
6577 mi_imm(marker));
6578
6579 trace_intel_end_write_buffer_marker(&cmd_buffer->trace);
6580 }
6581