xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/genX_init_state.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "anv_private.h"
25 
26 #include "common/intel_aux_map.h"
27 #include "common/intel_sample_positions.h"
28 #include "common/intel_pixel_hash.h"
29 #include "genxml/gen_macros.h"
30 #include "genxml/genX_pack.h"
31 
32 #include "vk_standard_sample_locations.h"
33 
34 #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT
35 #include "grl/genX_grl.h"
36 #endif
37 
38 #include "genX_mi_builder.h"
39 
40 #include "vk_util.h"
41 #include "vk_format.h"
42 
43 static void
genX(emit_slice_hashing_state)44 genX(emit_slice_hashing_state)(struct anv_device *device,
45                                struct anv_batch *batch)
46 {
47 #if GFX_VER == 11
48    /* Gfx11 hardware has two pixel pipes at most. */
49    for (unsigned i = 2; i < ARRAY_SIZE(device->info->ppipe_subslices); i++)
50       assert(device->info->ppipe_subslices[i] == 0);
51 
52    if (device->info->ppipe_subslices[0] == device->info->ppipe_subslices[1])
53      return;
54 
55    if (!device->slice_hash.alloc_size) {
56       unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
57       device->slice_hash =
58          anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
59 
60       const bool flip = device->info->ppipe_subslices[0] <
61                      device->info->ppipe_subslices[1];
62       struct GENX(SLICE_HASH_TABLE) table;
63       intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
64 
65       GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
66    }
67 
68    anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
69       ptr.SliceHashStatePointerValid = true;
70       ptr.SliceHashTableStatePointer = device->slice_hash.offset;
71    }
72 
73    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
74       mode.SliceHashingTableEnable = true;
75    }
76 #elif GFX_VERx10 == 120
77    /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
78     * present with n active dual subslices.
79     */
80    unsigned ppipes_of[3] = {};
81 
82    for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
83       for (unsigned p = 0; p < 3; p++)
84          ppipes_of[n] += (device->info->ppipe_subslices[p] == n);
85    }
86 
87    /* Gfx12 has three pixel pipes. */
88    for (unsigned p = 3; p < ARRAY_SIZE(device->info->ppipe_subslices); p++)
89       assert(device->info->ppipe_subslices[p] == 0);
90 
91    if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
92       /* All three pixel pipes have the maximum number of active dual
93        * subslices, or there is only one active pixel pipe: Nothing to do.
94        */
95       return;
96    }
97 
98    anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
99       p.SliceHashControl[0] = TABLE_0;
100 
101       if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
102          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
103       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
104          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
105 
106       if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
107          intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
108       else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
109          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
110       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
111          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
112       else
113          unreachable("Illegal fusing.");
114    }
115 
116    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
117       p.SubsliceHashingTableEnable = true;
118       p.SubsliceHashingTableEnableMask = true;
119    }
120 #elif GFX_VERx10 == 125
121    /* Calculate the set of present pixel pipes, and another set of
122     * present pixel pipes with 2 dual subslices enabled, the latter
123     * will appear on the hashing table with twice the frequency of
124     * pixel pipes with a single dual subslice present.
125     */
126    uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
127    for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) {
128       if (device->info->ppipe_subslices[p] > 0)
129          ppipe_mask1 |= (1u << p);
130       if (device->info->ppipe_subslices[p] > 1)
131          ppipe_mask2 |= (1u << p);
132    }
133    assert(ppipe_mask1);
134 
135    if (!device->slice_hash.alloc_size) {
136       unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
137       device->slice_hash =
138          anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
139 
140       struct GENX(SLICE_HASH_TABLE) table;
141 
142       /* Note that the hardware expects an array with 7 tables, each
143        * table is intended to specify the pixel pipe hashing behavior
144        * for every possible slice count between 2 and 8, however that
145        * doesn't actually work, among other reasons due to hardware
146        * bugs that will cause the GPU to erroneously access the table
147        * at the wrong index in some cases, so in practice all 7 tables
148        * need to be initialized to the same value.
149        */
150       for (unsigned i = 0; i < 7; i++)
151          intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
152                                              table.Entry[i][0]);
153 
154       GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
155    }
156 
157    anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
158       ptr.SliceHashStatePointerValid = true;
159       ptr.SliceHashTableStatePointer = device->slice_hash.offset;
160    }
161 
162    /* TODO: Figure out FCV support for other platforms
163     * Testing indicates that FCV is broken gfx125.
164     * Let's disable FCV for now till we figure out what's wrong.
165     *
166     * Alternatively, it can be toggled off via drirc option 'anv_disable_fcv'.
167     *
168     * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9987
169     * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10318
170     * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10795
171     * Ref: Internal issue 1480 about Unreal Engine 5.1
172     */
173    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
174       mode.SliceHashingTableEnable = true;
175       mode.SliceHashingTableEnableMask = true;
176       mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
177 				    hashing32x32 : NormalMode);
178       mode.CrossSliceHashingModeMask = -1;
179       mode.FastClearOptimizationEnable = !device->physical->disable_fcv;
180       mode.FastClearOptimizationEnableMask = !device->physical->disable_fcv;
181    }
182 #endif
183 }
184 
185 static void
init_common_queue_state(struct anv_queue * queue,struct anv_batch * batch)186 init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch)
187 {
188    UNUSED struct anv_device *device = queue->device;
189 
190 #if GFX_VER >= 11
191    /* Starting with GFX version 11, SLM is no longer part of the L3$ config
192     * so it never changes throughout the lifetime of the VkDevice.
193     */
194    const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
195    genX(emit_l3_config)(batch, device, cfg);
196    device->l3_config = cfg;
197 #endif
198 
199 #if GFX_VERx10 == 125
200    /* Even though L3 partial write merging is supposed to be enabled
201     * by default on Gfx12.5 according to the hardware spec, i915
202     * appears to accidentally clear the enables during context
203     * initialization, so make sure to enable them here since partial
204     * write merging has a large impact on rendering performance.
205     */
206    anv_batch_write_reg(batch, GENX(L3SQCREG5), reg) {
207       reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
208       reg.CompressiblePartialWriteMergeEnable = true;
209       reg.CoherentPartialWriteMergeEnable = true;
210       reg.CrossTilePartialWriteMergeEnable = true;
211    }
212 #endif
213 
214    /* Emit STATE_BASE_ADDRESS on Gfx12+ because we set a default CPS_STATE and
215     * those are relative to STATE_BASE_ADDRESS::DynamicStateBaseAddress.
216     */
217 #if GFX_VER >= 12
218 
219 #if GFX_VERx10 >= 125
220    /* Wa_14016407139:
221     *
222     * "On Surface state base address modification, for 3D workloads, SW must
223     *  always program PIPE_CONTROL either with CS Stall or PS sync stall. In
224     *  both the cases set Render Target Cache Flush Enable".
225     */
226    genx_batch_emit_pipe_control(batch, device->info,
227                                 0,
228                                 ANV_PIPE_CS_STALL_BIT |
229                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
230 #endif
231 
232    /* GEN:BUG:1607854226:
233     *
234     *  Non-pipelined state has issues with not applying in MEDIA/GPGPU mode.
235     *  Fortunately, we always start the context off in 3D mode.
236     */
237    uint32_t mocs = device->isl_dev.mocs.internal;
238    anv_batch_emit(batch, GENX(STATE_BASE_ADDRESS), sba) {
239       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
240       sba.GeneralStateBufferSize  = 0xfffff;
241       sba.GeneralStateMOCS = mocs;
242       sba.GeneralStateBaseAddressModifyEnable = true;
243       sba.GeneralStateBufferSizeModifyEnable = true;
244 
245       sba.StatelessDataPortAccessMOCS = mocs;
246 
247       sba.SurfaceStateBaseAddress =
248          (struct anv_address) { .offset =
249          device->physical->va.internal_surface_state_pool.addr,
250       };
251       sba.SurfaceStateMOCS = mocs;
252       sba.SurfaceStateBaseAddressModifyEnable = true;
253 
254       sba.DynamicStateBaseAddress =
255          (struct anv_address) { .offset =
256          device->physical->va.dynamic_state_pool.addr,
257       };
258       sba.DynamicStateBufferSize = (device->physical->va.dynamic_state_pool.size +
259                                     device->physical->va.dynamic_visible_pool.size) / 4096;
260       sba.DynamicStateMOCS = mocs;
261       sba.DynamicStateBaseAddressModifyEnable = true;
262       sba.DynamicStateBufferSizeModifyEnable = true;
263 
264       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
265       sba.IndirectObjectBufferSize = 0xfffff;
266       sba.IndirectObjectMOCS = mocs;
267       sba.IndirectObjectBaseAddressModifyEnable = true;
268       sba.IndirectObjectBufferSizeModifyEnable = true;
269 
270       sba.InstructionBaseAddress =
271          (struct anv_address) { .offset =
272          device->physical->va.instruction_state_pool.addr,
273       };
274       sba.InstructionBufferSize = device->physical->va.instruction_state_pool.size / 4096;
275       sba.InstructionMOCS = mocs;
276       sba.InstructionBaseAddressModifyEnable = true;
277       sba.InstructionBuffersizeModifyEnable = true;
278 
279 #if GFX_VER >= 11
280       sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
281       sba.BindlessSamplerStateBufferSize = 0;
282       sba.BindlessSamplerStateMOCS = mocs;
283       sba.BindlessSamplerStateBaseAddressModifyEnable = true;
284 #endif
285 
286       if (device->physical->indirect_descriptors) {
287          sba.BindlessSurfaceStateBaseAddress =
288             (struct anv_address) { .offset =
289             device->physical->va.bindless_surface_state_pool.addr,
290          };
291          sba.BindlessSurfaceStateSize =
292             anv_physical_device_bindless_heap_size(device->physical, false) /
293             ANV_SURFACE_STATE_SIZE - 1;
294          sba.BindlessSurfaceStateMOCS = mocs;
295          sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
296       } else {
297          /* Bindless Surface State & Bindless Sampler State are aligned to the
298           * same heap
299           */
300          sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
301             .offset = device->physical->va.internal_surface_state_pool.addr,
302          };
303          sba.BindlessSurfaceStateSize =
304             (device->physical->va.internal_surface_state_pool.size +
305              device->physical->va.bindless_surface_state_pool.size) - 1;
306          sba.BindlessSurfaceStateMOCS = mocs;
307          sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
308       }
309 
310 #if GFX_VERx10 >= 125
311       sba.L1CacheControl = L1CC_WB;
312 #endif
313    }
314 
315    /* Disable the POOL_ALLOC mechanism in HW. We found that this state can get
316     * corrupted (likely due to leaking from another context), the default
317     * value should be disabled. It doesn't cost anything to set it once at
318     * device initialization.
319     */
320 #if GFX_VER >= 11 && GFX_VERx10 < 125
321    anv_batch_emit(batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
322       btpa.MOCS = mocs;
323       btpa.BindingTablePoolEnable = false;
324    }
325 #endif
326 
327    struct mi_builder b;
328    mi_builder_init(&b, device->info, batch);
329 
330    mi_store(&b, mi_reg64(ANV_BINDLESS_SURFACE_BASE_ADDR_REG),
331                 mi_imm(device->physical->va.internal_surface_state_pool.addr));
332 #endif /* GFX_VER >= 12 */
333 
334 #if GFX_VERx10 >= 125
335    if (ANV_SUPPORT_RT && device->info->has_ray_tracing) {
336       anv_batch_emit(batch, GENX(3DSTATE_BTD), btd) {
337          /* TODO: This is the timeout after which the bucketed thread
338           *       dispatcher will kick off a wave of threads. We go with the
339           *       lowest value for now. It could be tweaked on a per
340           *       application basis (drirc).
341           */
342          btd.DispatchTimeoutCounter = _64clocks;
343          /* BSpec 43851: "This field must be programmed to 6h i.e. memory
344           *               backed buffer must be 128KB."
345           */
346          btd.PerDSSMemoryBackedBufferSize = 6;
347          btd.MemoryBackedBufferBasePointer = (struct anv_address) {
348             /* This batch doesn't have a reloc list so we can't use the BO
349              * here.  We just use the address directly.
350              */
351             .offset = device->btd_fifo_bo->offset,
352          };
353 #if INTEL_NEEDS_WA_14017794102
354          btd.BTDMidthreadpreemption = false;
355 #endif
356       }
357    }
358 #endif
359 }
360 
361 #if GFX_VER >= 20
362 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE_FAST)
363 #else
364 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE)
365 #endif
366 
367 static VkResult
init_render_queue_state(struct anv_queue * queue,bool is_companion_rcs_batch)368 init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
369 {
370    struct anv_device *device = queue->device;
371    UNUSED const struct intel_device_info *devinfo = queue->device->info;
372 
373    struct anv_async_submit *submit;
374    VkResult result = anv_async_submit_create(queue,
375                                              &device->batch_bo_pool,
376                                              is_companion_rcs_batch,
377                                              true, &submit);
378    if (result != VK_SUCCESS)
379       return result;
380 
381    struct anv_batch *batch = &submit->batch;
382 
383    genX(emit_pipeline_select)(batch, _3D, device);
384 
385 #if GFX_VER == 9
386    anv_batch_write_reg(batch, GENX(CACHE_MODE_1), cm1) {
387       cm1.FloatBlendOptimizationEnable = true;
388       cm1.FloatBlendOptimizationEnableMask = true;
389       cm1.MSCRAWHazardAvoidanceBit = true;
390       cm1.MSCRAWHazardAvoidanceBitMask = true;
391       cm1.PartialResolveDisableInVC = true;
392       cm1.PartialResolveDisableInVCMask = true;
393    }
394 #endif
395 
396    anv_batch_emit(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
397 
398    anv_batch_emit(batch, _3DSTATE_DRAWING_RECTANGLE, rect) {
399       rect.ClippedDrawingRectangleYMin = 0;
400       rect.ClippedDrawingRectangleXMin = 0;
401       rect.ClippedDrawingRectangleYMax = UINT16_MAX;
402       rect.ClippedDrawingRectangleXMax = UINT16_MAX;
403       rect.DrawingRectangleOriginY = 0;
404       rect.DrawingRectangleOriginX = 0;
405    }
406 
407    anv_batch_emit(batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
408 
409    /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
410     *
411     *   "3DSTATE_RASTER if used must be programmed prior to using this
412     *    packet."
413     *
414     * Emit this before 3DSTATE_WM_HZ_OP below.
415     */
416    anv_batch_emit(batch, GENX(3DSTATE_RASTER), rast) {
417       rast.APIMode = DX101;
418    }
419 
420    /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
421     *
422     *    "3DSTATE_MULTISAMPLE packet must be used prior to this packet to
423     *     change the Number of Multisamples. This packet must not be used to
424     *     change Number of Multisamples in a rendering sequence."
425     *
426     * Emit this before 3DSTATE_WM_HZ_OP below.
427     */
428    anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms);
429 
430    /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
431     * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
432     * Clear." It mentions that the packet overrides GPU state for the clear
433     * operation and needs to be reset to 0s to clear the overrides. Depending
434     * on the kernel, we may not get a context with the state for this packet
435     * zeroed. Do it ourselves just in case. We've observed this to prevent a
436     * number of GPU hangs on ICL.
437     */
438    anv_batch_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
439 
440    genX(emit_sample_pattern)(batch, NULL);
441 
442 #if GFX_VER == 11
443    /* The default behavior of bit 5 "Headerless Message for Pre-emptable
444     * Contexts" in SAMPLER MODE register is set to 0, which means
445     * headerless sampler messages are not allowed for pre-emptable
446     * contexts. Set the bit 5 to 1 to allow them.
447     */
448    anv_batch_write_reg(batch, GENX(SAMPLER_MODE), sm) {
449       sm.HeaderlessMessageforPreemptableContexts = true;
450       sm.HeaderlessMessageforPreemptableContextsMask = true;
451    }
452 
453    /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
454     * HALF_SLICE_CHICKEN7 register.
455     */
456    anv_batch_write_reg(batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
457       hsc7.EnabledTexelOffsetPrecisionFix = true;
458       hsc7.EnabledTexelOffsetPrecisionFixMask = true;
459    }
460 
461    anv_batch_write_reg(batch, GENX(TCCNTLREG), tcc) {
462       tcc.L3DataPartialWriteMergingEnable = true;
463       tcc.ColorZPartialWriteMergingEnable = true;
464       tcc.URBPartialWriteMergingEnable = true;
465       tcc.TCDisable = true;
466    }
467 #endif
468    genX(emit_slice_hashing_state)(device, batch);
469 
470 #if GFX_VER >= 11
471    /* hardware specification recommends disabling repacking for
472     * the compatibility with decompression mechanism in display controller.
473     */
474    if (device->info->disable_ccs_repack) {
475       anv_batch_write_reg(batch, GENX(CACHE_MODE_0), cm0) {
476          cm0.DisableRepackingforCompression = true;
477          cm0.DisableRepackingforCompressionMask = true;
478       }
479    }
480 
481    /* an unknown issue is causing vs push constants to become
482     * corrupted during object-level preemption. For now, restrict
483     * to command buffer level preemption to avoid rendering
484     * corruption.
485     */
486    anv_batch_write_reg(batch, GENX(CS_CHICKEN1), cc1) {
487       cc1.ReplayMode = MidcmdbufferPreemption;
488       cc1.ReplayModeMask = true;
489 
490 #if GFX_VERx10 == 120
491       cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = true;
492       cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
493 #endif
494    }
495 
496 #if INTEL_NEEDS_WA_1806527549
497    /* Wa_1806527549 says to disable the following HiZ optimization when the
498     * depth buffer is D16_UNORM. We've found the WA to help with more depth
499     * buffer configurations however, so we always disable it just to be safe.
500     */
501    anv_batch_write_reg(batch, GENX(HIZ_CHICKEN), reg) {
502       reg.HZDepthTestLEGEOptimizationDisable = true;
503       reg.HZDepthTestLEGEOptimizationDisableMask = true;
504    }
505 #endif
506 
507 #if GFX_VER == 12
508    anv_batch_write_reg(batch, GENX(FF_MODE2), reg) {
509       /* On Alchemist, the FF_MODE2 docs for the GS timer say:
510        *
511        *    "The timer value must be set to 224."
512        *
513        * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
514        * and that this is necessary to avoid hanging the HS/DS units.  It
515        * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
516        *
517        * The HS timer docs also have the same quote for Alchemist.  I am
518        * unaware of a reason it needs to be set to 224 on Tigerlake, but
519        * we do so for consistency if nothing else.
520        *
521        * For the TDS timer value, the docs say:
522        *
523        *    "For best performance, a value of 4 should be programmed."
524        *
525        * i915 also sets it this way on Tigerlake due to workarounds.
526        *
527        * The default VS timer appears to be 0, so we leave it at that.
528        */
529       reg.GSTimerValue  = 224;
530       reg.HSTimerValue  = 224;
531       reg.TDSTimerValue = 4;
532       reg.VSTimerValue  = 0;
533    }
534 #endif
535 
536 #if INTEL_NEEDS_WA_1508744258
537    /*    Disable RHWO by setting 0x7010[14] by default except during resolve
538     *    pass.
539     *
540     * We implement global disabling of the optimization here and we toggle it
541     * in anv_image_ccs_op().
542     */
543    anv_batch_write_reg(batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
544       c1.RCCRHWOOptimizationDisable = true;
545       c1.RCCRHWOOptimizationDisableMask = true;
546    }
547 #endif
548 
549 #if GFX_VERx10 < 125
550 #define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3)
551 #else
552 #define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1)
553 #endif
554 
555    /* Enable the new line drawing algorithm that produces higher quality
556     * lines.
557     */
558    anv_batch_write_reg(batch, AA_LINE_QUALITY_REG, c3) {
559       c3.AALineQualityFix = true;
560       c3.AALineQualityFixMask = true;
561    }
562 #endif
563 
564 #if GFX_VER == 12
565    if (device->info->has_aux_map) {
566       uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
567       assert(aux_base_addr % (32 * 1024) == 0);
568       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
569          lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
570          lri.DataDWord = aux_base_addr & 0xffffffff;
571       }
572       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
573          lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
574          lri.DataDWord = aux_base_addr >> 32;
575       }
576    }
577 #endif
578 
579 #if GFX_VERx10 == 125
580    anv_batch_write_reg(batch, GENX(CHICKEN_RASTER_2), reg) {
581       reg.TBIMRBatchSizeOverride = true;
582       reg.TBIMROpenBatchEnable = true;
583       reg.TBIMRFastClip = true;
584       reg.TBIMRBatchSizeOverrideMask = true;
585       reg.TBIMROpenBatchEnableMask = true;
586       reg.TBIMRFastClipMask = true;
587    }
588 #endif
589 
590    /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
591     * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
592     *
593     * This is only safe on kernels with context isolation support.
594     */
595    assert(device->physical->info.has_context_isolation);
596    anv_batch_write_reg(batch, GENX(CS_DEBUG_MODE2), csdm2) {
597       csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
598       csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
599    }
600 
601    init_common_queue_state(queue, batch);
602 
603    /* Because 3DSTATE_CPS::CoarsePixelShadingStateArrayPointer is relative to
604     * the dynamic state base address we need to emit this instruction after
605     * STATE_BASE_ADDRESS in init_common_queue_state().
606     */
607 #if GFX_VER == 11
608    anv_batch_emit(batch, GENX(3DSTATE_CPS), cps);
609 #elif GFX_VER >= 12
610    anv_batch_emit(batch, GENX(3DSTATE_CPS_POINTERS), cps) {
611       assert(device->cps_states.alloc_size != 0);
612       /* Offset 0 is the disabled state */
613       cps.CoarsePixelShadingStateArrayPointer =
614          device->cps_states.offset;
615    }
616 #endif
617 
618 #if GFX_VERx10 >= 125
619    anv_batch_emit(batch, GENX(STATE_COMPUTE_MODE), cm) {
620       cm.Mask1 = 0xffff;
621 #if GFX_VERx10 >= 200
622       cm.Mask2 = 0xffff;
623 #endif
624    }
625    anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
626    anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
627 
628    /* We no longer required to explicitly flush or invalidate caches since the
629     * PIPELINE_SELECT is getting deprecated on Xe2+.
630     */
631 #if GFX_VER < 20
632    genx_batch_emit_pipe_control_write(batch, device->info, _3D, NoWrite,
633                                       ANV_NULL_ADDRESS,
634                                       0,
635                                       ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
636 #endif
637 
638    genX(emit_pipeline_select)(batch, GPGPU, device);
639    anv_batch_emit(batch, GENX(CFE_STATE), cfe) {
640       cfe.MaximumNumberofThreads =
641          devinfo->max_cs_threads * devinfo->subslice_total;
642    }
643 
644    /* We no longer required to explicitly flush or invalidate caches since the
645     * PIPELINE_SELECT is getting deprecated on Xe2+.
646     */
647 #if GFX_VER < 20
648    genx_batch_emit_pipe_control_write(batch, device->info, _3D, NoWrite,
649                                       ANV_NULL_ADDRESS,
650                                       0,
651                                       ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
652 #endif
653 
654    genX(emit_pipeline_select)(batch, _3D, device);
655 #endif
656 
657 #if GFX_VER >= 20
658    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
659       p.DX10OGLBorderModeforYCRCB = true;
660       p.DX10OGLBorderModeforYCRCBMask = true;
661 #if INTEL_NEEDS_WA_14019857787
662       p.EnableOOOreadsinRCPB = true;
663       p.EnableOOOreadsinRCPBMask = true;
664 #endif
665    }
666 #endif
667 
668    anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
669 
670    result = batch->status;
671    if (result != VK_SUCCESS) {
672       anv_async_submit_destroy(submit);
673       return result;
674    }
675 
676    result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
677    if (result != VK_SUCCESS) {
678       anv_async_submit_destroy(submit);
679       return result;
680    }
681 
682    if (is_companion_rcs_batch)
683       queue->init_companion_submit = submit;
684    else
685       queue->init_submit = submit;
686 
687    return VK_SUCCESS;
688 }
689 
690 static VkResult
init_compute_queue_state(struct anv_queue * queue)691 init_compute_queue_state(struct anv_queue *queue)
692 {
693    struct anv_device *device = queue->device;
694    UNUSED const struct intel_device_info *devinfo = device->info;
695    struct anv_async_submit *submit;
696    VkResult result = anv_async_submit_create(queue,
697                                              &device->batch_bo_pool,
698                                              false, true, &submit);
699    if (result != VK_SUCCESS)
700       return result;
701 
702    struct anv_batch *batch = &submit->batch;
703 
704    genX(emit_pipeline_select)(batch, GPGPU, queue->device);
705 
706 #if GFX_VER == 12
707    if (queue->device->info->has_aux_map) {
708       uint64_t aux_base_addr =
709          intel_aux_map_get_base(queue->device->aux_map_ctx);
710       assert(aux_base_addr % (32 * 1024) == 0);
711       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
712          lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
713          lri.DataDWord = aux_base_addr & 0xffffffff;
714       }
715       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
716          lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num) + 4;
717          lri.DataDWord = aux_base_addr >> 32;
718       }
719    }
720 #else
721    assert(!queue->device->info->has_aux_map);
722 #endif
723 
724    /* Wa_14015782607 - Issue pipe control with HDC_flush and
725     * untyped cache flush set to 1 when CCS has NP state update with
726     * STATE_COMPUTE_MODE.
727     */
728    if (intel_needs_workaround(devinfo, 14015782607) &&
729        queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
730       genx_batch_emit_pipe_control(batch, devinfo, GPGPU,
731                                    ANV_PIPE_CS_STALL_BIT |
732                                    ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
733                                    ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
734    }
735 
736 #if GFX_VERx10 >= 125
737    /* Wa_14014427904/22013045878 - We need additional invalidate/flush when
738     * emitting NP state commands with ATS-M in compute mode.
739     */
740    if (intel_device_info_is_atsm(devinfo) &&
741        queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
742       genx_batch_emit_pipe_control
743          (batch, devinfo, GPGPU,
744           ANV_PIPE_CS_STALL_BIT |
745           ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
746           ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
747           ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
748           ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
749           ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
750           ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
751    }
752 
753    anv_batch_emit(batch, GENX(STATE_COMPUTE_MODE), cm) {
754 #if GFX_VER < 20
755       cm.PixelAsyncComputeThreadLimit = 4;
756       cm.PixelAsyncComputeThreadLimitMask = 0x7;
757 #endif
758    }
759 #endif
760 
761    init_common_queue_state(queue, batch);
762 
763 #if GFX_VERx10 >= 125
764    anv_batch_emit(batch, GENX(CFE_STATE), cfe) {
765       cfe.MaximumNumberofThreads =
766          devinfo->max_cs_threads * devinfo->subslice_total;
767    }
768 #endif
769 
770    anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
771 
772    result = batch->status;
773    if (result != VK_SUCCESS) {
774       anv_async_submit_destroy(submit);
775       return result;
776    }
777 
778    result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
779    if (result != VK_SUCCESS) {
780       anv_async_submit_destroy(submit);
781       return result;
782    }
783 
784    queue->init_submit = submit;
785 
786    return VK_SUCCESS;
787 }
788 
789 static VkResult
init_copy_video_queue_state(struct anv_queue * queue)790 init_copy_video_queue_state(struct anv_queue *queue)
791 {
792 #if GFX_VER >= 12
793    struct anv_device *device = queue->device;
794    const struct intel_device_info *devinfo = device->info;
795 
796    if (devinfo->has_aux_map) {
797       struct anv_async_submit *submit;
798       VkResult result = anv_async_submit_create(queue,
799                                                 &device->batch_bo_pool,
800                                                 false, true, &submit);
801       if (result != VK_SUCCESS)
802          return result;
803 
804       struct anv_batch *batch = &submit->batch;
805 
806       uint64_t reg = GENX(VD0_AUX_TABLE_BASE_ADDR_num);
807 
808       if (queue->family->engine_class == INTEL_ENGINE_CLASS_COPY) {
809 #if GFX_VERx10 >= 125
810          reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
811 #endif
812       }
813 
814       uint64_t aux_base_addr =
815          intel_aux_map_get_base(queue->device->aux_map_ctx);
816       assert(aux_base_addr % (32 * 1024) == 0);
817       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
818          lri.RegisterOffset = reg;
819          lri.DataDWord = aux_base_addr & 0xffffffff;
820       }
821       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
822          lri.RegisterOffset = reg + 4;
823          lri.DataDWord = aux_base_addr >> 32;
824       }
825 
826       anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
827 
828       result = batch->status;
829       if (result != VK_SUCCESS) {
830          anv_async_submit_destroy(submit);
831          return result;
832       }
833 
834       result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
835       if (result != VK_SUCCESS) {
836          anv_async_submit_destroy(submit);
837          return result;
838       }
839 
840       queue->init_submit = submit;
841    }
842 #else
843    assert(!queue->device->info->has_aux_map);
844 #endif
845 
846    return VK_SUCCESS;
847 }
848 
849 void
genX(init_physical_device_state)850 genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
851 {
852    assert(pdevice->info.verx10 == GFX_VERx10);
853 #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT
854    genX(grl_load_rt_uuid)(pdevice->rt_uuid);
855    pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)();
856 #endif
857 
858    pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp);
859    pdevice->cmd_capture_data = genX(cmd_capture_data);
860 
861    pdevice->gpgpu_pipeline_value = GPGPU;
862 
863    struct GENX(VERTEX_ELEMENT_STATE) empty_ve = {
864       .Valid = true,
865       .Component0Control = VFCOMP_STORE_0,
866       .Component1Control = VFCOMP_STORE_0,
867       .Component2Control = VFCOMP_STORE_0,
868       .Component3Control = VFCOMP_STORE_0,
869    };
870    GENX(VERTEX_ELEMENT_STATE_pack)(NULL, pdevice->empty_vs_input, &empty_ve);
871 }
872 
873 VkResult
genX(init_device_state)874 genX(init_device_state)(struct anv_device *device)
875 {
876    VkResult res;
877 
878    device->slice_hash = (struct anv_state) { 0 };
879    for (uint32_t i = 0; i < device->queue_count; i++) {
880       struct anv_queue *queue = &device->queues[i];
881       switch (queue->family->engine_class) {
882       case INTEL_ENGINE_CLASS_RENDER:
883          res = init_render_queue_state(queue, false /* is_companion_rcs_batch */);
884          break;
885       case INTEL_ENGINE_CLASS_COMPUTE: {
886          res = init_compute_queue_state(queue);
887          if (res != VK_SUCCESS)
888             return res;
889 
890          /**
891           * Execute RCS init batch by default on the companion RCS command buffer in
892           * order to support MSAA copy/clear operations on compute queue.
893           */
894          res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
895          break;
896       }
897       case INTEL_ENGINE_CLASS_VIDEO:
898          res = init_copy_video_queue_state(queue);
899          break;
900       case INTEL_ENGINE_CLASS_COPY:
901          res = init_copy_video_queue_state(queue);
902          if (res != VK_SUCCESS)
903             return res;
904 
905          /**
906           * Execute RCS init batch by default on the companion RCS command buffer in
907           * order to support MSAA copy/clear operations on copy queue.
908           */
909          res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
910          break;
911       default:
912          res = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
913          break;
914       }
915       if (res != VK_SUCCESS)
916          return res;
917 
918       if (!device->trtt.queue &&
919           queue->family->queueFlags & VK_QUEUE_SPARSE_BINDING_BIT)
920          device->trtt.queue = queue;
921    }
922 
923    return res;
924 }
925 
926 #if GFX_VERx10 >= 125
927 #define maybe_for_each_shading_rate_op(name) \
928    for (VkFragmentShadingRateCombinerOpKHR name = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR; \
929         name <= VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR; \
930         name++)
931 #elif GFX_VER >= 12
932 #define maybe_for_each_shading_rate_op(name)
933 #endif
934 
935 /* Rather than reemitting the CPS_STATE structure everything those changes and
936  * for as many viewports as needed, we can just prepare all possible cases and
937  * just pick the right offset from the prepacked states when needed.
938  */
939 void
genX(init_cps_device_state)940 genX(init_cps_device_state)(struct anv_device *device)
941 {
942 #if GFX_VER >= 12
943    void *cps_state_ptr = device->cps_states.map;
944 
945    /* Disabled CPS mode */
946    for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
947       /* ICL PRMs, Volume 2d: Command Reference: Structures: 3DSTATE_CPS_BODY:
948        *
949        *   "It is an INVALID configuration to set the CPS mode other than
950        *    CPS_MODE_NONE and request per-sample dispatch in 3DSTATE_PS_EXTRA.
951        *    Such configuration should be disallowed at the API level, and
952        *    rendering results are undefined."
953        *
954        * Since we select this state when per coarse pixel is disabled and that
955        * includes when per-sample dispatch is enabled, we need to ensure this
956        * is set to NONE.
957        */
958       struct GENX(CPS_STATE) cps_state = {
959          .CoarsePixelShadingMode = CPS_MODE_NONE,
960       };
961 
962       GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
963       cps_state_ptr += GENX(CPS_STATE_length) * 4;
964    }
965 
966    maybe_for_each_shading_rate_op(op0) {
967       maybe_for_each_shading_rate_op(op1) {
968          for (uint32_t x = 1; x <= 4; x *= 2) {
969             for (uint32_t y = 1; y <= 4; y *= 2) {
970                struct GENX(CPS_STATE) cps_state = {
971                   .CoarsePixelShadingMode = CPS_MODE_CONSTANT,
972                   .MinCPSizeX = x,
973                   .MinCPSizeY = y,
974                };
975 
976 #if GFX_VERx10 >= 125
977                static const uint32_t combiner_ops[] = {
978                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR]    = PASSTHROUGH,
979                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = OVERRIDE,
980                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR]     = HIGH_QUALITY,
981                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR]     = LOW_QUALITY,
982                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR]     = RELATIVE,
983                };
984 
985                cps_state.Combiner0OpcodeforCPsize = combiner_ops[op0];
986                cps_state.Combiner1OpcodeforCPsize = combiner_ops[op1];
987 #endif /* GFX_VERx10 >= 125 */
988 
989                for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
990                   GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
991                   cps_state_ptr += GENX(CPS_STATE_length) * 4;
992                }
993             }
994          }
995       }
996    }
997 #endif /* GFX_VER >= 12 */
998 }
999 
1000 void
genX(emit_l3_config)1001 genX(emit_l3_config)(struct anv_batch *batch,
1002                      const struct anv_device *device,
1003                      const struct intel_l3_config *cfg)
1004 {
1005 #if GFX_VER < 20
1006    UNUSED const struct intel_device_info *devinfo = device->info;
1007 
1008 #if GFX_VER >= 12
1009 #define L3_ALLOCATION_REG GENX(L3ALLOC)
1010 #define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
1011 #else
1012 #define L3_ALLOCATION_REG GENX(L3CNTLREG)
1013 #define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
1014 #endif
1015 
1016    anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) {
1017       if (cfg == NULL || (GFX_VER >= 12 && cfg->n[INTEL_L3P_ALL] > 126)) {
1018          assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
1019                           cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
1020                           cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
1021                           cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
1022 #if GFX_VER >= 12
1023          l3cr.L3FullWayAllocationEnable = true;
1024 #else
1025          unreachable("Invalid L3$ config");
1026 #endif
1027       } else {
1028 #if GFX_VER < 11
1029          l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM];
1030 #endif
1031 #if INTEL_NEEDS_WA_1406697149
1032          /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be
1033           * set in L3CNTLREG register. The default setting of the bit is not
1034           * the desirable behavior.
1035           */
1036          l3cr.ErrorDetectionBehaviorControl = true;
1037          l3cr.UseFullWays = true;
1038 #endif /* INTEL_NEEDS_WA_1406697149 */
1039          assert(cfg->n[INTEL_L3P_IS] == 0);
1040          assert(cfg->n[INTEL_L3P_C] == 0);
1041          assert(cfg->n[INTEL_L3P_T] == 0);
1042          l3cr.URBAllocation = cfg->n[INTEL_L3P_URB];
1043          l3cr.ROAllocation = cfg->n[INTEL_L3P_RO];
1044          l3cr.DCAllocation = cfg->n[INTEL_L3P_DC];
1045          l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL];
1046       }
1047    }
1048 #endif /* GFX_VER < 20 */
1049 }
1050 
1051 void
genX(emit_sample_pattern)1052 genX(emit_sample_pattern)(struct anv_batch *batch,
1053                           const struct vk_sample_locations_state *sl)
1054 {
1055    assert(sl == NULL || sl->grid_size.width == 1);
1056    assert(sl == NULL || sl->grid_size.height == 1);
1057 
1058    /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and
1059     * VkPhysicalDeviceFeatures::standardSampleLocations.
1060     */
1061    anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) {
1062       /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says:
1063        *
1064        *    "When programming the sample offsets (for NUMSAMPLES_4 or _8
1065        *    and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3
1066        *    (or 7 for 8X, or 15 for 16X) must have monotonically increasing
1067        *    distance from the pixel center. This is required to get the
1068        *    correct centroid computation in the device."
1069        *
1070        * However, the Vulkan spec seems to require that the the samples occur
1071        * in the order provided through the API. The standard sample patterns
1072        * have the above property that they have monotonically increasing
1073        * distances from the center but client-provided ones do not. As long as
1074        * this only affects centroid calculations as the docs say, we should be
1075        * ok because OpenGL and Vulkan only require that the centroid be some
1076        * lit sample and that it's the same for all samples in a pixel; they
1077        * have no requirement that it be the one closest to center.
1078        */
1079       for (uint32_t i = 1; i <= 16; i *= 2) {
1080          switch (i) {
1081          case VK_SAMPLE_COUNT_1_BIT:
1082             if (sl && sl->per_pixel == i) {
1083                INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, sl->locations);
1084             } else {
1085                INTEL_SAMPLE_POS_1X(sp._1xSample);
1086             }
1087             break;
1088          case VK_SAMPLE_COUNT_2_BIT:
1089             if (sl && sl->per_pixel == i) {
1090                INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, sl->locations);
1091             } else {
1092                INTEL_SAMPLE_POS_2X(sp._2xSample);
1093             }
1094             break;
1095          case VK_SAMPLE_COUNT_4_BIT:
1096             if (sl && sl->per_pixel == i) {
1097                INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, sl->locations);
1098             } else {
1099                INTEL_SAMPLE_POS_4X(sp._4xSample);
1100             }
1101             break;
1102          case VK_SAMPLE_COUNT_8_BIT:
1103             if (sl && sl->per_pixel == i) {
1104                INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, sl->locations);
1105             } else {
1106                INTEL_SAMPLE_POS_8X(sp._8xSample);
1107             }
1108             break;
1109          case VK_SAMPLE_COUNT_16_BIT:
1110             if (sl && sl->per_pixel == i) {
1111                INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, sl->locations);
1112             } else {
1113                INTEL_SAMPLE_POS_16X(sp._16xSample);
1114             }
1115             break;
1116          default:
1117             unreachable("Invalid sample count");
1118          }
1119       }
1120    }
1121 }
1122 
1123 static uint32_t
vk_to_intel_tex_filter(VkFilter filter,bool anisotropyEnable)1124 vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable)
1125 {
1126    switch (filter) {
1127    default:
1128       unreachable("Invalid filter");
1129    case VK_FILTER_NEAREST:
1130       return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_NEAREST;
1131    case VK_FILTER_LINEAR:
1132       return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_LINEAR;
1133    }
1134 }
1135 
1136 static uint32_t
vk_to_intel_max_anisotropy(float ratio)1137 vk_to_intel_max_anisotropy(float ratio)
1138 {
1139    return (CLAMP(ratio, 2, 16) - 2) / 2;
1140 }
1141 
1142 static const uint32_t vk_to_intel_mipmap_mode[] = {
1143    [VK_SAMPLER_MIPMAP_MODE_NEAREST]          = MIPFILTER_NEAREST,
1144    [VK_SAMPLER_MIPMAP_MODE_LINEAR]           = MIPFILTER_LINEAR
1145 };
1146 
1147 static const uint32_t vk_to_intel_tex_address[] = {
1148    [VK_SAMPLER_ADDRESS_MODE_REPEAT]          = TCM_WRAP,
1149    [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
1150    [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE]   = TCM_CLAMP,
1151    [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
1152    [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
1153 };
1154 
1155 /* Vulkan specifies the result of shadow comparisons as:
1156  *     1     if   ref <op> texel,
1157  *     0     otherwise.
1158  *
1159  * The hardware does:
1160  *     0     if texel <op> ref,
1161  *     1     otherwise.
1162  *
1163  * So, these look a bit strange because there's both a negation
1164  * and swapping of the arguments involved.
1165  */
1166 static const uint32_t vk_to_intel_shadow_compare_op[] = {
1167    [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_ALWAYS,
1168    [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LEQUAL,
1169    [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_NOTEQUAL,
1170    [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LESS,
1171    [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GEQUAL,
1172    [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_EQUAL,
1173    [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GREATER,
1174    [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_NEVER,
1175 };
1176 
1177 static const uint32_t vk_to_intel_sampler_reduction_mode[] = {
1178    [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE] = STD_FILTER,
1179    [VK_SAMPLER_REDUCTION_MODE_MIN]              = MINIMUM,
1180    [VK_SAMPLER_REDUCTION_MODE_MAX]              = MAXIMUM,
1181 };
1182 
genX(CreateSampler)1183 VkResult genX(CreateSampler)(
1184     VkDevice                                    _device,
1185     const VkSamplerCreateInfo*                  pCreateInfo,
1186     const VkAllocationCallbacks*                pAllocator,
1187     VkSampler*                                  pSampler)
1188 {
1189    ANV_FROM_HANDLE(anv_device, device, _device);
1190    struct anv_sampler *sampler;
1191 
1192    sampler = vk_sampler_create(&device->vk, pCreateInfo,
1193                                pAllocator, sizeof(*sampler));
1194    if (!sampler)
1195       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1196 
1197    const struct vk_format_ycbcr_info *ycbcr_info =
1198       sampler->vk.format != VK_FORMAT_UNDEFINED ?
1199       vk_format_get_ycbcr_info(sampler->vk.format) : NULL;
1200    assert((ycbcr_info == NULL) == (sampler->vk.ycbcr_conversion == NULL));
1201 
1202    sampler->n_planes = ycbcr_info ? ycbcr_info->n_planes : 1;
1203 
1204    uint32_t border_color_stride = 64;
1205    uint32_t border_color_offset;
1206    void *border_color_ptr;
1207    if (sampler->vk.border_color <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) {
1208       border_color_offset = device->border_colors.offset +
1209                             pCreateInfo->borderColor *
1210                             border_color_stride;
1211       border_color_ptr = device->border_colors.map +
1212                          pCreateInfo->borderColor * border_color_stride;
1213    } else {
1214       assert(vk_border_color_is_custom(sampler->vk.border_color));
1215       if (pCreateInfo->flags & VK_SAMPLER_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT) {
1216          const VkOpaqueCaptureDescriptorDataCreateInfoEXT *opaque_info =
1217             vk_find_struct_const(pCreateInfo->pNext,
1218                                  OPAQUE_CAPTURE_DESCRIPTOR_DATA_CREATE_INFO_EXT);
1219          if (opaque_info) {
1220             uint32_t alloc_idx = *((const uint32_t *)opaque_info->opaqueCaptureDescriptorData);
1221             sampler->custom_border_color =
1222                anv_state_reserved_array_pool_alloc_index(&device->custom_border_colors, alloc_idx);
1223          } else {
1224             sampler->custom_border_color =
1225                anv_state_reserved_array_pool_alloc(&device->custom_border_colors, true);
1226          }
1227       } else {
1228          sampler->custom_border_color =
1229             anv_state_reserved_array_pool_alloc(&device->custom_border_colors, false);
1230       }
1231       if (sampler->custom_border_color.alloc_size == 0)
1232          return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1233 
1234       border_color_offset = sampler->custom_border_color.offset;
1235       border_color_ptr = sampler->custom_border_color.map;
1236 
1237       union isl_color_value color = { .u32 = {
1238          sampler->vk.border_color_value.uint32[0],
1239          sampler->vk.border_color_value.uint32[1],
1240          sampler->vk.border_color_value.uint32[2],
1241          sampler->vk.border_color_value.uint32[3],
1242       } };
1243 
1244       const struct anv_format *format_desc =
1245          sampler->vk.format != VK_FORMAT_UNDEFINED ?
1246          anv_get_format(sampler->vk.format) : NULL;
1247 
1248       if (format_desc && format_desc->n_planes == 1 &&
1249           !isl_swizzle_is_identity(format_desc->planes[0].swizzle)) {
1250          const struct anv_format_plane *fmt_plane = &format_desc->planes[0];
1251 
1252          assert(!isl_format_has_int_channel(fmt_plane->isl_format));
1253          color = isl_color_value_swizzle(color, fmt_plane->swizzle, true);
1254       }
1255 
1256       memcpy(border_color_ptr, color.u32, sizeof(color));
1257    }
1258 
1259    const bool seamless_cube =
1260       !(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT);
1261 
1262    struct mesa_sha1 ctx;
1263    _mesa_sha1_init(&ctx);
1264 
1265    for (unsigned p = 0; p < sampler->n_planes; p++) {
1266       const bool plane_has_chroma =
1267          ycbcr_info && ycbcr_info->planes[p].has_chroma;
1268       const VkFilter min_filter =
1269          plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
1270                             pCreateInfo->minFilter;
1271       const VkFilter mag_filter =
1272          plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
1273                             pCreateInfo->magFilter;
1274       const bool force_addr_rounding =
1275             device->physical->instance->force_filter_addr_rounding;
1276       const bool enable_min_filter_addr_rounding =
1277             force_addr_rounding || min_filter != VK_FILTER_NEAREST;
1278       const bool enable_mag_filter_addr_rounding =
1279             force_addr_rounding || mag_filter != VK_FILTER_NEAREST;
1280       /* From Broadwell PRM, SAMPLER_STATE:
1281        *   "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
1282        */
1283       enum isl_format plane0_isl_format = sampler->vk.ycbcr_conversion ?
1284          anv_get_format(sampler->vk.format)->planes[0].isl_format :
1285          ISL_FORMAT_UNSUPPORTED;
1286       const bool isl_format_is_planar_yuv =
1287          plane0_isl_format != ISL_FORMAT_UNSUPPORTED &&
1288          isl_format_is_yuv(plane0_isl_format) &&
1289          isl_format_is_planar(plane0_isl_format);
1290 
1291       const uint32_t mip_filter_mode =
1292          isl_format_is_planar_yuv ?
1293          MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode];
1294 
1295       struct GENX(SAMPLER_STATE) sampler_state = {
1296          .SamplerDisable = false,
1297          .TextureBorderColorMode = DX10OGL,
1298 
1299 #if GFX_VER >= 11
1300          .CPSLODCompensationEnable = true,
1301 #endif
1302 
1303          .LODPreClampMode = CLAMP_MODE_OGL,
1304 
1305          .MipModeFilter = mip_filter_mode,
1306          .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable),
1307          .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable),
1308          .TextureLODBias = CLAMP(pCreateInfo->mipLodBias, -16, 15.996),
1309          .AnisotropicAlgorithm =
1310             pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY,
1311          .MinLOD = CLAMP(pCreateInfo->minLod, 0, 14),
1312          .MaxLOD = CLAMP(pCreateInfo->maxLod, 0, 14),
1313          .ChromaKeyEnable = 0,
1314          .ChromaKeyIndex = 0,
1315          .ChromaKeyMode = 0,
1316          .ShadowFunction =
1317             vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ?
1318                                         pCreateInfo->compareOp : VK_COMPARE_OP_NEVER],
1319          .CubeSurfaceControlMode = seamless_cube ? OVERRIDE : PROGRAMMED,
1320 
1321          .LODClampMagnificationMode = MIPNONE,
1322 
1323          .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy),
1324          .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1325          .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1326          .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1327          .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1328          .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1329          .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1330          .TrilinearFilterQuality = 0,
1331          .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
1332          .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU],
1333          .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV],
1334          .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW],
1335 
1336          .ReductionType =
1337             vk_to_intel_sampler_reduction_mode[sampler->vk.reduction_mode],
1338          .ReductionTypeEnable =
1339             sampler->vk.reduction_mode != VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE,
1340       };
1341 
1342       /* Pack a version of the SAMPLER_STATE without the border color. We'll
1343        * use it to store into the shader cache and also for hashing.
1344        */
1345       GENX(SAMPLER_STATE_pack)(NULL, sampler->state_no_bc[p], &sampler_state);
1346       _mesa_sha1_update(&ctx, sampler->state_no_bc[p], sizeof(sampler->state_no_bc[p]));
1347 
1348       /* Put border color after the hashing, we don't want the allocation
1349        * order of border colors to influence the hash. We just need th
1350        * parameters to be hashed.
1351        */
1352       sampler_state.BorderColorPointer = border_color_offset;
1353       GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
1354    }
1355 
1356    /* If we have bindless, allocate enough samplers.  We allocate 32 bytes
1357     * for each sampler instead of 16 bytes because we want all bindless
1358     * samplers to be 32-byte aligned so we don't have to use indirect
1359     * sampler messages on them.
1360     */
1361    sampler->bindless_state =
1362       anv_state_pool_alloc(&device->dynamic_state_pool,
1363                            sampler->n_planes * 32, 32);
1364    if (sampler->bindless_state.map) {
1365       memcpy(sampler->bindless_state.map, sampler->state,
1366              sampler->n_planes * GENX(SAMPLER_STATE_length) * 4);
1367    }
1368 
1369    /* Hash the border color */
1370    _mesa_sha1_update(&ctx, border_color_ptr,
1371                      sizeof(union isl_color_value));
1372 
1373    _mesa_sha1_final(&ctx, sampler->sha1);
1374 
1375    *pSampler = anv_sampler_to_handle(sampler);
1376 
1377    return VK_SUCCESS;
1378 }
1379 
1380 void
genX(emit_embedded_sampler)1381 genX(emit_embedded_sampler)(struct anv_device *device,
1382                             struct anv_embedded_sampler *sampler,
1383                             struct anv_pipeline_embedded_sampler_binding *binding)
1384 {
1385    sampler->ref_cnt = 1;
1386    memcpy(&sampler->key, &binding->key, sizeof(binding->key));
1387 
1388    sampler->border_color_state =
1389       anv_state_pool_alloc(&device->dynamic_state_pool,
1390                            sizeof(struct gfx8_border_color), 64);
1391    memcpy(sampler->border_color_state.map,
1392           binding->key.color,
1393           sizeof(binding->key.color));
1394 
1395    sampler->sampler_state =
1396       anv_state_pool_alloc(&device->dynamic_state_pool,
1397                            ANV_SAMPLER_STATE_SIZE, 32);
1398 
1399    struct GENX(SAMPLER_STATE) sampler_state = {
1400       .BorderColorPointer = sampler->border_color_state.offset,
1401    };
1402    uint32_t dwords[GENX(SAMPLER_STATE_length)];
1403    GENX(SAMPLER_STATE_pack)(NULL, dwords, &sampler_state);
1404 
1405    for (uint32_t i = 0; i < GENX(SAMPLER_STATE_length); i++) {
1406       ((uint32_t *)sampler->sampler_state.map)[i] =
1407          dwords[i] | binding->key.sampler[i];
1408    }
1409 }
1410 
1411 /* Wa_14015814527
1412  *
1413  * Check if task shader was utilized within cmd_buffer, if so
1414  * commit empty URB states and null prim.
1415  */
1416 void
genX(apply_task_urb_workaround)1417 genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer)
1418 {
1419    if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
1420       return;
1421 
1422 #if GFX_VERx10 >= 125
1423    const struct intel_device_info *devinfo = &cmd_buffer->device->physical->info;
1424 
1425    if (!intel_needs_workaround(devinfo, 16014390852))
1426       return;
1427 
1428    if (cmd_buffer->state.current_pipeline != _3D ||
1429        !cmd_buffer->state.gfx.used_task_shader)
1430       return;
1431 
1432    cmd_buffer->state.gfx.used_task_shader = false;
1433 
1434    /* Wa_14015821291 mentions that WA below is not required if we have
1435     * a pipeline flush going on. It will get flushed during
1436     * cmd_buffer_flush_state before draw.
1437     */
1438    if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_CS_STALL_BIT))
1439       return;
1440 
1441    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
1442       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
1443          urb._3DCommandSubOpcode += i;
1444       }
1445    }
1446 
1447    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
1448    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
1449 
1450    /* Issue 'nullprim' to commit the state. */
1451    genx_batch_emit_pipe_control_write
1452       (&cmd_buffer->batch, cmd_buffer->device->info,
1453        cmd_buffer->state.current_pipeline,
1454        WriteImmediateData, cmd_buffer->device->workaround_address, 0, 0);
1455 #endif
1456 }
1457 
1458 VkResult
genX(init_trtt_context_state)1459 genX(init_trtt_context_state)(struct anv_async_submit *submit)
1460 {
1461 #if GFX_VER >= 12
1462    struct anv_queue *queue = submit->queue;
1463    struct anv_device *device = queue->device;
1464    struct anv_trtt *trtt = &device->trtt;
1465    struct anv_batch *batch = &submit->batch;
1466 
1467    assert((trtt->l3_addr & 0xFFF) == 0);
1468    uint32_t l3_addr_low = (trtt->l3_addr & 0xFFFFF000) >> 12;
1469    uint32_t l3_addr_high = (trtt->l3_addr >> 32) & 0xFFFF;
1470 
1471    anv_batch_write_reg(batch, GENX(GFX_TRTT_INVAL), trtt_inval)
1472       trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
1473    anv_batch_write_reg(batch, GENX(GFX_TRTT_NULL), trtt_null)
1474       trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
1475    anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low)
1476       trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low;
1477    anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_HIGH), trtt_base_high)
1478       trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
1479 
1480    anv_batch_write_reg(batch, GENX(BLT_TRTT_INVAL), trtt_inval)
1481       trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
1482    anv_batch_write_reg(batch, GENX(BLT_TRTT_NULL), trtt_null)
1483       trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
1484    anv_batch_write_reg(batch, GENX(BLT_TRTT_L3_BASE_LOW), trtt_base_low)
1485       trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low;
1486    anv_batch_write_reg(batch, GENX(BLT_TRTT_L3_BASE_HIGH), trtt_base_high)
1487       trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
1488 
1489    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_INVAL), trtt_inval)
1490       trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
1491    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_NULL), trtt_null)
1492       trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
1493    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_L3_BASE_LOW), trtt_base_low)
1494       trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low;
1495    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_L3_BASE_HIGH), trtt_base_high)
1496       trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
1497 
1498 #if GFX_VER >= 20
1499    uint32_t trva_base = device->physical->va.trtt.addr >> 44;
1500    anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range)
1501       trtt_va_range.TRVABase = trva_base;
1502    anv_batch_write_reg(batch, GENX(BLT_TRTT_VA_RANGE), trtt_va_range)
1503       trtt_va_range.TRVABase = trva_base;
1504    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_VA_RANGE), trtt_va_range)
1505       trtt_va_range.TRVABase = trva_base;
1506 #else
1507    anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
1508       trtt_va_range.TRVAMaskValue = 0xF;
1509       trtt_va_range.TRVADataValue = 0xF;
1510    }
1511    anv_batch_write_reg(batch, GENX(BLT_TRTT_VA_RANGE), trtt_va_range) {
1512       trtt_va_range.TRVAMaskValue = 0xF;
1513       trtt_va_range.TRVADataValue = 0xF;
1514    }
1515    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_VA_RANGE), trtt_va_range) {
1516       trtt_va_range.TRVAMaskValue = 0xF;
1517       trtt_va_range.TRVADataValue = 0xF;
1518    }
1519 #endif
1520 
1521    /* Enabling TR-TT needs to be done after setting up the other registers.
1522     */
1523    anv_batch_write_reg(batch, GENX(GFX_TRTT_CR), trtt_cr)
1524       trtt_cr.TRTTEnable = true;
1525    anv_batch_write_reg(batch, GENX(BLT_TRTT_CR), trtt_cr)
1526       trtt_cr.TRTTEnable = true;
1527    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_CR), trtt_cr)
1528       trtt_cr.TRTTEnable = true;
1529 
1530    if (queue->family->engine_class != INTEL_ENGINE_CLASS_COPY) {
1531       genx_batch_emit_pipe_control(batch, device->info, _3D,
1532                                    ANV_PIPE_CS_STALL_BIT |
1533                                    ANV_PIPE_TLB_INVALIDATE_BIT);
1534    }
1535 #endif
1536    return VK_SUCCESS;
1537 }
1538