xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/genX_cmd_compute.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 
30 #include "common/intel_compute_slm.h"
31 #include "genxml/gen_macros.h"
32 #include "genxml/genX_pack.h"
33 #include "genxml/genX_rt_pack.h"
34 #include "common/intel_genX_state_brw.h"
35 
36 #include "ds/intel_tracepoints.h"
37 
38 #include "genX_mi_builder.h"
39 
40 void
genX(cmd_buffer_ensure_cfe_state)41 genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
42                                   uint32_t total_scratch)
43 {
44 #if GFX_VERx10 >= 125
45    assert(cmd_buffer->state.current_pipeline == GPGPU);
46 
47    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
48 
49    if (total_scratch <= comp_state->scratch_size)
50       return;
51 
52    const struct intel_device_info *devinfo = cmd_buffer->device->info;
53    anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
54       cfe.MaximumNumberofThreads = devinfo->max_cs_threads * devinfo->subslice_total;
55 
56       uint32_t scratch_surf;
57       struct anv_scratch_pool *scratch_pool =
58          (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ?
59           &cmd_buffer->device->protected_scratch_pool :
60           &cmd_buffer->device->scratch_pool;
61       struct anv_bo *scratch_bo =
62             anv_scratch_pool_alloc(cmd_buffer->device, scratch_pool,
63                                    MESA_SHADER_COMPUTE,
64                                    total_scratch);
65       anv_reloc_list_add_bo(cmd_buffer->batch.relocs, scratch_bo);
66       scratch_surf = anv_scratch_pool_get_surf(cmd_buffer->device, scratch_pool,
67                                                total_scratch);
68       cfe.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
69 #if GFX_VER >= 20
70       switch (cmd_buffer->device->physical->instance->stack_ids) {
71       case 256:  cfe.StackIDControl = StackIDs256;  break;
72       case 512:  cfe.StackIDControl = StackIDs512;  break;
73       case 1024: cfe.StackIDControl = StackIDs1024; break;
74       case 2048: cfe.StackIDControl = StackIDs2048; break;
75       default:   unreachable("invalid stack_ids value");
76       }
77 #endif
78 
79       cfe.OverDispatchControl = 2; /* 50% overdispatch */
80    }
81 
82    comp_state->scratch_size = total_scratch;
83 #else
84    unreachable("Invalid call");
85 #endif
86 }
87 
88 static void
genX(cmd_buffer_flush_compute_state)89 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
90 {
91    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
92    struct anv_compute_pipeline *pipeline =
93       anv_pipeline_to_compute(comp_state->base.pipeline);
94    const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
95 
96    assert(pipeline->cs);
97 
98    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
99 
100    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
101 
102    genX(flush_descriptor_buffers)(cmd_buffer, &comp_state->base);
103 
104    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
105 
106    /* Apply any pending pipeline flushes we may have.  We want to apply them
107     * now because, if any of those flushes are for things like push constants,
108     * the GPU will read the state at weird times.
109     */
110    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
111 
112    if (cmd_buffer->state.compute.pipeline_dirty) {
113 #if GFX_VERx10 < 125
114       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
115        *
116        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
117        *    the only bits that are changed are scoreboard related: Scoreboard
118        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
119        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
120        *    sufficient."
121        */
122       anv_add_pending_pipe_bits(cmd_buffer,
123                               ANV_PIPE_CS_STALL_BIT,
124                               "flush compute state");
125       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
126 #endif
127 
128       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
129 
130 #if GFX_VERx10 >= 125
131       const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
132       genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
133 #endif
134 
135       /* The workgroup size of the pipeline affects our push constant layout
136        * so flag push constants as dirty if we change the pipeline.
137        */
138       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
139       comp_state->base.push_constants_data_dirty = true;
140    }
141 
142    cmd_buffer->state.descriptors_dirty |=
143       genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
144                                               &cmd_buffer->state.compute.base,
145                                               &pipeline->base);
146 
147    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
148        cmd_buffer->state.compute.pipeline_dirty) {
149       genX(cmd_buffer_flush_descriptor_sets)(cmd_buffer,
150                                              &cmd_buffer->state.compute.base,
151                                              VK_SHADER_STAGE_COMPUTE_BIT,
152                                              &pipeline->cs, 1);
153       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
154 
155 #if GFX_VERx10 < 125
156       uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
157       struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
158          .BindingTablePointer =
159             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
160          .SamplerStatePointer =
161             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
162       };
163       GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
164 
165       struct anv_state state =
166          anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
167                                       pipeline->interface_descriptor_data,
168                                       GENX(INTERFACE_DESCRIPTOR_DATA_length),
169                                       64);
170 
171       uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
172       anv_batch_emit(&cmd_buffer->batch,
173                      GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
174          mid.InterfaceDescriptorTotalLength        = size;
175          mid.InterfaceDescriptorDataStartAddress   = state.offset;
176       }
177 #endif
178    }
179 
180    if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
181 
182       if (comp_state->base.push_constants_state.alloc_size == 0 ||
183           comp_state->base.push_constants_data_dirty) {
184          comp_state->base.push_constants_state =
185             anv_cmd_buffer_cs_push_constants(cmd_buffer);
186          comp_state->base.push_constants_data_dirty = false;
187       }
188 
189 #if GFX_VERx10 < 125
190       if (comp_state->base.push_constants_state.alloc_size) {
191          anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
192             curbe.CURBETotalDataLength    = comp_state->base.push_constants_state.alloc_size;
193             curbe.CURBEDataStartAddress   = comp_state->base.push_constants_state.offset;
194          }
195       }
196 #endif
197 
198       cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
199    }
200 
201    cmd_buffer->state.compute.pipeline_dirty = false;
202 
203    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
204 }
205 
206 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer * cmd_buffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ)207 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
208                                   uint32_t baseGroupX,
209                                   uint32_t baseGroupY,
210                                   uint32_t baseGroupZ)
211 {
212    if (anv_batch_has_error(&cmd_buffer->batch))
213       return;
214 
215    struct anv_push_constants *push =
216       &cmd_buffer->state.compute.base.push_constants;
217    if (push->cs.base_work_group_id[0] != baseGroupX ||
218        push->cs.base_work_group_id[1] != baseGroupY ||
219        push->cs.base_work_group_id[2] != baseGroupZ) {
220       push->cs.base_work_group_id[0] = baseGroupX;
221       push->cs.base_work_group_id[1] = baseGroupY;
222       push->cs.base_work_group_id[2] = baseGroupZ;
223 
224       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
225       cmd_buffer->state.compute.base.push_constants_data_dirty = true;
226    }
227 }
228 
229 #define GPGPU_DISPATCHDIMX 0x2500
230 #define GPGPU_DISPATCHDIMY 0x2504
231 #define GPGPU_DISPATCHDIMZ 0x2508
232 
233 static void
compute_load_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr)234 compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer,
235                              const struct anv_address indirect_addr)
236 {
237    struct mi_builder b;
238    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
239 
240    struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
241    struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
242    struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
243 
244    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
245    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
246    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
247 }
248 
249 static void
compute_store_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr)250 compute_store_indirect_params(struct anv_cmd_buffer *cmd_buffer,
251                              const struct anv_address indirect_addr)
252 {
253    struct mi_builder b;
254    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
255 
256    struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
257    struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
258    struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
259 
260    mi_store(&b, size_x, mi_reg32(GPGPU_DISPATCHDIMX));
261    mi_store(&b, size_y, mi_reg32(GPGPU_DISPATCHDIMY));
262    mi_store(&b, size_z, mi_reg32(GPGPU_DISPATCHDIMZ));
263 }
264 
265 
266 #if GFX_VERx10 >= 125
267 
GENX(INTERFACE_DESCRIPTOR_DATA)268 static inline struct GENX(INTERFACE_DESCRIPTOR_DATA)
269 get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
270                               const struct anv_shader_bin *shader,
271                               const struct brw_cs_prog_data *prog_data,
272                               const struct intel_cs_dispatch_info *dispatch)
273 {
274    const struct intel_device_info *devinfo = cmd_buffer->device->info;
275 
276    return (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
277       .SamplerCount = DIV_ROUND_UP(CLAMP(shader->bind_map.sampler_count, 0, 16), 4),
278       .KernelStartPointer = shader->kernel.offset,
279       .SamplerStatePointer = cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
280       .BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
281       /* Typically set to 0 to avoid prefetching on every thread dispatch. */
282       .BindingTableEntryCount = devinfo->verx10 == 125 ?
283          0 : 1 + MIN2(shader->bind_map.surface_count, 30),
284       .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
285       .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared),
286       .PreferredSLMAllocationSize =
287          intel_compute_preferred_slm_calc_encode_size(devinfo,
288                                                       prog_data->base.total_shared,
289                                                       dispatch->group_size,
290                                                       dispatch->simd_size),
291       .NumberOfBarriers = prog_data->uses_barrier,
292    };
293 }
294 
295 static inline void
emit_indirect_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct brw_cs_prog_data * prog_data,struct anv_address indirect_addr)296 emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
297                              const struct anv_shader_bin *shader,
298                              const struct brw_cs_prog_data *prog_data,
299                              struct anv_address indirect_addr)
300 {
301    const struct intel_device_info *devinfo = cmd_buffer->device->info;
302    assert(devinfo->has_indirect_unroll);
303 
304    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
305    bool predicate = cmd_buffer->state.conditional_render_enabled;
306 
307    const struct intel_cs_dispatch_info dispatch =
308       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
309    const int dispatch_size = dispatch.simd_size / 16;
310 
311    struct GENX(COMPUTE_WALKER_BODY) body =  {
312       .SIMDSize                 = dispatch_size,
313       .MessageSIMD              = dispatch_size,
314       .IndirectDataStartAddress = comp_state->base.push_constants_state.offset,
315       .IndirectDataLength       = comp_state->base.push_constants_state.alloc_size,
316       .GenerateLocalID          = prog_data->generate_local_id != 0,
317       .EmitLocal                = prog_data->generate_local_id,
318       .WalkOrder                = prog_data->walk_order,
319       .TileLayout               = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
320                                   TileY32bpe : Linear,
321       .LocalXMaximum            = prog_data->local_size[0] - 1,
322       .LocalYMaximum            = prog_data->local_size[1] - 1,
323       .LocalZMaximum            = prog_data->local_size[2] - 1,
324       .ExecutionMask            = dispatch.right_mask,
325       .PostSync.MOCS            = anv_mocs(cmd_buffer->device, NULL, 0),
326       .InterfaceDescriptor =
327          get_interface_descriptor_data(cmd_buffer, shader, prog_data,
328                                        &dispatch),
329    };
330 
331    cmd_buffer->state.last_indirect_dispatch =
332       anv_batch_emitn(
333          &cmd_buffer->batch,
334          GENX(EXECUTE_INDIRECT_DISPATCH_length),
335          GENX(EXECUTE_INDIRECT_DISPATCH),
336          .PredicateEnable            = predicate,
337          .MaxCount                   = 1,
338          .COMPUTE_WALKER_BODY        = body,
339          .ArgumentBufferStartAddress = indirect_addr,
340          .MOCS                       = anv_mocs(cmd_buffer->device,
341                                                 indirect_addr.bo, 0),
342       );
343 }
344 
345 static inline void
emit_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)346 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
347                     const struct anv_compute_pipeline *pipeline, bool indirect,
348                     const struct brw_cs_prog_data *prog_data,
349                     uint32_t groupCountX, uint32_t groupCountY,
350                     uint32_t groupCountZ)
351 {
352    const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
353    const bool predicate = cmd_buffer->state.conditional_render_enabled;
354 
355    const struct intel_device_info *devinfo = pipeline->base.device->info;
356    const struct intel_cs_dispatch_info dispatch =
357       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
358 
359    cmd_buffer->state.last_compute_walker =
360       anv_batch_emitn(
361          &cmd_buffer->batch,
362          GENX(COMPUTE_WALKER_length),
363          GENX(COMPUTE_WALKER),
364          .IndirectParameterEnable        = indirect,
365          .PredicateEnable                = predicate,
366          .SIMDSize                       = dispatch.simd_size / 16,
367          .MessageSIMD                    = dispatch.simd_size / 16,
368          .IndirectDataStartAddress       = comp_state->base.push_constants_state.offset,
369          .IndirectDataLength             = comp_state->base.push_constants_state.alloc_size,
370 #if GFX_VERx10 == 125
371          .SystolicModeEnable             = prog_data->uses_systolic,
372 #endif
373          .GenerateLocalID                = prog_data->generate_local_id != 0,
374          .EmitLocal                      = prog_data->generate_local_id,
375          .WalkOrder                      = prog_data->walk_order,
376          .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
377                        TileY32bpe : Linear,
378          .LocalXMaximum                  = prog_data->local_size[0] - 1,
379          .LocalYMaximum                  = prog_data->local_size[1] - 1,
380          .LocalZMaximum                  = prog_data->local_size[2] - 1,
381          .ThreadGroupIDXDimension        = groupCountX,
382          .ThreadGroupIDYDimension        = groupCountY,
383          .ThreadGroupIDZDimension        = groupCountZ,
384          .ExecutionMask                  = dispatch.right_mask,
385          .PostSync                       = {
386             .MOCS                        = anv_mocs(pipeline->base.device, NULL, 0),
387          },
388          .InterfaceDescriptor =
389             get_interface_descriptor_data(cmd_buffer, pipeline->cs,
390                                           prog_data, &dispatch),
391       );
392 }
393 
394 #else /* #if GFX_VERx10 >= 125 */
395 
396 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)397 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
398                   const struct anv_compute_pipeline *pipeline, bool indirect,
399                   const struct brw_cs_prog_data *prog_data,
400                   uint32_t groupCountX, uint32_t groupCountY,
401                   uint32_t groupCountZ)
402 {
403    const bool predicate = cmd_buffer->state.conditional_render_enabled;
404 
405    const struct intel_device_info *devinfo = pipeline->base.device->info;
406    const struct intel_cs_dispatch_info dispatch =
407       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
408 
409    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
410       ggw.IndirectParameterEnable      = indirect;
411       ggw.PredicateEnable              = predicate;
412       ggw.SIMDSize                     = dispatch.simd_size / 16;
413       ggw.ThreadDepthCounterMaximum    = 0;
414       ggw.ThreadHeightCounterMaximum   = 0;
415       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
416       ggw.ThreadGroupIDXDimension      = groupCountX;
417       ggw.ThreadGroupIDYDimension      = groupCountY;
418       ggw.ThreadGroupIDZDimension      = groupCountZ;
419       ggw.RightExecutionMask           = dispatch.right_mask;
420       ggw.BottomExecutionMask          = 0xffffffff;
421    }
422 
423    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
424 }
425 
426 #endif /* #if GFX_VERx10 >= 125 */
427 
428 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,const struct brw_cs_prog_data * prog_data,struct anv_address indirect_addr,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)429 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
430                const struct anv_compute_pipeline *pipeline,
431                const struct brw_cs_prog_data *prog_data,
432                struct anv_address indirect_addr,
433                uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
434 {
435    bool is_indirect = !anv_address_is_null(indirect_addr);
436 
437 #if GFX_VERx10 >= 125
438    if (is_indirect && cmd_buffer->device->info->has_indirect_unroll) {
439       emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data,
440                                    indirect_addr);
441       return;
442    }
443 #endif
444 
445    if (is_indirect)
446       compute_load_indirect_params(cmd_buffer, indirect_addr);
447 
448 #if GFX_VERx10 >= 125
449    emit_compute_walker(cmd_buffer, pipeline, is_indirect, prog_data,
450                        groupCountX, groupCountY, groupCountZ);
451 #else
452    emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data,
453                      groupCountX, groupCountY, groupCountZ);
454 #endif
455 }
456 
genX(CmdDispatchBase)457 void genX(CmdDispatchBase)(
458     VkCommandBuffer                             commandBuffer,
459     uint32_t                                    baseGroupX,
460     uint32_t                                    baseGroupY,
461     uint32_t                                    baseGroupZ,
462     uint32_t                                    groupCountX,
463     uint32_t                                    groupCountY,
464     uint32_t                                    groupCountZ)
465 {
466    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
467    struct anv_compute_pipeline *pipeline =
468       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
469    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
470 
471    anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
472                                      baseGroupY, baseGroupZ);
473 
474    if (anv_batch_has_error(&cmd_buffer->batch))
475       return;
476 
477    anv_measure_snapshot(cmd_buffer,
478                         INTEL_SNAPSHOT_COMPUTE,
479                         "compute",
480                         groupCountX * groupCountY * groupCountZ *
481                         prog_data->local_size[0] * prog_data->local_size[1] *
482                         prog_data->local_size[2]);
483 
484    trace_intel_begin_compute(&cmd_buffer->trace);
485 
486    if (prog_data->uses_num_work_groups) {
487       struct anv_state state =
488          anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 12, 4);
489       uint32_t *sizes = state.map;
490       sizes[0] = groupCountX;
491       sizes[1] = groupCountY;
492       sizes[2] = groupCountZ;
493       cmd_buffer->state.compute.num_workgroups =
494          anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
495 
496       /* The num_workgroups buffer goes in the binding table */
497       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
498    }
499 
500    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
501 
502    if (cmd_buffer->state.conditional_render_enabled)
503       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
504 
505    emit_cs_walker(cmd_buffer, pipeline, prog_data,
506                   ANV_NULL_ADDRESS /* no indirect data */,
507                   groupCountX, groupCountY, groupCountZ);
508 
509    trace_intel_end_compute(&cmd_buffer->trace,
510                            groupCountX, groupCountY, groupCountZ);
511 }
512 
genX(CmdDispatchIndirect)513 void genX(CmdDispatchIndirect)(
514     VkCommandBuffer                             commandBuffer,
515     VkBuffer                                    _buffer,
516     VkDeviceSize                                offset)
517 {
518    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
519    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
520    struct anv_compute_pipeline *pipeline =
521       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
522    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
523    struct anv_address addr = anv_address_add(buffer->address, offset);
524    UNUSED struct anv_batch *batch = &cmd_buffer->batch;
525 
526    anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
527 
528    anv_measure_snapshot(cmd_buffer,
529                         INTEL_SNAPSHOT_COMPUTE,
530                         "compute indirect",
531                         0);
532    trace_intel_begin_compute_indirect(&cmd_buffer->trace);
533 
534    if (prog_data->uses_num_work_groups) {
535       cmd_buffer->state.compute.num_workgroups = addr;
536 
537       /* The num_workgroups buffer goes in the binding table */
538       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
539    }
540 
541    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
542 
543    if (cmd_buffer->state.conditional_render_enabled)
544       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
545 
546    emit_cs_walker(cmd_buffer, pipeline, prog_data, addr, 0, 0, 0);
547 
548    trace_intel_end_compute_indirect(&cmd_buffer->trace,
549                                     anv_address_utrace(addr));
550 }
551 
552 struct anv_address
genX(cmd_buffer_ray_query_globals)553 genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
554 {
555 #if GFX_VERx10 >= 125
556    struct anv_device *device = cmd_buffer->device;
557 
558    struct anv_state state =
559       anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
560                                            BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
561    struct brw_rt_scratch_layout layout;
562    uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
563                                        * some cases?
564                                        */
565    brw_rt_compute_scratch_layout(&layout, device->info,
566                                  stack_ids_per_dss, 1 << 10);
567 
568    const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
569       .MemBaseAddress = (struct anv_address) {
570          /* The ray query HW computes offsets from the top of the buffer, so
571           * let the address at the end of the buffer.
572           */
573          .bo = device->ray_query_bo,
574          .offset = device->ray_query_bo->size
575       },
576       .AsyncRTStackSize = layout.ray_stack_stride / 64,
577       .NumDSSRTStacks = layout.stack_ids_per_dss,
578       .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
579       .Flags = RT_DEPTH_TEST_LESS_EQUAL,
580       .ResumeShaderTable = (struct anv_address) {
581          .bo = cmd_buffer->state.ray_query_shadow_bo,
582       },
583    };
584    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
585 
586    return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
587 #else
588    unreachable("Not supported");
589 #endif
590 }
591 
592 #if GFX_VERx10 >= 125
593 void
genX(cmd_buffer_dispatch_kernel)594 genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
595                                  struct anv_kernel *kernel,
596                                  const uint32_t *global_size,
597                                  uint32_t arg_count,
598                                  const struct anv_kernel_arg *args)
599 {
600    const struct intel_device_info *devinfo = cmd_buffer->device->info;
601    const struct brw_cs_prog_data *cs_prog_data =
602       brw_cs_prog_data_const(kernel->bin->prog_data);
603 
604    genX(cmd_buffer_config_l3)(cmd_buffer, kernel->l3_config);
605 
606    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
607 
608    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
609 
610    /* Apply any pending pipeline flushes we may have.  We want to apply them
611     * now because, if any of those flushes are for things like push constants,
612     * the GPU will read the state at weird times.
613     */
614    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
615 
616    uint32_t indirect_data_size = sizeof(struct brw_kernel_sysvals);
617    indirect_data_size += kernel->bin->bind_map.kernel_args_size;
618    indirect_data_size = ALIGN(indirect_data_size, 64);
619    struct anv_state indirect_data =
620       anv_cmd_buffer_alloc_general_state(cmd_buffer,
621                                          indirect_data_size, 64);
622    memset(indirect_data.map, 0, indirect_data.alloc_size);
623 
624    struct brw_kernel_sysvals sysvals = {};
625    if (global_size != NULL) {
626       for (unsigned i = 0; i < 3; i++)
627          sysvals.num_work_groups[i] = global_size[i];
628       memcpy(indirect_data.map, &sysvals, sizeof(sysvals));
629    } else {
630       struct anv_address sysvals_addr = {
631          .bo = NULL, /* General state buffer is always 0. */
632          .offset = indirect_data.offset,
633       };
634 
635       compute_store_indirect_params(cmd_buffer, sysvals_addr);
636    }
637 
638    void *args_map = indirect_data.map + sizeof(sysvals);
639    for (unsigned i = 0; i < kernel->bin->bind_map.kernel_arg_count; i++) {
640       struct brw_kernel_arg_desc *arg_desc =
641          &kernel->bin->bind_map.kernel_args[i];
642       assert(i < arg_count);
643       const struct anv_kernel_arg *arg = &args[i];
644       if (arg->is_ptr) {
645          memcpy(args_map + arg_desc->offset, arg->ptr, arg_desc->size);
646       } else {
647          assert(arg_desc->size <= sizeof(arg->u64));
648          memcpy(args_map + arg_desc->offset, &arg->u64, arg_desc->size);
649       }
650    }
651 
652    struct intel_cs_dispatch_info dispatch =
653       brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
654 
655    anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
656       cw.PredicateEnable                = false;
657       cw.SIMDSize                       = dispatch.simd_size / 16;
658       cw.MessageSIMD                    = dispatch.simd_size / 16;
659       cw.IndirectDataStartAddress       = indirect_data.offset;
660       cw.IndirectDataLength             = indirect_data.alloc_size;
661       cw.LocalXMaximum                  = cs_prog_data->local_size[0] - 1;
662       cw.LocalYMaximum                  = cs_prog_data->local_size[1] - 1;
663       cw.LocalZMaximum                  = cs_prog_data->local_size[2] - 1;
664       cw.ExecutionMask                  = dispatch.right_mask;
665       cw.PostSync.MOCS                  = cmd_buffer->device->isl_dev.mocs.internal;
666 
667       if (global_size != NULL) {
668          cw.ThreadGroupIDXDimension     = global_size[0];
669          cw.ThreadGroupIDYDimension     = global_size[1];
670          cw.ThreadGroupIDZDimension     = global_size[2];
671       } else {
672          cw.IndirectParameterEnable     = true;
673       }
674 
675       cw.InterfaceDescriptor =
676          get_interface_descriptor_data(cmd_buffer,
677                                        kernel->bin,
678                                        cs_prog_data,
679                                        &dispatch);
680    }
681 
682    /* We just blew away the compute pipeline state */
683    cmd_buffer->state.compute.pipeline_dirty = true;
684 }
685 
686 static void
calc_local_trace_size(uint8_t local_shift[3],const uint32_t global[3])687 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
688 {
689    unsigned total_shift = 0;
690    memset(local_shift, 0, 3);
691 
692    bool progress;
693    do {
694       progress = false;
695       for (unsigned i = 0; i < 3; i++) {
696          assert(global[i] > 0);
697          if ((1 << local_shift[i]) < global[i]) {
698             progress = true;
699             local_shift[i]++;
700             total_shift++;
701          }
702 
703          if (total_shift == 3)
704             return;
705       }
706    } while(progress);
707 
708    /* Assign whatever's left to x */
709    local_shift[0] += 3 - total_shift;
710 }
711 
GENX(RT_SHADER_TABLE)712 static struct GENX(RT_SHADER_TABLE)
713 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
714 {
715    return (struct GENX(RT_SHADER_TABLE)) {
716       .BaseAddress = anv_address_from_u64(region->deviceAddress),
717       .Stride = region->stride,
718    };
719 }
720 
721 struct trace_params {
722    /* If is_sbt_indirect, use indirect_sbts_addr to build RT_DISPATCH_GLOBALS
723     * with mi_builder.
724     */
725    bool is_sbt_indirect;
726    const VkStridedDeviceAddressRegionKHR *raygen_sbt;
727    const VkStridedDeviceAddressRegionKHR *miss_sbt;
728    const VkStridedDeviceAddressRegionKHR *hit_sbt;
729    const VkStridedDeviceAddressRegionKHR *callable_sbt;
730 
731    /* A pointer to a VkTraceRaysIndirectCommand2KHR structure */
732    uint64_t indirect_sbts_addr;
733 
734    /* If is_indirect, use launch_size_addr to program the dispatch size. */
735    bool is_launch_size_indirect;
736    uint32_t launch_size[3];
737 
738    /* A pointer a uint32_t[3] */
739    uint64_t launch_size_addr;
740 };
741 
742 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)743 cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer *cmd_buffer,
744                                     struct trace_params *params)
745 {
746    assert(!params->is_sbt_indirect);
747    assert(params->miss_sbt != NULL);
748    assert(params->hit_sbt != NULL);
749    assert(params->callable_sbt != NULL);
750 
751    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
752 
753    struct anv_state rtdg_state =
754       anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
755                                            BRW_RT_PUSH_CONST_OFFSET +
756                                            sizeof(struct anv_push_constants),
757                                            64);
758 
759    struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
760       .MemBaseAddress     = (struct anv_address) {
761          .bo = rt->scratch.bo,
762          .offset = rt->scratch.layout.ray_stack_start,
763       },
764       .CallStackHandler   = anv_shader_bin_get_bsr(
765          cmd_buffer->device->rt_trivial_return, 0),
766       .AsyncRTStackSize   = rt->scratch.layout.ray_stack_stride / 64,
767       .NumDSSRTStacks     = rt->scratch.layout.stack_ids_per_dss,
768       .MaxBVHLevels       = BRW_RT_MAX_BVH_LEVELS,
769       .Flags              = RT_DEPTH_TEST_LESS_EQUAL,
770       .HitGroupTable      = vk_sdar_to_shader_table(params->hit_sbt),
771       .MissGroupTable     = vk_sdar_to_shader_table(params->miss_sbt),
772       .SWStackSize        = rt->scratch.layout.sw_stack_size / 64,
773       .LaunchWidth        = params->launch_size[0],
774       .LaunchHeight       = params->launch_size[1],
775       .LaunchDepth        = params->launch_size[2],
776       .CallableGroupTable = vk_sdar_to_shader_table(params->callable_sbt),
777    };
778    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
779 
780    return rtdg_state;
781 }
782 
783 static struct mi_value
mi_build_sbt_entry(struct mi_builder * b,uint64_t addr_field_addr,uint64_t stride_field_addr)784 mi_build_sbt_entry(struct mi_builder *b,
785                    uint64_t addr_field_addr,
786                    uint64_t stride_field_addr)
787 {
788    return mi_ior(b,
789                  mi_iand(b, mi_mem64(anv_address_from_u64(addr_field_addr)),
790                             mi_imm(BITFIELD64_BIT(49) - 1)),
791                  mi_ishl_imm(b, mi_mem32(anv_address_from_u64(stride_field_addr)),
792                                 48));
793 }
794 
795 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)796 cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer *cmd_buffer,
797                                              struct trace_params *params)
798 {
799    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
800 
801    struct anv_state rtdg_state =
802       anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
803                                            BRW_RT_PUSH_CONST_OFFSET +
804                                            sizeof(struct anv_push_constants),
805                                            64);
806 
807    struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
808       .MemBaseAddress     = (struct anv_address) {
809          .bo = rt->scratch.bo,
810          .offset = rt->scratch.layout.ray_stack_start,
811       },
812       .CallStackHandler   = anv_shader_bin_get_bsr(
813          cmd_buffer->device->rt_trivial_return, 0),
814       .AsyncRTStackSize   = rt->scratch.layout.ray_stack_stride / 64,
815       .NumDSSRTStacks     = rt->scratch.layout.stack_ids_per_dss,
816       .MaxBVHLevels       = BRW_RT_MAX_BVH_LEVELS,
817       .Flags              = RT_DEPTH_TEST_LESS_EQUAL,
818       .SWStackSize        = rt->scratch.layout.sw_stack_size / 64,
819    };
820    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
821 
822    struct anv_address rtdg_addr =
823       anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
824 
825    struct mi_builder b;
826    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
827    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
828    mi_builder_set_mocs(&b, mocs);
829    mi_builder_set_write_check(&b, true);
830 
831    /* Fill the MissGroupTable, HitGroupTable & CallableGroupTable fields of
832     * RT_DISPATCH_GLOBALS using the mi_builder.
833     */
834    mi_store(&b,
835             mi_mem64(
836                anv_address_add(
837                   rtdg_addr,
838                   GENX(RT_DISPATCH_GLOBALS_MissGroupTable_start) / 8)),
839             mi_build_sbt_entry(&b,
840                                params->indirect_sbts_addr +
841                                offsetof(VkTraceRaysIndirectCommand2KHR,
842                                         missShaderBindingTableAddress),
843                                params->indirect_sbts_addr +
844                                offsetof(VkTraceRaysIndirectCommand2KHR,
845                                         missShaderBindingTableStride)));
846    mi_store(&b,
847             mi_mem64(
848                anv_address_add(
849                   rtdg_addr,
850                   GENX(RT_DISPATCH_GLOBALS_HitGroupTable_start) / 8)),
851             mi_build_sbt_entry(&b,
852                                params->indirect_sbts_addr +
853                                offsetof(VkTraceRaysIndirectCommand2KHR,
854                                         hitShaderBindingTableAddress),
855                                params->indirect_sbts_addr +
856                                offsetof(VkTraceRaysIndirectCommand2KHR,
857                                         hitShaderBindingTableStride)));
858    mi_store(&b,
859             mi_mem64(
860                anv_address_add(
861                   rtdg_addr,
862                   GENX(RT_DISPATCH_GLOBALS_CallableGroupTable_start) / 8)),
863             mi_build_sbt_entry(&b,
864                                params->indirect_sbts_addr +
865                                offsetof(VkTraceRaysIndirectCommand2KHR,
866                                         callableShaderBindingTableAddress),
867                                params->indirect_sbts_addr +
868                                offsetof(VkTraceRaysIndirectCommand2KHR,
869                                         callableShaderBindingTableStride)));
870 
871    return rtdg_state;
872 }
873 
874 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)875 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
876                       struct trace_params *params)
877 {
878    struct anv_device *device = cmd_buffer->device;
879    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
880    struct anv_ray_tracing_pipeline *pipeline =
881       anv_pipeline_to_ray_tracing(rt->base.pipeline);
882 
883    if (anv_batch_has_error(&cmd_buffer->batch))
884       return;
885 
886    /* If we have a known degenerate launch size, just bail */
887    if (!params->is_launch_size_indirect &&
888        (params->launch_size[0] == 0 ||
889         params->launch_size[1] == 0 ||
890         params->launch_size[2] == 0))
891       return;
892 
893    trace_intel_begin_rays(&cmd_buffer->trace);
894 
895    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
896 
897    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
898 
899    genX(flush_descriptor_buffers)(cmd_buffer, &rt->base);
900 
901    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
902 
903    cmd_buffer->state.rt.pipeline_dirty = false;
904 
905    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
906 
907    genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
908                                            &cmd_buffer->state.rt.base,
909                                            &pipeline->base);
910 
911    /* Add these to the reloc list as they're internal buffers that don't
912     * actually have relocs to pick them up manually.
913     *
914     * TODO(RT): This is a bit of a hack
915     */
916    anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
917                          rt->scratch.bo);
918    anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
919                          cmd_buffer->device->btd_fifo_bo);
920 
921    /* Allocate and set up our RT_DISPATCH_GLOBALS */
922    struct anv_state rtdg_state =
923       params->is_sbt_indirect ?
924       cmd_buffer_emit_rt_dispatch_globals_indirect(cmd_buffer, params) :
925       cmd_buffer_emit_rt_dispatch_globals(cmd_buffer, params);
926 
927    assert(rtdg_state.alloc_size >= (BRW_RT_PUSH_CONST_OFFSET +
928                                     sizeof(struct anv_push_constants)));
929    assert(GENX(RT_DISPATCH_GLOBALS_length) * 4 <= BRW_RT_PUSH_CONST_OFFSET);
930    /* Push constants go after the RT_DISPATCH_GLOBALS */
931    memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
932           &cmd_buffer->state.rt.base.push_constants,
933           sizeof(struct anv_push_constants));
934 
935    struct anv_address rtdg_addr =
936       anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
937 
938    uint8_t local_size_log2[3];
939    uint32_t global_size[3] = {};
940    if (params->is_launch_size_indirect) {
941       /* Pick a local size that's probably ok.  We assume most TraceRays calls
942        * will use a two-dimensional dispatch size.  Worst case, our initial
943        * dispatch will be a little slower than it has to be.
944        */
945       local_size_log2[0] = 2;
946       local_size_log2[1] = 1;
947       local_size_log2[2] = 0;
948 
949       struct mi_builder b;
950       mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
951       const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
952       mi_builder_set_mocs(&b, mocs);
953       mi_builder_set_write_check(&b, true);
954 
955       struct mi_value launch_size[3] = {
956          mi_mem32(anv_address_from_u64(params->launch_size_addr + 0)),
957          mi_mem32(anv_address_from_u64(params->launch_size_addr + 4)),
958          mi_mem32(anv_address_from_u64(params->launch_size_addr + 8)),
959       };
960 
961       /* Store the original launch size into RT_DISPATCH_GLOBALS */
962       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
963                                             GENX(RT_DISPATCH_GLOBALS_LaunchWidth_start) / 8)),
964                mi_value_ref(&b, launch_size[0]));
965       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
966                                             GENX(RT_DISPATCH_GLOBALS_LaunchHeight_start) / 8)),
967                mi_value_ref(&b, launch_size[1]));
968       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
969                                             GENX(RT_DISPATCH_GLOBALS_LaunchDepth_start) / 8)),
970                mi_value_ref(&b, launch_size[2]));
971 
972       /* Compute the global dispatch size */
973       for (unsigned i = 0; i < 3; i++) {
974          if (local_size_log2[i] == 0)
975             continue;
976 
977          /* global_size = DIV_ROUND_UP(launch_size, local_size)
978           *
979           * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
980           * has the semantics of shifting the enture 64-bit value and taking
981           * the bottom 32 so we don't have to worry about roll-over.
982           */
983          uint32_t local_size = 1 << local_size_log2[i];
984          launch_size[i] = mi_iadd(&b, launch_size[i],
985                                       mi_imm(local_size - 1));
986          launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
987                                             local_size_log2[i]);
988       }
989 
990       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
991       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
992       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
993 
994    } else {
995       calc_local_trace_size(local_size_log2, params->launch_size);
996 
997       for (unsigned i = 0; i < 3; i++) {
998          /* We have to be a bit careful here because DIV_ROUND_UP adds to the
999           * numerator value may overflow.  Cast to uint64_t to avoid this.
1000           */
1001          uint32_t local_size = 1 << local_size_log2[i];
1002          global_size[i] = DIV_ROUND_UP((uint64_t)params->launch_size[i], local_size);
1003       }
1004    }
1005 
1006 #if GFX_VERx10 == 125
1007    /* Wa_14014427904 - We need additional invalidate/flush when
1008     * emitting NP state commands with ATS-M in compute mode.
1009     */
1010    if (intel_device_info_is_atsm(device->info) &&
1011       cmd_buffer->queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
1012       genx_batch_emit_pipe_control(&cmd_buffer->batch,
1013                                    cmd_buffer->device->info,
1014                                    cmd_buffer->state.current_pipeline,
1015                                    ANV_PIPE_CS_STALL_BIT |
1016                                    ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
1017                                    ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
1018                                    ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
1019                                    ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
1020                                    ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1021                                    ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
1022    }
1023 #endif
1024 
1025    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BTD), btd) {
1026       /* TODO: This is the timeout after which the bucketed thread dispatcher
1027        *       will kick off a wave of threads. We go with the lowest value
1028        *       for now. It could be tweaked on a per application basis
1029        *       (drirc).
1030        */
1031       btd.DispatchTimeoutCounter = _64clocks;
1032       /* BSpec 43851: "This field must be programmed to 6h i.e. memory backed
1033        *               buffer must be 128KB."
1034        */
1035       btd.PerDSSMemoryBackedBufferSize = 6;
1036       btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo };
1037       if (pipeline->base.scratch_size > 0) {
1038          struct anv_bo *scratch_bo =
1039             anv_scratch_pool_alloc(device,
1040                                    &device->scratch_pool,
1041                                    MESA_SHADER_COMPUTE,
1042                                    pipeline->base.scratch_size);
1043          anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1044                                scratch_bo);
1045          uint32_t scratch_surf =
1046             anv_scratch_pool_get_surf(cmd_buffer->device,
1047                                       &device->scratch_pool,
1048                                       pipeline->base.scratch_size);
1049          btd.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
1050       }
1051 #if INTEL_NEEDS_WA_14017794102
1052       btd.BTDMidthreadpreemption = false;
1053 #endif
1054    }
1055 
1056    genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size);
1057 
1058    const struct brw_cs_prog_data *cs_prog_data =
1059       brw_cs_prog_data_const(device->rt_trampoline->prog_data);
1060    struct intel_cs_dispatch_info dispatch =
1061       brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL);
1062 
1063    anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
1064       cw.IndirectParameterEnable        = params->is_launch_size_indirect;
1065       cw.PredicateEnable                = cmd_buffer->state.conditional_render_enabled;
1066       cw.SIMDSize                       = dispatch.simd_size / 16;
1067       cw.MessageSIMD                    = dispatch.simd_size / 16;
1068       cw.LocalXMaximum                  = (1 << local_size_log2[0]) - 1;
1069       cw.LocalYMaximum                  = (1 << local_size_log2[1]) - 1;
1070       cw.LocalZMaximum                  = (1 << local_size_log2[2]) - 1;
1071       cw.ThreadGroupIDXDimension        = global_size[0];
1072       cw.ThreadGroupIDYDimension        = global_size[1];
1073       cw.ThreadGroupIDZDimension        = global_size[2];
1074       cw.ExecutionMask                  = 0xff;
1075       cw.EmitInlineParameter            = true;
1076       cw.PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0);
1077 
1078       const gl_shader_stage s = MESA_SHADER_RAYGEN;
1079       struct anv_device *device = cmd_buffer->device;
1080       struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
1081       struct anv_state *samplers = &cmd_buffer->state.samplers[s];
1082       cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
1083          .KernelStartPointer = device->rt_trampoline->kernel.offset,
1084          .SamplerStatePointer = samplers->offset,
1085          /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
1086          .SamplerCount = 0,
1087          .BindingTablePointer = surfaces->offset,
1088          .NumberofThreadsinGPGPUThreadGroup = 1,
1089          .BTDMode = true,
1090 #if INTEL_NEEDS_WA_14017794102
1091          .ThreadPreemption = false,
1092 #endif
1093       };
1094 
1095       struct brw_rt_raygen_trampoline_params trampoline_params = {
1096          .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
1097          .raygen_bsr_addr =
1098             params->is_sbt_indirect ?
1099             (params->indirect_sbts_addr +
1100              offsetof(VkTraceRaysIndirectCommand2KHR,
1101                       raygenShaderRecordAddress)) :
1102             params->raygen_sbt->deviceAddress,
1103          .is_indirect = params->is_sbt_indirect,
1104          .local_group_size_log2 = {
1105             local_size_log2[0],
1106             local_size_log2[1],
1107             local_size_log2[2],
1108          },
1109       };
1110       STATIC_ASSERT(sizeof(trampoline_params) == 32);
1111       memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
1112    }
1113 
1114    trace_intel_end_rays(&cmd_buffer->trace,
1115                         params->launch_size[0],
1116                         params->launch_size[1],
1117                         params->launch_size[2]);
1118 }
1119 
1120 void
genX(CmdTraceRaysKHR)1121 genX(CmdTraceRaysKHR)(
1122     VkCommandBuffer                             commandBuffer,
1123     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
1124     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
1125     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
1126     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
1127     uint32_t                                    width,
1128     uint32_t                                    height,
1129     uint32_t                                    depth)
1130 {
1131    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1132    struct trace_params params = {
1133       .is_sbt_indirect         = false,
1134       .raygen_sbt              = pRaygenShaderBindingTable,
1135       .miss_sbt                = pMissShaderBindingTable,
1136       .hit_sbt                 = pHitShaderBindingTable,
1137       .callable_sbt            = pCallableShaderBindingTable,
1138       .is_launch_size_indirect = false,
1139       .launch_size             = {
1140          width,
1141          height,
1142          depth,
1143       },
1144    };
1145 
1146    cmd_buffer_trace_rays(cmd_buffer, &params);
1147 }
1148 
1149 void
genX(CmdTraceRaysIndirectKHR)1150 genX(CmdTraceRaysIndirectKHR)(
1151     VkCommandBuffer                             commandBuffer,
1152     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
1153     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
1154     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
1155     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
1156     VkDeviceAddress                             indirectDeviceAddress)
1157 {
1158    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1159    struct trace_params params = {
1160       .is_sbt_indirect         = false,
1161       .raygen_sbt              = pRaygenShaderBindingTable,
1162       .miss_sbt                = pMissShaderBindingTable,
1163       .hit_sbt                 = pHitShaderBindingTable,
1164       .callable_sbt            = pCallableShaderBindingTable,
1165       .is_launch_size_indirect = true,
1166       .launch_size_addr        = indirectDeviceAddress,
1167    };
1168 
1169    cmd_buffer_trace_rays(cmd_buffer, &params);
1170 }
1171 
1172 void
genX(CmdTraceRaysIndirect2KHR)1173 genX(CmdTraceRaysIndirect2KHR)(
1174     VkCommandBuffer                             commandBuffer,
1175     VkDeviceAddress                             indirectDeviceAddress)
1176 {
1177    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1178    struct trace_params params = {
1179       .is_sbt_indirect         = true,
1180       .indirect_sbts_addr      = indirectDeviceAddress,
1181       .is_launch_size_indirect = true,
1182       .launch_size_addr        = indirectDeviceAddress +
1183                                  offsetof(VkTraceRaysIndirectCommand2KHR, width),
1184    };
1185 
1186    cmd_buffer_trace_rays(cmd_buffer, &params);
1187 }
1188 
1189 #endif /* GFX_VERx10 >= 125 */
1190