1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29
30 #include "common/intel_compute_slm.h"
31 #include "genxml/gen_macros.h"
32 #include "genxml/genX_pack.h"
33 #include "genxml/genX_rt_pack.h"
34 #include "common/intel_genX_state_brw.h"
35
36 #include "ds/intel_tracepoints.h"
37
38 #include "genX_mi_builder.h"
39
40 void
genX(cmd_buffer_ensure_cfe_state)41 genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
42 uint32_t total_scratch)
43 {
44 #if GFX_VERx10 >= 125
45 assert(cmd_buffer->state.current_pipeline == GPGPU);
46
47 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
48
49 if (total_scratch <= comp_state->scratch_size)
50 return;
51
52 const struct intel_device_info *devinfo = cmd_buffer->device->info;
53 anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
54 cfe.MaximumNumberofThreads = devinfo->max_cs_threads * devinfo->subslice_total;
55
56 uint32_t scratch_surf;
57 struct anv_scratch_pool *scratch_pool =
58 (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ?
59 &cmd_buffer->device->protected_scratch_pool :
60 &cmd_buffer->device->scratch_pool;
61 struct anv_bo *scratch_bo =
62 anv_scratch_pool_alloc(cmd_buffer->device, scratch_pool,
63 MESA_SHADER_COMPUTE,
64 total_scratch);
65 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, scratch_bo);
66 scratch_surf = anv_scratch_pool_get_surf(cmd_buffer->device, scratch_pool,
67 total_scratch);
68 cfe.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
69 #if GFX_VER >= 20
70 switch (cmd_buffer->device->physical->instance->stack_ids) {
71 case 256: cfe.StackIDControl = StackIDs256; break;
72 case 512: cfe.StackIDControl = StackIDs512; break;
73 case 1024: cfe.StackIDControl = StackIDs1024; break;
74 case 2048: cfe.StackIDControl = StackIDs2048; break;
75 default: unreachable("invalid stack_ids value");
76 }
77 #endif
78
79 cfe.OverDispatchControl = 2; /* 50% overdispatch */
80 }
81
82 comp_state->scratch_size = total_scratch;
83 #else
84 unreachable("Invalid call");
85 #endif
86 }
87
88 static void
genX(cmd_buffer_flush_compute_state)89 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
90 {
91 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
92 struct anv_compute_pipeline *pipeline =
93 anv_pipeline_to_compute(comp_state->base.pipeline);
94 const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
95
96 assert(pipeline->cs);
97
98 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
99
100 genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
101
102 genX(flush_descriptor_buffers)(cmd_buffer, &comp_state->base);
103
104 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
105
106 /* Apply any pending pipeline flushes we may have. We want to apply them
107 * now because, if any of those flushes are for things like push constants,
108 * the GPU will read the state at weird times.
109 */
110 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
111
112 if (cmd_buffer->state.compute.pipeline_dirty) {
113 #if GFX_VERx10 < 125
114 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
115 *
116 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
117 * the only bits that are changed are scoreboard related: Scoreboard
118 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
119 * these scoreboard related states, a MEDIA_STATE_FLUSH is
120 * sufficient."
121 */
122 anv_add_pending_pipe_bits(cmd_buffer,
123 ANV_PIPE_CS_STALL_BIT,
124 "flush compute state");
125 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
126 #endif
127
128 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
129
130 #if GFX_VERx10 >= 125
131 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
132 genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
133 #endif
134
135 /* The workgroup size of the pipeline affects our push constant layout
136 * so flag push constants as dirty if we change the pipeline.
137 */
138 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
139 comp_state->base.push_constants_data_dirty = true;
140 }
141
142 cmd_buffer->state.descriptors_dirty |=
143 genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
144 &cmd_buffer->state.compute.base,
145 &pipeline->base);
146
147 if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
148 cmd_buffer->state.compute.pipeline_dirty) {
149 genX(cmd_buffer_flush_descriptor_sets)(cmd_buffer,
150 &cmd_buffer->state.compute.base,
151 VK_SHADER_STAGE_COMPUTE_BIT,
152 &pipeline->cs, 1);
153 cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
154
155 #if GFX_VERx10 < 125
156 uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
157 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
158 .BindingTablePointer =
159 cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
160 .SamplerStatePointer =
161 cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
162 };
163 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
164
165 struct anv_state state =
166 anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
167 pipeline->interface_descriptor_data,
168 GENX(INTERFACE_DESCRIPTOR_DATA_length),
169 64);
170
171 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
172 anv_batch_emit(&cmd_buffer->batch,
173 GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
174 mid.InterfaceDescriptorTotalLength = size;
175 mid.InterfaceDescriptorDataStartAddress = state.offset;
176 }
177 #endif
178 }
179
180 if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
181
182 if (comp_state->base.push_constants_state.alloc_size == 0 ||
183 comp_state->base.push_constants_data_dirty) {
184 comp_state->base.push_constants_state =
185 anv_cmd_buffer_cs_push_constants(cmd_buffer);
186 comp_state->base.push_constants_data_dirty = false;
187 }
188
189 #if GFX_VERx10 < 125
190 if (comp_state->base.push_constants_state.alloc_size) {
191 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
192 curbe.CURBETotalDataLength = comp_state->base.push_constants_state.alloc_size;
193 curbe.CURBEDataStartAddress = comp_state->base.push_constants_state.offset;
194 }
195 }
196 #endif
197
198 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
199 }
200
201 cmd_buffer->state.compute.pipeline_dirty = false;
202
203 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
204 }
205
206 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer * cmd_buffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ)207 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
208 uint32_t baseGroupX,
209 uint32_t baseGroupY,
210 uint32_t baseGroupZ)
211 {
212 if (anv_batch_has_error(&cmd_buffer->batch))
213 return;
214
215 struct anv_push_constants *push =
216 &cmd_buffer->state.compute.base.push_constants;
217 if (push->cs.base_work_group_id[0] != baseGroupX ||
218 push->cs.base_work_group_id[1] != baseGroupY ||
219 push->cs.base_work_group_id[2] != baseGroupZ) {
220 push->cs.base_work_group_id[0] = baseGroupX;
221 push->cs.base_work_group_id[1] = baseGroupY;
222 push->cs.base_work_group_id[2] = baseGroupZ;
223
224 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
225 cmd_buffer->state.compute.base.push_constants_data_dirty = true;
226 }
227 }
228
229 #define GPGPU_DISPATCHDIMX 0x2500
230 #define GPGPU_DISPATCHDIMY 0x2504
231 #define GPGPU_DISPATCHDIMZ 0x2508
232
233 static void
compute_load_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr)234 compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer,
235 const struct anv_address indirect_addr)
236 {
237 struct mi_builder b;
238 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
239
240 struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
241 struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
242 struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
243
244 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
245 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
246 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
247 }
248
249 static void
compute_store_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr)250 compute_store_indirect_params(struct anv_cmd_buffer *cmd_buffer,
251 const struct anv_address indirect_addr)
252 {
253 struct mi_builder b;
254 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
255
256 struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
257 struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
258 struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
259
260 mi_store(&b, size_x, mi_reg32(GPGPU_DISPATCHDIMX));
261 mi_store(&b, size_y, mi_reg32(GPGPU_DISPATCHDIMY));
262 mi_store(&b, size_z, mi_reg32(GPGPU_DISPATCHDIMZ));
263 }
264
265
266 #if GFX_VERx10 >= 125
267
GENX(INTERFACE_DESCRIPTOR_DATA)268 static inline struct GENX(INTERFACE_DESCRIPTOR_DATA)
269 get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
270 const struct anv_shader_bin *shader,
271 const struct brw_cs_prog_data *prog_data,
272 const struct intel_cs_dispatch_info *dispatch)
273 {
274 const struct intel_device_info *devinfo = cmd_buffer->device->info;
275
276 return (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
277 .SamplerCount = DIV_ROUND_UP(CLAMP(shader->bind_map.sampler_count, 0, 16), 4),
278 .KernelStartPointer = shader->kernel.offset,
279 .SamplerStatePointer = cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
280 .BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
281 /* Typically set to 0 to avoid prefetching on every thread dispatch. */
282 .BindingTableEntryCount = devinfo->verx10 == 125 ?
283 0 : 1 + MIN2(shader->bind_map.surface_count, 30),
284 .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
285 .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared),
286 .PreferredSLMAllocationSize =
287 intel_compute_preferred_slm_calc_encode_size(devinfo,
288 prog_data->base.total_shared,
289 dispatch->group_size,
290 dispatch->simd_size),
291 .NumberOfBarriers = prog_data->uses_barrier,
292 };
293 }
294
295 static inline void
emit_indirect_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct brw_cs_prog_data * prog_data,struct anv_address indirect_addr)296 emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
297 const struct anv_shader_bin *shader,
298 const struct brw_cs_prog_data *prog_data,
299 struct anv_address indirect_addr)
300 {
301 const struct intel_device_info *devinfo = cmd_buffer->device->info;
302 assert(devinfo->has_indirect_unroll);
303
304 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
305 bool predicate = cmd_buffer->state.conditional_render_enabled;
306
307 const struct intel_cs_dispatch_info dispatch =
308 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
309 const int dispatch_size = dispatch.simd_size / 16;
310
311 struct GENX(COMPUTE_WALKER_BODY) body = {
312 .SIMDSize = dispatch_size,
313 .MessageSIMD = dispatch_size,
314 .IndirectDataStartAddress = comp_state->base.push_constants_state.offset,
315 .IndirectDataLength = comp_state->base.push_constants_state.alloc_size,
316 .GenerateLocalID = prog_data->generate_local_id != 0,
317 .EmitLocal = prog_data->generate_local_id,
318 .WalkOrder = prog_data->walk_order,
319 .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
320 TileY32bpe : Linear,
321 .LocalXMaximum = prog_data->local_size[0] - 1,
322 .LocalYMaximum = prog_data->local_size[1] - 1,
323 .LocalZMaximum = prog_data->local_size[2] - 1,
324 .ExecutionMask = dispatch.right_mask,
325 .PostSync.MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
326 .InterfaceDescriptor =
327 get_interface_descriptor_data(cmd_buffer, shader, prog_data,
328 &dispatch),
329 };
330
331 cmd_buffer->state.last_indirect_dispatch =
332 anv_batch_emitn(
333 &cmd_buffer->batch,
334 GENX(EXECUTE_INDIRECT_DISPATCH_length),
335 GENX(EXECUTE_INDIRECT_DISPATCH),
336 .PredicateEnable = predicate,
337 .MaxCount = 1,
338 .COMPUTE_WALKER_BODY = body,
339 .ArgumentBufferStartAddress = indirect_addr,
340 .MOCS = anv_mocs(cmd_buffer->device,
341 indirect_addr.bo, 0),
342 );
343 }
344
345 static inline void
emit_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)346 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
347 const struct anv_compute_pipeline *pipeline, bool indirect,
348 const struct brw_cs_prog_data *prog_data,
349 uint32_t groupCountX, uint32_t groupCountY,
350 uint32_t groupCountZ)
351 {
352 const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
353 const bool predicate = cmd_buffer->state.conditional_render_enabled;
354
355 const struct intel_device_info *devinfo = pipeline->base.device->info;
356 const struct intel_cs_dispatch_info dispatch =
357 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
358
359 cmd_buffer->state.last_compute_walker =
360 anv_batch_emitn(
361 &cmd_buffer->batch,
362 GENX(COMPUTE_WALKER_length),
363 GENX(COMPUTE_WALKER),
364 .IndirectParameterEnable = indirect,
365 .PredicateEnable = predicate,
366 .SIMDSize = dispatch.simd_size / 16,
367 .MessageSIMD = dispatch.simd_size / 16,
368 .IndirectDataStartAddress = comp_state->base.push_constants_state.offset,
369 .IndirectDataLength = comp_state->base.push_constants_state.alloc_size,
370 #if GFX_VERx10 == 125
371 .SystolicModeEnable = prog_data->uses_systolic,
372 #endif
373 .GenerateLocalID = prog_data->generate_local_id != 0,
374 .EmitLocal = prog_data->generate_local_id,
375 .WalkOrder = prog_data->walk_order,
376 .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
377 TileY32bpe : Linear,
378 .LocalXMaximum = prog_data->local_size[0] - 1,
379 .LocalYMaximum = prog_data->local_size[1] - 1,
380 .LocalZMaximum = prog_data->local_size[2] - 1,
381 .ThreadGroupIDXDimension = groupCountX,
382 .ThreadGroupIDYDimension = groupCountY,
383 .ThreadGroupIDZDimension = groupCountZ,
384 .ExecutionMask = dispatch.right_mask,
385 .PostSync = {
386 .MOCS = anv_mocs(pipeline->base.device, NULL, 0),
387 },
388 .InterfaceDescriptor =
389 get_interface_descriptor_data(cmd_buffer, pipeline->cs,
390 prog_data, &dispatch),
391 );
392 }
393
394 #else /* #if GFX_VERx10 >= 125 */
395
396 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)397 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
398 const struct anv_compute_pipeline *pipeline, bool indirect,
399 const struct brw_cs_prog_data *prog_data,
400 uint32_t groupCountX, uint32_t groupCountY,
401 uint32_t groupCountZ)
402 {
403 const bool predicate = cmd_buffer->state.conditional_render_enabled;
404
405 const struct intel_device_info *devinfo = pipeline->base.device->info;
406 const struct intel_cs_dispatch_info dispatch =
407 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
408
409 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
410 ggw.IndirectParameterEnable = indirect;
411 ggw.PredicateEnable = predicate;
412 ggw.SIMDSize = dispatch.simd_size / 16;
413 ggw.ThreadDepthCounterMaximum = 0;
414 ggw.ThreadHeightCounterMaximum = 0;
415 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
416 ggw.ThreadGroupIDXDimension = groupCountX;
417 ggw.ThreadGroupIDYDimension = groupCountY;
418 ggw.ThreadGroupIDZDimension = groupCountZ;
419 ggw.RightExecutionMask = dispatch.right_mask;
420 ggw.BottomExecutionMask = 0xffffffff;
421 }
422
423 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
424 }
425
426 #endif /* #if GFX_VERx10 >= 125 */
427
428 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,const struct brw_cs_prog_data * prog_data,struct anv_address indirect_addr,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)429 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
430 const struct anv_compute_pipeline *pipeline,
431 const struct brw_cs_prog_data *prog_data,
432 struct anv_address indirect_addr,
433 uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
434 {
435 bool is_indirect = !anv_address_is_null(indirect_addr);
436
437 #if GFX_VERx10 >= 125
438 if (is_indirect && cmd_buffer->device->info->has_indirect_unroll) {
439 emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data,
440 indirect_addr);
441 return;
442 }
443 #endif
444
445 if (is_indirect)
446 compute_load_indirect_params(cmd_buffer, indirect_addr);
447
448 #if GFX_VERx10 >= 125
449 emit_compute_walker(cmd_buffer, pipeline, is_indirect, prog_data,
450 groupCountX, groupCountY, groupCountZ);
451 #else
452 emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data,
453 groupCountX, groupCountY, groupCountZ);
454 #endif
455 }
456
genX(CmdDispatchBase)457 void genX(CmdDispatchBase)(
458 VkCommandBuffer commandBuffer,
459 uint32_t baseGroupX,
460 uint32_t baseGroupY,
461 uint32_t baseGroupZ,
462 uint32_t groupCountX,
463 uint32_t groupCountY,
464 uint32_t groupCountZ)
465 {
466 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
467 struct anv_compute_pipeline *pipeline =
468 anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
469 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
470
471 anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
472 baseGroupY, baseGroupZ);
473
474 if (anv_batch_has_error(&cmd_buffer->batch))
475 return;
476
477 anv_measure_snapshot(cmd_buffer,
478 INTEL_SNAPSHOT_COMPUTE,
479 "compute",
480 groupCountX * groupCountY * groupCountZ *
481 prog_data->local_size[0] * prog_data->local_size[1] *
482 prog_data->local_size[2]);
483
484 trace_intel_begin_compute(&cmd_buffer->trace);
485
486 if (prog_data->uses_num_work_groups) {
487 struct anv_state state =
488 anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 12, 4);
489 uint32_t *sizes = state.map;
490 sizes[0] = groupCountX;
491 sizes[1] = groupCountY;
492 sizes[2] = groupCountZ;
493 cmd_buffer->state.compute.num_workgroups =
494 anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
495
496 /* The num_workgroups buffer goes in the binding table */
497 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
498 }
499
500 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
501
502 if (cmd_buffer->state.conditional_render_enabled)
503 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
504
505 emit_cs_walker(cmd_buffer, pipeline, prog_data,
506 ANV_NULL_ADDRESS /* no indirect data */,
507 groupCountX, groupCountY, groupCountZ);
508
509 trace_intel_end_compute(&cmd_buffer->trace,
510 groupCountX, groupCountY, groupCountZ);
511 }
512
genX(CmdDispatchIndirect)513 void genX(CmdDispatchIndirect)(
514 VkCommandBuffer commandBuffer,
515 VkBuffer _buffer,
516 VkDeviceSize offset)
517 {
518 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
519 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
520 struct anv_compute_pipeline *pipeline =
521 anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
522 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
523 struct anv_address addr = anv_address_add(buffer->address, offset);
524 UNUSED struct anv_batch *batch = &cmd_buffer->batch;
525
526 anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
527
528 anv_measure_snapshot(cmd_buffer,
529 INTEL_SNAPSHOT_COMPUTE,
530 "compute indirect",
531 0);
532 trace_intel_begin_compute_indirect(&cmd_buffer->trace);
533
534 if (prog_data->uses_num_work_groups) {
535 cmd_buffer->state.compute.num_workgroups = addr;
536
537 /* The num_workgroups buffer goes in the binding table */
538 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
539 }
540
541 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
542
543 if (cmd_buffer->state.conditional_render_enabled)
544 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
545
546 emit_cs_walker(cmd_buffer, pipeline, prog_data, addr, 0, 0, 0);
547
548 trace_intel_end_compute_indirect(&cmd_buffer->trace,
549 anv_address_utrace(addr));
550 }
551
552 struct anv_address
genX(cmd_buffer_ray_query_globals)553 genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
554 {
555 #if GFX_VERx10 >= 125
556 struct anv_device *device = cmd_buffer->device;
557
558 struct anv_state state =
559 anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
560 BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
561 struct brw_rt_scratch_layout layout;
562 uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
563 * some cases?
564 */
565 brw_rt_compute_scratch_layout(&layout, device->info,
566 stack_ids_per_dss, 1 << 10);
567
568 const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
569 .MemBaseAddress = (struct anv_address) {
570 /* The ray query HW computes offsets from the top of the buffer, so
571 * let the address at the end of the buffer.
572 */
573 .bo = device->ray_query_bo,
574 .offset = device->ray_query_bo->size
575 },
576 .AsyncRTStackSize = layout.ray_stack_stride / 64,
577 .NumDSSRTStacks = layout.stack_ids_per_dss,
578 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
579 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
580 .ResumeShaderTable = (struct anv_address) {
581 .bo = cmd_buffer->state.ray_query_shadow_bo,
582 },
583 };
584 GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
585
586 return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
587 #else
588 unreachable("Not supported");
589 #endif
590 }
591
592 #if GFX_VERx10 >= 125
593 void
genX(cmd_buffer_dispatch_kernel)594 genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
595 struct anv_kernel *kernel,
596 const uint32_t *global_size,
597 uint32_t arg_count,
598 const struct anv_kernel_arg *args)
599 {
600 const struct intel_device_info *devinfo = cmd_buffer->device->info;
601 const struct brw_cs_prog_data *cs_prog_data =
602 brw_cs_prog_data_const(kernel->bin->prog_data);
603
604 genX(cmd_buffer_config_l3)(cmd_buffer, kernel->l3_config);
605
606 genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
607
608 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
609
610 /* Apply any pending pipeline flushes we may have. We want to apply them
611 * now because, if any of those flushes are for things like push constants,
612 * the GPU will read the state at weird times.
613 */
614 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
615
616 uint32_t indirect_data_size = sizeof(struct brw_kernel_sysvals);
617 indirect_data_size += kernel->bin->bind_map.kernel_args_size;
618 indirect_data_size = ALIGN(indirect_data_size, 64);
619 struct anv_state indirect_data =
620 anv_cmd_buffer_alloc_general_state(cmd_buffer,
621 indirect_data_size, 64);
622 memset(indirect_data.map, 0, indirect_data.alloc_size);
623
624 struct brw_kernel_sysvals sysvals = {};
625 if (global_size != NULL) {
626 for (unsigned i = 0; i < 3; i++)
627 sysvals.num_work_groups[i] = global_size[i];
628 memcpy(indirect_data.map, &sysvals, sizeof(sysvals));
629 } else {
630 struct anv_address sysvals_addr = {
631 .bo = NULL, /* General state buffer is always 0. */
632 .offset = indirect_data.offset,
633 };
634
635 compute_store_indirect_params(cmd_buffer, sysvals_addr);
636 }
637
638 void *args_map = indirect_data.map + sizeof(sysvals);
639 for (unsigned i = 0; i < kernel->bin->bind_map.kernel_arg_count; i++) {
640 struct brw_kernel_arg_desc *arg_desc =
641 &kernel->bin->bind_map.kernel_args[i];
642 assert(i < arg_count);
643 const struct anv_kernel_arg *arg = &args[i];
644 if (arg->is_ptr) {
645 memcpy(args_map + arg_desc->offset, arg->ptr, arg_desc->size);
646 } else {
647 assert(arg_desc->size <= sizeof(arg->u64));
648 memcpy(args_map + arg_desc->offset, &arg->u64, arg_desc->size);
649 }
650 }
651
652 struct intel_cs_dispatch_info dispatch =
653 brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
654
655 anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
656 cw.PredicateEnable = false;
657 cw.SIMDSize = dispatch.simd_size / 16;
658 cw.MessageSIMD = dispatch.simd_size / 16;
659 cw.IndirectDataStartAddress = indirect_data.offset;
660 cw.IndirectDataLength = indirect_data.alloc_size;
661 cw.LocalXMaximum = cs_prog_data->local_size[0] - 1;
662 cw.LocalYMaximum = cs_prog_data->local_size[1] - 1;
663 cw.LocalZMaximum = cs_prog_data->local_size[2] - 1;
664 cw.ExecutionMask = dispatch.right_mask;
665 cw.PostSync.MOCS = cmd_buffer->device->isl_dev.mocs.internal;
666
667 if (global_size != NULL) {
668 cw.ThreadGroupIDXDimension = global_size[0];
669 cw.ThreadGroupIDYDimension = global_size[1];
670 cw.ThreadGroupIDZDimension = global_size[2];
671 } else {
672 cw.IndirectParameterEnable = true;
673 }
674
675 cw.InterfaceDescriptor =
676 get_interface_descriptor_data(cmd_buffer,
677 kernel->bin,
678 cs_prog_data,
679 &dispatch);
680 }
681
682 /* We just blew away the compute pipeline state */
683 cmd_buffer->state.compute.pipeline_dirty = true;
684 }
685
686 static void
calc_local_trace_size(uint8_t local_shift[3],const uint32_t global[3])687 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
688 {
689 unsigned total_shift = 0;
690 memset(local_shift, 0, 3);
691
692 bool progress;
693 do {
694 progress = false;
695 for (unsigned i = 0; i < 3; i++) {
696 assert(global[i] > 0);
697 if ((1 << local_shift[i]) < global[i]) {
698 progress = true;
699 local_shift[i]++;
700 total_shift++;
701 }
702
703 if (total_shift == 3)
704 return;
705 }
706 } while(progress);
707
708 /* Assign whatever's left to x */
709 local_shift[0] += 3 - total_shift;
710 }
711
GENX(RT_SHADER_TABLE)712 static struct GENX(RT_SHADER_TABLE)
713 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
714 {
715 return (struct GENX(RT_SHADER_TABLE)) {
716 .BaseAddress = anv_address_from_u64(region->deviceAddress),
717 .Stride = region->stride,
718 };
719 }
720
721 struct trace_params {
722 /* If is_sbt_indirect, use indirect_sbts_addr to build RT_DISPATCH_GLOBALS
723 * with mi_builder.
724 */
725 bool is_sbt_indirect;
726 const VkStridedDeviceAddressRegionKHR *raygen_sbt;
727 const VkStridedDeviceAddressRegionKHR *miss_sbt;
728 const VkStridedDeviceAddressRegionKHR *hit_sbt;
729 const VkStridedDeviceAddressRegionKHR *callable_sbt;
730
731 /* A pointer to a VkTraceRaysIndirectCommand2KHR structure */
732 uint64_t indirect_sbts_addr;
733
734 /* If is_indirect, use launch_size_addr to program the dispatch size. */
735 bool is_launch_size_indirect;
736 uint32_t launch_size[3];
737
738 /* A pointer a uint32_t[3] */
739 uint64_t launch_size_addr;
740 };
741
742 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)743 cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer *cmd_buffer,
744 struct trace_params *params)
745 {
746 assert(!params->is_sbt_indirect);
747 assert(params->miss_sbt != NULL);
748 assert(params->hit_sbt != NULL);
749 assert(params->callable_sbt != NULL);
750
751 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
752
753 struct anv_state rtdg_state =
754 anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
755 BRW_RT_PUSH_CONST_OFFSET +
756 sizeof(struct anv_push_constants),
757 64);
758
759 struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
760 .MemBaseAddress = (struct anv_address) {
761 .bo = rt->scratch.bo,
762 .offset = rt->scratch.layout.ray_stack_start,
763 },
764 .CallStackHandler = anv_shader_bin_get_bsr(
765 cmd_buffer->device->rt_trivial_return, 0),
766 .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
767 .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
768 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
769 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
770 .HitGroupTable = vk_sdar_to_shader_table(params->hit_sbt),
771 .MissGroupTable = vk_sdar_to_shader_table(params->miss_sbt),
772 .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
773 .LaunchWidth = params->launch_size[0],
774 .LaunchHeight = params->launch_size[1],
775 .LaunchDepth = params->launch_size[2],
776 .CallableGroupTable = vk_sdar_to_shader_table(params->callable_sbt),
777 };
778 GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
779
780 return rtdg_state;
781 }
782
783 static struct mi_value
mi_build_sbt_entry(struct mi_builder * b,uint64_t addr_field_addr,uint64_t stride_field_addr)784 mi_build_sbt_entry(struct mi_builder *b,
785 uint64_t addr_field_addr,
786 uint64_t stride_field_addr)
787 {
788 return mi_ior(b,
789 mi_iand(b, mi_mem64(anv_address_from_u64(addr_field_addr)),
790 mi_imm(BITFIELD64_BIT(49) - 1)),
791 mi_ishl_imm(b, mi_mem32(anv_address_from_u64(stride_field_addr)),
792 48));
793 }
794
795 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)796 cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer *cmd_buffer,
797 struct trace_params *params)
798 {
799 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
800
801 struct anv_state rtdg_state =
802 anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
803 BRW_RT_PUSH_CONST_OFFSET +
804 sizeof(struct anv_push_constants),
805 64);
806
807 struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
808 .MemBaseAddress = (struct anv_address) {
809 .bo = rt->scratch.bo,
810 .offset = rt->scratch.layout.ray_stack_start,
811 },
812 .CallStackHandler = anv_shader_bin_get_bsr(
813 cmd_buffer->device->rt_trivial_return, 0),
814 .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
815 .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
816 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
817 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
818 .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
819 };
820 GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
821
822 struct anv_address rtdg_addr =
823 anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
824
825 struct mi_builder b;
826 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
827 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
828 mi_builder_set_mocs(&b, mocs);
829 mi_builder_set_write_check(&b, true);
830
831 /* Fill the MissGroupTable, HitGroupTable & CallableGroupTable fields of
832 * RT_DISPATCH_GLOBALS using the mi_builder.
833 */
834 mi_store(&b,
835 mi_mem64(
836 anv_address_add(
837 rtdg_addr,
838 GENX(RT_DISPATCH_GLOBALS_MissGroupTable_start) / 8)),
839 mi_build_sbt_entry(&b,
840 params->indirect_sbts_addr +
841 offsetof(VkTraceRaysIndirectCommand2KHR,
842 missShaderBindingTableAddress),
843 params->indirect_sbts_addr +
844 offsetof(VkTraceRaysIndirectCommand2KHR,
845 missShaderBindingTableStride)));
846 mi_store(&b,
847 mi_mem64(
848 anv_address_add(
849 rtdg_addr,
850 GENX(RT_DISPATCH_GLOBALS_HitGroupTable_start) / 8)),
851 mi_build_sbt_entry(&b,
852 params->indirect_sbts_addr +
853 offsetof(VkTraceRaysIndirectCommand2KHR,
854 hitShaderBindingTableAddress),
855 params->indirect_sbts_addr +
856 offsetof(VkTraceRaysIndirectCommand2KHR,
857 hitShaderBindingTableStride)));
858 mi_store(&b,
859 mi_mem64(
860 anv_address_add(
861 rtdg_addr,
862 GENX(RT_DISPATCH_GLOBALS_CallableGroupTable_start) / 8)),
863 mi_build_sbt_entry(&b,
864 params->indirect_sbts_addr +
865 offsetof(VkTraceRaysIndirectCommand2KHR,
866 callableShaderBindingTableAddress),
867 params->indirect_sbts_addr +
868 offsetof(VkTraceRaysIndirectCommand2KHR,
869 callableShaderBindingTableStride)));
870
871 return rtdg_state;
872 }
873
874 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)875 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
876 struct trace_params *params)
877 {
878 struct anv_device *device = cmd_buffer->device;
879 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
880 struct anv_ray_tracing_pipeline *pipeline =
881 anv_pipeline_to_ray_tracing(rt->base.pipeline);
882
883 if (anv_batch_has_error(&cmd_buffer->batch))
884 return;
885
886 /* If we have a known degenerate launch size, just bail */
887 if (!params->is_launch_size_indirect &&
888 (params->launch_size[0] == 0 ||
889 params->launch_size[1] == 0 ||
890 params->launch_size[2] == 0))
891 return;
892
893 trace_intel_begin_rays(&cmd_buffer->trace);
894
895 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
896
897 genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
898
899 genX(flush_descriptor_buffers)(cmd_buffer, &rt->base);
900
901 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
902
903 cmd_buffer->state.rt.pipeline_dirty = false;
904
905 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
906
907 genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
908 &cmd_buffer->state.rt.base,
909 &pipeline->base);
910
911 /* Add these to the reloc list as they're internal buffers that don't
912 * actually have relocs to pick them up manually.
913 *
914 * TODO(RT): This is a bit of a hack
915 */
916 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
917 rt->scratch.bo);
918 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
919 cmd_buffer->device->btd_fifo_bo);
920
921 /* Allocate and set up our RT_DISPATCH_GLOBALS */
922 struct anv_state rtdg_state =
923 params->is_sbt_indirect ?
924 cmd_buffer_emit_rt_dispatch_globals_indirect(cmd_buffer, params) :
925 cmd_buffer_emit_rt_dispatch_globals(cmd_buffer, params);
926
927 assert(rtdg_state.alloc_size >= (BRW_RT_PUSH_CONST_OFFSET +
928 sizeof(struct anv_push_constants)));
929 assert(GENX(RT_DISPATCH_GLOBALS_length) * 4 <= BRW_RT_PUSH_CONST_OFFSET);
930 /* Push constants go after the RT_DISPATCH_GLOBALS */
931 memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
932 &cmd_buffer->state.rt.base.push_constants,
933 sizeof(struct anv_push_constants));
934
935 struct anv_address rtdg_addr =
936 anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
937
938 uint8_t local_size_log2[3];
939 uint32_t global_size[3] = {};
940 if (params->is_launch_size_indirect) {
941 /* Pick a local size that's probably ok. We assume most TraceRays calls
942 * will use a two-dimensional dispatch size. Worst case, our initial
943 * dispatch will be a little slower than it has to be.
944 */
945 local_size_log2[0] = 2;
946 local_size_log2[1] = 1;
947 local_size_log2[2] = 0;
948
949 struct mi_builder b;
950 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
951 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
952 mi_builder_set_mocs(&b, mocs);
953 mi_builder_set_write_check(&b, true);
954
955 struct mi_value launch_size[3] = {
956 mi_mem32(anv_address_from_u64(params->launch_size_addr + 0)),
957 mi_mem32(anv_address_from_u64(params->launch_size_addr + 4)),
958 mi_mem32(anv_address_from_u64(params->launch_size_addr + 8)),
959 };
960
961 /* Store the original launch size into RT_DISPATCH_GLOBALS */
962 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
963 GENX(RT_DISPATCH_GLOBALS_LaunchWidth_start) / 8)),
964 mi_value_ref(&b, launch_size[0]));
965 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
966 GENX(RT_DISPATCH_GLOBALS_LaunchHeight_start) / 8)),
967 mi_value_ref(&b, launch_size[1]));
968 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
969 GENX(RT_DISPATCH_GLOBALS_LaunchDepth_start) / 8)),
970 mi_value_ref(&b, launch_size[2]));
971
972 /* Compute the global dispatch size */
973 for (unsigned i = 0; i < 3; i++) {
974 if (local_size_log2[i] == 0)
975 continue;
976
977 /* global_size = DIV_ROUND_UP(launch_size, local_size)
978 *
979 * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
980 * has the semantics of shifting the enture 64-bit value and taking
981 * the bottom 32 so we don't have to worry about roll-over.
982 */
983 uint32_t local_size = 1 << local_size_log2[i];
984 launch_size[i] = mi_iadd(&b, launch_size[i],
985 mi_imm(local_size - 1));
986 launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
987 local_size_log2[i]);
988 }
989
990 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
991 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
992 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
993
994 } else {
995 calc_local_trace_size(local_size_log2, params->launch_size);
996
997 for (unsigned i = 0; i < 3; i++) {
998 /* We have to be a bit careful here because DIV_ROUND_UP adds to the
999 * numerator value may overflow. Cast to uint64_t to avoid this.
1000 */
1001 uint32_t local_size = 1 << local_size_log2[i];
1002 global_size[i] = DIV_ROUND_UP((uint64_t)params->launch_size[i], local_size);
1003 }
1004 }
1005
1006 #if GFX_VERx10 == 125
1007 /* Wa_14014427904 - We need additional invalidate/flush when
1008 * emitting NP state commands with ATS-M in compute mode.
1009 */
1010 if (intel_device_info_is_atsm(device->info) &&
1011 cmd_buffer->queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
1012 genx_batch_emit_pipe_control(&cmd_buffer->batch,
1013 cmd_buffer->device->info,
1014 cmd_buffer->state.current_pipeline,
1015 ANV_PIPE_CS_STALL_BIT |
1016 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
1017 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
1018 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
1019 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
1020 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1021 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
1022 }
1023 #endif
1024
1025 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BTD), btd) {
1026 /* TODO: This is the timeout after which the bucketed thread dispatcher
1027 * will kick off a wave of threads. We go with the lowest value
1028 * for now. It could be tweaked on a per application basis
1029 * (drirc).
1030 */
1031 btd.DispatchTimeoutCounter = _64clocks;
1032 /* BSpec 43851: "This field must be programmed to 6h i.e. memory backed
1033 * buffer must be 128KB."
1034 */
1035 btd.PerDSSMemoryBackedBufferSize = 6;
1036 btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo };
1037 if (pipeline->base.scratch_size > 0) {
1038 struct anv_bo *scratch_bo =
1039 anv_scratch_pool_alloc(device,
1040 &device->scratch_pool,
1041 MESA_SHADER_COMPUTE,
1042 pipeline->base.scratch_size);
1043 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1044 scratch_bo);
1045 uint32_t scratch_surf =
1046 anv_scratch_pool_get_surf(cmd_buffer->device,
1047 &device->scratch_pool,
1048 pipeline->base.scratch_size);
1049 btd.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
1050 }
1051 #if INTEL_NEEDS_WA_14017794102
1052 btd.BTDMidthreadpreemption = false;
1053 #endif
1054 }
1055
1056 genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size);
1057
1058 const struct brw_cs_prog_data *cs_prog_data =
1059 brw_cs_prog_data_const(device->rt_trampoline->prog_data);
1060 struct intel_cs_dispatch_info dispatch =
1061 brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL);
1062
1063 anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
1064 cw.IndirectParameterEnable = params->is_launch_size_indirect;
1065 cw.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1066 cw.SIMDSize = dispatch.simd_size / 16;
1067 cw.MessageSIMD = dispatch.simd_size / 16;
1068 cw.LocalXMaximum = (1 << local_size_log2[0]) - 1;
1069 cw.LocalYMaximum = (1 << local_size_log2[1]) - 1;
1070 cw.LocalZMaximum = (1 << local_size_log2[2]) - 1;
1071 cw.ThreadGroupIDXDimension = global_size[0];
1072 cw.ThreadGroupIDYDimension = global_size[1];
1073 cw.ThreadGroupIDZDimension = global_size[2];
1074 cw.ExecutionMask = 0xff;
1075 cw.EmitInlineParameter = true;
1076 cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0);
1077
1078 const gl_shader_stage s = MESA_SHADER_RAYGEN;
1079 struct anv_device *device = cmd_buffer->device;
1080 struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
1081 struct anv_state *samplers = &cmd_buffer->state.samplers[s];
1082 cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
1083 .KernelStartPointer = device->rt_trampoline->kernel.offset,
1084 .SamplerStatePointer = samplers->offset,
1085 /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
1086 .SamplerCount = 0,
1087 .BindingTablePointer = surfaces->offset,
1088 .NumberofThreadsinGPGPUThreadGroup = 1,
1089 .BTDMode = true,
1090 #if INTEL_NEEDS_WA_14017794102
1091 .ThreadPreemption = false,
1092 #endif
1093 };
1094
1095 struct brw_rt_raygen_trampoline_params trampoline_params = {
1096 .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
1097 .raygen_bsr_addr =
1098 params->is_sbt_indirect ?
1099 (params->indirect_sbts_addr +
1100 offsetof(VkTraceRaysIndirectCommand2KHR,
1101 raygenShaderRecordAddress)) :
1102 params->raygen_sbt->deviceAddress,
1103 .is_indirect = params->is_sbt_indirect,
1104 .local_group_size_log2 = {
1105 local_size_log2[0],
1106 local_size_log2[1],
1107 local_size_log2[2],
1108 },
1109 };
1110 STATIC_ASSERT(sizeof(trampoline_params) == 32);
1111 memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
1112 }
1113
1114 trace_intel_end_rays(&cmd_buffer->trace,
1115 params->launch_size[0],
1116 params->launch_size[1],
1117 params->launch_size[2]);
1118 }
1119
1120 void
genX(CmdTraceRaysKHR)1121 genX(CmdTraceRaysKHR)(
1122 VkCommandBuffer commandBuffer,
1123 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
1124 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
1125 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
1126 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
1127 uint32_t width,
1128 uint32_t height,
1129 uint32_t depth)
1130 {
1131 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1132 struct trace_params params = {
1133 .is_sbt_indirect = false,
1134 .raygen_sbt = pRaygenShaderBindingTable,
1135 .miss_sbt = pMissShaderBindingTable,
1136 .hit_sbt = pHitShaderBindingTable,
1137 .callable_sbt = pCallableShaderBindingTable,
1138 .is_launch_size_indirect = false,
1139 .launch_size = {
1140 width,
1141 height,
1142 depth,
1143 },
1144 };
1145
1146 cmd_buffer_trace_rays(cmd_buffer, ¶ms);
1147 }
1148
1149 void
genX(CmdTraceRaysIndirectKHR)1150 genX(CmdTraceRaysIndirectKHR)(
1151 VkCommandBuffer commandBuffer,
1152 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
1153 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
1154 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
1155 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
1156 VkDeviceAddress indirectDeviceAddress)
1157 {
1158 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1159 struct trace_params params = {
1160 .is_sbt_indirect = false,
1161 .raygen_sbt = pRaygenShaderBindingTable,
1162 .miss_sbt = pMissShaderBindingTable,
1163 .hit_sbt = pHitShaderBindingTable,
1164 .callable_sbt = pCallableShaderBindingTable,
1165 .is_launch_size_indirect = true,
1166 .launch_size_addr = indirectDeviceAddress,
1167 };
1168
1169 cmd_buffer_trace_rays(cmd_buffer, ¶ms);
1170 }
1171
1172 void
genX(CmdTraceRaysIndirect2KHR)1173 genX(CmdTraceRaysIndirect2KHR)(
1174 VkCommandBuffer commandBuffer,
1175 VkDeviceAddress indirectDeviceAddress)
1176 {
1177 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1178 struct trace_params params = {
1179 .is_sbt_indirect = true,
1180 .indirect_sbts_addr = indirectDeviceAddress,
1181 .is_launch_size_indirect = true,
1182 .launch_size_addr = indirectDeviceAddress +
1183 offsetof(VkTraceRaysIndirectCommand2KHR, width),
1184 };
1185
1186 cmd_buffer_trace_rays(cmd_buffer, ¶ms);
1187 }
1188
1189 #endif /* GFX_VERx10 >= 125 */
1190