1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include <xf86drm.h>
31
32 #include "anv_private.h"
33 #include "anv_measure.h"
34
35 #include "common/intel_debug_identifier.h"
36
37 #include "genxml/gen9_pack.h"
38 #include "genxml/genX_bits.h"
39
40 #include "util/perf/u_trace.h"
41
42 /** \file anv_batch_chain.c
43 *
44 * This file contains functions related to anv_cmd_buffer as a data
45 * structure. This involves everything required to create and destroy
46 * the actual batch buffers as well as link them together.
47 *
48 * It specifically does *not* contain any handling of actual vkCmd calls
49 * beyond vkCmdExecuteCommands.
50 */
51
52 /*-----------------------------------------------------------------------*
53 * Functions related to anv_reloc_list
54 *-----------------------------------------------------------------------*/
55
56 VkResult
anv_reloc_list_init(struct anv_reloc_list * list,const VkAllocationCallbacks * alloc,bool uses_relocs)57 anv_reloc_list_init(struct anv_reloc_list *list,
58 const VkAllocationCallbacks *alloc,
59 bool uses_relocs)
60 {
61 assert(alloc != NULL);
62 memset(list, 0, sizeof(*list));
63 list->uses_relocs = uses_relocs;
64 list->alloc = alloc;
65 return VK_SUCCESS;
66 }
67
68 static VkResult
anv_reloc_list_init_clone(struct anv_reloc_list * list,const struct anv_reloc_list * other_list)69 anv_reloc_list_init_clone(struct anv_reloc_list *list,
70 const struct anv_reloc_list *other_list)
71 {
72 list->dep_words = other_list->dep_words;
73
74 if (list->dep_words > 0) {
75 list->deps =
76 vk_alloc(list->alloc, list->dep_words * sizeof(BITSET_WORD), 8,
77 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
78 memcpy(list->deps, other_list->deps,
79 list->dep_words * sizeof(BITSET_WORD));
80 } else {
81 list->deps = NULL;
82 }
83
84 return VK_SUCCESS;
85 }
86
87 void
anv_reloc_list_finish(struct anv_reloc_list * list)88 anv_reloc_list_finish(struct anv_reloc_list *list)
89 {
90 vk_free(list->alloc, list->deps);
91 }
92
93 static VkResult
anv_reloc_list_grow_deps(struct anv_reloc_list * list,uint32_t min_num_words)94 anv_reloc_list_grow_deps(struct anv_reloc_list *list,
95 uint32_t min_num_words)
96 {
97 if (min_num_words <= list->dep_words)
98 return VK_SUCCESS;
99
100 uint32_t new_length = MAX2(32, list->dep_words * 2);
101 while (new_length < min_num_words)
102 new_length *= 2;
103
104 BITSET_WORD *new_deps =
105 vk_realloc(list->alloc, list->deps, new_length * sizeof(BITSET_WORD), 8,
106 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
107 if (new_deps == NULL)
108 return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
109 list->deps = new_deps;
110
111 /* Zero out the new data */
112 memset(list->deps + list->dep_words, 0,
113 (new_length - list->dep_words) * sizeof(BITSET_WORD));
114 list->dep_words = new_length;
115
116 return VK_SUCCESS;
117 }
118
119 VkResult
anv_reloc_list_add_bo_impl(struct anv_reloc_list * list,struct anv_bo * target_bo)120 anv_reloc_list_add_bo_impl(struct anv_reloc_list *list,
121 struct anv_bo *target_bo)
122 {
123 /* This can happen with sparse resources. */
124 if (!target_bo)
125 return VK_SUCCESS;
126
127 uint32_t idx = target_bo->gem_handle;
128 VkResult result = anv_reloc_list_grow_deps(list,
129 (idx / BITSET_WORDBITS) + 1);
130 if (unlikely(result != VK_SUCCESS))
131 return result;
132
133 BITSET_SET(list->deps, idx);
134
135 return VK_SUCCESS;
136 }
137
138 static void
anv_reloc_list_clear(struct anv_reloc_list * list)139 anv_reloc_list_clear(struct anv_reloc_list *list)
140 {
141 if (list->dep_words > 0)
142 memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
143 }
144
145 VkResult
anv_reloc_list_append(struct anv_reloc_list * list,struct anv_reloc_list * other)146 anv_reloc_list_append(struct anv_reloc_list *list,
147 struct anv_reloc_list *other)
148 {
149 anv_reloc_list_grow_deps(list, other->dep_words);
150 for (uint32_t w = 0; w < other->dep_words; w++)
151 list->deps[w] |= other->deps[w];
152
153 return VK_SUCCESS;
154 }
155
156 /*-----------------------------------------------------------------------*
157 * Functions related to anv_batch
158 *-----------------------------------------------------------------------*/
159
160 static VkResult
anv_extend_batch(struct anv_batch * batch,uint32_t size)161 anv_extend_batch(struct anv_batch *batch, uint32_t size)
162 {
163 assert(batch->extend_cb != NULL);
164 VkResult result = batch->extend_cb(batch, size, batch->user_data);
165 if (result != VK_SUCCESS)
166 return anv_batch_set_error(batch, result);
167 return result;
168 }
169
170 void *
anv_batch_emit_dwords(struct anv_batch * batch,int num_dwords)171 anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords)
172 {
173 uint32_t size = num_dwords * 4;
174 if (batch->next + size > batch->end) {
175 if (anv_extend_batch(batch, size) != VK_SUCCESS)
176 return NULL;
177 }
178
179 void *p = batch->next;
180
181 batch->next += num_dwords * 4;
182 assert(batch->next <= batch->end);
183
184 return p;
185 }
186
187 /* Ensure enough contiguous space is available */
188 VkResult
anv_batch_emit_ensure_space(struct anv_batch * batch,uint32_t size)189 anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size)
190 {
191 if (batch->next + size > batch->end) {
192 VkResult result = anv_extend_batch(batch, size);
193 if (result != VK_SUCCESS)
194 return result;
195 }
196
197 assert(batch->next + size <= batch->end);
198
199 return VK_SUCCESS;
200 }
201
202 void
anv_batch_advance(struct anv_batch * batch,uint32_t size)203 anv_batch_advance(struct anv_batch *batch, uint32_t size)
204 {
205 assert(batch->next + size <= batch->end);
206
207 batch->next += size;
208 }
209
210 struct anv_address
anv_batch_address(struct anv_batch * batch,void * batch_location)211 anv_batch_address(struct anv_batch *batch, void *batch_location)
212 {
213 assert(batch->start <= batch_location);
214
215 /* Allow a jump at the current location of the batch. */
216 assert(batch->next >= batch_location);
217
218 return anv_address_add(batch->start_addr, batch_location - batch->start);
219 }
220
221 void
anv_batch_emit_batch(struct anv_batch * batch,struct anv_batch * other)222 anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
223 {
224 uint32_t size = other->next - other->start;
225 assert(size % 4 == 0);
226
227 if (batch->next + size > batch->end) {
228 if (anv_extend_batch(batch, size) != VK_SUCCESS)
229 return;
230 }
231
232 assert(batch->next + size <= batch->end);
233
234 VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size));
235 memcpy(batch->next, other->start, size);
236
237 VkResult result = anv_reloc_list_append(batch->relocs, other->relocs);
238 if (result != VK_SUCCESS) {
239 anv_batch_set_error(batch, result);
240 return;
241 }
242
243 batch->next += size;
244 }
245
246 /*-----------------------------------------------------------------------*
247 * Functions related to anv_batch_bo
248 *-----------------------------------------------------------------------*/
249
250 static VkResult
anv_batch_bo_create(struct anv_cmd_buffer * cmd_buffer,uint32_t size,struct anv_batch_bo ** bbo_out)251 anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,
252 uint32_t size,
253 struct anv_batch_bo **bbo_out)
254 {
255 VkResult result;
256
257 struct anv_batch_bo *bbo = vk_zalloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
258 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
259 if (bbo == NULL)
260 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
261
262 result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
263 size, &bbo->bo);
264 if (result != VK_SUCCESS)
265 goto fail_alloc;
266
267 const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
268 result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->vk.pool->alloc, uses_relocs);
269 if (result != VK_SUCCESS)
270 goto fail_bo_alloc;
271
272 *bbo_out = bbo;
273
274 return VK_SUCCESS;
275
276 fail_bo_alloc:
277 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
278 fail_alloc:
279 vk_free(&cmd_buffer->vk.pool->alloc, bbo);
280
281 return result;
282 }
283
284 static VkResult
anv_batch_bo_clone(struct anv_cmd_buffer * cmd_buffer,const struct anv_batch_bo * other_bbo,struct anv_batch_bo ** bbo_out)285 anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
286 const struct anv_batch_bo *other_bbo,
287 struct anv_batch_bo **bbo_out)
288 {
289 VkResult result;
290
291 struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
292 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
293 if (bbo == NULL)
294 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
295
296 result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
297 other_bbo->bo->size, &bbo->bo);
298 if (result != VK_SUCCESS)
299 goto fail_alloc;
300
301 result = anv_reloc_list_init_clone(&bbo->relocs, &other_bbo->relocs);
302 if (result != VK_SUCCESS)
303 goto fail_bo_alloc;
304
305 bbo->length = other_bbo->length;
306 memcpy(bbo->bo->map, other_bbo->bo->map, other_bbo->length);
307 *bbo_out = bbo;
308
309 return VK_SUCCESS;
310
311 fail_bo_alloc:
312 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
313 fail_alloc:
314 vk_free(&cmd_buffer->vk.pool->alloc, bbo);
315
316 return result;
317 }
318
319 static void
anv_batch_bo_start(struct anv_batch_bo * bbo,struct anv_batch * batch,size_t batch_padding)320 anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch,
321 size_t batch_padding)
322 {
323 anv_batch_set_storage(batch, (struct anv_address) { .bo = bbo->bo, },
324 bbo->bo->map, bbo->bo->size - batch_padding);
325 batch->relocs = &bbo->relocs;
326 anv_reloc_list_clear(&bbo->relocs);
327 }
328
329 static void
anv_batch_bo_continue(struct anv_batch_bo * bbo,struct anv_batch * batch,size_t batch_padding)330 anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch,
331 size_t batch_padding)
332 {
333 batch->start_addr = (struct anv_address) { .bo = bbo->bo, };
334 batch->start = bbo->bo->map;
335 batch->next = bbo->bo->map + bbo->length;
336 batch->end = bbo->bo->map + bbo->bo->size - batch_padding;
337 batch->relocs = &bbo->relocs;
338 }
339
340 static void
anv_batch_bo_finish(struct anv_batch_bo * bbo,struct anv_batch * batch)341 anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch)
342 {
343 assert(batch->start == bbo->bo->map);
344 bbo->length = batch->next - batch->start;
345 VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length));
346 }
347
348 static void
anv_batch_bo_link(struct anv_cmd_buffer * cmd_buffer,struct anv_batch_bo * prev_bbo,struct anv_batch_bo * next_bbo,uint32_t next_bbo_offset)349 anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer,
350 struct anv_batch_bo *prev_bbo,
351 struct anv_batch_bo *next_bbo,
352 uint32_t next_bbo_offset)
353 {
354 const uint32_t bb_start_offset =
355 prev_bbo->length - GFX9_MI_BATCH_BUFFER_START_length * 4;
356 ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset;
357
358 /* Make sure we're looking at a MI_BATCH_BUFFER_START */
359 assert(((*bb_start >> 29) & 0x07) == 0);
360 assert(((*bb_start >> 23) & 0x3f) == 49);
361
362 uint64_t *map = prev_bbo->bo->map + bb_start_offset + 4;
363 *map = intel_canonical_address(next_bbo->bo->offset + next_bbo_offset);
364
365 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
366 if (cmd_buffer->device->physical->memory.need_flush &&
367 anv_bo_needs_host_cache_flush(prev_bbo->bo->alloc_flags))
368 intel_flush_range(map, sizeof(uint64_t));
369 #endif
370 }
371
372 static void
anv_batch_bo_destroy(struct anv_batch_bo * bbo,struct anv_cmd_buffer * cmd_buffer)373 anv_batch_bo_destroy(struct anv_batch_bo *bbo,
374 struct anv_cmd_buffer *cmd_buffer)
375 {
376 anv_reloc_list_finish(&bbo->relocs);
377 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
378 vk_free(&cmd_buffer->vk.pool->alloc, bbo);
379 }
380
381 static VkResult
anv_batch_bo_list_clone(const struct list_head * list,struct anv_cmd_buffer * cmd_buffer,struct list_head * new_list)382 anv_batch_bo_list_clone(const struct list_head *list,
383 struct anv_cmd_buffer *cmd_buffer,
384 struct list_head *new_list)
385 {
386 VkResult result = VK_SUCCESS;
387
388 list_inithead(new_list);
389
390 struct anv_batch_bo *prev_bbo = NULL;
391 list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
392 struct anv_batch_bo *new_bbo = NULL;
393 result = anv_batch_bo_clone(cmd_buffer, bbo, &new_bbo);
394 if (result != VK_SUCCESS)
395 break;
396 list_addtail(&new_bbo->link, new_list);
397
398 if (prev_bbo)
399 anv_batch_bo_link(cmd_buffer, prev_bbo, new_bbo, 0);
400
401 prev_bbo = new_bbo;
402 }
403
404 if (result != VK_SUCCESS) {
405 list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) {
406 list_del(&bbo->link);
407 anv_batch_bo_destroy(bbo, cmd_buffer);
408 }
409 }
410
411 return result;
412 }
413
414 /*-----------------------------------------------------------------------*
415 * Functions related to anv_batch_bo
416 *-----------------------------------------------------------------------*/
417
418 static struct anv_batch_bo *
anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer * cmd_buffer)419 anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer)
420 {
421 return list_entry(cmd_buffer->batch_bos.prev, struct anv_batch_bo, link);
422 }
423
424 static struct anv_batch_bo *
anv_cmd_buffer_current_generation_batch_bo(struct anv_cmd_buffer * cmd_buffer)425 anv_cmd_buffer_current_generation_batch_bo(struct anv_cmd_buffer *cmd_buffer)
426 {
427 return list_entry(cmd_buffer->generation.batch_bos.prev, struct anv_batch_bo, link);
428 }
429
430 struct anv_address
anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer * cmd_buffer)431 anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
432 {
433 /* Only graphics & compute queues need binding tables. */
434 if (!(cmd_buffer->queue_family->queueFlags & (VK_QUEUE_GRAPHICS_BIT |
435 VK_QUEUE_COMPUTE_BIT)))
436 return ANV_NULL_ADDRESS;
437
438 /* If we've never allocated a binding table block, do it now. Otherwise we
439 * would trigger another STATE_BASE_ADDRESS emission which would require an
440 * additional bunch of flushes/stalls.
441 */
442 if (u_vector_length(&cmd_buffer->bt_block_states) == 0) {
443 VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
444 if (result != VK_SUCCESS) {
445 anv_batch_set_error(&cmd_buffer->batch, result);
446 return ANV_NULL_ADDRESS;
447 }
448 }
449
450 struct anv_state_pool *pool = &cmd_buffer->device->binding_table_pool;
451 struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
452 return (struct anv_address) {
453 .bo = pool->block_pool.bo,
454 .offset = bt_block->offset - pool->start_offset,
455 };
456 }
457
458 static void
emit_batch_buffer_start(struct anv_batch * batch,struct anv_bo * bo,uint32_t offset)459 emit_batch_buffer_start(struct anv_batch *batch,
460 struct anv_bo *bo, uint32_t offset)
461 {
462 anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
463 bbs.DWordLength = GFX9_MI_BATCH_BUFFER_START_length -
464 GFX9_MI_BATCH_BUFFER_START_length_bias;
465 bbs.SecondLevelBatchBuffer = Firstlevelbatch;
466 bbs.AddressSpaceIndicator = ASI_PPGTT;
467 bbs.BatchBufferStartAddress = (struct anv_address) { bo, offset };
468 }
469 }
470
471 enum anv_cmd_buffer_batch {
472 ANV_CMD_BUFFER_BATCH_MAIN,
473 ANV_CMD_BUFFER_BATCH_GENERATION,
474 };
475
476 static void
cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_batch_bo * bbo,enum anv_cmd_buffer_batch batch_type)477 cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,
478 struct anv_batch_bo *bbo,
479 enum anv_cmd_buffer_batch batch_type)
480 {
481 struct anv_batch *batch =
482 batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
483 &cmd_buffer->generation.batch : &cmd_buffer->batch;
484 struct anv_batch_bo *current_bbo =
485 batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
486 anv_cmd_buffer_current_generation_batch_bo(cmd_buffer) :
487 anv_cmd_buffer_current_batch_bo(cmd_buffer);
488
489 /* We set the end of the batch a little short so we would be sure we
490 * have room for the chaining command. Since we're about to emit the
491 * chaining command, let's set it back where it should go.
492 */
493 batch->end += GFX9_MI_BATCH_BUFFER_START_length * 4;
494 assert(batch->end == current_bbo->bo->map + current_bbo->bo->size);
495
496 emit_batch_buffer_start(batch, bbo->bo, 0);
497
498 anv_batch_bo_finish(current_bbo, batch);
499
500 /* Add the current amount of data written in the current_bbo to the command
501 * buffer.
502 */
503 cmd_buffer->total_batch_size += current_bbo->length;
504 }
505
506 static void
anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer * cmd_buffer_from,struct anv_cmd_buffer * cmd_buffer_to)507 anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
508 struct anv_cmd_buffer *cmd_buffer_to)
509 {
510 uint32_t *bb_start = cmd_buffer_from->batch_end;
511
512 struct anv_batch_bo *last_bbo =
513 list_last_entry(&cmd_buffer_from->batch_bos, struct anv_batch_bo, link);
514 struct anv_batch_bo *first_bbo =
515 list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link);
516
517 struct GFX9_MI_BATCH_BUFFER_START gen_bb_start = {
518 __anv_cmd_header(GFX9_MI_BATCH_BUFFER_START),
519 .SecondLevelBatchBuffer = Firstlevelbatch,
520 .AddressSpaceIndicator = ASI_PPGTT,
521 .BatchBufferStartAddress = (struct anv_address) { first_bbo->bo, 0 },
522 };
523 struct anv_batch local_batch = {
524 .start = last_bbo->bo->map,
525 .end = last_bbo->bo->map + last_bbo->bo->size,
526 .relocs = &last_bbo->relocs,
527 .alloc = &cmd_buffer_from->vk.pool->alloc,
528 };
529
530 __anv_cmd_pack(GFX9_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);
531
532 last_bbo->chained = true;
533 }
534
535 static void
anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer * cmd_buffer)536 anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer)
537 {
538 struct anv_batch_bo *last_bbo =
539 list_last_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
540 last_bbo->chained = false;
541
542 uint32_t *batch = cmd_buffer->batch_end;
543 anv_pack_struct(batch, GFX9_MI_BATCH_BUFFER_END,
544 __anv_cmd_header(GFX9_MI_BATCH_BUFFER_END));
545 }
546
547 static VkResult
anv_cmd_buffer_chain_batch(struct anv_batch * batch,uint32_t size,void * _data)548 anv_cmd_buffer_chain_batch(struct anv_batch *batch, uint32_t size, void *_data)
549 {
550 /* The caller should not need that much space. Otherwise it should split
551 * its commands.
552 */
553 assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
554
555 struct anv_cmd_buffer *cmd_buffer = _data;
556 struct anv_batch_bo *new_bbo = NULL;
557 /* Amount of reserved space at the end of the batch to account for the
558 * chaining instruction.
559 */
560 const uint32_t batch_padding = GFX9_MI_BATCH_BUFFER_START_length * 4;
561 /* Cap reallocation to chunk. */
562 uint32_t alloc_size = MIN2(
563 MAX2(batch->allocated_batch_size, size + batch_padding),
564 ANV_MAX_CMD_BUFFER_BATCH_SIZE);
565
566 VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
567 if (result != VK_SUCCESS)
568 return result;
569
570 batch->allocated_batch_size += alloc_size;
571
572 struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
573 if (seen_bbo == NULL) {
574 anv_batch_bo_destroy(new_bbo, cmd_buffer);
575 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
576 }
577 *seen_bbo = new_bbo;
578
579 cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo, ANV_CMD_BUFFER_BATCH_MAIN);
580
581 list_addtail(&new_bbo->link, &cmd_buffer->batch_bos);
582
583 anv_batch_bo_start(new_bbo, batch, batch_padding);
584
585 return VK_SUCCESS;
586 }
587
588 static VkResult
anv_cmd_buffer_chain_generation_batch(struct anv_batch * batch,uint32_t size,void * _data)589 anv_cmd_buffer_chain_generation_batch(struct anv_batch *batch, uint32_t size, void *_data)
590 {
591 /* The caller should not need that much space. Otherwise it should split
592 * its commands.
593 */
594 assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
595
596 struct anv_cmd_buffer *cmd_buffer = _data;
597 struct anv_batch_bo *new_bbo = NULL;
598 /* Cap reallocation to chunk. */
599 uint32_t alloc_size = MIN2(
600 MAX2(batch->allocated_batch_size, size),
601 ANV_MAX_CMD_BUFFER_BATCH_SIZE);
602
603 VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
604 if (result != VK_SUCCESS)
605 return result;
606
607 batch->allocated_batch_size += alloc_size;
608
609 struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
610 if (seen_bbo == NULL) {
611 anv_batch_bo_destroy(new_bbo, cmd_buffer);
612 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
613 }
614 *seen_bbo = new_bbo;
615
616 if (!list_is_empty(&cmd_buffer->generation.batch_bos)) {
617 cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo,
618 ANV_CMD_BUFFER_BATCH_GENERATION);
619 }
620
621 list_addtail(&new_bbo->link, &cmd_buffer->generation.batch_bos);
622
623 anv_batch_bo_start(new_bbo, batch, GFX9_MI_BATCH_BUFFER_START_length * 4);
624
625 return VK_SUCCESS;
626 }
627
628 /** Allocate a binding table
629 *
630 * This function allocates a binding table. This is a bit more complicated
631 * than one would think due to a combination of Vulkan driver design and some
632 * unfortunate hardware restrictions.
633 *
634 * The 3DSTATE_BINDING_TABLE_POINTERS_* packets only have a 16-bit field for
635 * the binding table pointer which means that all binding tables need to live
636 * in the bottom 64k of surface state base address. The way the GL driver has
637 * classically dealt with this restriction is to emit all surface states
638 * on-the-fly into the batch and have a batch buffer smaller than 64k. This
639 * isn't really an option in Vulkan for a couple of reasons:
640 *
641 * 1) In Vulkan, we have growing (or chaining) batches so surface states have
642 * to live in their own buffer and we have to be able to re-emit
643 * STATE_BASE_ADDRESS as needed which requires a full pipeline stall. In
644 * order to avoid emitting STATE_BASE_ADDRESS any more often than needed
645 * (it's not that hard to hit 64k of just binding tables), we allocate
646 * surface state objects up-front when VkImageView is created. In order
647 * for this to work, surface state objects need to be allocated from a
648 * global buffer.
649 *
650 * 2) We tried to design the surface state system in such a way that it's
651 * already ready for bindless texturing. The way bindless texturing works
652 * on our hardware is that you have a big pool of surface state objects
653 * (with its own state base address) and the bindless handles are simply
654 * offsets into that pool. With the architecture we chose, we already
655 * have that pool and it's exactly the same pool that we use for regular
656 * surface states so we should already be ready for bindless.
657 *
658 * 3) For render targets, we need to be able to fill out the surface states
659 * later in vkBeginRenderPass so that we can assign clear colors
660 * correctly. One way to do this would be to just create the surface
661 * state data and then repeatedly copy it into the surface state BO every
662 * time we have to re-emit STATE_BASE_ADDRESS. While this works, it's
663 * rather annoying and just being able to allocate them up-front and
664 * re-use them for the entire render pass.
665 *
666 * While none of these are technically blockers for emitting state on the fly
667 * like we do in GL, the ability to have a single surface state pool is
668 * simplifies things greatly. Unfortunately, it comes at a cost...
669 *
670 * Because of the 64k limitation of 3DSTATE_BINDING_TABLE_POINTERS_*, we can't
671 * place the binding tables just anywhere in surface state base address.
672 * Because 64k isn't a whole lot of space, we can't simply restrict the
673 * surface state buffer to 64k, we have to be more clever. The solution we've
674 * chosen is to have a block pool with a maximum size of 2G that starts at
675 * zero and grows in both directions. All surface states are allocated from
676 * the top of the pool (positive offsets) and we allocate blocks (< 64k) of
677 * binding tables from the bottom of the pool (negative offsets). Every time
678 * we allocate a new binding table block, we set surface state base address to
679 * point to the bottom of the binding table block. This way all of the
680 * binding tables in the block are in the bottom 64k of surface state base
681 * address. When we fill out the binding table, we add the distance between
682 * the bottom of our binding table block and zero of the block pool to the
683 * surface state offsets so that they are correct relative to out new surface
684 * state base address at the bottom of the binding table block.
685 *
686 * \param[in] entries The number of surface state entries the binding
687 * table should be able to hold.
688 *
689 * \param[out] state_offset The offset surface surface state base address
690 * where the surface states live. This must be
691 * added to the surface state offset when it is
692 * written into the binding table entry.
693 *
694 * \return An anv_state representing the binding table
695 */
696 struct anv_state
anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer * cmd_buffer,uint32_t entries,uint32_t * state_offset)697 anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
698 uint32_t entries, uint32_t *state_offset)
699 {
700 if (u_vector_length(&cmd_buffer->bt_block_states) == 0)
701 return (struct anv_state) { 0 };
702
703 struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
704
705 uint32_t bt_size = align(entries * 4, 32);
706
707 struct anv_state state = cmd_buffer->bt_next;
708 if (bt_size > state.alloc_size)
709 return (struct anv_state) { 0 };
710
711 state.alloc_size = bt_size;
712 cmd_buffer->bt_next.offset += bt_size;
713 cmd_buffer->bt_next.map += bt_size;
714 cmd_buffer->bt_next.alloc_size -= bt_size;
715
716 if (cmd_buffer->device->info->verx10 >= 125) {
717 /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to change the binding
718 * table address independently from surface state base address. We no
719 * longer need any sort of offsetting.
720 */
721 *state_offset = 0;
722 } else {
723 assert(bt_block->offset < 0);
724 *state_offset = -bt_block->offset;
725 }
726
727 return state;
728 }
729
730 struct anv_state
anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer * cmd_buffer,uint32_t count)731 anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer *cmd_buffer,
732 uint32_t count)
733 {
734 if (count == 0)
735 return ANV_STATE_NULL;
736 struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
737 struct anv_state state =
738 anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
739 count * isl_dev->ss.size,
740 isl_dev->ss.align);
741 if (state.map == NULL)
742 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
743 return state;
744 }
745
746 struct anv_state
anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer * cmd_buffer,uint32_t size,uint32_t alignment)747 anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
748 uint32_t size, uint32_t alignment)
749 {
750 if (size == 0)
751 return ANV_STATE_NULL;
752 struct anv_state state =
753 anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
754 size, alignment);
755 if (state.map == NULL)
756 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
757 return state;
758 }
759
760 struct anv_state
anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer * cmd_buffer,uint32_t size,uint32_t alignment)761 anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer *cmd_buffer,
762 uint32_t size, uint32_t alignment)
763 {
764 if (size == 0)
765 return ANV_STATE_NULL;
766 struct anv_state state =
767 anv_state_stream_alloc(&cmd_buffer->general_state_stream,
768 size, alignment);
769 if (state.map == NULL)
770 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
771 return state;
772 }
773
774 /** Allocate space associated with a command buffer
775 *
776 * Some commands like vkCmdBuildAccelerationStructuresKHR() can end up needing
777 * large amount of temporary buffers. This function is here to deal with those
778 * potentially larger allocations, using a side BO if needed.
779 *
780 */
781 struct anv_cmd_alloc
anv_cmd_buffer_alloc_space(struct anv_cmd_buffer * cmd_buffer,size_t size,uint32_t alignment,bool mapped)782 anv_cmd_buffer_alloc_space(struct anv_cmd_buffer *cmd_buffer,
783 size_t size, uint32_t alignment,
784 bool mapped)
785 {
786 /* Below 16k, source memory from dynamic state, otherwise allocate a BO. */
787 if (size < 16 * 1024) {
788 struct anv_state state =
789 anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
790 size, alignment);
791 if (state.map == NULL) {
792 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
793 return (struct anv_cmd_alloc) {
794 .address = ANV_NULL_ADDRESS,
795 };
796 }
797
798 return (struct anv_cmd_alloc) {
799 .address = anv_state_pool_state_address(
800 &cmd_buffer->device->dynamic_state_pool,
801 state),
802 .map = state.map,
803 .size = size,
804 };
805 }
806
807 assert(alignment <= 4096);
808
809 struct anv_bo *bo = NULL;
810 VkResult result =
811 anv_bo_pool_alloc(mapped ?
812 &cmd_buffer->device->batch_bo_pool :
813 &cmd_buffer->device->bvh_bo_pool,
814 align(size, 4096), &bo);
815 if (result != VK_SUCCESS) {
816 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
817 return ANV_EMPTY_ALLOC;
818 }
819
820 struct anv_bo **bo_entry =
821 u_vector_add(&cmd_buffer->dynamic_bos);
822 if (bo_entry == NULL) {
823 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
824 anv_bo_pool_free(bo->map != NULL ?
825 &cmd_buffer->device->batch_bo_pool :
826 &cmd_buffer->device->bvh_bo_pool, bo);
827 return ANV_EMPTY_ALLOC;
828 }
829 *bo_entry = bo;
830
831 return (struct anv_cmd_alloc) {
832 .address = (struct anv_address) { .bo = bo },
833 .map = bo->map,
834 .size = size,
835 };
836 }
837
838 VkResult
anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer * cmd_buffer)839 anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
840 {
841 struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states);
842 if (bt_block == NULL) {
843 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
844 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
845 }
846
847 *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device);
848
849 /* The bt_next state is a rolling state (we update it as we suballocate
850 * from it) which is relative to the start of the binding table block.
851 */
852 cmd_buffer->bt_next = *bt_block;
853 cmd_buffer->bt_next.offset = 0;
854
855 return VK_SUCCESS;
856 }
857
858 VkResult
anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)859 anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
860 {
861 struct anv_batch_bo *batch_bo = NULL;
862 VkResult result;
863
864 list_inithead(&cmd_buffer->batch_bos);
865
866 cmd_buffer->total_batch_size = 0;
867
868 result = anv_batch_bo_create(cmd_buffer,
869 ANV_MIN_CMD_BUFFER_BATCH_SIZE,
870 &batch_bo);
871 if (result != VK_SUCCESS)
872 return result;
873
874 list_addtail(&batch_bo->link, &cmd_buffer->batch_bos);
875
876 cmd_buffer->batch.alloc = &cmd_buffer->vk.pool->alloc;
877 cmd_buffer->batch.user_data = cmd_buffer;
878 cmd_buffer->batch.allocated_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE;
879
880 cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;
881 cmd_buffer->batch.engine_class = cmd_buffer->queue_family->engine_class;
882 cmd_buffer->batch.trace = &cmd_buffer->trace;
883
884 anv_batch_bo_start(batch_bo, &cmd_buffer->batch,
885 GFX9_MI_BATCH_BUFFER_START_length * 4);
886
887 /* Generation batch is initialized empty since it's possible it won't be
888 * used.
889 */
890 list_inithead(&cmd_buffer->generation.batch_bos);
891
892 cmd_buffer->generation.batch.alloc = &cmd_buffer->vk.pool->alloc;
893 cmd_buffer->generation.batch.user_data = cmd_buffer;
894 cmd_buffer->generation.batch.allocated_batch_size = 0;
895 cmd_buffer->generation.batch.extend_cb = anv_cmd_buffer_chain_generation_batch;
896 cmd_buffer->generation.batch.engine_class =
897 cmd_buffer->queue_family->engine_class;
898
899 int success = u_vector_init_pow2(&cmd_buffer->seen_bbos, 8,
900 sizeof(struct anv_bo *));
901 if (!success)
902 goto fail_batch_bo;
903
904 *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo;
905
906 success = u_vector_init(&cmd_buffer->bt_block_states, 8,
907 sizeof(struct anv_state));
908 if (!success)
909 goto fail_seen_bbos;
910
911 const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
912 result = anv_reloc_list_init(&cmd_buffer->surface_relocs,
913 &cmd_buffer->vk.pool->alloc, uses_relocs);
914 if (result != VK_SUCCESS)
915 goto fail_bt_blocks;
916
917 return VK_SUCCESS;
918
919 fail_bt_blocks:
920 u_vector_finish(&cmd_buffer->bt_block_states);
921 fail_seen_bbos:
922 u_vector_finish(&cmd_buffer->seen_bbos);
923 fail_batch_bo:
924 anv_batch_bo_destroy(batch_bo, cmd_buffer);
925
926 return result;
927 }
928
929 void
anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)930 anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
931 {
932 struct anv_state *bt_block;
933 u_vector_foreach(bt_block, &cmd_buffer->bt_block_states)
934 anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
935 u_vector_finish(&cmd_buffer->bt_block_states);
936
937 anv_reloc_list_finish(&cmd_buffer->surface_relocs);
938
939 u_vector_finish(&cmd_buffer->seen_bbos);
940
941 /* Destroy all of the batch buffers */
942 list_for_each_entry_safe(struct anv_batch_bo, bbo,
943 &cmd_buffer->batch_bos, link) {
944 list_del(&bbo->link);
945 anv_batch_bo_destroy(bbo, cmd_buffer);
946 }
947 /* Also destroy all generation batch buffers */
948 list_for_each_entry_safe(struct anv_batch_bo, bbo,
949 &cmd_buffer->generation.batch_bos, link) {
950 list_del(&bbo->link);
951 anv_batch_bo_destroy(bbo, cmd_buffer);
952 }
953
954 if (cmd_buffer->generation.ring_bo) {
955 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
956 cmd_buffer->generation.ring_bo);
957 }
958 }
959
960 void
anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)961 anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
962 {
963 /* Delete all but the first batch bo */
964 assert(!list_is_empty(&cmd_buffer->batch_bos));
965 while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) {
966 struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
967 list_del(&bbo->link);
968 anv_batch_bo_destroy(bbo, cmd_buffer);
969 }
970 assert(!list_is_empty(&cmd_buffer->batch_bos));
971
972 anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer),
973 &cmd_buffer->batch,
974 GFX9_MI_BATCH_BUFFER_START_length * 4);
975
976 while (u_vector_length(&cmd_buffer->bt_block_states) > 0) {
977 struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states);
978 anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
979 }
980 cmd_buffer->bt_next = ANV_STATE_NULL;
981
982 anv_reloc_list_clear(&cmd_buffer->surface_relocs);
983
984 /* Reset the list of seen buffers */
985 cmd_buffer->seen_bbos.head = 0;
986 cmd_buffer->seen_bbos.tail = 0;
987
988 struct anv_batch_bo *first_bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
989
990 *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = first_bbo;
991
992 assert(first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE);
993 cmd_buffer->batch.allocated_batch_size = first_bbo->bo->size;
994
995 /* Delete all generation batch bos */
996 list_for_each_entry_safe(struct anv_batch_bo, bbo,
997 &cmd_buffer->generation.batch_bos, link) {
998 list_del(&bbo->link);
999 anv_batch_bo_destroy(bbo, cmd_buffer);
1000 }
1001
1002 /* And reset generation batch */
1003 cmd_buffer->generation.batch.allocated_batch_size = 0;
1004 cmd_buffer->generation.batch.start = NULL;
1005 cmd_buffer->generation.batch.end = NULL;
1006 cmd_buffer->generation.batch.next = NULL;
1007
1008 if (cmd_buffer->generation.ring_bo) {
1009 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
1010 cmd_buffer->generation.ring_bo);
1011 cmd_buffer->generation.ring_bo = NULL;
1012 }
1013
1014 cmd_buffer->total_batch_size = 0;
1015 }
1016
1017 void
anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer * cmd_buffer)1018 anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
1019 {
1020 const struct intel_device_info *devinfo = cmd_buffer->device->info;
1021 struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
1022
1023 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1024 /* When we start a batch buffer, we subtract a certain amount of
1025 * padding from the end to ensure that we always have room to emit a
1026 * BATCH_BUFFER_START to chain to the next BO. We need to remove
1027 * that padding before we end the batch; otherwise, we may end up
1028 * with our BATCH_BUFFER_END in another BO.
1029 */
1030 cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
1031 assert(cmd_buffer->batch.start == batch_bo->bo->map);
1032 assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
1033
1034 /* Save end instruction location to override it later. */
1035 cmd_buffer->batch_end = cmd_buffer->batch.next;
1036
1037 /* If we can chain this command buffer to another one, leave some place
1038 * for the jump instruction.
1039 */
1040 batch_bo->chained = anv_cmd_buffer_is_chainable(cmd_buffer);
1041 if (batch_bo->chained)
1042 emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
1043 else
1044 anv_batch_emit(&cmd_buffer->batch, GFX9_MI_BATCH_BUFFER_END, bbe);
1045
1046 /* Round batch up to an even number of dwords. */
1047 if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4)
1048 anv_batch_emit(&cmd_buffer->batch, GFX9_MI_NOOP, noop);
1049
1050 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY;
1051 } else {
1052 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1053 /* If this is a secondary command buffer, we need to determine the
1054 * mode in which it will be executed with vkExecuteCommands. We
1055 * determine this statically here so that this stays in sync with the
1056 * actual ExecuteCommands implementation.
1057 */
1058 const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start;
1059 if (cmd_buffer->device->physical->use_call_secondary) {
1060 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN;
1061
1062 void *jump_addr =
1063 anv_genX(devinfo, batch_emit_return)(&cmd_buffer->batch) +
1064 (GFX9_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8);
1065 cmd_buffer->return_addr = anv_batch_address(&cmd_buffer->batch, jump_addr);
1066
1067 /* The emit above may have caused us to chain batch buffers which
1068 * would mean that batch_bo is no longer valid.
1069 */
1070 batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
1071 } else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) &&
1072 (length < ANV_MIN_CMD_BUFFER_BATCH_SIZE / 2)) {
1073 /* If the secondary has exactly one batch buffer in its list *and*
1074 * that batch buffer is less than half of the maximum size, we're
1075 * probably better of simply copying it into our batch.
1076 */
1077 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_EMIT;
1078 } else if (!(cmd_buffer->usage_flags &
1079 VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
1080 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN;
1081
1082 /* In order to chain, we need this command buffer to contain an
1083 * MI_BATCH_BUFFER_START which will jump back to the calling batch.
1084 * It doesn't matter where it points now so long as has a valid
1085 * relocation. We'll adjust it later as part of the chaining
1086 * process.
1087 *
1088 * We set the end of the batch a little short so we would be sure we
1089 * have room for the chaining command. Since we're about to emit the
1090 * chaining command, let's set it back where it should go.
1091 */
1092 cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
1093 assert(cmd_buffer->batch.start == batch_bo->bo->map);
1094 assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
1095
1096 emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
1097 assert(cmd_buffer->batch.start == batch_bo->bo->map);
1098 } else {
1099 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN;
1100 }
1101 }
1102
1103 anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
1104
1105 /* Add the current amount of data written in the current_bbo to the command
1106 * buffer.
1107 */
1108 cmd_buffer->total_batch_size += batch_bo->length;
1109 }
1110
1111 static VkResult
anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer * cmd_buffer,struct list_head * list)1112 anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer,
1113 struct list_head *list)
1114 {
1115 list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
1116 struct anv_batch_bo **bbo_ptr = u_vector_add(&cmd_buffer->seen_bbos);
1117 if (bbo_ptr == NULL)
1118 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
1119
1120 *bbo_ptr = bbo;
1121 }
1122
1123 return VK_SUCCESS;
1124 }
1125
1126 void
anv_cmd_buffer_add_secondary(struct anv_cmd_buffer * primary,struct anv_cmd_buffer * secondary)1127 anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
1128 struct anv_cmd_buffer *secondary)
1129 {
1130 anv_measure_add_secondary(primary, secondary);
1131 switch (secondary->exec_mode) {
1132 case ANV_CMD_BUFFER_EXEC_MODE_EMIT:
1133 anv_batch_emit_batch(&primary->batch, &secondary->batch);
1134 break;
1135 case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: {
1136 struct anv_batch_bo *first_bbo =
1137 list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1138 struct anv_batch_bo *last_bbo =
1139 list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1140
1141 emit_batch_buffer_start(&primary->batch, first_bbo->bo, 0);
1142
1143 struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary);
1144 assert(primary->batch.start == this_bbo->bo->map);
1145 uint32_t offset = primary->batch.next - primary->batch.start;
1146
1147 /* Make the tail of the secondary point back to right after the
1148 * MI_BATCH_BUFFER_START in the primary batch.
1149 */
1150 anv_batch_bo_link(primary, last_bbo, this_bbo, offset);
1151
1152 anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
1153 break;
1154 }
1155 case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: {
1156 struct list_head copy_list;
1157 VkResult result = anv_batch_bo_list_clone(&secondary->batch_bos,
1158 secondary,
1159 ©_list);
1160 if (result != VK_SUCCESS)
1161 return; /* FIXME */
1162
1163 anv_cmd_buffer_add_seen_bbos(primary, ©_list);
1164
1165 struct anv_batch_bo *first_bbo =
1166 list_first_entry(©_list, struct anv_batch_bo, link);
1167 struct anv_batch_bo *last_bbo =
1168 list_last_entry(©_list, struct anv_batch_bo, link);
1169
1170 cmd_buffer_chain_to_batch_bo(primary, first_bbo,
1171 ANV_CMD_BUFFER_BATCH_MAIN);
1172
1173 list_splicetail(©_list, &primary->batch_bos);
1174
1175 anv_batch_bo_continue(last_bbo, &primary->batch,
1176 GFX9_MI_BATCH_BUFFER_START_length * 4);
1177 break;
1178 }
1179 case ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN: {
1180 struct anv_batch_bo *first_bbo =
1181 list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1182
1183 anv_genX(primary->device->info, batch_emit_secondary_call)(
1184 &primary->batch, primary->device,
1185 (struct anv_address) { .bo = first_bbo->bo },
1186 secondary->return_addr);
1187
1188 anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
1189 break;
1190 }
1191 default:
1192 assert(!"Invalid execution mode");
1193 }
1194
1195 anv_reloc_list_append(&primary->surface_relocs, &secondary->surface_relocs);
1196
1197 /* Add the amount of data written in the secondary buffer to the primary
1198 * command buffer.
1199 */
1200 primary->total_batch_size += secondary->total_batch_size;
1201 }
1202
1203 void
anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer ** cmd_buffers,uint32_t num_cmd_buffers)1204 anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
1205 uint32_t num_cmd_buffers)
1206 {
1207 if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {
1208 assert(num_cmd_buffers == 1);
1209 return;
1210 }
1211
1212 /* Chain the N-1 first batch buffers */
1213 for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++) {
1214 assert(cmd_buffers[i]->companion_rcs_cmd_buffer == NULL);
1215 anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);
1216 }
1217
1218 /* Put an end to the last one */
1219 anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);
1220 }
1221
1222 static void
anv_print_batch(struct anv_device * device,struct anv_queue * queue,struct anv_cmd_buffer * cmd_buffer)1223 anv_print_batch(struct anv_device *device,
1224 struct anv_queue *queue,
1225 struct anv_cmd_buffer *cmd_buffer)
1226 {
1227 struct anv_batch_bo *bbo =
1228 list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
1229 device->cmd_buffer_being_decoded = cmd_buffer;
1230 struct intel_batch_decode_ctx *ctx = queue->decoder;
1231
1232 if (cmd_buffer->is_companion_rcs_cmd_buffer) {
1233 int render_queue_idx =
1234 anv_get_first_render_queue_index(device->physical);
1235 ctx = &device->decoder[render_queue_idx];
1236 }
1237
1238 if (INTEL_DEBUG(DEBUG_BATCH)) {
1239 intel_print_batch(ctx, bbo->bo->map,
1240 bbo->bo->size, bbo->bo->offset, false);
1241 }
1242 if (INTEL_DEBUG(DEBUG_BATCH_STATS)) {
1243 intel_batch_stats(ctx, bbo->bo->map,
1244 bbo->bo->size, bbo->bo->offset, false);
1245 }
1246 device->cmd_buffer_being_decoded = NULL;
1247 }
1248
1249 void
anv_cmd_buffer_exec_batch_debug(struct anv_queue * queue,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass)1250 anv_cmd_buffer_exec_batch_debug(struct anv_queue *queue,
1251 uint32_t cmd_buffer_count,
1252 struct anv_cmd_buffer **cmd_buffers,
1253 struct anv_query_pool *perf_query_pool,
1254 uint32_t perf_query_pass)
1255 {
1256 if (!INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS))
1257 return;
1258
1259 struct anv_device *device = queue->device;
1260 const bool has_perf_query = perf_query_pool && cmd_buffer_count;
1261 uint64_t frame_id = device->debug_frame_desc->frame_id;
1262
1263 if (!intel_debug_batch_in_range(device->debug_frame_desc->frame_id))
1264 return;
1265 fprintf(stderr, "Batch for frame %"PRIu64" on queue %d\n",
1266 frame_id, (int)(queue - device->queues));
1267
1268 if (cmd_buffer_count) {
1269 if (has_perf_query) {
1270 struct anv_bo *pass_batch_bo = perf_query_pool->bo;
1271 uint64_t pass_batch_offset =
1272 khr_perf_query_preamble_offset(perf_query_pool, perf_query_pass);
1273
1274 if (INTEL_DEBUG(DEBUG_BATCH)) {
1275 intel_print_batch(queue->decoder,
1276 pass_batch_bo->map + pass_batch_offset, 64,
1277 pass_batch_bo->offset + pass_batch_offset, false);
1278 }
1279 }
1280
1281 for (uint32_t i = 0; i < cmd_buffer_count; i++)
1282 anv_print_batch(device, queue, cmd_buffers[i]);
1283 } else if (INTEL_DEBUG(DEBUG_BATCH)) {
1284 intel_print_batch(queue->decoder, device->trivial_batch_bo->map,
1285 device->trivial_batch_bo->size,
1286 device->trivial_batch_bo->offset, false);
1287 }
1288 }
1289
1290 /* We lock around execbuf for three main reasons:
1291 *
1292 * 1) When a block pool is resized, we create a new gem handle with a
1293 * different size and, in the case of surface states, possibly a different
1294 * center offset but we re-use the same anv_bo struct when we do so. If
1295 * this happens in the middle of setting up an execbuf, we could end up
1296 * with our list of BOs out of sync with our list of gem handles.
1297 *
1298 * 2) The algorithm we use for building the list of unique buffers isn't
1299 * thread-safe. While the client is supposed to synchronize around
1300 * QueueSubmit, this would be extremely difficult to debug if it ever came
1301 * up in the wild due to a broken app. It's better to play it safe and
1302 * just lock around QueueSubmit.
1303 *
1304 * Since the only other things that ever take the device lock such as block
1305 * pool resize only rarely happen, this will almost never be contended so
1306 * taking a lock isn't really an expensive operation in this case.
1307 */
1308 static inline VkResult
anv_queue_exec_locked(struct anv_queue * queue,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,uint32_t signal_count,const struct vk_sync_signal * signals,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass,struct anv_utrace_submit * utrace_submit)1309 anv_queue_exec_locked(struct anv_queue *queue,
1310 uint32_t wait_count,
1311 const struct vk_sync_wait *waits,
1312 uint32_t cmd_buffer_count,
1313 struct anv_cmd_buffer **cmd_buffers,
1314 uint32_t signal_count,
1315 const struct vk_sync_signal *signals,
1316 struct anv_query_pool *perf_query_pool,
1317 uint32_t perf_query_pass,
1318 struct anv_utrace_submit *utrace_submit)
1319 {
1320 struct anv_device *device = queue->device;
1321 VkResult result = VK_SUCCESS;
1322
1323 /* We only need to synchronize the main & companion command buffers if we
1324 * have a companion command buffer somewhere in the list of command
1325 * buffers.
1326 */
1327 bool needs_companion_sync = false;
1328 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
1329 if (cmd_buffers[i]->companion_rcs_cmd_buffer != NULL) {
1330 needs_companion_sync = true;
1331 break;
1332 }
1333 }
1334
1335 if (perf_query_pool && device->perf_queue != queue)
1336 debug_warn_once("Mismatch between queue that OA stream was open and "
1337 "queue were query will be executed.");
1338
1339 result =
1340 device->kmd_backend->queue_exec_locked(
1341 queue,
1342 wait_count, waits,
1343 cmd_buffer_count, cmd_buffers,
1344 needs_companion_sync ? 0 : signal_count, signals,
1345 perf_query_pool,
1346 perf_query_pass,
1347 utrace_submit);
1348 if (result != VK_SUCCESS)
1349 return result;
1350
1351 if (needs_companion_sync) {
1352 struct vk_sync_wait companion_sync = {
1353 .sync = queue->companion_sync,
1354 };
1355 /* If any of the command buffer had a companion batch, the submission
1356 * backend will signal queue->companion_sync, so to ensure completion,
1357 * we just need to wait on that fence.
1358 */
1359 result =
1360 device->kmd_backend->queue_exec_locked(queue,
1361 1, &companion_sync,
1362 0, NULL,
1363 signal_count, signals,
1364 NULL, 0,
1365 NULL);
1366 }
1367
1368 return result;
1369 }
1370
1371 static inline bool
can_chain_query_pools(struct anv_query_pool * p1,struct anv_query_pool * p2)1372 can_chain_query_pools(struct anv_query_pool *p1, struct anv_query_pool *p2)
1373 {
1374 return (!p1 || !p2 || p1 == p2);
1375 }
1376
1377 static VkResult
anv_queue_submit_sparse_bind_locked(struct anv_queue * queue,struct vk_queue_submit * submit)1378 anv_queue_submit_sparse_bind_locked(struct anv_queue *queue,
1379 struct vk_queue_submit *submit)
1380 {
1381 struct anv_device *device = queue->device;
1382 VkResult result;
1383
1384 /* When fake sparse is enabled, while we do accept creating "sparse"
1385 * resources we can't really handle sparse submission. Fake sparse is
1386 * supposed to be used by applications that request sparse to be enabled
1387 * but don't actually *use* it.
1388 */
1389 if (device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) {
1390 if (INTEL_DEBUG(DEBUG_SPARSE))
1391 fprintf(stderr, "=== application submitting sparse operations: "
1392 "buffer_bind:%d image_opaque_bind:%d image_bind:%d\n",
1393 submit->buffer_bind_count, submit->image_opaque_bind_count,
1394 submit->image_bind_count);
1395 return vk_queue_set_lost(&queue->vk, "Sparse binding not supported");
1396 }
1397
1398 assert(submit->command_buffer_count == 0);
1399
1400 if (INTEL_DEBUG(DEBUG_SPARSE)) {
1401 fprintf(stderr, "[sparse submission, buffers:%u opaque_images:%u "
1402 "images:%u waits:%u signals:%u]\n",
1403 submit->buffer_bind_count,
1404 submit->image_opaque_bind_count,
1405 submit->image_bind_count,
1406 submit->wait_count, submit->signal_count);
1407 }
1408
1409 struct anv_sparse_submission sparse_submit = {
1410 .queue = queue,
1411 .binds = NULL,
1412 .binds_len = 0,
1413 .binds_capacity = 0,
1414 .wait_count = submit->wait_count,
1415 .signal_count = submit->signal_count,
1416 .waits = submit->waits,
1417 .signals = submit->signals,
1418 };
1419
1420 for (uint32_t i = 0; i < submit->buffer_bind_count; i++) {
1421 VkSparseBufferMemoryBindInfo *bind_info = &submit->buffer_binds[i];
1422 ANV_FROM_HANDLE(anv_buffer, buffer, bind_info->buffer);
1423
1424 assert(anv_buffer_is_sparse(buffer));
1425
1426 for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1427 result = anv_sparse_bind_buffer(device, buffer,
1428 &bind_info->pBinds[j],
1429 &sparse_submit);
1430 if (result != VK_SUCCESS)
1431 goto out_free_submit;
1432 }
1433 }
1434
1435 for (uint32_t i = 0; i < submit->image_bind_count; i++) {
1436 VkSparseImageMemoryBindInfo *bind_info = &submit->image_binds[i];
1437 ANV_FROM_HANDLE(anv_image, image, bind_info->image);
1438
1439 assert(anv_image_is_sparse(image));
1440 assert(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT);
1441
1442 for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1443 result = anv_sparse_bind_image_memory(queue, image,
1444 &bind_info->pBinds[j],
1445 &sparse_submit);
1446 if (result != VK_SUCCESS)
1447 goto out_free_submit;
1448 }
1449 }
1450
1451 for (uint32_t i = 0; i < submit->image_opaque_bind_count; i++) {
1452 VkSparseImageOpaqueMemoryBindInfo *bind_info =
1453 &submit->image_opaque_binds[i];
1454 ANV_FROM_HANDLE(anv_image, image, bind_info->image);
1455
1456 assert(anv_image_is_sparse(image));
1457
1458 for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1459 result = anv_sparse_bind_image_opaque(device, image,
1460 &bind_info->pBinds[j],
1461 &sparse_submit);
1462 if (result != VK_SUCCESS)
1463 goto out_free_submit;
1464 }
1465 }
1466
1467 result = anv_sparse_bind(device, &sparse_submit);
1468
1469 out_free_submit:
1470 vk_free(&device->vk.alloc, sparse_submit.binds);
1471 return result;
1472 }
1473
1474 static VkResult
anv_queue_submit_cmd_buffers_locked(struct anv_queue * queue,struct vk_queue_submit * submit,struct anv_utrace_submit * utrace_submit)1475 anv_queue_submit_cmd_buffers_locked(struct anv_queue *queue,
1476 struct vk_queue_submit *submit,
1477 struct anv_utrace_submit *utrace_submit)
1478 {
1479 VkResult result;
1480
1481 if (submit->command_buffer_count == 0) {
1482 result = anv_queue_exec_locked(queue, submit->wait_count, submit->waits,
1483 0 /* cmd_buffer_count */,
1484 NULL /* cmd_buffers */,
1485 submit->signal_count, submit->signals,
1486 NULL /* perf_query_pool */,
1487 0 /* perf_query_pass */,
1488 utrace_submit);
1489 if (result != VK_SUCCESS)
1490 return result;
1491 } else {
1492 /* Everything's easier if we don't have to bother with container_of() */
1493 STATIC_ASSERT(offsetof(struct anv_cmd_buffer, vk) == 0);
1494 struct vk_command_buffer **vk_cmd_buffers = submit->command_buffers;
1495 struct anv_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers;
1496 uint32_t start = 0;
1497 uint32_t end = submit->command_buffer_count;
1498 struct anv_query_pool *perf_query_pool =
1499 cmd_buffers[start]->perf_query_pool;
1500 for (uint32_t n = 0; n < end; n++) {
1501 bool can_chain = false;
1502 uint32_t next = n + 1;
1503 /* Can we chain the last buffer into the next one? */
1504 if (next < end &&
1505 anv_cmd_buffer_is_chainable(cmd_buffers[n]) &&
1506 anv_cmd_buffer_is_chainable(cmd_buffers[next]) &&
1507 can_chain_query_pools
1508 (cmd_buffers[next]->perf_query_pool, perf_query_pool)) {
1509 can_chain = true;
1510 perf_query_pool =
1511 perf_query_pool ? perf_query_pool :
1512 cmd_buffers[next]->perf_query_pool;
1513 }
1514 if (!can_chain) {
1515 /* The next buffer cannot be chained, or we have reached the
1516 * last buffer, submit what have been chained so far.
1517 */
1518 VkResult result =
1519 anv_queue_exec_locked(queue,
1520 start == 0 ? submit->wait_count : 0,
1521 start == 0 ? submit->waits : NULL,
1522 next - start, &cmd_buffers[start],
1523 next == end ? submit->signal_count : 0,
1524 next == end ? submit->signals : NULL,
1525 perf_query_pool,
1526 submit->perf_pass_index,
1527 next == end ? utrace_submit : NULL);
1528 if (result != VK_SUCCESS)
1529 return result;
1530 if (next < end) {
1531 start = next;
1532 perf_query_pool = cmd_buffers[start]->perf_query_pool;
1533 }
1534 }
1535 }
1536 }
1537 for (uint32_t i = 0; i < submit->signal_count; i++) {
1538 if (!vk_sync_is_anv_bo_sync(submit->signals[i].sync))
1539 continue;
1540
1541 struct anv_bo_sync *bo_sync =
1542 container_of(submit->signals[i].sync, struct anv_bo_sync, sync);
1543
1544 /* Once the execbuf has returned, we need to set the fence state to
1545 * SUBMITTED. We can't do this before calling execbuf because
1546 * anv_GetFenceStatus does take the global device lock before checking
1547 * fence->state.
1548 *
1549 * We set the fence state to SUBMITTED regardless of whether or not the
1550 * execbuf succeeds because we need to ensure that vkWaitForFences() and
1551 * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or
1552 * VK_SUCCESS) in a finite amount of time even if execbuf fails.
1553 */
1554 assert(bo_sync->state == ANV_BO_SYNC_STATE_RESET);
1555 bo_sync->state = ANV_BO_SYNC_STATE_SUBMITTED;
1556 }
1557
1558 pthread_cond_broadcast(&queue->device->queue_submit);
1559
1560 return VK_SUCCESS;
1561 }
1562
1563 static inline void
anv_queue_free_initial_submission(struct anv_queue * queue)1564 anv_queue_free_initial_submission(struct anv_queue *queue)
1565 {
1566 if (queue->init_submit &&
1567 anv_async_submit_done(queue->init_submit)) {
1568 anv_async_submit_destroy(queue->init_submit);
1569 queue->init_submit = NULL;
1570 }
1571 if (queue->init_companion_submit &&
1572 anv_async_submit_done(queue->init_companion_submit)) {
1573 anv_async_submit_destroy(queue->init_companion_submit);
1574 queue->init_companion_submit = NULL;
1575 }
1576 }
1577
1578 VkResult
anv_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)1579 anv_queue_submit(struct vk_queue *vk_queue,
1580 struct vk_queue_submit *submit)
1581 {
1582 struct anv_queue *queue = container_of(vk_queue, struct anv_queue, vk);
1583 struct anv_device *device = queue->device;
1584 VkResult result;
1585
1586 anv_queue_free_initial_submission(queue);
1587
1588 if (queue->device->info->no_hw) {
1589 for (uint32_t i = 0; i < submit->signal_count; i++) {
1590 result = vk_sync_signal(&device->vk,
1591 submit->signals[i].sync,
1592 submit->signals[i].signal_value);
1593 if (result != VK_SUCCESS)
1594 return vk_queue_set_lost(&queue->vk, "vk_sync_signal failed");
1595 }
1596 return VK_SUCCESS;
1597 }
1598
1599 /* Flush the trace points first before taking the lock as the flushing
1600 * might try to take that same lock.
1601 */
1602 struct anv_utrace_submit *utrace_submit = NULL;
1603 result = anv_device_utrace_flush_cmd_buffers(
1604 queue,
1605 submit->command_buffer_count,
1606 (struct anv_cmd_buffer **)submit->command_buffers,
1607 &utrace_submit);
1608 if (result != VK_SUCCESS)
1609 return result;
1610
1611 pthread_mutex_lock(&device->mutex);
1612
1613 uint64_t start_ts = intel_ds_begin_submit(&queue->ds);
1614
1615 if (submit->buffer_bind_count ||
1616 submit->image_opaque_bind_count ||
1617 submit->image_bind_count) {
1618 result = anv_queue_submit_sparse_bind_locked(queue, submit);
1619 } else {
1620 result = anv_queue_submit_cmd_buffers_locked(queue, submit,
1621 utrace_submit);
1622 }
1623
1624 /* Take submission ID under lock */
1625 intel_ds_end_submit(&queue->ds, start_ts);
1626
1627 pthread_mutex_unlock(&device->mutex);
1628
1629 intel_ds_device_process(&device->ds, false);
1630
1631 return result;
1632 }
1633
1634 void
anv_cmd_buffer_clflush(struct anv_cmd_buffer ** cmd_buffers,uint32_t num_cmd_buffers)1635 anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers,
1636 uint32_t num_cmd_buffers)
1637 {
1638 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
1639 struct anv_batch_bo **bbo;
1640
1641 __builtin_ia32_mfence();
1642
1643 for (uint32_t i = 0; i < num_cmd_buffers; i++) {
1644 u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
1645 intel_flush_range_no_fence((*bbo)->bo->map, (*bbo)->length);
1646 }
1647 }
1648
1649 __builtin_ia32_mfence();
1650 #endif
1651 }
1652
1653 static VkResult
anv_async_submit_extend_batch(struct anv_batch * batch,uint32_t size,void * user_data)1654 anv_async_submit_extend_batch(struct anv_batch *batch, uint32_t size,
1655 void *user_data)
1656 {
1657 struct anv_async_submit *submit = user_data;
1658
1659 uint32_t alloc_size = 0;
1660 util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
1661 alloc_size += (*bo)->size;
1662 alloc_size = MAX2(alloc_size * 2, 8192);
1663
1664 struct anv_bo *bo;
1665 VkResult result = anv_bo_pool_alloc(submit->bo_pool,
1666 align(alloc_size, 4096),
1667 &bo);
1668 if (result != VK_SUCCESS)
1669 return result;
1670
1671 util_dynarray_append(&submit->batch_bos, struct anv_bo *, bo);
1672
1673 batch->end += 4 * GFX9_MI_BATCH_BUFFER_START_length;
1674
1675 anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
1676 bbs.DWordLength = GFX9_MI_BATCH_BUFFER_START_length -
1677 GFX9_MI_BATCH_BUFFER_START_length_bias;
1678 bbs.SecondLevelBatchBuffer = Firstlevelbatch;
1679 bbs.AddressSpaceIndicator = ASI_PPGTT;
1680 bbs.BatchBufferStartAddress = (struct anv_address) { bo, 0 };
1681 }
1682
1683 anv_batch_set_storage(batch,
1684 (struct anv_address) { .bo = bo, },
1685 bo->map,
1686 bo->size - 4 * GFX9_MI_BATCH_BUFFER_START_length);
1687
1688 return VK_SUCCESS;
1689 }
1690
1691 VkResult
anv_async_submit_init(struct anv_async_submit * submit,struct anv_queue * queue,struct anv_bo_pool * bo_pool,bool use_companion_rcs,bool create_signal_sync)1692 anv_async_submit_init(struct anv_async_submit *submit,
1693 struct anv_queue *queue,
1694 struct anv_bo_pool *bo_pool,
1695 bool use_companion_rcs,
1696 bool create_signal_sync)
1697 {
1698 struct anv_device *device = queue->device;
1699
1700 memset(submit, 0, sizeof(*submit));
1701
1702 submit->use_companion_rcs = use_companion_rcs;
1703 submit->queue = queue;
1704 submit->bo_pool = bo_pool;
1705
1706 const bool uses_relocs = device->physical->uses_relocs;
1707 VkResult result =
1708 anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
1709 if (result != VK_SUCCESS)
1710 return result;
1711
1712 submit->batch = (struct anv_batch) {
1713 .alloc = &device->vk.alloc,
1714 .relocs = &submit->relocs,
1715 .user_data = submit,
1716 .extend_cb = anv_async_submit_extend_batch,
1717 };
1718
1719 util_dynarray_init(&submit->batch_bos, NULL);
1720
1721 if (create_signal_sync) {
1722 result = vk_sync_create(&device->vk,
1723 &device->physical->sync_syncobj_type,
1724 0, 0, &submit->signal.sync);
1725 if (result != VK_SUCCESS) {
1726 anv_reloc_list_finish(&submit->relocs);
1727 util_dynarray_fini(&submit->batch_bos);
1728 return result;
1729 }
1730 submit->owns_sync = true;
1731 }
1732
1733 return VK_SUCCESS;
1734 }
1735
1736 void
anv_async_submit_fini(struct anv_async_submit * submit)1737 anv_async_submit_fini(struct anv_async_submit *submit)
1738 {
1739 struct anv_device *device = submit->queue->device;
1740
1741 if (submit->owns_sync)
1742 vk_sync_destroy(&device->vk, submit->signal.sync);
1743
1744 util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
1745 anv_bo_pool_free(submit->bo_pool, *bo);
1746 util_dynarray_fini(&submit->batch_bos);
1747 anv_reloc_list_finish(&submit->relocs);
1748 }
1749
1750 VkResult
anv_async_submit_create(struct anv_queue * queue,struct anv_bo_pool * bo_pool,bool use_companion_rcs,bool create_signal_sync,struct anv_async_submit ** out_submit)1751 anv_async_submit_create(struct anv_queue *queue,
1752 struct anv_bo_pool *bo_pool,
1753 bool use_companion_rcs,
1754 bool create_signal_sync,
1755 struct anv_async_submit **out_submit)
1756 {
1757 struct anv_device *device = queue->device;
1758
1759 *out_submit =
1760 vk_alloc(&device->vk.alloc, sizeof(struct anv_async_submit), 8,
1761 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1762 if (*out_submit == NULL)
1763 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1764
1765 VkResult result = anv_async_submit_init(*out_submit, queue,
1766 bo_pool,
1767 use_companion_rcs,
1768 create_signal_sync);
1769 if (result != VK_SUCCESS)
1770 vk_free(&device->vk.alloc, *out_submit);
1771
1772 return result;
1773 }
1774
1775 void
anv_async_submit_destroy(struct anv_async_submit * submit)1776 anv_async_submit_destroy(struct anv_async_submit *submit)
1777 {
1778 struct anv_device *device = submit->queue->device;
1779 anv_async_submit_fini(submit);
1780 vk_free(&device->vk.alloc, submit);
1781 }
1782
1783 bool
anv_async_submit_done(struct anv_async_submit * submit)1784 anv_async_submit_done(struct anv_async_submit *submit)
1785 {
1786 struct anv_device *device = submit->queue->device;
1787
1788 return vk_sync_wait(&device->vk,
1789 submit->signal.sync,
1790 submit->signal.signal_value,
1791 VK_SYNC_WAIT_COMPLETE, 0) == VK_SUCCESS;
1792 }
1793
1794 bool
anv_async_submit_wait(struct anv_async_submit * submit)1795 anv_async_submit_wait(struct anv_async_submit *submit)
1796 {
1797 struct anv_device *device = submit->queue->device;
1798
1799 return vk_sync_wait(&device->vk,
1800 submit->signal.sync,
1801 submit->signal.signal_value,
1802 VK_SYNC_WAIT_COMPLETE,
1803 os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE)) == VK_SUCCESS;
1804 }
1805