xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/anv_batch_chain.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 
30 #include <xf86drm.h>
31 
32 #include "anv_private.h"
33 #include "anv_measure.h"
34 
35 #include "common/intel_debug_identifier.h"
36 
37 #include "genxml/gen9_pack.h"
38 #include "genxml/genX_bits.h"
39 
40 #include "util/perf/u_trace.h"
41 
42 /** \file anv_batch_chain.c
43  *
44  * This file contains functions related to anv_cmd_buffer as a data
45  * structure.  This involves everything required to create and destroy
46  * the actual batch buffers as well as link them together.
47  *
48  * It specifically does *not* contain any handling of actual vkCmd calls
49  * beyond vkCmdExecuteCommands.
50  */
51 
52 /*-----------------------------------------------------------------------*
53  * Functions related to anv_reloc_list
54  *-----------------------------------------------------------------------*/
55 
56 VkResult
anv_reloc_list_init(struct anv_reloc_list * list,const VkAllocationCallbacks * alloc,bool uses_relocs)57 anv_reloc_list_init(struct anv_reloc_list *list,
58                     const VkAllocationCallbacks *alloc,
59                     bool uses_relocs)
60 {
61    assert(alloc != NULL);
62    memset(list, 0, sizeof(*list));
63    list->uses_relocs = uses_relocs;
64    list->alloc = alloc;
65    return VK_SUCCESS;
66 }
67 
68 static VkResult
anv_reloc_list_init_clone(struct anv_reloc_list * list,const struct anv_reloc_list * other_list)69 anv_reloc_list_init_clone(struct anv_reloc_list *list,
70                           const struct anv_reloc_list *other_list)
71 {
72    list->dep_words = other_list->dep_words;
73 
74    if (list->dep_words > 0) {
75       list->deps =
76          vk_alloc(list->alloc, list->dep_words * sizeof(BITSET_WORD), 8,
77                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
78       memcpy(list->deps, other_list->deps,
79              list->dep_words * sizeof(BITSET_WORD));
80    } else {
81       list->deps = NULL;
82    }
83 
84    return VK_SUCCESS;
85 }
86 
87 void
anv_reloc_list_finish(struct anv_reloc_list * list)88 anv_reloc_list_finish(struct anv_reloc_list *list)
89 {
90    vk_free(list->alloc, list->deps);
91 }
92 
93 static VkResult
anv_reloc_list_grow_deps(struct anv_reloc_list * list,uint32_t min_num_words)94 anv_reloc_list_grow_deps(struct anv_reloc_list *list,
95                          uint32_t min_num_words)
96 {
97    if (min_num_words <= list->dep_words)
98       return VK_SUCCESS;
99 
100    uint32_t new_length = MAX2(32, list->dep_words * 2);
101    while (new_length < min_num_words)
102       new_length *= 2;
103 
104    BITSET_WORD *new_deps =
105       vk_realloc(list->alloc, list->deps, new_length * sizeof(BITSET_WORD), 8,
106                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
107    if (new_deps == NULL)
108       return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
109    list->deps = new_deps;
110 
111    /* Zero out the new data */
112    memset(list->deps + list->dep_words, 0,
113           (new_length - list->dep_words) * sizeof(BITSET_WORD));
114    list->dep_words = new_length;
115 
116    return VK_SUCCESS;
117 }
118 
119 VkResult
anv_reloc_list_add_bo_impl(struct anv_reloc_list * list,struct anv_bo * target_bo)120 anv_reloc_list_add_bo_impl(struct anv_reloc_list *list,
121                            struct anv_bo *target_bo)
122 {
123    /* This can happen with sparse resources. */
124    if (!target_bo)
125       return VK_SUCCESS;
126 
127    uint32_t idx = target_bo->gem_handle;
128    VkResult result = anv_reloc_list_grow_deps(list,
129                                               (idx / BITSET_WORDBITS) + 1);
130    if (unlikely(result != VK_SUCCESS))
131       return result;
132 
133    BITSET_SET(list->deps, idx);
134 
135    return VK_SUCCESS;
136 }
137 
138 static void
anv_reloc_list_clear(struct anv_reloc_list * list)139 anv_reloc_list_clear(struct anv_reloc_list *list)
140 {
141    if (list->dep_words > 0)
142       memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
143 }
144 
145 VkResult
anv_reloc_list_append(struct anv_reloc_list * list,struct anv_reloc_list * other)146 anv_reloc_list_append(struct anv_reloc_list *list,
147                       struct anv_reloc_list *other)
148 {
149    anv_reloc_list_grow_deps(list, other->dep_words);
150    for (uint32_t w = 0; w < other->dep_words; w++)
151       list->deps[w] |= other->deps[w];
152 
153    return VK_SUCCESS;
154 }
155 
156 /*-----------------------------------------------------------------------*
157  * Functions related to anv_batch
158  *-----------------------------------------------------------------------*/
159 
160 static VkResult
anv_extend_batch(struct anv_batch * batch,uint32_t size)161 anv_extend_batch(struct anv_batch *batch, uint32_t size)
162 {
163    assert(batch->extend_cb != NULL);
164    VkResult result = batch->extend_cb(batch, size, batch->user_data);
165    if (result != VK_SUCCESS)
166       return anv_batch_set_error(batch, result);
167    return result;
168 }
169 
170 void *
anv_batch_emit_dwords(struct anv_batch * batch,int num_dwords)171 anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords)
172 {
173    uint32_t size = num_dwords * 4;
174    if (batch->next + size > batch->end) {
175       if (anv_extend_batch(batch, size) != VK_SUCCESS)
176          return NULL;
177    }
178 
179    void *p = batch->next;
180 
181    batch->next += num_dwords * 4;
182    assert(batch->next <= batch->end);
183 
184    return p;
185 }
186 
187 /* Ensure enough contiguous space is available */
188 VkResult
anv_batch_emit_ensure_space(struct anv_batch * batch,uint32_t size)189 anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size)
190 {
191    if (batch->next + size > batch->end) {
192       VkResult result = anv_extend_batch(batch, size);
193       if (result != VK_SUCCESS)
194          return result;
195    }
196 
197    assert(batch->next + size <= batch->end);
198 
199    return VK_SUCCESS;
200 }
201 
202 void
anv_batch_advance(struct anv_batch * batch,uint32_t size)203 anv_batch_advance(struct anv_batch *batch, uint32_t size)
204 {
205    assert(batch->next + size <= batch->end);
206 
207    batch->next += size;
208 }
209 
210 struct anv_address
anv_batch_address(struct anv_batch * batch,void * batch_location)211 anv_batch_address(struct anv_batch *batch, void *batch_location)
212 {
213    assert(batch->start <= batch_location);
214 
215    /* Allow a jump at the current location of the batch. */
216    assert(batch->next >= batch_location);
217 
218    return anv_address_add(batch->start_addr, batch_location - batch->start);
219 }
220 
221 void
anv_batch_emit_batch(struct anv_batch * batch,struct anv_batch * other)222 anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
223 {
224    uint32_t size = other->next - other->start;
225    assert(size % 4 == 0);
226 
227    if (batch->next + size > batch->end) {
228       if (anv_extend_batch(batch, size) != VK_SUCCESS)
229          return;
230    }
231 
232    assert(batch->next + size <= batch->end);
233 
234    VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size));
235    memcpy(batch->next, other->start, size);
236 
237    VkResult result = anv_reloc_list_append(batch->relocs, other->relocs);
238    if (result != VK_SUCCESS) {
239       anv_batch_set_error(batch, result);
240       return;
241    }
242 
243    batch->next += size;
244 }
245 
246 /*-----------------------------------------------------------------------*
247  * Functions related to anv_batch_bo
248  *-----------------------------------------------------------------------*/
249 
250 static VkResult
anv_batch_bo_create(struct anv_cmd_buffer * cmd_buffer,uint32_t size,struct anv_batch_bo ** bbo_out)251 anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,
252                     uint32_t size,
253                     struct anv_batch_bo **bbo_out)
254 {
255    VkResult result;
256 
257    struct anv_batch_bo *bbo = vk_zalloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
258                                         8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
259    if (bbo == NULL)
260       return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
261 
262    result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
263                               size, &bbo->bo);
264    if (result != VK_SUCCESS)
265       goto fail_alloc;
266 
267    const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
268    result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->vk.pool->alloc, uses_relocs);
269    if (result != VK_SUCCESS)
270       goto fail_bo_alloc;
271 
272    *bbo_out = bbo;
273 
274    return VK_SUCCESS;
275 
276  fail_bo_alloc:
277    anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
278  fail_alloc:
279    vk_free(&cmd_buffer->vk.pool->alloc, bbo);
280 
281    return result;
282 }
283 
284 static VkResult
anv_batch_bo_clone(struct anv_cmd_buffer * cmd_buffer,const struct anv_batch_bo * other_bbo,struct anv_batch_bo ** bbo_out)285 anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
286                    const struct anv_batch_bo *other_bbo,
287                    struct anv_batch_bo **bbo_out)
288 {
289    VkResult result;
290 
291    struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
292                                         8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
293    if (bbo == NULL)
294       return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
295 
296    result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
297                               other_bbo->bo->size, &bbo->bo);
298    if (result != VK_SUCCESS)
299       goto fail_alloc;
300 
301    result = anv_reloc_list_init_clone(&bbo->relocs, &other_bbo->relocs);
302    if (result != VK_SUCCESS)
303       goto fail_bo_alloc;
304 
305    bbo->length = other_bbo->length;
306    memcpy(bbo->bo->map, other_bbo->bo->map, other_bbo->length);
307    *bbo_out = bbo;
308 
309    return VK_SUCCESS;
310 
311  fail_bo_alloc:
312    anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
313  fail_alloc:
314    vk_free(&cmd_buffer->vk.pool->alloc, bbo);
315 
316    return result;
317 }
318 
319 static void
anv_batch_bo_start(struct anv_batch_bo * bbo,struct anv_batch * batch,size_t batch_padding)320 anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch,
321                    size_t batch_padding)
322 {
323    anv_batch_set_storage(batch, (struct anv_address) { .bo = bbo->bo, },
324                          bbo->bo->map, bbo->bo->size - batch_padding);
325    batch->relocs = &bbo->relocs;
326    anv_reloc_list_clear(&bbo->relocs);
327 }
328 
329 static void
anv_batch_bo_continue(struct anv_batch_bo * bbo,struct anv_batch * batch,size_t batch_padding)330 anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch,
331                       size_t batch_padding)
332 {
333    batch->start_addr = (struct anv_address) { .bo = bbo->bo, };
334    batch->start = bbo->bo->map;
335    batch->next = bbo->bo->map + bbo->length;
336    batch->end = bbo->bo->map + bbo->bo->size - batch_padding;
337    batch->relocs = &bbo->relocs;
338 }
339 
340 static void
anv_batch_bo_finish(struct anv_batch_bo * bbo,struct anv_batch * batch)341 anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch)
342 {
343    assert(batch->start == bbo->bo->map);
344    bbo->length = batch->next - batch->start;
345    VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length));
346 }
347 
348 static void
anv_batch_bo_link(struct anv_cmd_buffer * cmd_buffer,struct anv_batch_bo * prev_bbo,struct anv_batch_bo * next_bbo,uint32_t next_bbo_offset)349 anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer,
350                   struct anv_batch_bo *prev_bbo,
351                   struct anv_batch_bo *next_bbo,
352                   uint32_t next_bbo_offset)
353 {
354    const uint32_t bb_start_offset =
355       prev_bbo->length - GFX9_MI_BATCH_BUFFER_START_length * 4;
356    ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset;
357 
358    /* Make sure we're looking at a MI_BATCH_BUFFER_START */
359    assert(((*bb_start >> 29) & 0x07) == 0);
360    assert(((*bb_start >> 23) & 0x3f) == 49);
361 
362    uint64_t *map = prev_bbo->bo->map + bb_start_offset + 4;
363    *map = intel_canonical_address(next_bbo->bo->offset + next_bbo_offset);
364 
365 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
366    if (cmd_buffer->device->physical->memory.need_flush &&
367        anv_bo_needs_host_cache_flush(prev_bbo->bo->alloc_flags))
368       intel_flush_range(map, sizeof(uint64_t));
369 #endif
370 }
371 
372 static void
anv_batch_bo_destroy(struct anv_batch_bo * bbo,struct anv_cmd_buffer * cmd_buffer)373 anv_batch_bo_destroy(struct anv_batch_bo *bbo,
374                      struct anv_cmd_buffer *cmd_buffer)
375 {
376    anv_reloc_list_finish(&bbo->relocs);
377    anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
378    vk_free(&cmd_buffer->vk.pool->alloc, bbo);
379 }
380 
381 static VkResult
anv_batch_bo_list_clone(const struct list_head * list,struct anv_cmd_buffer * cmd_buffer,struct list_head * new_list)382 anv_batch_bo_list_clone(const struct list_head *list,
383                         struct anv_cmd_buffer *cmd_buffer,
384                         struct list_head *new_list)
385 {
386    VkResult result = VK_SUCCESS;
387 
388    list_inithead(new_list);
389 
390    struct anv_batch_bo *prev_bbo = NULL;
391    list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
392       struct anv_batch_bo *new_bbo = NULL;
393       result = anv_batch_bo_clone(cmd_buffer, bbo, &new_bbo);
394       if (result != VK_SUCCESS)
395          break;
396       list_addtail(&new_bbo->link, new_list);
397 
398       if (prev_bbo)
399          anv_batch_bo_link(cmd_buffer, prev_bbo, new_bbo, 0);
400 
401       prev_bbo = new_bbo;
402    }
403 
404    if (result != VK_SUCCESS) {
405       list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) {
406          list_del(&bbo->link);
407          anv_batch_bo_destroy(bbo, cmd_buffer);
408       }
409    }
410 
411    return result;
412 }
413 
414 /*-----------------------------------------------------------------------*
415  * Functions related to anv_batch_bo
416  *-----------------------------------------------------------------------*/
417 
418 static struct anv_batch_bo *
anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer * cmd_buffer)419 anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer)
420 {
421    return list_entry(cmd_buffer->batch_bos.prev, struct anv_batch_bo, link);
422 }
423 
424 static struct anv_batch_bo *
anv_cmd_buffer_current_generation_batch_bo(struct anv_cmd_buffer * cmd_buffer)425 anv_cmd_buffer_current_generation_batch_bo(struct anv_cmd_buffer *cmd_buffer)
426 {
427    return list_entry(cmd_buffer->generation.batch_bos.prev, struct anv_batch_bo, link);
428 }
429 
430 struct anv_address
anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer * cmd_buffer)431 anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
432 {
433    /* Only graphics & compute queues need binding tables. */
434    if (!(cmd_buffer->queue_family->queueFlags & (VK_QUEUE_GRAPHICS_BIT |
435                                                  VK_QUEUE_COMPUTE_BIT)))
436       return ANV_NULL_ADDRESS;
437 
438    /* If we've never allocated a binding table block, do it now. Otherwise we
439     * would trigger another STATE_BASE_ADDRESS emission which would require an
440     * additional bunch of flushes/stalls.
441     */
442    if (u_vector_length(&cmd_buffer->bt_block_states) == 0) {
443       VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
444       if (result != VK_SUCCESS) {
445          anv_batch_set_error(&cmd_buffer->batch, result);
446          return ANV_NULL_ADDRESS;
447       }
448    }
449 
450    struct anv_state_pool *pool = &cmd_buffer->device->binding_table_pool;
451    struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
452    return (struct anv_address) {
453       .bo = pool->block_pool.bo,
454       .offset = bt_block->offset - pool->start_offset,
455    };
456 }
457 
458 static void
emit_batch_buffer_start(struct anv_batch * batch,struct anv_bo * bo,uint32_t offset)459 emit_batch_buffer_start(struct anv_batch *batch,
460                         struct anv_bo *bo, uint32_t offset)
461 {
462    anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
463       bbs.DWordLength               = GFX9_MI_BATCH_BUFFER_START_length -
464                                       GFX9_MI_BATCH_BUFFER_START_length_bias;
465       bbs.SecondLevelBatchBuffer    = Firstlevelbatch;
466       bbs.AddressSpaceIndicator     = ASI_PPGTT;
467       bbs.BatchBufferStartAddress   = (struct anv_address) { bo, offset };
468    }
469 }
470 
471 enum anv_cmd_buffer_batch {
472    ANV_CMD_BUFFER_BATCH_MAIN,
473    ANV_CMD_BUFFER_BATCH_GENERATION,
474 };
475 
476 static void
cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_batch_bo * bbo,enum anv_cmd_buffer_batch batch_type)477 cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,
478                              struct anv_batch_bo *bbo,
479                              enum anv_cmd_buffer_batch batch_type)
480 {
481    struct anv_batch *batch =
482       batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
483       &cmd_buffer->generation.batch : &cmd_buffer->batch;
484    struct anv_batch_bo *current_bbo =
485       batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
486       anv_cmd_buffer_current_generation_batch_bo(cmd_buffer) :
487       anv_cmd_buffer_current_batch_bo(cmd_buffer);
488 
489    /* We set the end of the batch a little short so we would be sure we
490     * have room for the chaining command.  Since we're about to emit the
491     * chaining command, let's set it back where it should go.
492     */
493    batch->end += GFX9_MI_BATCH_BUFFER_START_length * 4;
494    assert(batch->end == current_bbo->bo->map + current_bbo->bo->size);
495 
496    emit_batch_buffer_start(batch, bbo->bo, 0);
497 
498    anv_batch_bo_finish(current_bbo, batch);
499 
500    /* Add the current amount of data written in the current_bbo to the command
501     * buffer.
502     */
503    cmd_buffer->total_batch_size += current_bbo->length;
504 }
505 
506 static void
anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer * cmd_buffer_from,struct anv_cmd_buffer * cmd_buffer_to)507 anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
508                                    struct anv_cmd_buffer *cmd_buffer_to)
509 {
510    uint32_t *bb_start = cmd_buffer_from->batch_end;
511 
512    struct anv_batch_bo *last_bbo =
513       list_last_entry(&cmd_buffer_from->batch_bos, struct anv_batch_bo, link);
514    struct anv_batch_bo *first_bbo =
515       list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link);
516 
517    struct GFX9_MI_BATCH_BUFFER_START gen_bb_start = {
518       __anv_cmd_header(GFX9_MI_BATCH_BUFFER_START),
519       .SecondLevelBatchBuffer    = Firstlevelbatch,
520       .AddressSpaceIndicator     = ASI_PPGTT,
521       .BatchBufferStartAddress   = (struct anv_address) { first_bbo->bo, 0 },
522    };
523    struct anv_batch local_batch = {
524       .start  = last_bbo->bo->map,
525       .end    = last_bbo->bo->map + last_bbo->bo->size,
526       .relocs = &last_bbo->relocs,
527       .alloc  = &cmd_buffer_from->vk.pool->alloc,
528    };
529 
530    __anv_cmd_pack(GFX9_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);
531 
532    last_bbo->chained = true;
533 }
534 
535 static void
anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer * cmd_buffer)536 anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer)
537 {
538    struct anv_batch_bo *last_bbo =
539       list_last_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
540    last_bbo->chained = false;
541 
542    uint32_t *batch = cmd_buffer->batch_end;
543    anv_pack_struct(batch, GFX9_MI_BATCH_BUFFER_END,
544                    __anv_cmd_header(GFX9_MI_BATCH_BUFFER_END));
545 }
546 
547 static VkResult
anv_cmd_buffer_chain_batch(struct anv_batch * batch,uint32_t size,void * _data)548 anv_cmd_buffer_chain_batch(struct anv_batch *batch, uint32_t size, void *_data)
549 {
550    /* The caller should not need that much space. Otherwise it should split
551     * its commands.
552     */
553    assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
554 
555    struct anv_cmd_buffer *cmd_buffer = _data;
556    struct anv_batch_bo *new_bbo = NULL;
557    /* Amount of reserved space at the end of the batch to account for the
558     * chaining instruction.
559     */
560    const uint32_t batch_padding = GFX9_MI_BATCH_BUFFER_START_length * 4;
561    /* Cap reallocation to chunk. */
562    uint32_t alloc_size = MIN2(
563       MAX2(batch->allocated_batch_size, size + batch_padding),
564       ANV_MAX_CMD_BUFFER_BATCH_SIZE);
565 
566    VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
567    if (result != VK_SUCCESS)
568       return result;
569 
570    batch->allocated_batch_size += alloc_size;
571 
572    struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
573    if (seen_bbo == NULL) {
574       anv_batch_bo_destroy(new_bbo, cmd_buffer);
575       return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
576    }
577    *seen_bbo = new_bbo;
578 
579    cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo, ANV_CMD_BUFFER_BATCH_MAIN);
580 
581    list_addtail(&new_bbo->link, &cmd_buffer->batch_bos);
582 
583    anv_batch_bo_start(new_bbo, batch, batch_padding);
584 
585    return VK_SUCCESS;
586 }
587 
588 static VkResult
anv_cmd_buffer_chain_generation_batch(struct anv_batch * batch,uint32_t size,void * _data)589 anv_cmd_buffer_chain_generation_batch(struct anv_batch *batch, uint32_t size, void *_data)
590 {
591    /* The caller should not need that much space. Otherwise it should split
592     * its commands.
593     */
594    assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
595 
596    struct anv_cmd_buffer *cmd_buffer = _data;
597    struct anv_batch_bo *new_bbo = NULL;
598    /* Cap reallocation to chunk. */
599    uint32_t alloc_size = MIN2(
600       MAX2(batch->allocated_batch_size, size),
601       ANV_MAX_CMD_BUFFER_BATCH_SIZE);
602 
603    VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
604    if (result != VK_SUCCESS)
605       return result;
606 
607    batch->allocated_batch_size += alloc_size;
608 
609    struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
610    if (seen_bbo == NULL) {
611       anv_batch_bo_destroy(new_bbo, cmd_buffer);
612       return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
613    }
614    *seen_bbo = new_bbo;
615 
616    if (!list_is_empty(&cmd_buffer->generation.batch_bos)) {
617       cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo,
618                                    ANV_CMD_BUFFER_BATCH_GENERATION);
619    }
620 
621    list_addtail(&new_bbo->link, &cmd_buffer->generation.batch_bos);
622 
623    anv_batch_bo_start(new_bbo, batch, GFX9_MI_BATCH_BUFFER_START_length * 4);
624 
625    return VK_SUCCESS;
626 }
627 
628 /** Allocate a binding table
629  *
630  * This function allocates a binding table.  This is a bit more complicated
631  * than one would think due to a combination of Vulkan driver design and some
632  * unfortunate hardware restrictions.
633  *
634  * The 3DSTATE_BINDING_TABLE_POINTERS_* packets only have a 16-bit field for
635  * the binding table pointer which means that all binding tables need to live
636  * in the bottom 64k of surface state base address.  The way the GL driver has
637  * classically dealt with this restriction is to emit all surface states
638  * on-the-fly into the batch and have a batch buffer smaller than 64k.  This
639  * isn't really an option in Vulkan for a couple of reasons:
640  *
641  *  1) In Vulkan, we have growing (or chaining) batches so surface states have
642  *     to live in their own buffer and we have to be able to re-emit
643  *     STATE_BASE_ADDRESS as needed which requires a full pipeline stall.  In
644  *     order to avoid emitting STATE_BASE_ADDRESS any more often than needed
645  *     (it's not that hard to hit 64k of just binding tables), we allocate
646  *     surface state objects up-front when VkImageView is created.  In order
647  *     for this to work, surface state objects need to be allocated from a
648  *     global buffer.
649  *
650  *  2) We tried to design the surface state system in such a way that it's
651  *     already ready for bindless texturing.  The way bindless texturing works
652  *     on our hardware is that you have a big pool of surface state objects
653  *     (with its own state base address) and the bindless handles are simply
654  *     offsets into that pool.  With the architecture we chose, we already
655  *     have that pool and it's exactly the same pool that we use for regular
656  *     surface states so we should already be ready for bindless.
657  *
658  *  3) For render targets, we need to be able to fill out the surface states
659  *     later in vkBeginRenderPass so that we can assign clear colors
660  *     correctly.  One way to do this would be to just create the surface
661  *     state data and then repeatedly copy it into the surface state BO every
662  *     time we have to re-emit STATE_BASE_ADDRESS.  While this works, it's
663  *     rather annoying and just being able to allocate them up-front and
664  *     re-use them for the entire render pass.
665  *
666  * While none of these are technically blockers for emitting state on the fly
667  * like we do in GL, the ability to have a single surface state pool is
668  * simplifies things greatly.  Unfortunately, it comes at a cost...
669  *
670  * Because of the 64k limitation of 3DSTATE_BINDING_TABLE_POINTERS_*, we can't
671  * place the binding tables just anywhere in surface state base address.
672  * Because 64k isn't a whole lot of space, we can't simply restrict the
673  * surface state buffer to 64k, we have to be more clever.  The solution we've
674  * chosen is to have a block pool with a maximum size of 2G that starts at
675  * zero and grows in both directions.  All surface states are allocated from
676  * the top of the pool (positive offsets) and we allocate blocks (< 64k) of
677  * binding tables from the bottom of the pool (negative offsets).  Every time
678  * we allocate a new binding table block, we set surface state base address to
679  * point to the bottom of the binding table block.  This way all of the
680  * binding tables in the block are in the bottom 64k of surface state base
681  * address.  When we fill out the binding table, we add the distance between
682  * the bottom of our binding table block and zero of the block pool to the
683  * surface state offsets so that they are correct relative to out new surface
684  * state base address at the bottom of the binding table block.
685  *
686  * \param[in]  entries        The number of surface state entries the binding
687  *                            table should be able to hold.
688  *
689  * \param[out] state_offset   The offset surface surface state base address
690  *                            where the surface states live.  This must be
691  *                            added to the surface state offset when it is
692  *                            written into the binding table entry.
693  *
694  * \return                    An anv_state representing the binding table
695  */
696 struct anv_state
anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer * cmd_buffer,uint32_t entries,uint32_t * state_offset)697 anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
698                                    uint32_t entries, uint32_t *state_offset)
699 {
700    if (u_vector_length(&cmd_buffer->bt_block_states) == 0)
701       return (struct anv_state) { 0 };
702 
703    struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
704 
705    uint32_t bt_size = align(entries * 4, 32);
706 
707    struct anv_state state = cmd_buffer->bt_next;
708    if (bt_size > state.alloc_size)
709       return (struct anv_state) { 0 };
710 
711    state.alloc_size = bt_size;
712    cmd_buffer->bt_next.offset += bt_size;
713    cmd_buffer->bt_next.map += bt_size;
714    cmd_buffer->bt_next.alloc_size -= bt_size;
715 
716    if (cmd_buffer->device->info->verx10 >= 125) {
717       /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to change the binding
718        * table address independently from surface state base address.  We no
719        * longer need any sort of offsetting.
720        */
721       *state_offset = 0;
722    } else {
723       assert(bt_block->offset < 0);
724       *state_offset = -bt_block->offset;
725    }
726 
727    return state;
728 }
729 
730 struct anv_state
anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer * cmd_buffer,uint32_t count)731 anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer *cmd_buffer,
732                                     uint32_t count)
733 {
734    if (count == 0)
735       return ANV_STATE_NULL;
736    struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
737    struct anv_state state =
738       anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
739                              count * isl_dev->ss.size,
740                              isl_dev->ss.align);
741    if (state.map == NULL)
742       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
743    return state;
744 }
745 
746 struct anv_state
anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer * cmd_buffer,uint32_t size,uint32_t alignment)747 anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
748                                    uint32_t size, uint32_t alignment)
749 {
750    if (size == 0)
751       return ANV_STATE_NULL;
752    struct anv_state state =
753       anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
754                              size, alignment);
755    if (state.map == NULL)
756       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
757    return state;
758 }
759 
760 struct anv_state
anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer * cmd_buffer,uint32_t size,uint32_t alignment)761 anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer *cmd_buffer,
762                                    uint32_t size, uint32_t alignment)
763 {
764    if (size == 0)
765       return ANV_STATE_NULL;
766    struct anv_state state =
767       anv_state_stream_alloc(&cmd_buffer->general_state_stream,
768                              size, alignment);
769    if (state.map == NULL)
770       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
771    return state;
772 }
773 
774 /** Allocate space associated with a command buffer
775  *
776  * Some commands like vkCmdBuildAccelerationStructuresKHR() can end up needing
777  * large amount of temporary buffers. This function is here to deal with those
778  * potentially larger allocations, using a side BO if needed.
779  *
780  */
781 struct anv_cmd_alloc
anv_cmd_buffer_alloc_space(struct anv_cmd_buffer * cmd_buffer,size_t size,uint32_t alignment,bool mapped)782 anv_cmd_buffer_alloc_space(struct anv_cmd_buffer *cmd_buffer,
783                            size_t size, uint32_t alignment,
784                            bool mapped)
785 {
786    /* Below 16k, source memory from dynamic state, otherwise allocate a BO. */
787    if (size < 16 * 1024) {
788       struct anv_state state =
789          anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
790                                 size, alignment);
791       if (state.map == NULL) {
792          anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
793          return (struct anv_cmd_alloc) {
794             .address = ANV_NULL_ADDRESS,
795          };
796       }
797 
798       return (struct anv_cmd_alloc) {
799          .address = anv_state_pool_state_address(
800             &cmd_buffer->device->dynamic_state_pool,
801             state),
802          .map = state.map,
803          .size = size,
804       };
805    }
806 
807    assert(alignment <= 4096);
808 
809    struct anv_bo *bo = NULL;
810    VkResult result =
811       anv_bo_pool_alloc(mapped ?
812                         &cmd_buffer->device->batch_bo_pool :
813                         &cmd_buffer->device->bvh_bo_pool,
814                         align(size, 4096), &bo);
815    if (result != VK_SUCCESS) {
816       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
817       return ANV_EMPTY_ALLOC;
818    }
819 
820    struct anv_bo **bo_entry =
821       u_vector_add(&cmd_buffer->dynamic_bos);
822    if (bo_entry == NULL) {
823       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
824       anv_bo_pool_free(bo->map != NULL ?
825                        &cmd_buffer->device->batch_bo_pool :
826                        &cmd_buffer->device->bvh_bo_pool, bo);
827       return ANV_EMPTY_ALLOC;
828    }
829    *bo_entry = bo;
830 
831    return (struct anv_cmd_alloc) {
832       .address = (struct anv_address) { .bo = bo },
833       .map = bo->map,
834       .size = size,
835    };
836 }
837 
838 VkResult
anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer * cmd_buffer)839 anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
840 {
841    struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states);
842    if (bt_block == NULL) {
843       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
844       return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
845    }
846 
847    *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device);
848 
849    /* The bt_next state is a rolling state (we update it as we suballocate
850     * from it) which is relative to the start of the binding table block.
851     */
852    cmd_buffer->bt_next = *bt_block;
853    cmd_buffer->bt_next.offset = 0;
854 
855    return VK_SUCCESS;
856 }
857 
858 VkResult
anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)859 anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
860 {
861    struct anv_batch_bo *batch_bo = NULL;
862    VkResult result;
863 
864    list_inithead(&cmd_buffer->batch_bos);
865 
866    cmd_buffer->total_batch_size = 0;
867 
868    result = anv_batch_bo_create(cmd_buffer,
869                                 ANV_MIN_CMD_BUFFER_BATCH_SIZE,
870                                 &batch_bo);
871    if (result != VK_SUCCESS)
872       return result;
873 
874    list_addtail(&batch_bo->link, &cmd_buffer->batch_bos);
875 
876    cmd_buffer->batch.alloc = &cmd_buffer->vk.pool->alloc;
877    cmd_buffer->batch.user_data = cmd_buffer;
878    cmd_buffer->batch.allocated_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE;
879 
880    cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;
881    cmd_buffer->batch.engine_class = cmd_buffer->queue_family->engine_class;
882    cmd_buffer->batch.trace = &cmd_buffer->trace;
883 
884    anv_batch_bo_start(batch_bo, &cmd_buffer->batch,
885                       GFX9_MI_BATCH_BUFFER_START_length * 4);
886 
887    /* Generation batch is initialized empty since it's possible it won't be
888     * used.
889     */
890    list_inithead(&cmd_buffer->generation.batch_bos);
891 
892    cmd_buffer->generation.batch.alloc = &cmd_buffer->vk.pool->alloc;
893    cmd_buffer->generation.batch.user_data = cmd_buffer;
894    cmd_buffer->generation.batch.allocated_batch_size = 0;
895    cmd_buffer->generation.batch.extend_cb = anv_cmd_buffer_chain_generation_batch;
896    cmd_buffer->generation.batch.engine_class =
897       cmd_buffer->queue_family->engine_class;
898 
899    int success = u_vector_init_pow2(&cmd_buffer->seen_bbos, 8,
900                                     sizeof(struct anv_bo *));
901    if (!success)
902       goto fail_batch_bo;
903 
904    *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo;
905 
906    success = u_vector_init(&cmd_buffer->bt_block_states, 8,
907                            sizeof(struct anv_state));
908    if (!success)
909       goto fail_seen_bbos;
910 
911    const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
912    result = anv_reloc_list_init(&cmd_buffer->surface_relocs,
913                                 &cmd_buffer->vk.pool->alloc, uses_relocs);
914    if (result != VK_SUCCESS)
915       goto fail_bt_blocks;
916 
917    return VK_SUCCESS;
918 
919  fail_bt_blocks:
920    u_vector_finish(&cmd_buffer->bt_block_states);
921  fail_seen_bbos:
922    u_vector_finish(&cmd_buffer->seen_bbos);
923  fail_batch_bo:
924    anv_batch_bo_destroy(batch_bo, cmd_buffer);
925 
926    return result;
927 }
928 
929 void
anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)930 anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
931 {
932    struct anv_state *bt_block;
933    u_vector_foreach(bt_block, &cmd_buffer->bt_block_states)
934       anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
935    u_vector_finish(&cmd_buffer->bt_block_states);
936 
937    anv_reloc_list_finish(&cmd_buffer->surface_relocs);
938 
939    u_vector_finish(&cmd_buffer->seen_bbos);
940 
941    /* Destroy all of the batch buffers */
942    list_for_each_entry_safe(struct anv_batch_bo, bbo,
943                             &cmd_buffer->batch_bos, link) {
944       list_del(&bbo->link);
945       anv_batch_bo_destroy(bbo, cmd_buffer);
946    }
947    /* Also destroy all generation batch buffers */
948    list_for_each_entry_safe(struct anv_batch_bo, bbo,
949                             &cmd_buffer->generation.batch_bos, link) {
950       list_del(&bbo->link);
951       anv_batch_bo_destroy(bbo, cmd_buffer);
952    }
953 
954    if (cmd_buffer->generation.ring_bo) {
955       anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
956                        cmd_buffer->generation.ring_bo);
957    }
958 }
959 
960 void
anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)961 anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
962 {
963    /* Delete all but the first batch bo */
964    assert(!list_is_empty(&cmd_buffer->batch_bos));
965    while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) {
966       struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
967       list_del(&bbo->link);
968       anv_batch_bo_destroy(bbo, cmd_buffer);
969    }
970    assert(!list_is_empty(&cmd_buffer->batch_bos));
971 
972    anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer),
973                       &cmd_buffer->batch,
974                       GFX9_MI_BATCH_BUFFER_START_length * 4);
975 
976    while (u_vector_length(&cmd_buffer->bt_block_states) > 0) {
977       struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states);
978       anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
979    }
980    cmd_buffer->bt_next = ANV_STATE_NULL;
981 
982    anv_reloc_list_clear(&cmd_buffer->surface_relocs);
983 
984    /* Reset the list of seen buffers */
985    cmd_buffer->seen_bbos.head = 0;
986    cmd_buffer->seen_bbos.tail = 0;
987 
988    struct anv_batch_bo *first_bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
989 
990    *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = first_bbo;
991 
992    assert(first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE);
993    cmd_buffer->batch.allocated_batch_size = first_bbo->bo->size;
994 
995    /* Delete all generation batch bos */
996    list_for_each_entry_safe(struct anv_batch_bo, bbo,
997                             &cmd_buffer->generation.batch_bos, link) {
998       list_del(&bbo->link);
999       anv_batch_bo_destroy(bbo, cmd_buffer);
1000    }
1001 
1002    /* And reset generation batch */
1003    cmd_buffer->generation.batch.allocated_batch_size = 0;
1004    cmd_buffer->generation.batch.start = NULL;
1005    cmd_buffer->generation.batch.end   = NULL;
1006    cmd_buffer->generation.batch.next  = NULL;
1007 
1008    if (cmd_buffer->generation.ring_bo) {
1009       anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
1010                        cmd_buffer->generation.ring_bo);
1011       cmd_buffer->generation.ring_bo = NULL;
1012    }
1013 
1014    cmd_buffer->total_batch_size = 0;
1015 }
1016 
1017 void
anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer * cmd_buffer)1018 anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
1019 {
1020    const struct intel_device_info *devinfo = cmd_buffer->device->info;
1021    struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
1022 
1023    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1024       /* When we start a batch buffer, we subtract a certain amount of
1025        * padding from the end to ensure that we always have room to emit a
1026        * BATCH_BUFFER_START to chain to the next BO.  We need to remove
1027        * that padding before we end the batch; otherwise, we may end up
1028        * with our BATCH_BUFFER_END in another BO.
1029        */
1030       cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
1031       assert(cmd_buffer->batch.start == batch_bo->bo->map);
1032       assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
1033 
1034       /* Save end instruction location to override it later. */
1035       cmd_buffer->batch_end = cmd_buffer->batch.next;
1036 
1037       /* If we can chain this command buffer to another one, leave some place
1038        * for the jump instruction.
1039        */
1040       batch_bo->chained = anv_cmd_buffer_is_chainable(cmd_buffer);
1041       if (batch_bo->chained)
1042          emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
1043       else
1044          anv_batch_emit(&cmd_buffer->batch, GFX9_MI_BATCH_BUFFER_END, bbe);
1045 
1046       /* Round batch up to an even number of dwords. */
1047       if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4)
1048          anv_batch_emit(&cmd_buffer->batch, GFX9_MI_NOOP, noop);
1049 
1050       cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY;
1051    } else {
1052       assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1053       /* If this is a secondary command buffer, we need to determine the
1054        * mode in which it will be executed with vkExecuteCommands.  We
1055        * determine this statically here so that this stays in sync with the
1056        * actual ExecuteCommands implementation.
1057        */
1058       const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start;
1059       if (cmd_buffer->device->physical->use_call_secondary) {
1060          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN;
1061 
1062          void *jump_addr =
1063             anv_genX(devinfo, batch_emit_return)(&cmd_buffer->batch) +
1064             (GFX9_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8);
1065          cmd_buffer->return_addr = anv_batch_address(&cmd_buffer->batch, jump_addr);
1066 
1067          /* The emit above may have caused us to chain batch buffers which
1068           * would mean that batch_bo is no longer valid.
1069           */
1070          batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
1071       } else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) &&
1072                  (length < ANV_MIN_CMD_BUFFER_BATCH_SIZE / 2)) {
1073          /* If the secondary has exactly one batch buffer in its list *and*
1074           * that batch buffer is less than half of the maximum size, we're
1075           * probably better of simply copying it into our batch.
1076           */
1077          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_EMIT;
1078       } else if (!(cmd_buffer->usage_flags &
1079                    VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
1080          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN;
1081 
1082          /* In order to chain, we need this command buffer to contain an
1083           * MI_BATCH_BUFFER_START which will jump back to the calling batch.
1084           * It doesn't matter where it points now so long as has a valid
1085           * relocation.  We'll adjust it later as part of the chaining
1086           * process.
1087           *
1088           * We set the end of the batch a little short so we would be sure we
1089           * have room for the chaining command.  Since we're about to emit the
1090           * chaining command, let's set it back where it should go.
1091           */
1092          cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
1093          assert(cmd_buffer->batch.start == batch_bo->bo->map);
1094          assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
1095 
1096          emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
1097          assert(cmd_buffer->batch.start == batch_bo->bo->map);
1098       } else {
1099          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN;
1100       }
1101    }
1102 
1103    anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
1104 
1105    /* Add the current amount of data written in the current_bbo to the command
1106     * buffer.
1107     */
1108    cmd_buffer->total_batch_size += batch_bo->length;
1109 }
1110 
1111 static VkResult
anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer * cmd_buffer,struct list_head * list)1112 anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer,
1113                              struct list_head *list)
1114 {
1115    list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
1116       struct anv_batch_bo **bbo_ptr = u_vector_add(&cmd_buffer->seen_bbos);
1117       if (bbo_ptr == NULL)
1118          return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
1119 
1120       *bbo_ptr = bbo;
1121    }
1122 
1123    return VK_SUCCESS;
1124 }
1125 
1126 void
anv_cmd_buffer_add_secondary(struct anv_cmd_buffer * primary,struct anv_cmd_buffer * secondary)1127 anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
1128                              struct anv_cmd_buffer *secondary)
1129 {
1130    anv_measure_add_secondary(primary, secondary);
1131    switch (secondary->exec_mode) {
1132    case ANV_CMD_BUFFER_EXEC_MODE_EMIT:
1133       anv_batch_emit_batch(&primary->batch, &secondary->batch);
1134       break;
1135    case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: {
1136       struct anv_batch_bo *first_bbo =
1137          list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1138       struct anv_batch_bo *last_bbo =
1139          list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1140 
1141       emit_batch_buffer_start(&primary->batch, first_bbo->bo, 0);
1142 
1143       struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary);
1144       assert(primary->batch.start == this_bbo->bo->map);
1145       uint32_t offset = primary->batch.next - primary->batch.start;
1146 
1147       /* Make the tail of the secondary point back to right after the
1148        * MI_BATCH_BUFFER_START in the primary batch.
1149        */
1150       anv_batch_bo_link(primary, last_bbo, this_bbo, offset);
1151 
1152       anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
1153       break;
1154    }
1155    case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: {
1156       struct list_head copy_list;
1157       VkResult result = anv_batch_bo_list_clone(&secondary->batch_bos,
1158                                                 secondary,
1159                                                 &copy_list);
1160       if (result != VK_SUCCESS)
1161          return; /* FIXME */
1162 
1163       anv_cmd_buffer_add_seen_bbos(primary, &copy_list);
1164 
1165       struct anv_batch_bo *first_bbo =
1166          list_first_entry(&copy_list, struct anv_batch_bo, link);
1167       struct anv_batch_bo *last_bbo =
1168          list_last_entry(&copy_list, struct anv_batch_bo, link);
1169 
1170       cmd_buffer_chain_to_batch_bo(primary, first_bbo,
1171                                    ANV_CMD_BUFFER_BATCH_MAIN);
1172 
1173       list_splicetail(&copy_list, &primary->batch_bos);
1174 
1175       anv_batch_bo_continue(last_bbo, &primary->batch,
1176                             GFX9_MI_BATCH_BUFFER_START_length * 4);
1177       break;
1178    }
1179    case ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN: {
1180       struct anv_batch_bo *first_bbo =
1181          list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1182 
1183       anv_genX(primary->device->info, batch_emit_secondary_call)(
1184          &primary->batch, primary->device,
1185          (struct anv_address) { .bo = first_bbo->bo },
1186          secondary->return_addr);
1187 
1188       anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
1189       break;
1190    }
1191    default:
1192       assert(!"Invalid execution mode");
1193    }
1194 
1195    anv_reloc_list_append(&primary->surface_relocs, &secondary->surface_relocs);
1196 
1197    /* Add the amount of data written in the secondary buffer to the primary
1198     * command buffer.
1199     */
1200    primary->total_batch_size += secondary->total_batch_size;
1201 }
1202 
1203 void
anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer ** cmd_buffers,uint32_t num_cmd_buffers)1204 anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
1205                                      uint32_t num_cmd_buffers)
1206 {
1207    if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {
1208       assert(num_cmd_buffers == 1);
1209       return;
1210    }
1211 
1212    /* Chain the N-1 first batch buffers */
1213    for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++) {
1214       assert(cmd_buffers[i]->companion_rcs_cmd_buffer == NULL);
1215       anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);
1216    }
1217 
1218    /* Put an end to the last one */
1219    anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);
1220 }
1221 
1222 static void
anv_print_batch(struct anv_device * device,struct anv_queue * queue,struct anv_cmd_buffer * cmd_buffer)1223 anv_print_batch(struct anv_device *device,
1224                 struct anv_queue *queue,
1225                 struct anv_cmd_buffer *cmd_buffer)
1226 {
1227    struct anv_batch_bo *bbo =
1228       list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
1229    device->cmd_buffer_being_decoded = cmd_buffer;
1230    struct intel_batch_decode_ctx *ctx = queue->decoder;
1231 
1232    if (cmd_buffer->is_companion_rcs_cmd_buffer) {
1233       int render_queue_idx =
1234          anv_get_first_render_queue_index(device->physical);
1235       ctx = &device->decoder[render_queue_idx];
1236    }
1237 
1238    if (INTEL_DEBUG(DEBUG_BATCH)) {
1239       intel_print_batch(ctx, bbo->bo->map,
1240                         bbo->bo->size, bbo->bo->offset, false);
1241    }
1242    if (INTEL_DEBUG(DEBUG_BATCH_STATS)) {
1243       intel_batch_stats(ctx, bbo->bo->map,
1244                         bbo->bo->size, bbo->bo->offset, false);
1245    }
1246    device->cmd_buffer_being_decoded = NULL;
1247 }
1248 
1249 void
anv_cmd_buffer_exec_batch_debug(struct anv_queue * queue,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass)1250 anv_cmd_buffer_exec_batch_debug(struct anv_queue *queue,
1251                                 uint32_t cmd_buffer_count,
1252                                 struct anv_cmd_buffer **cmd_buffers,
1253                                 struct anv_query_pool *perf_query_pool,
1254                                 uint32_t perf_query_pass)
1255 {
1256    if (!INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS))
1257       return;
1258 
1259    struct anv_device *device = queue->device;
1260    const bool has_perf_query = perf_query_pool && cmd_buffer_count;
1261    uint64_t frame_id = device->debug_frame_desc->frame_id;
1262 
1263    if (!intel_debug_batch_in_range(device->debug_frame_desc->frame_id))
1264       return;
1265    fprintf(stderr, "Batch for frame %"PRIu64" on queue %d\n",
1266       frame_id, (int)(queue - device->queues));
1267 
1268    if (cmd_buffer_count) {
1269       if (has_perf_query) {
1270          struct anv_bo *pass_batch_bo = perf_query_pool->bo;
1271          uint64_t pass_batch_offset =
1272             khr_perf_query_preamble_offset(perf_query_pool, perf_query_pass);
1273 
1274          if (INTEL_DEBUG(DEBUG_BATCH)) {
1275             intel_print_batch(queue->decoder,
1276                               pass_batch_bo->map + pass_batch_offset, 64,
1277                               pass_batch_bo->offset + pass_batch_offset, false);
1278          }
1279       }
1280 
1281       for (uint32_t i = 0; i < cmd_buffer_count; i++)
1282          anv_print_batch(device, queue, cmd_buffers[i]);
1283    } else if (INTEL_DEBUG(DEBUG_BATCH)) {
1284       intel_print_batch(queue->decoder, device->trivial_batch_bo->map,
1285                         device->trivial_batch_bo->size,
1286                         device->trivial_batch_bo->offset, false);
1287    }
1288 }
1289 
1290 /* We lock around execbuf for three main reasons:
1291  *
1292  *  1) When a block pool is resized, we create a new gem handle with a
1293  *     different size and, in the case of surface states, possibly a different
1294  *     center offset but we re-use the same anv_bo struct when we do so. If
1295  *     this happens in the middle of setting up an execbuf, we could end up
1296  *     with our list of BOs out of sync with our list of gem handles.
1297  *
1298  *  2) The algorithm we use for building the list of unique buffers isn't
1299  *     thread-safe. While the client is supposed to synchronize around
1300  *     QueueSubmit, this would be extremely difficult to debug if it ever came
1301  *     up in the wild due to a broken app. It's better to play it safe and
1302  *     just lock around QueueSubmit.
1303  *
1304  * Since the only other things that ever take the device lock such as block
1305  * pool resize only rarely happen, this will almost never be contended so
1306  * taking a lock isn't really an expensive operation in this case.
1307  */
1308 static inline VkResult
anv_queue_exec_locked(struct anv_queue * queue,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,uint32_t signal_count,const struct vk_sync_signal * signals,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass,struct anv_utrace_submit * utrace_submit)1309 anv_queue_exec_locked(struct anv_queue *queue,
1310                       uint32_t wait_count,
1311                       const struct vk_sync_wait *waits,
1312                       uint32_t cmd_buffer_count,
1313                       struct anv_cmd_buffer **cmd_buffers,
1314                       uint32_t signal_count,
1315                       const struct vk_sync_signal *signals,
1316                       struct anv_query_pool *perf_query_pool,
1317                       uint32_t perf_query_pass,
1318                       struct anv_utrace_submit *utrace_submit)
1319 {
1320    struct anv_device *device = queue->device;
1321    VkResult result = VK_SUCCESS;
1322 
1323    /* We only need to synchronize the main & companion command buffers if we
1324     * have a companion command buffer somewhere in the list of command
1325     * buffers.
1326     */
1327    bool needs_companion_sync = false;
1328    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
1329       if (cmd_buffers[i]->companion_rcs_cmd_buffer != NULL) {
1330          needs_companion_sync = true;
1331          break;
1332       }
1333    }
1334 
1335    if (perf_query_pool && device->perf_queue != queue)
1336       debug_warn_once("Mismatch between queue that OA stream was open and "
1337                       "queue were query will be executed.");
1338 
1339    result =
1340       device->kmd_backend->queue_exec_locked(
1341          queue,
1342          wait_count, waits,
1343          cmd_buffer_count, cmd_buffers,
1344          needs_companion_sync ? 0 : signal_count, signals,
1345          perf_query_pool,
1346          perf_query_pass,
1347          utrace_submit);
1348    if (result != VK_SUCCESS)
1349       return result;
1350 
1351    if (needs_companion_sync) {
1352       struct vk_sync_wait companion_sync = {
1353          .sync = queue->companion_sync,
1354       };
1355       /* If any of the command buffer had a companion batch, the submission
1356        * backend will signal queue->companion_sync, so to ensure completion,
1357        * we just need to wait on that fence.
1358        */
1359       result =
1360          device->kmd_backend->queue_exec_locked(queue,
1361                                                 1, &companion_sync,
1362                                                 0, NULL,
1363                                                 signal_count, signals,
1364                                                 NULL, 0,
1365                                                 NULL);
1366    }
1367 
1368    return result;
1369 }
1370 
1371 static inline bool
can_chain_query_pools(struct anv_query_pool * p1,struct anv_query_pool * p2)1372 can_chain_query_pools(struct anv_query_pool *p1, struct anv_query_pool *p2)
1373 {
1374    return (!p1 || !p2 || p1 == p2);
1375 }
1376 
1377 static VkResult
anv_queue_submit_sparse_bind_locked(struct anv_queue * queue,struct vk_queue_submit * submit)1378 anv_queue_submit_sparse_bind_locked(struct anv_queue *queue,
1379                                     struct vk_queue_submit *submit)
1380 {
1381    struct anv_device *device = queue->device;
1382    VkResult result;
1383 
1384    /* When fake sparse is enabled, while we do accept creating "sparse"
1385     * resources we can't really handle sparse submission. Fake sparse is
1386     * supposed to be used by applications that request sparse to be enabled
1387     * but don't actually *use* it.
1388     */
1389    if (device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) {
1390       if (INTEL_DEBUG(DEBUG_SPARSE))
1391          fprintf(stderr, "=== application submitting sparse operations: "
1392                "buffer_bind:%d image_opaque_bind:%d image_bind:%d\n",
1393                submit->buffer_bind_count, submit->image_opaque_bind_count,
1394                submit->image_bind_count);
1395       return vk_queue_set_lost(&queue->vk, "Sparse binding not supported");
1396    }
1397 
1398    assert(submit->command_buffer_count == 0);
1399 
1400    if (INTEL_DEBUG(DEBUG_SPARSE)) {
1401       fprintf(stderr, "[sparse submission, buffers:%u opaque_images:%u "
1402               "images:%u waits:%u signals:%u]\n",
1403               submit->buffer_bind_count,
1404               submit->image_opaque_bind_count,
1405               submit->image_bind_count,
1406               submit->wait_count, submit->signal_count);
1407    }
1408 
1409    struct anv_sparse_submission sparse_submit = {
1410       .queue = queue,
1411       .binds = NULL,
1412       .binds_len = 0,
1413       .binds_capacity = 0,
1414       .wait_count = submit->wait_count,
1415       .signal_count = submit->signal_count,
1416       .waits = submit->waits,
1417       .signals = submit->signals,
1418    };
1419 
1420    for (uint32_t i = 0; i < submit->buffer_bind_count; i++) {
1421       VkSparseBufferMemoryBindInfo *bind_info = &submit->buffer_binds[i];
1422       ANV_FROM_HANDLE(anv_buffer, buffer, bind_info->buffer);
1423 
1424       assert(anv_buffer_is_sparse(buffer));
1425 
1426       for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1427          result = anv_sparse_bind_buffer(device, buffer,
1428                                          &bind_info->pBinds[j],
1429                                          &sparse_submit);
1430          if (result != VK_SUCCESS)
1431             goto out_free_submit;
1432       }
1433    }
1434 
1435    for (uint32_t i = 0; i < submit->image_bind_count; i++) {
1436       VkSparseImageMemoryBindInfo *bind_info = &submit->image_binds[i];
1437       ANV_FROM_HANDLE(anv_image, image, bind_info->image);
1438 
1439       assert(anv_image_is_sparse(image));
1440       assert(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT);
1441 
1442       for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1443          result = anv_sparse_bind_image_memory(queue, image,
1444                                                &bind_info->pBinds[j],
1445                                                &sparse_submit);
1446          if (result != VK_SUCCESS)
1447             goto out_free_submit;
1448       }
1449    }
1450 
1451    for (uint32_t i = 0; i < submit->image_opaque_bind_count; i++) {
1452       VkSparseImageOpaqueMemoryBindInfo *bind_info =
1453          &submit->image_opaque_binds[i];
1454       ANV_FROM_HANDLE(anv_image, image, bind_info->image);
1455 
1456       assert(anv_image_is_sparse(image));
1457 
1458       for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1459          result = anv_sparse_bind_image_opaque(device, image,
1460                                                &bind_info->pBinds[j],
1461                                                &sparse_submit);
1462          if (result != VK_SUCCESS)
1463             goto out_free_submit;
1464       }
1465    }
1466 
1467    result = anv_sparse_bind(device, &sparse_submit);
1468 
1469 out_free_submit:
1470    vk_free(&device->vk.alloc, sparse_submit.binds);
1471    return result;
1472 }
1473 
1474 static VkResult
anv_queue_submit_cmd_buffers_locked(struct anv_queue * queue,struct vk_queue_submit * submit,struct anv_utrace_submit * utrace_submit)1475 anv_queue_submit_cmd_buffers_locked(struct anv_queue *queue,
1476                                     struct vk_queue_submit *submit,
1477                                     struct anv_utrace_submit *utrace_submit)
1478 {
1479    VkResult result;
1480 
1481    if (submit->command_buffer_count == 0) {
1482       result = anv_queue_exec_locked(queue, submit->wait_count, submit->waits,
1483                                      0 /* cmd_buffer_count */,
1484                                      NULL /* cmd_buffers */,
1485                                      submit->signal_count, submit->signals,
1486                                      NULL /* perf_query_pool */,
1487                                      0 /* perf_query_pass */,
1488                                      utrace_submit);
1489       if (result != VK_SUCCESS)
1490          return result;
1491    } else {
1492       /* Everything's easier if we don't have to bother with container_of() */
1493       STATIC_ASSERT(offsetof(struct anv_cmd_buffer, vk) == 0);
1494       struct vk_command_buffer **vk_cmd_buffers = submit->command_buffers;
1495       struct anv_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers;
1496       uint32_t start = 0;
1497       uint32_t end = submit->command_buffer_count;
1498       struct anv_query_pool *perf_query_pool =
1499          cmd_buffers[start]->perf_query_pool;
1500       for (uint32_t n = 0; n < end; n++) {
1501          bool can_chain = false;
1502          uint32_t next = n + 1;
1503          /* Can we chain the last buffer into the next one? */
1504          if (next < end &&
1505              anv_cmd_buffer_is_chainable(cmd_buffers[n]) &&
1506              anv_cmd_buffer_is_chainable(cmd_buffers[next]) &&
1507              can_chain_query_pools
1508              (cmd_buffers[next]->perf_query_pool, perf_query_pool)) {
1509             can_chain = true;
1510             perf_query_pool =
1511                perf_query_pool ? perf_query_pool :
1512                cmd_buffers[next]->perf_query_pool;
1513          }
1514          if (!can_chain) {
1515             /* The next buffer cannot be chained, or we have reached the
1516              * last buffer, submit what have been chained so far.
1517              */
1518             VkResult result =
1519                anv_queue_exec_locked(queue,
1520                                      start == 0 ? submit->wait_count : 0,
1521                                      start == 0 ? submit->waits : NULL,
1522                                      next - start, &cmd_buffers[start],
1523                                      next == end ? submit->signal_count : 0,
1524                                      next == end ? submit->signals : NULL,
1525                                      perf_query_pool,
1526                                      submit->perf_pass_index,
1527                                      next == end ? utrace_submit : NULL);
1528             if (result != VK_SUCCESS)
1529                return result;
1530             if (next < end) {
1531                start = next;
1532                perf_query_pool = cmd_buffers[start]->perf_query_pool;
1533             }
1534          }
1535       }
1536    }
1537    for (uint32_t i = 0; i < submit->signal_count; i++) {
1538       if (!vk_sync_is_anv_bo_sync(submit->signals[i].sync))
1539          continue;
1540 
1541       struct anv_bo_sync *bo_sync =
1542          container_of(submit->signals[i].sync, struct anv_bo_sync, sync);
1543 
1544       /* Once the execbuf has returned, we need to set the fence state to
1545        * SUBMITTED.  We can't do this before calling execbuf because
1546        * anv_GetFenceStatus does take the global device lock before checking
1547        * fence->state.
1548        *
1549        * We set the fence state to SUBMITTED regardless of whether or not the
1550        * execbuf succeeds because we need to ensure that vkWaitForFences() and
1551        * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or
1552        * VK_SUCCESS) in a finite amount of time even if execbuf fails.
1553        */
1554       assert(bo_sync->state == ANV_BO_SYNC_STATE_RESET);
1555       bo_sync->state = ANV_BO_SYNC_STATE_SUBMITTED;
1556    }
1557 
1558    pthread_cond_broadcast(&queue->device->queue_submit);
1559 
1560    return VK_SUCCESS;
1561 }
1562 
1563 static inline void
anv_queue_free_initial_submission(struct anv_queue * queue)1564 anv_queue_free_initial_submission(struct anv_queue *queue)
1565 {
1566    if (queue->init_submit &&
1567        anv_async_submit_done(queue->init_submit)) {
1568       anv_async_submit_destroy(queue->init_submit);
1569       queue->init_submit = NULL;
1570    }
1571    if (queue->init_companion_submit &&
1572        anv_async_submit_done(queue->init_companion_submit)) {
1573       anv_async_submit_destroy(queue->init_companion_submit);
1574       queue->init_companion_submit = NULL;
1575    }
1576 }
1577 
1578 VkResult
anv_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)1579 anv_queue_submit(struct vk_queue *vk_queue,
1580                  struct vk_queue_submit *submit)
1581 {
1582    struct anv_queue *queue = container_of(vk_queue, struct anv_queue, vk);
1583    struct anv_device *device = queue->device;
1584    VkResult result;
1585 
1586    anv_queue_free_initial_submission(queue);
1587 
1588    if (queue->device->info->no_hw) {
1589       for (uint32_t i = 0; i < submit->signal_count; i++) {
1590          result = vk_sync_signal(&device->vk,
1591                                  submit->signals[i].sync,
1592                                  submit->signals[i].signal_value);
1593          if (result != VK_SUCCESS)
1594             return vk_queue_set_lost(&queue->vk, "vk_sync_signal failed");
1595       }
1596       return VK_SUCCESS;
1597    }
1598 
1599    /* Flush the trace points first before taking the lock as the flushing
1600     * might try to take that same lock.
1601     */
1602    struct anv_utrace_submit *utrace_submit = NULL;
1603    result = anv_device_utrace_flush_cmd_buffers(
1604       queue,
1605       submit->command_buffer_count,
1606       (struct anv_cmd_buffer **)submit->command_buffers,
1607       &utrace_submit);
1608    if (result != VK_SUCCESS)
1609       return result;
1610 
1611    pthread_mutex_lock(&device->mutex);
1612 
1613    uint64_t start_ts = intel_ds_begin_submit(&queue->ds);
1614 
1615    if (submit->buffer_bind_count ||
1616        submit->image_opaque_bind_count ||
1617        submit->image_bind_count) {
1618       result = anv_queue_submit_sparse_bind_locked(queue, submit);
1619    } else {
1620       result = anv_queue_submit_cmd_buffers_locked(queue, submit,
1621                                                    utrace_submit);
1622    }
1623 
1624    /* Take submission ID under lock */
1625    intel_ds_end_submit(&queue->ds, start_ts);
1626 
1627    pthread_mutex_unlock(&device->mutex);
1628 
1629    intel_ds_device_process(&device->ds, false);
1630 
1631    return result;
1632 }
1633 
1634 void
anv_cmd_buffer_clflush(struct anv_cmd_buffer ** cmd_buffers,uint32_t num_cmd_buffers)1635 anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers,
1636                        uint32_t num_cmd_buffers)
1637 {
1638 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
1639    struct anv_batch_bo **bbo;
1640 
1641    __builtin_ia32_mfence();
1642 
1643    for (uint32_t i = 0; i < num_cmd_buffers; i++) {
1644       u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
1645          intel_flush_range_no_fence((*bbo)->bo->map, (*bbo)->length);
1646       }
1647    }
1648 
1649    __builtin_ia32_mfence();
1650 #endif
1651 }
1652 
1653 static VkResult
anv_async_submit_extend_batch(struct anv_batch * batch,uint32_t size,void * user_data)1654 anv_async_submit_extend_batch(struct anv_batch *batch, uint32_t size,
1655                               void *user_data)
1656 {
1657    struct anv_async_submit *submit = user_data;
1658 
1659    uint32_t alloc_size = 0;
1660    util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
1661       alloc_size += (*bo)->size;
1662    alloc_size = MAX2(alloc_size * 2, 8192);
1663 
1664    struct anv_bo *bo;
1665    VkResult result = anv_bo_pool_alloc(submit->bo_pool,
1666                                        align(alloc_size, 4096),
1667                                        &bo);
1668    if (result != VK_SUCCESS)
1669       return result;
1670 
1671    util_dynarray_append(&submit->batch_bos, struct anv_bo *, bo);
1672 
1673    batch->end += 4 * GFX9_MI_BATCH_BUFFER_START_length;
1674 
1675    anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
1676       bbs.DWordLength               = GFX9_MI_BATCH_BUFFER_START_length -
1677                                       GFX9_MI_BATCH_BUFFER_START_length_bias;
1678       bbs.SecondLevelBatchBuffer    = Firstlevelbatch;
1679       bbs.AddressSpaceIndicator     = ASI_PPGTT;
1680       bbs.BatchBufferStartAddress   = (struct anv_address) { bo, 0 };
1681    }
1682 
1683    anv_batch_set_storage(batch,
1684                          (struct anv_address) { .bo = bo, },
1685                          bo->map,
1686                          bo->size - 4 * GFX9_MI_BATCH_BUFFER_START_length);
1687 
1688    return VK_SUCCESS;
1689 }
1690 
1691 VkResult
anv_async_submit_init(struct anv_async_submit * submit,struct anv_queue * queue,struct anv_bo_pool * bo_pool,bool use_companion_rcs,bool create_signal_sync)1692 anv_async_submit_init(struct anv_async_submit *submit,
1693                       struct anv_queue *queue,
1694                       struct anv_bo_pool *bo_pool,
1695                       bool use_companion_rcs,
1696                       bool create_signal_sync)
1697 {
1698    struct anv_device *device = queue->device;
1699 
1700    memset(submit, 0, sizeof(*submit));
1701 
1702    submit->use_companion_rcs = use_companion_rcs;
1703    submit->queue = queue;
1704    submit->bo_pool = bo_pool;
1705 
1706    const bool uses_relocs = device->physical->uses_relocs;
1707    VkResult result =
1708       anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
1709    if (result != VK_SUCCESS)
1710       return result;
1711 
1712    submit->batch = (struct anv_batch) {
1713       .alloc = &device->vk.alloc,
1714       .relocs = &submit->relocs,
1715       .user_data = submit,
1716       .extend_cb = anv_async_submit_extend_batch,
1717    };
1718 
1719    util_dynarray_init(&submit->batch_bos, NULL);
1720 
1721    if (create_signal_sync) {
1722       result = vk_sync_create(&device->vk,
1723                               &device->physical->sync_syncobj_type,
1724                               0, 0, &submit->signal.sync);
1725       if (result != VK_SUCCESS) {
1726          anv_reloc_list_finish(&submit->relocs);
1727          util_dynarray_fini(&submit->batch_bos);
1728          return result;
1729       }
1730       submit->owns_sync = true;
1731    }
1732 
1733    return VK_SUCCESS;
1734 }
1735 
1736 void
anv_async_submit_fini(struct anv_async_submit * submit)1737 anv_async_submit_fini(struct anv_async_submit *submit)
1738 {
1739    struct anv_device *device = submit->queue->device;
1740 
1741    if (submit->owns_sync)
1742       vk_sync_destroy(&device->vk, submit->signal.sync);
1743 
1744    util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
1745       anv_bo_pool_free(submit->bo_pool, *bo);
1746    util_dynarray_fini(&submit->batch_bos);
1747    anv_reloc_list_finish(&submit->relocs);
1748 }
1749 
1750 VkResult
anv_async_submit_create(struct anv_queue * queue,struct anv_bo_pool * bo_pool,bool use_companion_rcs,bool create_signal_sync,struct anv_async_submit ** out_submit)1751 anv_async_submit_create(struct anv_queue *queue,
1752                         struct anv_bo_pool *bo_pool,
1753                         bool use_companion_rcs,
1754                         bool create_signal_sync,
1755                         struct anv_async_submit **out_submit)
1756 {
1757    struct anv_device *device = queue->device;
1758 
1759    *out_submit =
1760       vk_alloc(&device->vk.alloc, sizeof(struct anv_async_submit), 8,
1761                VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1762    if (*out_submit == NULL)
1763       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1764 
1765    VkResult result = anv_async_submit_init(*out_submit, queue,
1766                                            bo_pool,
1767                                            use_companion_rcs,
1768                                            create_signal_sync);
1769    if (result != VK_SUCCESS)
1770       vk_free(&device->vk.alloc, *out_submit);
1771 
1772    return result;
1773 }
1774 
1775 void
anv_async_submit_destroy(struct anv_async_submit * submit)1776 anv_async_submit_destroy(struct anv_async_submit *submit)
1777 {
1778    struct anv_device *device = submit->queue->device;
1779    anv_async_submit_fini(submit);
1780    vk_free(&device->vk.alloc, submit);
1781 }
1782 
1783 bool
anv_async_submit_done(struct anv_async_submit * submit)1784 anv_async_submit_done(struct anv_async_submit *submit)
1785 {
1786    struct anv_device *device = submit->queue->device;
1787 
1788    return vk_sync_wait(&device->vk,
1789                        submit->signal.sync,
1790                        submit->signal.signal_value,
1791                        VK_SYNC_WAIT_COMPLETE, 0) == VK_SUCCESS;
1792 }
1793 
1794 bool
anv_async_submit_wait(struct anv_async_submit * submit)1795 anv_async_submit_wait(struct anv_async_submit *submit)
1796 {
1797    struct anv_device *device = submit->queue->device;
1798 
1799    return vk_sync_wait(&device->vk,
1800                        submit->signal.sync,
1801                        submit->signal.signal_value,
1802                        VK_SYNC_WAIT_COMPLETE,
1803                        os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE)) == VK_SUCCESS;
1804 }
1805