1 /**************************************************************************
2 *
3 * Copyright 2017 Advanced Micro Devices, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * on the rights to use, copy, modify, merge, publish, distribute, sub
10 * license, and/or sell copies of the Software, and to permit persons to whom
11 * the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
21 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
23 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 *
25 **************************************************************************/
26
27 /* This is a wrapper for pipe_context that executes all pipe_context calls
28 * in another thread.
29 *
30 *
31 * Guidelines for adopters and deviations from Gallium
32 * ---------------------------------------------------
33 *
34 * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen
35 * driver functions that take a context (fence_finish, texture_get_handle)
36 * should manually unwrap pipe_context by doing:
37 * pipe = threaded_context_unwrap_sync(pipe);
38 *
39 * pipe_context::priv is used to unwrap the context, so drivers and state
40 * trackers shouldn't use it.
41 *
42 * No other objects are wrapped.
43 *
44 * 2) Drivers must subclass and initialize these structures:
45 * - threaded_resource for pipe_resource (use threaded_resource_init/deinit)
46 * - threaded_query for pipe_query (zero memory)
47 * - threaded_transfer for pipe_transfer (zero memory)
48 *
49 * 3) The threaded context must not be enabled for contexts that can use video
50 * codecs.
51 *
52 * 4) Changes in driver behavior:
53 * - begin_query and end_query always return true; return values from
54 * the driver are ignored.
55 * - generate_mipmap uses is_format_supported to determine success;
56 * the return value from the driver is ignored.
57 * - resource_commit always returns true; failures are ignored.
58 * - set_debug_callback is skipped if the callback is synchronous.
59 *
60 *
61 * Thread-safety requirements on context functions
62 * -----------------------------------------------
63 *
64 * These pipe_context functions are executed directly, so they shouldn't use
65 * pipe_context in an unsafe way. They are de-facto screen functions now:
66 * - create_query
67 * - create_batch_query
68 * - create_*_state (all CSOs and shaders)
69 * - Make sure the shader compiler doesn't use any per-context stuff.
70 * (e.g. LLVM target machine)
71 * - Only pipe_context's debug callback for shader dumps is guaranteed to
72 * be up to date, because set_debug_callback synchronizes execution.
73 * - create_surface
74 * - surface_destroy
75 * - create_sampler_view
76 * - sampler_view_destroy
77 * - stream_output_target_destroy
78 * - transfer_map (only unsychronized buffer mappings)
79 * - get_query_result (when threaded_query::flushed == true)
80 * - create_stream_output_target
81 * - get_sample_position
82 *
83 *
84 * Transfer_map rules for buffer mappings
85 * --------------------------------------
86 *
87 * 1) If transfer_map has PIPE_MAP_UNSYNCHRONIZED, the call is made
88 * in the non-driver thread without flushing the queue. The driver will
89 * receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_MAP_-
90 * UNSYNCHRONIZED to indicate this.
91 * Note that transfer_unmap is always enqueued and called from the driver
92 * thread.
93 *
94 * 2) The driver isn't allowed to infer unsychronized mappings by tracking
95 * the valid buffer range. The threaded context always sends TC_TRANSFER_-
96 * MAP_NO_INFER_UNSYNCHRONIZED to indicate this. Ignoring the flag will lead
97 * to failures.
98 * The threaded context does its own detection of unsynchronized mappings.
99 *
100 * 3) The driver isn't allowed to do buffer invalidations by itself under any
101 * circumstances. This is necessary for unsychronized maps to map the latest
102 * version of the buffer. (because invalidations can be queued, while
103 * unsychronized maps are not queued and they should return the latest
104 * storage after invalidation). The threaded context always sends
105 * TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
106 * indicate this. Ignoring the flag will lead to failures.
107 * The threaded context uses its own buffer invalidation mechanism.
108 * Do NOT use pipe_buffer_write, as this may trigger invalidation;
109 * use tc_buffer_write instead.
110 *
111 * 4) PIPE_MAP_ONCE can no longer be used to infer that a buffer will not be mapped
112 * a second time before it is unmapped.
113 *
114 *
115 * Rules for fences
116 * ----------------
117 *
118 * Flushes will be executed asynchronously in the driver thread if a
119 * create_fence callback is provided. This affects fence semantics as follows.
120 *
121 * When the threaded context wants to perform an asynchronous flush, it will
122 * use the create_fence callback to pre-create the fence from the calling
123 * thread. This pre-created fence will be passed to pipe_context::flush
124 * together with the TC_FLUSH_ASYNC flag.
125 *
126 * The callback receives the unwrapped context as a parameter, but must use it
127 * in a thread-safe way because it is called from a non-driver thread.
128 *
129 * If the threaded_context does not immediately flush the current batch, the
130 * callback also receives a tc_unflushed_batch_token. If fence_finish is called
131 * on the returned fence in the context that created the fence,
132 * threaded_context_flush must be called.
133 *
134 * The driver must implement pipe_context::fence_server_sync properly, since
135 * the threaded context handles PIPE_FLUSH_ASYNC.
136 *
137 *
138 * Additional requirements
139 * -----------------------
140 *
141 * get_query_result:
142 * If threaded_query::flushed == true, get_query_result should assume that
143 * it's called from a non-driver thread, in which case the driver shouldn't
144 * use the context in an unsafe way.
145 *
146 * replace_buffer_storage:
147 * The driver has to implement this callback, which will be called when
148 * the threaded context wants to replace a resource's backing storage with
149 * another resource's backing storage. The threaded context uses it to
150 * implement buffer invalidation. This call is always queued.
151 * Note that 'minimum_num_rebinds' specifies only the minimum number of rebinds
152 * which must be managed by the driver; if a buffer is bound multiple times in
153 * the same binding point (e.g., vertex buffer slots 0,1,2), this will be counted
154 * as a single rebind.
155 * A buffer which has had its backing storage replaced may have its backing storage
156 * accessed through multiple pipe_resources.
157 *
158 *
159 * Optional resource busy callbacks for better performance
160 * -------------------------------------------------------
161 *
162 * This adds checking whether a resource is used by the GPU and whether
163 * a resource is referenced by an unflushed command buffer. If neither is true,
164 * the threaded context will map the buffer as UNSYNCHRONIZED without flushing
165 * or synchronizing the thread and will skip any buffer invalidations
166 * (reallocations) because invalidating an idle buffer has no benefit.
167 *
168 * There are 1 driver callback and 1 TC callback:
169 *
170 * 1) is_resource_busy: It returns true when a resource is busy. If this is NULL,
171 * the resource is considered always busy.
172 *
173 * 2) tc_driver_internal_flush_notify: If the driver set
174 * driver_calls_flush_notify = true in threaded_context_create, it should
175 * call this after every internal driver flush. The threaded context uses it
176 * to track internal driver flushes for the purpose of tracking which
177 * buffers are referenced by an unflushed command buffer.
178 *
179 * If is_resource_busy is set, threaded_resource::buffer_id_unique must be
180 * generated by the driver, and the replace_buffer_storage callback should
181 * delete the buffer ID passed to it. The driver should use
182 * util_idalloc_mt_init_tc.
183 *
184 *
185 * How it works (queue architecture)
186 * ---------------------------------
187 *
188 * There is a multithreaded queue consisting of batches, each batch containing
189 * 8-byte slots. Calls can occupy 1 or more slots.
190 *
191 * Once a batch is full and there is no space for the next call, it's flushed,
192 * meaning that it's added to the queue for execution in the other thread.
193 * The batches are ordered in a ring and reused once they are idle again.
194 * The batching is necessary for low queue/mutex overhead.
195 */
196
197 #ifndef U_THREADED_CONTEXT_H
198 #define U_THREADED_CONTEXT_H
199
200 #include "c11/threads.h"
201 #include "pipe/p_context.h"
202 #include "pipe/p_state.h"
203 #include "util/bitset.h"
204 #include "util/u_inlines.h"
205 #include "util/u_memory.h"
206 #include "util/u_queue.h"
207 #include "util/u_range.h"
208 #include "util/u_thread.h"
209 #include "util/slab.h"
210 #include "util/u_dynarray.h"
211
212 #ifdef __cplusplus
213 extern "C" {
214 #endif
215
216 struct threaded_context;
217 struct tc_unflushed_batch_token;
218
219 /* 0 = disabled, 1 = assertions, 2 = printfs, 3 = logging */
220 #define TC_DEBUG 0
221
222 /* This is an internal flag not sent to the driver. */
223 #define TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE (1u << 28)
224 /* These are map flags sent to drivers. */
225 /* Never infer whether it's safe to use unsychronized mappings: */
226 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
227 /* Don't invalidate buffers: */
228 #define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30)
229 /* transfer_map is called from a non-driver thread: */
230 #define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31)
231
232 /* Custom flush flags sent to drivers. */
233 /* fence is pre-populated with a fence created by the create_fence callback */
234 #define TC_FLUSH_ASYNC (1u << 31)
235
236 /* Size of the queue = number of batch slots in memory.
237 * - 1 batch is always idle and records new commands
238 * - 1 batch is being executed
239 * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
240 *
241 * Use a size as small as possible for low CPU L2 cache usage but large enough
242 * so that the queue isn't stalled too often for not having enough idle batch
243 * slots.
244 */
245 #define TC_MAX_BATCHES 10
246
247 /* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer)
248 * can occupy multiple call slots.
249 *
250 * The idea is to have batches as small as possible but large enough so that
251 * the queuing and mutex overhead is negligible.
252 */
253 #define TC_SLOTS_PER_BATCH 1536
254
255 /* The buffer list queue is much deeper than the batch queue because buffer
256 * lists need to stay around until the driver internally flushes its command
257 * buffer.
258 */
259 #define TC_MAX_BUFFER_LISTS (TC_MAX_BATCHES * 4)
260
261 /* This mask is used to get a hash of a buffer ID. It's also the bit size of
262 * the buffer list - 1. It must be 2^n - 1. The size should be as low as
263 * possible to minimize memory usage, but high enough to minimize hash
264 * collisions.
265 */
266 #define TC_BUFFER_ID_MASK BITFIELD_MASK(14)
267
268 /* Threshold for when to use the queue or sync. */
269 #define TC_MAX_STRING_MARKER_BYTES 512
270
271 /* Threshold for when to enqueue buffer/texture_subdata as-is.
272 * If the upload size is greater than this, it will do instead:
273 * - for buffers: DISCARD_RANGE is done by the threaded context
274 * - for textures: sync and call the driver directly
275 */
276 #define TC_MAX_SUBDATA_BYTES 320
277
278 enum tc_call_id {
279 #define CALL(name) TC_CALL_##name,
280 #include "u_threaded_context_calls.h"
281 #undef CALL
282 TC_NUM_CALLS,
283 };
284
285 enum tc_binding_type {
286 TC_BINDING_VERTEX_BUFFER,
287 TC_BINDING_STREAMOUT_BUFFER,
288 TC_BINDING_UBO_VS,
289 TC_BINDING_UBO_FS,
290 TC_BINDING_UBO_GS,
291 TC_BINDING_UBO_TCS,
292 TC_BINDING_UBO_TES,
293 TC_BINDING_UBO_CS,
294 TC_BINDING_SAMPLERVIEW_VS,
295 TC_BINDING_SAMPLERVIEW_FS,
296 TC_BINDING_SAMPLERVIEW_GS,
297 TC_BINDING_SAMPLERVIEW_TCS,
298 TC_BINDING_SAMPLERVIEW_TES,
299 TC_BINDING_SAMPLERVIEW_CS,
300 TC_BINDING_SSBO_VS,
301 TC_BINDING_SSBO_FS,
302 TC_BINDING_SSBO_GS,
303 TC_BINDING_SSBO_TCS,
304 TC_BINDING_SSBO_TES,
305 TC_BINDING_SSBO_CS,
306 TC_BINDING_IMAGE_VS,
307 TC_BINDING_IMAGE_FS,
308 TC_BINDING_IMAGE_GS,
309 TC_BINDING_IMAGE_TCS,
310 TC_BINDING_IMAGE_TES,
311 TC_BINDING_IMAGE_CS,
312 };
313
314 typedef uint16_t (*tc_execute)(struct pipe_context *pipe, void *call);
315
316 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
317 struct pipe_resource *dst,
318 struct pipe_resource *src,
319 unsigned minimum_num_rebinds,
320 uint32_t rebind_mask,
321 uint32_t delete_buffer_id);
322 typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx,
323 struct tc_unflushed_batch_token *token);
324 typedef bool (*tc_is_resource_busy)(struct pipe_screen *screen,
325 struct pipe_resource *resource,
326 unsigned usage);
327
328 struct threaded_resource {
329 struct pipe_resource b;
330
331 /* Since buffer invalidations are queued, we can't use the base resource
332 * for unsychronized mappings. This points to the latest version of
333 * the buffer after the latest invalidation. It's only used for unsychro-
334 * nized mappings in the non-driver thread. Initially it's set to &b.
335 */
336 struct pipe_resource *latest;
337
338 /* Optional CPU storage of the buffer. When we get partial glBufferSubData(implemented by
339 * copy_buffer) + glDrawElements, we don't want to drain the gfx pipeline before executing
340 * the copy. For ideal pipelining, we upload to this CPU storage and then reallocate
341 * the GPU storage completely and reupload everything without copy_buffer.
342 */
343 void *cpu_storage;
344
345 /* The buffer range which is initialized (with a write transfer, streamout,
346 * or writable shader resources). The remainder of the buffer is considered
347 * invalid and can be mapped unsynchronized.
348 *
349 * This allows unsychronized mapping of a buffer range which hasn't been
350 * used yet. It's for applications which forget to use the unsynchronized
351 * map flag and expect the driver to figure it out.
352 *
353 * Drivers should set this to the full range for buffers backed by user
354 * memory.
355 */
356 struct util_range valid_buffer_range;
357
358 /* Drivers are required to update this for shared resources and user
359 * pointers. */
360 bool is_shared;
361 bool is_user_ptr;
362 bool allow_cpu_storage;
363
364 /* internal tag for tc indicating which batch last touched this resource */
365 int8_t last_batch_usage;
366 /* for disambiguating last_batch_usage across batch cycles */
367 uint32_t batch_generation;
368
369 /* Unique buffer ID. Drivers must set it to non-zero for buffers and it must
370 * be unique. Textures must set 0. Low bits are used as a hash of the ID.
371 * Use util_idalloc_mt to generate these IDs.
372 */
373 uint32_t buffer_id_unique;
374
375 /* If positive, then a staging transfer is in progress.
376 */
377 int pending_staging_uploads;
378
379 /* If staging uploads are pending, this will hold the union of the mapped
380 * ranges.
381 */
382 struct util_range pending_staging_uploads_range;
383 };
384
385 struct threaded_transfer {
386 struct pipe_transfer b;
387
388 /* Staging buffer for DISCARD_RANGE transfers. */
389 struct pipe_resource *staging;
390
391 /* If b.resource is not the base instance of the buffer, but it's one of its
392 * reallocations (set in "latest" of the base instance), this points to
393 * the valid range of the base instance. It's used for transfers after
394 * a buffer invalidation, because such transfers operate on "latest", not
395 * the base instance. Initially it's set to &b.resource->valid_buffer_range.
396 */
397 struct util_range *valid_buffer_range;
398
399 bool cpu_storage_mapped;
400 };
401
402 struct threaded_query {
403 /* The query is added to the list in end_query and removed in flush. */
404 struct list_head head_unflushed;
405
406 /* Whether pipe->flush has been called in non-deferred mode after end_query. */
407 bool flushed;
408 };
409
410 struct tc_call_base {
411 #if !defined(NDEBUG) && TC_DEBUG >= 1
412 uint32_t sentinel;
413 #endif
414 uint16_t num_slots;
415 uint16_t call_id;
416 };
417
418 struct tc_draw_single {
419 struct tc_call_base base;
420 unsigned index_bias;
421 struct pipe_draw_info info;
422 };
423
424 /**
425 * A token representing an unflushed batch.
426 *
427 * See the general rules for fences for an explanation.
428 */
429 struct tc_unflushed_batch_token {
430 struct pipe_reference ref;
431 struct threaded_context *tc;
432 };
433
434 struct tc_renderpass_info {
435 union {
436 struct {
437 /* bitmask of full-cleared color buffers */
438 uint8_t cbuf_clear;
439 /* bitmask of not-full-cleared color buffers */
440 uint8_t cbuf_load;
441 /* bitmask of color buffers that have their stores invalidated */
442 uint8_t cbuf_invalidate;
443 /* whether the zsbuf is full-cleared */
444 bool zsbuf_clear : 1;
445 /* whether the zsbuf is partial-cleared */
446 bool zsbuf_clear_partial : 1;
447 /* whether the zsbuf is not-full-cleared */
448 bool zsbuf_load : 1;
449 /* whether the zsbuf is invalidated */
450 bool zsbuf_invalidate : 1;
451 /* whether a draw occurs */
452 bool has_draw : 1;
453 /* whether a framebuffer resolve occurs on cbuf[0] */
454 bool has_resolve : 1;
455 /* whether queries are ended during this renderpass */
456 bool has_query_ends : 1;
457 uint8_t pad : 1;
458 /* 32 bits offset */
459 /* bitmask of color buffers using fbfetch */
460 uint8_t cbuf_fbfetch;
461 /* whether the fragment shader writes to the zsbuf */
462 bool zsbuf_write_fs : 1;
463 /* whether the DSA state writes to the zsbuf */
464 bool zsbuf_write_dsa : 1;
465 /* whether the DSA state reads the zsbuf */
466 bool zsbuf_read_dsa : 1;
467 /* whether the zsbuf is used for fbfetch */
468 bool zsbuf_fbfetch : 1;
469 uint8_t pad2 : 4;
470 uint16_t pad3;
471 };
472 uint64_t data;
473 /* fb info is in data32[0] */
474 uint32_t data32[2];
475 /* cso info is in data16[2] */
476 uint16_t data16[4];
477 /* zsbuf fb info is in data8[3] */
478 uint8_t data8[8];
479 };
480 };
481
482 static inline bool
tc_renderpass_info_is_zsbuf_used(const struct tc_renderpass_info * info)483 tc_renderpass_info_is_zsbuf_used(const struct tc_renderpass_info *info)
484 {
485 return info->zsbuf_clear ||
486 info->zsbuf_clear_partial ||
487 info->zsbuf_write_fs ||
488 info->zsbuf_write_dsa ||
489 info->zsbuf_read_dsa ||
490 info->zsbuf_fbfetch;
491 }
492
493 /* if a driver ends a renderpass early for some reason,
494 * this function can be called to reset any stored renderpass info
495 * to a "safe" state that will avoid data loss on framebuffer attachments
496 *
497 * note: ending a renderpass early if invalidate hints are applied will
498 * result in data loss
499 */
500 static inline void
tc_renderpass_info_reset(struct tc_renderpass_info * info)501 tc_renderpass_info_reset(struct tc_renderpass_info *info)
502 {
503 info->data32[0] = 0;
504 info->cbuf_load = BITFIELD_MASK(8);
505 info->zsbuf_clear_partial = true;
506 info->has_draw = true;
507 info->has_query_ends = true;
508 }
509
510 struct tc_batch {
511 struct threaded_context *tc;
512 #if !defined(NDEBUG) && TC_DEBUG >= 1
513 unsigned sentinel;
514 #endif
515 uint16_t num_total_slots;
516 uint16_t buffer_list_index;
517 /* the index of the current renderpass info for recording */
518 int16_t renderpass_info_idx;
519 uint16_t max_renderpass_info_idx;
520
521 /* The last mergeable call that was added to this batch (i.e.
522 * buffer subdata). This might be out-of-date or NULL.
523 */
524 struct tc_call_base *last_mergeable_call;
525
526 struct util_queue_fence fence;
527 /* whether the first set_framebuffer_state call has been seen by this batch */
528 bool first_set_fb;
529 uint8_t batch_idx;
530 struct tc_unflushed_batch_token *token;
531 uint64_t slots[TC_SLOTS_PER_BATCH];
532 struct util_dynarray renderpass_infos;
533 };
534
535 struct tc_buffer_list {
536 /* Signalled by the driver after it flushes its internal command buffer. */
537 struct util_queue_fence driver_flushed_fence;
538
539 /* Buffer list where bit N means whether ID hash N is in the list. */
540 BITSET_DECLARE(buffer_list, TC_BUFFER_ID_MASK + 1);
541 };
542
543 /**
544 * Optional TC parameters/callbacks.
545 */
546 struct threaded_context_options {
547 tc_create_fence_func create_fence;
548 tc_is_resource_busy is_resource_busy;
549 bool driver_calls_flush_notify;
550
551 /**
552 * If true, ctx->get_device_reset_status() will be called without
553 * synchronizing with driver thread. Drivers can enable this to avoid
554 * TC syncs if their implementation of get_device_reset_status() is
555 * safe to call without synchronizing with driver thread.
556 */
557 bool unsynchronized_get_device_reset_status;
558
559 /* If true, create_fence_fd doesn't access the context in the driver. */
560 bool unsynchronized_create_fence_fd;
561 /* if true, texture_subdata calls may occur unsynchronized with PIPE_MAP_UNSYNCHRONIZED */
562 bool unsynchronized_texture_subdata;
563 /* if true, parse and track renderpass info during execution */
564 bool parse_renderpass_info;
565 /* callbacks for drivers to read their DSA/FS state and update renderpass info accordingly
566 * note: drivers must ONLY append to renderpass info using |=
567 */
568 void (*dsa_parse)(void *state, struct tc_renderpass_info *info);
569 void (*fs_parse)(void *state, struct tc_renderpass_info *info);
570 };
571
572 struct tc_vertex_buffers {
573 struct tc_call_base base;
574 uint8_t count;
575 struct pipe_vertex_buffer slot[0]; /* more will be allocated if needed */
576 };
577
578 struct threaded_context {
579 struct pipe_context base;
580 struct pipe_context *pipe;
581 struct slab_child_pool pool_transfers;
582 tc_replace_buffer_storage_func replace_buffer_storage;
583 struct threaded_context_options options;
584 unsigned map_buffer_alignment;
585 unsigned ubo_alignment;
586
587 struct list_head unflushed_queries;
588
589 /* Counters for the HUD. */
590 unsigned num_offloaded_slots;
591 unsigned num_direct_slots;
592 unsigned num_syncs;
593
594 bool use_forced_staging_uploads;
595 bool add_all_gfx_bindings_to_buffer_list;
596 bool add_all_compute_bindings_to_buffer_list;
597 uint8_t num_queries_active;
598
599 /* Estimation of how much vram/gtt bytes are mmap'd in
600 * the current tc_batch.
601 */
602 uint64_t bytes_mapped_estimate;
603 uint64_t bytes_mapped_limit;
604
605 /* Estimation of how replacement buffer bytes are in
606 * the current tc_batch.
607 */
608 uint64_t bytes_replaced_estimate;
609 uint64_t bytes_replaced_limit;
610
611 struct util_queue queue;
612 struct util_queue_fence *fence;
613
614 #ifndef NDEBUG
615 /**
616 * The driver thread is normally the queue thread, but
617 * there are cases where the queue is flushed directly
618 * from the frontend thread
619 */
620 thrd_t driver_thread;
621 #endif
622
623 bool seen_tcs;
624 bool seen_tes;
625 bool seen_gs;
626 /* whether the current renderpass has seen a set_framebuffer_state call */
627 bool seen_fb_state;
628 /* whether a renderpass is currently active */
629 bool in_renderpass;
630 /* whether a query has ended more recently than a draw */
631 bool query_ended;
632 /* whether pipe_context::flush has been called */
633 bool flushing;
634
635 bool seen_streamout_buffers;
636 bool seen_shader_buffers[PIPE_SHADER_TYPES];
637 bool seen_image_buffers[PIPE_SHADER_TYPES];
638 bool seen_sampler_buffers[PIPE_SHADER_TYPES];
639
640 int8_t last_completed;
641
642 uint8_t num_vertex_buffers;
643 unsigned max_const_buffers;
644 unsigned max_shader_buffers;
645 unsigned max_images;
646 unsigned max_samplers;
647 unsigned nr_cbufs;
648
649 unsigned last, next, next_buf_list, batch_generation;
650
651 /* The list fences that the driver should signal after the next flush.
652 * If this is empty, all driver command buffers have been flushed.
653 */
654 struct util_queue_fence *signal_fences_next_flush[TC_MAX_BUFFER_LISTS];
655 unsigned num_signal_fences_next_flush;
656
657 /* Bound buffers are tracked here using threaded_resource::buffer_id_hash.
658 * 0 means unbound.
659 */
660 uint32_t vertex_buffers[PIPE_MAX_ATTRIBS];
661 uint32_t streamout_buffers[PIPE_MAX_SO_BUFFERS];
662 uint32_t const_buffers[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
663 uint32_t shader_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS];
664 uint32_t image_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES];
665 uint32_t shader_buffers_writeable_mask[PIPE_SHADER_TYPES];
666 uint64_t image_buffers_writeable_mask[PIPE_SHADER_TYPES];
667 uint32_t sampler_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
668
669 struct tc_batch batch_slots[TC_MAX_BATCHES];
670 struct tc_buffer_list buffer_lists[TC_MAX_BUFFER_LISTS];
671 /* the current framebuffer attachments; [PIPE_MAX_COLOR_BUFS] is the zsbuf */
672 struct pipe_resource *fb_resources[PIPE_MAX_COLOR_BUFS + 1];
673 struct pipe_resource *fb_resolve;
674 /* accessed by main thread; preserves info across batches */
675 struct tc_renderpass_info *renderpass_info_recording;
676 /* accessed by driver thread */
677 struct tc_renderpass_info *renderpass_info;
678
679 /* Callbacks that call pipe_context functions. */
680 tc_execute execute_func[TC_NUM_CALLS];
681 };
682
683
684 void threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage);
685 void threaded_resource_deinit(struct pipe_resource *res);
686 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
687 void tc_driver_internal_flush_notify(struct threaded_context *tc);
688
689 /** function for getting the current renderpass info:
690 * - renderpass info is always non-null
691 *
692 * Rules:
693 * - threaded context must have been created with parse_renderpass_info=true
694 * - must be called after the driver receives a pipe_context::set_framebuffer_state callback
695 * - must be called after the driver receives a non-deferrable pipe_context::flush callback
696 * - renderpass info must not be used during any internal driver operations (e.g., u_blitter)
697 * - must not be called before the driver receives its first pipe_context::set_framebuffer_state callback
698 * - renderpass info is invalidated only for non-deferrable flushes and new framebuffer states
699 */
700 const struct tc_renderpass_info *
701 threaded_context_get_renderpass_info(struct threaded_context *tc);
702
703 struct pipe_context *
704 threaded_context_create(struct pipe_context *pipe,
705 struct slab_parent_pool *parent_transfer_pool,
706 tc_replace_buffer_storage_func replace_buffer,
707 const struct threaded_context_options *options,
708 struct threaded_context **out);
709
710 void
711 threaded_context_init_bytes_mapped_limit(struct threaded_context *tc, unsigned divisor);
712
713 void
714 threaded_context_flush(struct pipe_context *_pipe,
715 struct tc_unflushed_batch_token *token,
716 bool prefer_async);
717
718 struct tc_draw_single *
719 tc_add_draw_single_call(struct pipe_context *_pipe,
720 struct pipe_resource *index_bo);
721 struct pipe_vertex_buffer *
722 tc_add_set_vertex_buffers_call(struct pipe_context *_pipe, unsigned count);
723
724 void
725 tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info,
726 unsigned drawid_offset,
727 const struct pipe_draw_indirect_info *indirect,
728 const struct pipe_draw_start_count_bias *draws,
729 unsigned num_draws);
730
731 static inline struct threaded_context *
threaded_context(struct pipe_context * pipe)732 threaded_context(struct pipe_context *pipe)
733 {
734 return (struct threaded_context*)pipe;
735 }
736
737 static inline struct threaded_resource *
threaded_resource(struct pipe_resource * res)738 threaded_resource(struct pipe_resource *res)
739 {
740 return (struct threaded_resource*)res;
741 }
742
743 static inline struct threaded_query *
threaded_query(struct pipe_query * q)744 threaded_query(struct pipe_query *q)
745 {
746 return (struct threaded_query*)q;
747 }
748
749 static inline struct threaded_transfer *
threaded_transfer(struct pipe_transfer * transfer)750 threaded_transfer(struct pipe_transfer *transfer)
751 {
752 return (struct threaded_transfer*)transfer;
753 }
754
755 static inline void
tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token ** dst,struct tc_unflushed_batch_token * src)756 tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst,
757 struct tc_unflushed_batch_token *src)
758 {
759 if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference *)src))
760 free(*dst);
761 *dst = src;
762 }
763
764 /**
765 * Helper for !NDEBUG builds to assert that it is called from driver
766 * thread. This is to help drivers ensure that various code-paths
767 * are not hit indirectly from pipe entry points that are called from
768 * front-end/state-tracker thread.
769 */
770 static inline void
tc_assert_driver_thread(struct threaded_context * tc)771 tc_assert_driver_thread(struct threaded_context *tc)
772 {
773 if (!tc)
774 return;
775 #ifndef NDEBUG
776 assert(u_thread_is_self(tc->driver_thread));
777 #endif
778 }
779
780 /**
781 * This is called before GPU stores to disable the CPU storage because
782 * the CPU storage doesn't mirror the GPU storage.
783 *
784 * Drivers should also call it before exporting a DMABUF of a buffer.
785 */
786 static inline void
tc_buffer_disable_cpu_storage(struct pipe_resource * buf)787 tc_buffer_disable_cpu_storage(struct pipe_resource *buf)
788 {
789 struct threaded_resource *tres = threaded_resource(buf);
790
791 if (tres->cpu_storage) {
792 align_free(tres->cpu_storage);
793 tres->cpu_storage = NULL;
794 }
795 tres->allow_cpu_storage = false;
796 }
797
798 static inline void
tc_buffer_write(struct pipe_context * pipe,struct pipe_resource * buf,unsigned offset,unsigned size,const void * data)799 tc_buffer_write(struct pipe_context *pipe,
800 struct pipe_resource *buf,
801 unsigned offset,
802 unsigned size,
803 const void *data)
804 {
805 pipe->buffer_subdata(pipe, buf, PIPE_MAP_WRITE | TC_TRANSFER_MAP_NO_INVALIDATE, offset, size, data);
806 }
807
808 static inline struct tc_buffer_list *
tc_get_next_buffer_list(struct pipe_context * _pipe)809 tc_get_next_buffer_list(struct pipe_context *_pipe)
810 {
811 struct threaded_context *tc = threaded_context(_pipe);
812
813 return &tc->buffer_lists[tc->next_buf_list];
814 }
815
816 /* Set a buffer binding and add it to the buffer list. */
817 static inline void
tc_bind_buffer(uint32_t * binding,struct tc_buffer_list * next,struct pipe_resource * buf)818 tc_bind_buffer(uint32_t *binding, struct tc_buffer_list *next, struct pipe_resource *buf)
819 {
820 uint32_t id = threaded_resource(buf)->buffer_id_unique;
821 *binding = id;
822 BITSET_SET(next->buffer_list, id & TC_BUFFER_ID_MASK);
823 }
824
825 /* Reset a buffer binding. */
826 static inline void
tc_unbind_buffer(uint32_t * binding)827 tc_unbind_buffer(uint32_t *binding)
828 {
829 *binding = 0;
830 }
831
832 static inline void
tc_track_vertex_buffer(struct pipe_context * _pipe,unsigned index,struct pipe_resource * buf,struct tc_buffer_list * next_buffer_list)833 tc_track_vertex_buffer(struct pipe_context *_pipe, unsigned index,
834 struct pipe_resource *buf,
835 struct tc_buffer_list *next_buffer_list)
836 {
837 struct threaded_context *tc = threaded_context(_pipe);
838
839 if (buf) {
840 tc_bind_buffer(&tc->vertex_buffers[index], next_buffer_list, buf);
841 } else {
842 tc_unbind_buffer(&tc->vertex_buffers[index]);
843 }
844 }
845
846 #ifdef __cplusplus
847 }
848 #endif
849
850 #endif
851