xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_query.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2010 Jerome Glisse <[email protected]>
3  * Copyright 2014 Marek Olšák <[email protected]>
4  * Copyright 2018 Advanced Micro Devices, Inc.
5  *
6  * SPDX-License-Identifier: MIT
7  */
8 
9 #include "si_query.h"
10 #include "si_build_pm4.h"
11 
12 #include "amd/common/sid.h"
13 #include "si_pipe.h"
14 #include "util/os_time.h"
15 #include "util/u_memory.h"
16 #include "util/u_suballoc.h"
17 #include "util/u_upload_mgr.h"
18 
19 static const struct si_query_ops hw_query_ops;
20 static const struct si_query_ops sw_query_ops;
21 
22 struct si_hw_query_params {
23    unsigned start_offset;
24    unsigned end_offset;
25    unsigned fence_offset;
26    unsigned pair_stride;
27    unsigned pair_count;
28 };
29 
30 /* Queries without buffer handling or suspend/resume. */
31 struct si_query_sw {
32    struct si_query b;
33 
34    uint64_t begin_result;
35    uint64_t end_result;
36 
37    uint64_t begin_time;
38    uint64_t end_time;
39 
40    /* Fence for GPU_FINISHED. */
41    struct pipe_fence_handle *fence;
42 };
43 
si_query_sw_destroy(struct si_context * sctx,struct si_query * squery)44 static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
45 {
46    struct si_query_sw *query = (struct si_query_sw *)squery;
47 
48    sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
49    FREE(query);
50 }
51 
winsys_id_from_type(unsigned type)52 static enum radeon_value_id winsys_id_from_type(unsigned type)
53 {
54    switch (type) {
55    case SI_QUERY_REQUESTED_VRAM:
56       return RADEON_REQUESTED_VRAM_MEMORY;
57    case SI_QUERY_REQUESTED_GTT:
58       return RADEON_REQUESTED_GTT_MEMORY;
59    case SI_QUERY_MAPPED_VRAM:
60       return RADEON_MAPPED_VRAM;
61    case SI_QUERY_MAPPED_GTT:
62       return RADEON_MAPPED_GTT;
63    case SI_QUERY_SLAB_WASTED_VRAM:
64       return RADEON_SLAB_WASTED_VRAM;
65    case SI_QUERY_SLAB_WASTED_GTT:
66       return RADEON_SLAB_WASTED_GTT;
67    case SI_QUERY_BUFFER_WAIT_TIME:
68       return RADEON_BUFFER_WAIT_TIME_NS;
69    case SI_QUERY_NUM_MAPPED_BUFFERS:
70       return RADEON_NUM_MAPPED_BUFFERS;
71    case SI_QUERY_NUM_GFX_IBS:
72       return RADEON_NUM_GFX_IBS;
73    case SI_QUERY_GFX_BO_LIST_SIZE:
74       return RADEON_GFX_BO_LIST_COUNTER;
75    case SI_QUERY_GFX_IB_SIZE:
76       return RADEON_GFX_IB_SIZE_COUNTER;
77    case SI_QUERY_NUM_BYTES_MOVED:
78       return RADEON_NUM_BYTES_MOVED;
79    case SI_QUERY_NUM_EVICTIONS:
80       return RADEON_NUM_EVICTIONS;
81    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
82       return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
83    case SI_QUERY_VRAM_USAGE:
84       return RADEON_VRAM_USAGE;
85    case SI_QUERY_VRAM_VIS_USAGE:
86       return RADEON_VRAM_VIS_USAGE;
87    case SI_QUERY_GTT_USAGE:
88       return RADEON_GTT_USAGE;
89    case SI_QUERY_GPU_TEMPERATURE:
90       return RADEON_GPU_TEMPERATURE;
91    case SI_QUERY_CURRENT_GPU_SCLK:
92       return RADEON_CURRENT_SCLK;
93    case SI_QUERY_CURRENT_GPU_MCLK:
94       return RADEON_CURRENT_MCLK;
95    case SI_QUERY_CS_THREAD_BUSY:
96       return RADEON_CS_THREAD_TIME;
97    default:
98       unreachable("query type does not correspond to winsys id");
99    }
100 }
101 
si_query_sw_begin(struct si_context * sctx,struct si_query * squery)102 static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
103 {
104    struct si_query_sw *query = (struct si_query_sw *)squery;
105    enum radeon_value_id ws_id;
106 
107    switch (query->b.type) {
108    case PIPE_QUERY_TIMESTAMP_DISJOINT:
109    case PIPE_QUERY_GPU_FINISHED:
110       break;
111    case SI_QUERY_DRAW_CALLS:
112       query->begin_result = sctx->num_draw_calls;
113       break;
114    case SI_QUERY_DECOMPRESS_CALLS:
115       query->begin_result = sctx->num_decompress_calls;
116       break;
117    case SI_QUERY_COMPUTE_CALLS:
118       query->begin_result = sctx->num_compute_calls;
119       break;
120    case SI_QUERY_CP_DMA_CALLS:
121       query->begin_result = sctx->num_cp_dma_calls;
122       break;
123    case SI_QUERY_NUM_VS_FLUSHES:
124       query->begin_result = sctx->num_vs_flushes;
125       break;
126    case SI_QUERY_NUM_PS_FLUSHES:
127       query->begin_result = sctx->num_ps_flushes;
128       break;
129    case SI_QUERY_NUM_CS_FLUSHES:
130       query->begin_result = sctx->num_cs_flushes;
131       break;
132    case SI_QUERY_NUM_CB_CACHE_FLUSHES:
133       query->begin_result = sctx->num_cb_cache_flushes;
134       break;
135    case SI_QUERY_NUM_DB_CACHE_FLUSHES:
136       query->begin_result = sctx->num_db_cache_flushes;
137       break;
138    case SI_QUERY_NUM_L2_INVALIDATES:
139       query->begin_result = sctx->num_L2_invalidates;
140       break;
141    case SI_QUERY_NUM_L2_WRITEBACKS:
142       query->begin_result = sctx->num_L2_writebacks;
143       break;
144    case SI_QUERY_NUM_RESIDENT_HANDLES:
145       query->begin_result = sctx->num_resident_handles;
146       break;
147    case SI_QUERY_TC_OFFLOADED_SLOTS:
148       query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
149       break;
150    case SI_QUERY_TC_DIRECT_SLOTS:
151       query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
152       break;
153    case SI_QUERY_TC_NUM_SYNCS:
154       query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
155       break;
156    case SI_QUERY_REQUESTED_VRAM:
157    case SI_QUERY_REQUESTED_GTT:
158    case SI_QUERY_MAPPED_VRAM:
159    case SI_QUERY_MAPPED_GTT:
160    case SI_QUERY_SLAB_WASTED_VRAM:
161    case SI_QUERY_SLAB_WASTED_GTT:
162    case SI_QUERY_VRAM_USAGE:
163    case SI_QUERY_VRAM_VIS_USAGE:
164    case SI_QUERY_GTT_USAGE:
165    case SI_QUERY_GPU_TEMPERATURE:
166    case SI_QUERY_CURRENT_GPU_SCLK:
167    case SI_QUERY_CURRENT_GPU_MCLK:
168    case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
169    case SI_QUERY_NUM_MAPPED_BUFFERS:
170       query->begin_result = 0;
171       break;
172    case SI_QUERY_BUFFER_WAIT_TIME:
173    case SI_QUERY_GFX_IB_SIZE:
174    case SI_QUERY_NUM_GFX_IBS:
175    case SI_QUERY_NUM_BYTES_MOVED:
176    case SI_QUERY_NUM_EVICTIONS:
177    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
178       enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
179       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
180       break;
181    }
182    case SI_QUERY_GFX_BO_LIST_SIZE:
183       ws_id = winsys_id_from_type(query->b.type);
184       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
185       query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
186       break;
187    case SI_QUERY_CS_THREAD_BUSY:
188       ws_id = winsys_id_from_type(query->b.type);
189       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
190       query->begin_time = os_time_get_nano();
191       break;
192    case SI_QUERY_GALLIUM_THREAD_BUSY:
193       query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
194       query->begin_time = os_time_get_nano();
195       break;
196    case SI_QUERY_GPU_LOAD:
197    case SI_QUERY_GPU_SHADERS_BUSY:
198    case SI_QUERY_GPU_TA_BUSY:
199    case SI_QUERY_GPU_GDS_BUSY:
200    case SI_QUERY_GPU_VGT_BUSY:
201    case SI_QUERY_GPU_IA_BUSY:
202    case SI_QUERY_GPU_SX_BUSY:
203    case SI_QUERY_GPU_WD_BUSY:
204    case SI_QUERY_GPU_BCI_BUSY:
205    case SI_QUERY_GPU_SC_BUSY:
206    case SI_QUERY_GPU_PA_BUSY:
207    case SI_QUERY_GPU_DB_BUSY:
208    case SI_QUERY_GPU_CP_BUSY:
209    case SI_QUERY_GPU_CB_BUSY:
210    case SI_QUERY_GPU_SDMA_BUSY:
211    case SI_QUERY_GPU_PFP_BUSY:
212    case SI_QUERY_GPU_MEQ_BUSY:
213    case SI_QUERY_GPU_ME_BUSY:
214    case SI_QUERY_GPU_SURF_SYNC_BUSY:
215    case SI_QUERY_GPU_CP_DMA_BUSY:
216    case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
217       query->begin_result = si_begin_counter(sctx->screen, query->b.type);
218       break;
219    case SI_QUERY_NUM_COMPILATIONS:
220       query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
221       break;
222    case SI_QUERY_NUM_SHADERS_CREATED:
223       query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
224       break;
225    case SI_QUERY_LIVE_SHADER_CACHE_HITS:
226       query->begin_result = sctx->screen->live_shader_cache.hits;
227       break;
228    case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
229       query->begin_result = sctx->screen->live_shader_cache.misses;
230       break;
231    case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
232       query->begin_result = sctx->screen->num_memory_shader_cache_hits;
233       break;
234    case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
235       query->begin_result = sctx->screen->num_memory_shader_cache_misses;
236       break;
237    case SI_QUERY_DISK_SHADER_CACHE_HITS:
238       query->begin_result = sctx->screen->num_disk_shader_cache_hits;
239       break;
240    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
241       query->begin_result = sctx->screen->num_disk_shader_cache_misses;
242       break;
243    case SI_QUERY_GPIN_ASIC_ID:
244    case SI_QUERY_GPIN_NUM_SIMD:
245    case SI_QUERY_GPIN_NUM_RB:
246    case SI_QUERY_GPIN_NUM_SPI:
247    case SI_QUERY_GPIN_NUM_SE:
248       break;
249    default:
250       unreachable("si_query_sw_begin: bad query type");
251    }
252 
253    return true;
254 }
255 
si_query_sw_end(struct si_context * sctx,struct si_query * squery)256 static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
257 {
258    struct si_query_sw *query = (struct si_query_sw *)squery;
259    enum radeon_value_id ws_id;
260 
261    switch (query->b.type) {
262    case PIPE_QUERY_TIMESTAMP_DISJOINT:
263       break;
264    case PIPE_QUERY_GPU_FINISHED:
265       sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
266       break;
267    case SI_QUERY_DRAW_CALLS:
268       query->end_result = sctx->num_draw_calls;
269       break;
270    case SI_QUERY_DECOMPRESS_CALLS:
271       query->end_result = sctx->num_decompress_calls;
272       break;
273    case SI_QUERY_COMPUTE_CALLS:
274       query->end_result = sctx->num_compute_calls;
275       break;
276    case SI_QUERY_CP_DMA_CALLS:
277       query->end_result = sctx->num_cp_dma_calls;
278       break;
279    case SI_QUERY_NUM_VS_FLUSHES:
280       query->end_result = sctx->num_vs_flushes;
281       break;
282    case SI_QUERY_NUM_PS_FLUSHES:
283       query->end_result = sctx->num_ps_flushes;
284       break;
285    case SI_QUERY_NUM_CS_FLUSHES:
286       query->end_result = sctx->num_cs_flushes;
287       break;
288    case SI_QUERY_NUM_CB_CACHE_FLUSHES:
289       query->end_result = sctx->num_cb_cache_flushes;
290       break;
291    case SI_QUERY_NUM_DB_CACHE_FLUSHES:
292       query->end_result = sctx->num_db_cache_flushes;
293       break;
294    case SI_QUERY_NUM_L2_INVALIDATES:
295       query->end_result = sctx->num_L2_invalidates;
296       break;
297    case SI_QUERY_NUM_L2_WRITEBACKS:
298       query->end_result = sctx->num_L2_writebacks;
299       break;
300    case SI_QUERY_NUM_RESIDENT_HANDLES:
301       query->end_result = sctx->num_resident_handles;
302       break;
303    case SI_QUERY_TC_OFFLOADED_SLOTS:
304       query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
305       break;
306    case SI_QUERY_TC_DIRECT_SLOTS:
307       query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
308       break;
309    case SI_QUERY_TC_NUM_SYNCS:
310       query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
311       break;
312    case SI_QUERY_REQUESTED_VRAM:
313    case SI_QUERY_REQUESTED_GTT:
314    case SI_QUERY_MAPPED_VRAM:
315    case SI_QUERY_MAPPED_GTT:
316    case SI_QUERY_SLAB_WASTED_VRAM:
317    case SI_QUERY_SLAB_WASTED_GTT:
318    case SI_QUERY_VRAM_USAGE:
319    case SI_QUERY_VRAM_VIS_USAGE:
320    case SI_QUERY_GTT_USAGE:
321    case SI_QUERY_GPU_TEMPERATURE:
322    case SI_QUERY_CURRENT_GPU_SCLK:
323    case SI_QUERY_CURRENT_GPU_MCLK:
324    case SI_QUERY_BUFFER_WAIT_TIME:
325    case SI_QUERY_GFX_IB_SIZE:
326    case SI_QUERY_NUM_MAPPED_BUFFERS:
327    case SI_QUERY_NUM_GFX_IBS:
328    case SI_QUERY_NUM_BYTES_MOVED:
329    case SI_QUERY_NUM_EVICTIONS:
330    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
331       enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
332       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
333       break;
334    }
335    case SI_QUERY_GFX_BO_LIST_SIZE:
336       ws_id = winsys_id_from_type(query->b.type);
337       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
338       query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
339       break;
340    case SI_QUERY_CS_THREAD_BUSY:
341       ws_id = winsys_id_from_type(query->b.type);
342       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
343       query->end_time = os_time_get_nano();
344       break;
345    case SI_QUERY_GALLIUM_THREAD_BUSY:
346       query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
347       query->end_time = os_time_get_nano();
348       break;
349    case SI_QUERY_GPU_LOAD:
350    case SI_QUERY_GPU_SHADERS_BUSY:
351    case SI_QUERY_GPU_TA_BUSY:
352    case SI_QUERY_GPU_GDS_BUSY:
353    case SI_QUERY_GPU_VGT_BUSY:
354    case SI_QUERY_GPU_IA_BUSY:
355    case SI_QUERY_GPU_SX_BUSY:
356    case SI_QUERY_GPU_WD_BUSY:
357    case SI_QUERY_GPU_BCI_BUSY:
358    case SI_QUERY_GPU_SC_BUSY:
359    case SI_QUERY_GPU_PA_BUSY:
360    case SI_QUERY_GPU_DB_BUSY:
361    case SI_QUERY_GPU_CP_BUSY:
362    case SI_QUERY_GPU_CB_BUSY:
363    case SI_QUERY_GPU_SDMA_BUSY:
364    case SI_QUERY_GPU_PFP_BUSY:
365    case SI_QUERY_GPU_MEQ_BUSY:
366    case SI_QUERY_GPU_ME_BUSY:
367    case SI_QUERY_GPU_SURF_SYNC_BUSY:
368    case SI_QUERY_GPU_CP_DMA_BUSY:
369    case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
370       query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
371       query->begin_result = 0;
372       break;
373    case SI_QUERY_NUM_COMPILATIONS:
374       query->end_result = p_atomic_read(&sctx->screen->num_compilations);
375       break;
376    case SI_QUERY_NUM_SHADERS_CREATED:
377       query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
378       break;
379    case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
380       query->end_result = sctx->last_tex_ps_draw_ratio;
381       break;
382    case SI_QUERY_LIVE_SHADER_CACHE_HITS:
383       query->end_result = sctx->screen->live_shader_cache.hits;
384       break;
385    case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
386       query->end_result = sctx->screen->live_shader_cache.misses;
387       break;
388    case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
389       query->end_result = sctx->screen->num_memory_shader_cache_hits;
390       break;
391    case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
392       query->end_result = sctx->screen->num_memory_shader_cache_misses;
393       break;
394    case SI_QUERY_DISK_SHADER_CACHE_HITS:
395       query->end_result = sctx->screen->num_disk_shader_cache_hits;
396       break;
397    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
398       query->end_result = sctx->screen->num_disk_shader_cache_misses;
399       break;
400    case SI_QUERY_GPIN_ASIC_ID:
401    case SI_QUERY_GPIN_NUM_SIMD:
402    case SI_QUERY_GPIN_NUM_RB:
403    case SI_QUERY_GPIN_NUM_SPI:
404    case SI_QUERY_GPIN_NUM_SE:
405       break;
406    default:
407       unreachable("si_query_sw_end: bad query type");
408    }
409 
410    return true;
411 }
412 
si_query_sw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)413 static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
414                                    union pipe_query_result *result)
415 {
416    struct si_query_sw *query = (struct si_query_sw *)squery;
417 
418    switch (query->b.type) {
419    case PIPE_QUERY_TIMESTAMP_DISJOINT:
420       /* Convert from cycles per millisecond to cycles per second (Hz). */
421       result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
422       result->timestamp_disjoint.disjoint = false;
423       return true;
424    case PIPE_QUERY_GPU_FINISHED: {
425       struct pipe_screen *screen = sctx->b.screen;
426       struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
427 
428       result->b = screen->fence_finish(screen, ctx, query->fence, wait ? OS_TIMEOUT_INFINITE : 0);
429       return result->b;
430    }
431 
432    case SI_QUERY_GFX_BO_LIST_SIZE:
433       result->u64 =
434          (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
435       return true;
436    case SI_QUERY_CS_THREAD_BUSY:
437    case SI_QUERY_GALLIUM_THREAD_BUSY:
438       result->u64 =
439          (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
440       return true;
441    case SI_QUERY_GPIN_ASIC_ID:
442       result->u32 = 0;
443       return true;
444    case SI_QUERY_GPIN_NUM_SIMD:
445       result->u32 = sctx->screen->info.num_cu;
446       return true;
447    case SI_QUERY_GPIN_NUM_RB:
448       result->u32 = sctx->screen->info.max_render_backends;
449       return true;
450    case SI_QUERY_GPIN_NUM_SPI:
451       result->u32 = 1; /* all supported chips have one SPI per SE */
452       return true;
453    case SI_QUERY_GPIN_NUM_SE:
454       result->u32 = sctx->screen->info.max_se;
455       return true;
456    }
457 
458    result->u64 = query->end_result - query->begin_result;
459 
460    switch (query->b.type) {
461    case SI_QUERY_BUFFER_WAIT_TIME:
462    case SI_QUERY_GPU_TEMPERATURE:
463       result->u64 /= 1000;
464       break;
465    case SI_QUERY_CURRENT_GPU_SCLK:
466    case SI_QUERY_CURRENT_GPU_MCLK:
467       result->u64 *= 1000000;
468       break;
469    }
470 
471    return true;
472 }
473 
si_query_sw_create(unsigned query_type)474 static struct pipe_query *si_query_sw_create(unsigned query_type)
475 {
476    struct si_query_sw *query;
477 
478    query = CALLOC_STRUCT(si_query_sw);
479    if (!query)
480       return NULL;
481 
482    query->b.type = query_type;
483    query->b.ops = &sw_query_ops;
484 
485    return (struct pipe_query *)query;
486 }
487 
si_query_buffer_destroy(struct si_screen * sscreen,struct si_query_buffer * buffer)488 void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
489 {
490    struct si_query_buffer *prev = buffer->previous;
491 
492    /* Release all query buffers. */
493    while (prev) {
494       struct si_query_buffer *qbuf = prev;
495       prev = prev->previous;
496       si_resource_reference(&qbuf->buf, NULL);
497       FREE(qbuf);
498    }
499 
500    si_resource_reference(&buffer->buf, NULL);
501 }
502 
si_query_buffer_reset(struct si_context * sctx,struct si_query_buffer * buffer)503 void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
504 {
505    /* Discard all query buffers except for the oldest. */
506    while (buffer->previous) {
507       struct si_query_buffer *qbuf = buffer->previous;
508       buffer->previous = qbuf->previous;
509 
510       si_resource_reference(&buffer->buf, NULL);
511       buffer->buf = qbuf->buf; /* move ownership */
512       FREE(qbuf);
513    }
514    buffer->results_end = 0;
515 
516    if (!buffer->buf)
517       return;
518 
519    /* Discard even the oldest buffer if it can't be mapped without a stall. */
520    if (si_cs_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
521        !sctx->ws->buffer_wait(sctx->ws, buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
522       si_resource_reference(&buffer->buf, NULL);
523    }
524 }
525 
si_query_buffer_alloc(struct si_context * sctx,struct si_query_buffer * buffer,bool (* prepare_buffer)(struct si_context *,struct si_query_buffer *),unsigned size)526 bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
527                            bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
528                            unsigned size)
529 {
530    if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
531       if (buffer->buf) {
532          struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
533          memcpy(qbuf, buffer, sizeof(*qbuf));
534          buffer->previous = qbuf;
535       }
536       buffer->results_end = 0;
537 
538       /* Queries are normally read by the CPU after
539        * being written by the gpu, hence staging is probably a good
540        * usage pattern.
541        */
542       struct si_screen *screen = sctx->screen;
543       unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
544 
545       /* We need to bypass GL2 for queries if SET_PREDICATION accesses it uncached
546        * in a spinloop.
547        */
548       buffer->buf =  si_aligned_buffer_create(&screen->b,
549                                               screen->info.cp_sdma_ge_use_system_memory_scope ?
550                                                  SI_RESOURCE_FLAG_GL2_BYPASS : 0,
551                                               PIPE_USAGE_STAGING, buf_size, 256);
552       if (unlikely(!buffer->buf))
553          return false;
554    }
555 
556    if (!buffer->results_end && prepare_buffer) {
557       if (unlikely(!prepare_buffer(sctx, buffer))) {
558          si_resource_reference(&buffer->buf, NULL);
559          return false;
560       }
561    }
562 
563    return true;
564 }
565 
si_query_hw_destroy(struct si_context * sctx,struct si_query * squery)566 static void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
567 {
568    struct si_query_hw *query = (struct si_query_hw *)squery;
569 
570    si_query_buffer_destroy(sctx->screen, &query->buffer);
571    si_resource_reference(&query->workaround_buf, NULL);
572    FREE(squery);
573 }
574 
si_query_hw_prepare_buffer(struct si_context * sctx,struct si_query_buffer * qbuf)575 static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
576 {
577    struct si_query_hw *query = container_of(qbuf, struct si_query_hw, buffer);
578    struct si_screen *screen = sctx->screen;
579 
580    /* The caller ensures that the buffer is currently unused by the GPU. */
581    uint32_t *results = screen->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
582                                               PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
583    if (!results)
584       return false;
585 
586    memset(results, 0, qbuf->buf->b.b.width0);
587 
588    if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
589        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
590        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
591       unsigned max_rbs = screen->info.max_render_backends;
592       uint64_t enabled_rb_mask = screen->info.enabled_rb_mask;
593       unsigned num_results;
594       unsigned i, j;
595 
596       /* Set top bits for unused backends. */
597       num_results = qbuf->buf->b.b.width0 / query->result_size;
598       for (j = 0; j < num_results; j++) {
599          for (i = 0; i < max_rbs; i++) {
600             if (!(enabled_rb_mask & (1ull << i))) {
601                results[(i * 4) + 1] = 0x80000000;
602                results[(i * 4) + 3] = 0x80000000;
603             }
604          }
605          results += 4 * max_rbs;
606       }
607    }
608 
609    return true;
610 }
611 
si_query_pipestats_num_results(struct si_screen * sscreen)612 static unsigned si_query_pipestats_num_results(struct si_screen *sscreen)
613 {
614    return sscreen->info.gfx_level >= GFX11 ? 14 : 11;
615 }
616 
si_query_pipestat_dw_offset(enum pipe_statistics_query_index index)617 static unsigned si_query_pipestat_dw_offset(enum pipe_statistics_query_index index)
618 {
619    switch (index) {
620    case PIPE_STAT_QUERY_PS_INVOCATIONS: return 0;
621    case PIPE_STAT_QUERY_C_PRIMITIVES: return 2;
622    case PIPE_STAT_QUERY_C_INVOCATIONS: return 4;
623    case PIPE_STAT_QUERY_VS_INVOCATIONS: return 6;
624    case PIPE_STAT_QUERY_GS_INVOCATIONS: return 8;
625    case PIPE_STAT_QUERY_GS_PRIMITIVES: return 10;
626    case PIPE_STAT_QUERY_IA_PRIMITIVES: return 12;
627    case PIPE_STAT_QUERY_IA_VERTICES: return 14;
628    case PIPE_STAT_QUERY_HS_INVOCATIONS: return 16;
629    case PIPE_STAT_QUERY_DS_INVOCATIONS: return 18;
630    case PIPE_STAT_QUERY_CS_INVOCATIONS: return 20;
631    /* gfx11: MS_INVOCATIONS */
632    /* gfx11: MS_PRIMITIVES */
633    /* gfx11: TS_INVOCATIONS */
634    default:
635       assert(false);
636    }
637    return ~0;
638 }
639 
si_query_pipestat_end_dw_offset(struct si_screen * sscreen,enum pipe_statistics_query_index index)640 unsigned si_query_pipestat_end_dw_offset(struct si_screen *sscreen,
641                                          enum pipe_statistics_query_index index)
642 {
643    return si_query_pipestats_num_results(sscreen) * 2 + si_query_pipestat_dw_offset(index);
644 }
645 
si_query_hw_create(struct si_screen * sscreen,unsigned query_type,unsigned index)646 static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
647                                              unsigned index)
648 {
649    struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
650    if (!query)
651       return NULL;
652 
653    query->b.type = query_type;
654    query->b.ops = &hw_query_ops;
655 
656    switch (query_type) {
657    case PIPE_QUERY_OCCLUSION_COUNTER:
658    case PIPE_QUERY_OCCLUSION_PREDICATE:
659    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
660       query->result_size = 16 * sscreen->info.max_render_backends;
661       query->result_size += 16; /* for the fence + alignment */
662       query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
663       break;
664    case PIPE_QUERY_TIME_ELAPSED:
665       query->result_size = 16;
666       query->result_size += 8; /* for fence */
667       query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
668       break;
669    case PIPE_QUERY_TIMESTAMP:
670       query->result_size = 8;
671       query->result_size += 8; /* for fence */
672       query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
673       query->flags = SI_QUERY_HW_FLAG_NO_START;
674       break;
675    case PIPE_QUERY_PRIMITIVES_EMITTED:
676    case PIPE_QUERY_PRIMITIVES_GENERATED:
677    case PIPE_QUERY_SO_STATISTICS:
678    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
679       /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
680       /* the 64th bit in qw is used as fence. it is set by hardware in streamout stats event. */
681       query->result_size = 32;
682       query->b.num_cs_dw_suspend = 6;
683       query->stream = index;
684       break;
685    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
686       /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
687       /* the 64th bit in qw is used as fence. it is set by hardware in streamout stats event. */
688       query->result_size = 32 * SI_MAX_STREAMS;
689       query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
690       break;
691    case PIPE_QUERY_PIPELINE_STATISTICS:
692       query->result_size = si_query_pipestats_num_results(sscreen) * 16;
693       query->result_size += 8; /* for the fence + alignment */
694       query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
695       query->index = index;
696       if ((index == PIPE_STAT_QUERY_GS_PRIMITIVES || index == PIPE_STAT_QUERY_GS_INVOCATIONS) &&
697           sscreen->use_ngg && (sscreen->info.gfx_level >= GFX10 && sscreen->info.gfx_level <= GFX10_3))
698          query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
699 
700       /* GFX11 only emulates PIPE_STAT_QUERY_GS_PRIMITIVES because the shader culls,
701        * which makes the statistic incorrect.
702        */
703       if (sscreen->info.gfx_level >= GFX11 && index == PIPE_STAT_QUERY_GS_PRIMITIVES)
704          query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
705       break;
706    default:
707       assert(0);
708       FREE(query);
709       return NULL;
710    }
711 
712    return (struct pipe_query *)query;
713 }
714 
si_update_occlusion_query_state(struct si_context * sctx,unsigned type,int diff)715 static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
716 {
717    if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
718        type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
719       switch (type) {
720       case PIPE_QUERY_OCCLUSION_COUNTER:
721          sctx->num_integer_occlusion_queries += diff;
722          break;
723       case PIPE_QUERY_OCCLUSION_PREDICATE:
724          sctx->num_boolean_occlusion_queries += diff;
725          break;
726       case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
727          sctx->num_conservative_occlusion_queries += diff;
728          break;
729       }
730 
731       assert(sctx->num_integer_occlusion_queries >= 0);
732       assert(sctx->num_boolean_occlusion_queries >= 0);
733       assert(sctx->num_conservative_occlusion_queries >= 0);
734 
735       enum si_occlusion_query_mode new_mode =
736          sctx->num_integer_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER :
737          sctx->num_boolean_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN :
738          sctx->num_conservative_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_CONSERVATIVE_BOOLEAN :
739          SI_OCCLUSION_QUERY_MODE_DISABLE;
740 
741       /* Conservative queries are only available on gfx10+. On gfx11+, they perform worse
742        * with late Z, but not early Z. Instead of trying to detect late Z, never enable
743        * conservative queries to keep it simple. This is the recommended programming.
744        */
745       if (new_mode == SI_OCCLUSION_QUERY_MODE_CONSERVATIVE_BOOLEAN &&
746           (sctx->gfx_level < GFX10 || sctx ->gfx_level >= GFX11))
747          new_mode = SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN;
748 
749       if (sctx->occlusion_query_mode != new_mode) {
750          si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
751 
752          if (sctx->screen->info.has_out_of_order_rast &&
753              (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER) !=
754              (new_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER))
755             si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
756 
757          sctx->occlusion_query_mode = new_mode;
758       }
759    }
760 }
761 
event_type_for_stream(unsigned stream)762 static unsigned event_type_for_stream(unsigned stream)
763 {
764    switch (stream) {
765    default:
766    case 0:
767       return V_028A90_SAMPLE_STREAMOUTSTATS;
768    case 1:
769       return V_028A90_SAMPLE_STREAMOUTSTATS1;
770    case 2:
771       return V_028A90_SAMPLE_STREAMOUTSTATS2;
772    case 3:
773       return V_028A90_SAMPLE_STREAMOUTSTATS3;
774    }
775 }
776 
emit_sample_streamout(struct radeon_cmdbuf * cs,uint64_t va,unsigned stream)777 static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
778 {
779    radeon_begin(cs);
780    radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
781    radeon_emit(EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
782    radeon_emit(va);
783    radeon_emit(va >> 32);
784    radeon_end();
785 }
786 
si_query_hw_do_emit_start(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)787 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
788                                       struct si_resource *buffer, uint64_t va)
789 {
790    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
791 
792    switch (query->b.type) {
793    case PIPE_QUERY_OCCLUSION_COUNTER:
794    case PIPE_QUERY_OCCLUSION_PREDICATE:
795    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
796       radeon_begin(cs);
797       if (sctx->gfx_level >= GFX11 &&
798           sctx->screen->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
799          radeon_emit(PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
800       } else {
801          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
802          if (sctx->gfx_level >= GFX11)
803             radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
804          else
805             radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
806       }
807       radeon_emit(va);
808       radeon_emit(va >> 32);
809       radeon_end();
810       break;
811    }
812    case PIPE_QUERY_PRIMITIVES_EMITTED:
813    case PIPE_QUERY_PRIMITIVES_GENERATED:
814    case PIPE_QUERY_SO_STATISTICS:
815    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
816       emit_sample_streamout(cs, va, query->stream);
817       break;
818    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
819       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
820          emit_sample_streamout(cs, va + 32 * stream, stream);
821       break;
822    case PIPE_QUERY_TIME_ELAPSED:
823       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
824                         EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
825       break;
826    case PIPE_QUERY_PIPELINE_STATISTICS: {
827       if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
828          /* The hw GS primitive counter doesn't work when ngg is active.
829           * So if use_ngg is true, we don't use the hw version but instead
830           * emulate it in the GS shader.
831           * The value is written at the same position, so we don't need to
832           * change anything else.
833           * If ngg is enabled for the draw, the primitive count is written in
834           * gfx10_ngg_gs_emit_epilogue. If ngg is disabled, the number of exported
835           * vertices is stored in gs_emitted_vertices and the number of prim
836           * is computed based on the output prim type in emit_gs_epilogue.
837           */
838          struct pipe_shader_buffer sbuf;
839          sbuf.buffer = &buffer->b.b;
840          sbuf.buffer_offset = query->buffer.results_end;
841          sbuf.buffer_size = buffer->bo_size;
842          si_set_internal_shader_buffer(sctx, SI_GS_QUERY_EMULATED_COUNTERS_BUF, &sbuf);
843          SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 1);
844 
845          const uint32_t zero = 0;
846          radeon_begin(cs);
847          /* Clear the emulated counter end value. We don't clear start because it's unused. */
848          va += si_query_pipestat_end_dw_offset(sctx->screen, query->index) * 4;
849          radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + 1, 0));
850          radeon_emit(S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
851          radeon_emit(va);
852          radeon_emit(va >> 32);
853          radeon_emit(zero);
854          radeon_end();
855 
856          sctx->num_pipeline_stat_emulated_queries++;
857       } else {
858          radeon_begin(cs);
859          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
860          radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
861          radeon_emit(va);
862          radeon_emit(va >> 32);
863          radeon_end();
864       }
865       break;
866    }
867    default:
868       assert(0);
869    }
870    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
871                              RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
872 }
873 
si_update_hw_pipeline_stats(struct si_context * sctx,unsigned type,int diff)874 static void si_update_hw_pipeline_stats(struct si_context *sctx, unsigned type, int diff)
875 {
876    if (type == PIPE_QUERY_PIPELINE_STATISTICS ||
877        /* All streamout queries: */
878        type == PIPE_QUERY_PRIMITIVES_GENERATED ||
879        type == PIPE_QUERY_PRIMITIVES_EMITTED ||
880        type == PIPE_QUERY_SO_STATISTICS ||
881        type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
882        type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
883       if (type == PIPE_QUERY_PIPELINE_STATISTICS)
884          sctx->num_pipeline_stat_queries += diff;
885 
886       /* Increment for pipeline statistics and streamout queries. */
887       sctx->num_hw_pipestat_streamout_queries += diff;
888 
889       /* Enable/disable pipeline stats if we have any queries. */
890       if (diff == 1 && sctx->num_hw_pipestat_streamout_queries == 1) {
891          sctx->barrier_flags &= ~SI_BARRIER_EVENT_PIPELINESTAT_STOP;
892          sctx->barrier_flags |= SI_BARRIER_EVENT_PIPELINESTAT_START;
893          si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
894       } else if (diff == -1 && sctx->num_hw_pipestat_streamout_queries == 0) {
895          sctx->barrier_flags &= ~SI_BARRIER_EVENT_PIPELINESTAT_START;
896          sctx->barrier_flags |= SI_BARRIER_EVENT_PIPELINESTAT_STOP;
897          si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
898       }
899    }
900 }
901 
si_query_hw_emit_start(struct si_context * sctx,struct si_query_hw * query)902 static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
903 {
904    uint64_t va;
905 
906    if (!query->buffer.buf && query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
907       si_resource_reference(&query->buffer.buf, sctx->pipeline_stats_query_buf);
908 
909    /* Don't realloc pipeline_stats_query_buf */
910    if ((!(query->flags & SI_QUERY_EMULATE_GS_COUNTERS) || !sctx->pipeline_stats_query_buf) &&
911        !si_query_buffer_alloc(sctx, &query->buffer, si_query_hw_prepare_buffer, query->result_size))
912       return;
913 
914    if (query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
915       si_resource_reference(&sctx->pipeline_stats_query_buf, query->buffer.buf);
916 
917    si_update_occlusion_query_state(sctx, query->b.type, 1);
918    si_update_prims_generated_query_state(sctx, query->b.type, 1);
919    si_update_hw_pipeline_stats(sctx, query->b.type, 1);
920 
921    si_need_gfx_cs_space(sctx, 0);
922 
923    va = query->buffer.buf->gpu_address + query->buffer.results_end;
924    si_query_hw_do_emit_start(sctx, query, query->buffer.buf, va);
925 }
926 
si_query_hw_do_emit_stop(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)927 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
928                                      struct si_resource *buffer, uint64_t va)
929 {
930    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
931    uint64_t fence_va = 0;
932 
933    switch (query->b.type) {
934    case PIPE_QUERY_OCCLUSION_COUNTER:
935    case PIPE_QUERY_OCCLUSION_PREDICATE:
936    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
937       fence_va = va + sctx->screen->info.max_render_backends * 16;
938       va += 8;
939       radeon_begin(cs);
940       if (sctx->gfx_level >= GFX11 &&
941           sctx->screen->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
942          radeon_emit(PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
943       } else {
944          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
945          if (sctx->gfx_level >= GFX11)
946             radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
947          else
948             radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
949       }
950       radeon_emit(va);
951       radeon_emit(va >> 32);
952       radeon_end();
953       break;
954    }
955    case PIPE_QUERY_PRIMITIVES_EMITTED:
956    case PIPE_QUERY_PRIMITIVES_GENERATED:
957    case PIPE_QUERY_SO_STATISTICS:
958    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
959       va += 16;
960       emit_sample_streamout(cs, va, query->stream);
961       break;
962    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
963       va += 16;
964       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
965          emit_sample_streamout(cs, va + 32 * stream, stream);
966       break;
967    case PIPE_QUERY_TIME_ELAPSED:
968       va += 8;
969       FALLTHROUGH;
970    case PIPE_QUERY_TIMESTAMP:
971       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
972                         EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
973       fence_va = va + 8;
974       break;
975    case PIPE_QUERY_PIPELINE_STATISTICS: {
976       unsigned sample_size = (query->result_size - 8) / 2;
977 
978       va += sample_size;
979       fence_va = va + sample_size;
980 
981       radeon_begin(cs);
982       if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
983          radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
984 
985          if (--sctx->num_pipeline_stat_emulated_queries == 0) {
986             si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
987             SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 0);
988          }
989       } else {
990          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
991          radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
992          radeon_emit(va);
993          radeon_emit(va >> 32);
994       }
995       radeon_end();
996       break;
997    }
998    default:
999       assert(0);
1000    }
1001    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
1002                              RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
1003 
1004    if (fence_va) {
1005       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
1006                         EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
1007                         query->b.type);
1008    }
1009 }
1010 
si_query_hw_emit_stop(struct si_context * sctx,struct si_query_hw * query)1011 static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
1012 {
1013    uint64_t va;
1014 
1015    /* The queries which need begin already called this in begin_query. */
1016    if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1017       si_need_gfx_cs_space(sctx, 0);
1018       if (!si_query_buffer_alloc(sctx, &query->buffer, si_query_hw_prepare_buffer,
1019                                  query->result_size))
1020          return;
1021    }
1022 
1023    if (!query->buffer.buf)
1024       return; // previous buffer allocation failure
1025 
1026    /* emit end query */
1027    va = query->buffer.buf->gpu_address + query->buffer.results_end;
1028 
1029    si_query_hw_do_emit_stop(sctx, query, query->buffer.buf, va);
1030 
1031    query->buffer.results_end += query->result_size;
1032 
1033    si_update_occlusion_query_state(sctx, query->b.type, -1);
1034    si_update_prims_generated_query_state(sctx, query->b.type, -1);
1035    si_update_hw_pipeline_stats(sctx, query->b.type, -1);
1036 }
1037 
emit_set_predicate(struct si_context * ctx,struct si_resource * buf,uint64_t va,uint32_t op)1038 static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
1039                                uint32_t op)
1040 {
1041    struct radeon_cmdbuf *cs = &ctx->gfx_cs;
1042 
1043    radeon_begin(cs);
1044 
1045    if (ctx->gfx_level >= GFX9) {
1046       radeon_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
1047       radeon_emit(op);
1048       radeon_emit(va);
1049       radeon_emit(va >> 32);
1050    } else {
1051       radeon_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
1052       radeon_emit(va);
1053       radeon_emit(op | ((va >> 32) & 0xFF));
1054    }
1055    radeon_end();
1056 
1057    radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_READ | RADEON_PRIO_QUERY);
1058 }
1059 
si_emit_query_predication(struct si_context * ctx,unsigned index)1060 static void si_emit_query_predication(struct si_context *ctx, unsigned index)
1061 {
1062    uint32_t op;
1063    bool flag_wait, invert;
1064 
1065    struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
1066    if (!query)
1067       return;
1068 
1069    invert = ctx->render_cond_invert;
1070    flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
1071                ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
1072 
1073    if (ctx->gfx_level >= GFX11 && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1074                                    query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
1075       struct gfx11_sh_query *gfx10_query = (struct gfx11_sh_query *)query;
1076       struct gfx11_sh_query_buffer *qbuf, *first, *last;
1077 
1078       op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1079 
1080       /* if true then invert, see GL_ARB_conditional_render_inverted */
1081       if (!invert)
1082          op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1083       else
1084          op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1085 
1086       op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1087 
1088       first = gfx10_query->first;
1089       last = gfx10_query->last;
1090 
1091       while (first) {
1092          qbuf = first;
1093          if (first != last)
1094             first = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list);
1095          else
1096             first = NULL;
1097 
1098          unsigned results_base = gfx10_query->first_begin;
1099          uint64_t va_base = qbuf->buf->gpu_address;
1100          uint64_t va = va_base + results_base;
1101 
1102          unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0;
1103          unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0;
1104 
1105          unsigned count = (end - begin) / sizeof(struct gfx11_sh_query_buffer_mem);
1106          do {
1107             if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1108                for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1109                   emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * stream, op);
1110 
1111                   /* set CONTINUE bit for all packets except the first */
1112                   op |= PREDICATION_CONTINUE;
1113                }
1114             } else {
1115                emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * gfx10_query->stream, op);
1116                op |= PREDICATION_CONTINUE;
1117             }
1118 
1119             results_base += sizeof(struct gfx11_sh_query_buffer_mem);
1120          } while (count--);
1121       }
1122    } else {
1123       struct si_query_buffer *qbuf;
1124 
1125       if (query->workaround_buf) {
1126          op = PRED_OP(PREDICATION_OP_BOOL64);
1127       } else {
1128          switch (query->b.type) {
1129          case PIPE_QUERY_OCCLUSION_COUNTER:
1130          case PIPE_QUERY_OCCLUSION_PREDICATE:
1131          case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1132             op = PRED_OP(PREDICATION_OP_ZPASS);
1133             break;
1134          case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1135          case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1136             op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1137             invert = !invert;
1138             break;
1139          default:
1140             assert(0);
1141             return;
1142          }
1143       }
1144 
1145       /* if true then invert, see GL_ARB_conditional_render_inverted */
1146       if (invert)
1147          op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1148       else
1149          op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1150 
1151       /* Use the value written by compute shader as a workaround. Note that
1152        * the wait flag does not apply in this predication mode.
1153        *
1154        * The shader outputs the result value to L2. Workarounds only affect GFX8
1155        * and later, where the CP reads data from L2, so we don't need an
1156        * additional flush.
1157        */
1158       if (query->workaround_buf) {
1159          uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
1160          emit_set_predicate(ctx, query->workaround_buf, va, op);
1161          return;
1162       }
1163 
1164       op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1165 
1166       /* emit predicate packets for all data blocks */
1167       for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1168          unsigned results_base = 0;
1169          uint64_t va_base = qbuf->buf->gpu_address;
1170 
1171          while (results_base < qbuf->results_end) {
1172             uint64_t va = va_base + results_base;
1173 
1174             if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1175                for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1176                   emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1177 
1178                   /* set CONTINUE bit for all packets except the first */
1179                   op |= PREDICATION_CONTINUE;
1180                }
1181             } else {
1182                emit_set_predicate(ctx, qbuf->buf, va, op);
1183                op |= PREDICATION_CONTINUE;
1184             }
1185 
1186             results_base += query->result_size;
1187          }
1188       }
1189    }
1190 }
1191 
si_create_query(struct pipe_context * ctx,unsigned query_type,unsigned index)1192 static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
1193                                           unsigned index)
1194 {
1195    struct si_screen *sscreen = (struct si_screen *)ctx->screen;
1196 
1197    if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
1198        (query_type >= PIPE_QUERY_DRIVER_SPECIFIC))
1199       return si_query_sw_create(query_type);
1200 
1201    if (sscreen->info.gfx_level >= GFX11 &&
1202        (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
1203         query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
1204         query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1205         query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
1206       return gfx11_sh_query_create(sscreen, query_type, index);
1207 
1208    return si_query_hw_create(sscreen, query_type, index);
1209 }
1210 
si_destroy_query(struct pipe_context * ctx,struct pipe_query * query)1211 static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1212 {
1213    struct si_context *sctx = (struct si_context *)ctx;
1214    struct si_query *squery = (struct si_query *)query;
1215 
1216    squery->ops->destroy(sctx, squery);
1217 }
1218 
si_begin_query(struct pipe_context * ctx,struct pipe_query * query)1219 static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
1220 {
1221    struct si_context *sctx = (struct si_context *)ctx;
1222    struct si_query *squery = (struct si_query *)query;
1223 
1224    return squery->ops->begin(sctx, squery);
1225 }
1226 
si_query_hw_begin(struct si_context * sctx,struct si_query * squery)1227 static bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
1228 {
1229    struct si_query_hw *query = (struct si_query_hw *)squery;
1230 
1231    if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1232       assert(0);
1233       return false;
1234    }
1235 
1236    if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
1237       si_query_buffer_reset(sctx, &query->buffer);
1238 
1239    si_resource_reference(&query->workaround_buf, NULL);
1240 
1241    si_query_hw_emit_start(sctx, query);
1242    if (!query->buffer.buf)
1243       return false;
1244 
1245    list_addtail(&query->b.active_list, &sctx->active_queries);
1246    sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1247    return true;
1248 }
1249 
si_end_query(struct pipe_context * ctx,struct pipe_query * query)1250 static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
1251 {
1252    struct si_context *sctx = (struct si_context *)ctx;
1253    struct si_query *squery = (struct si_query *)query;
1254 
1255    return squery->ops->end(sctx, squery);
1256 }
1257 
si_query_hw_end(struct si_context * sctx,struct si_query * squery)1258 static bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
1259 {
1260    struct si_query_hw *query = (struct si_query_hw *)squery;
1261 
1262    if (query->flags & SI_QUERY_HW_FLAG_NO_START)
1263       si_query_buffer_reset(sctx, &query->buffer);
1264 
1265    si_query_hw_emit_stop(sctx, query);
1266 
1267    if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
1268       list_delinit(&query->b.active_list);
1269       sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
1270    }
1271 
1272    if (!query->buffer.buf)
1273       return false;
1274 
1275    return true;
1276 }
1277 
si_get_hw_query_result_shader_params(struct si_context * sctx,struct si_query_hw * squery,int index,struct si_hw_query_params * params)1278 static void si_get_hw_query_result_shader_params(struct si_context *sctx,
1279                                                  struct si_query_hw *squery, int index,
1280                                                  struct si_hw_query_params *params)
1281 {
1282    unsigned max_rbs = sctx->screen->info.max_render_backends;
1283 
1284    params->pair_stride = 0;
1285    params->pair_count = 1;
1286 
1287    switch (squery->b.type) {
1288    case PIPE_QUERY_OCCLUSION_COUNTER:
1289    case PIPE_QUERY_OCCLUSION_PREDICATE:
1290    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1291       params->start_offset = 0;
1292       params->end_offset = 8;
1293       params->fence_offset = max_rbs * 16;
1294       params->pair_stride = 16;
1295       params->pair_count = max_rbs;
1296       break;
1297    case PIPE_QUERY_TIME_ELAPSED:
1298       params->start_offset = 0;
1299       params->end_offset = 8;
1300       params->fence_offset = 16;
1301       break;
1302    case PIPE_QUERY_TIMESTAMP:
1303       params->start_offset = 0;
1304       params->end_offset = 0;
1305       params->fence_offset = 8;
1306       break;
1307    case PIPE_QUERY_PRIMITIVES_EMITTED:
1308       params->start_offset = 8;
1309       params->end_offset = 24;
1310       params->fence_offset = params->end_offset + 4;
1311       break;
1312    case PIPE_QUERY_PRIMITIVES_GENERATED:
1313       params->start_offset = 0;
1314       params->end_offset = 16;
1315       params->fence_offset = params->end_offset + 4;
1316       break;
1317    case PIPE_QUERY_SO_STATISTICS:
1318       params->start_offset = 8 - index * 8;
1319       params->end_offset = 24 - index * 8;
1320       params->fence_offset = params->end_offset + 4;
1321       break;
1322    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1323       params->pair_count = SI_MAX_STREAMS;
1324       params->pair_stride = 32;
1325       FALLTHROUGH;
1326    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1327       params->start_offset = 0;
1328       params->end_offset = 16;
1329 
1330       /* We can re-use the high dword of the last 64-bit value as a
1331        * fence: it is initialized as 0, and the high bit is set by
1332        * the write of the streamout stats event.
1333        */
1334       params->fence_offset = squery->result_size - 4;
1335       break;
1336    case PIPE_QUERY_PIPELINE_STATISTICS: {
1337       params->start_offset = si_query_pipestat_dw_offset(index) * 4;
1338       params->end_offset = si_query_pipestat_end_dw_offset(sctx->screen, index) * 4;
1339       params->fence_offset = si_query_pipestats_num_results(sctx->screen) * 16;
1340       break;
1341    }
1342    default:
1343       unreachable("si_get_hw_query_params unsupported");
1344    }
1345 }
1346 
si_query_read_result(void * map,unsigned start_index,unsigned end_index,bool test_status_bit)1347 static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
1348                                      bool test_status_bit)
1349 {
1350    uint32_t *current_result = (uint32_t *)map;
1351    uint64_t start, end;
1352 
1353    start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
1354    end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
1355 
1356    if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1357       return end - start;
1358    }
1359    return 0;
1360 }
1361 
si_query_hw_add_result(struct si_screen * sscreen,struct si_query_hw * query,void * buffer,union pipe_query_result * result)1362 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
1363                                    void *buffer, union pipe_query_result *result)
1364 {
1365    unsigned max_rbs = sscreen->info.max_render_backends;
1366 
1367    switch (query->b.type) {
1368    case PIPE_QUERY_OCCLUSION_COUNTER: {
1369       for (unsigned i = 0; i < max_rbs; ++i) {
1370          unsigned results_base = i * 16;
1371          result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
1372       }
1373       break;
1374    }
1375    case PIPE_QUERY_OCCLUSION_PREDICATE:
1376    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1377       for (unsigned i = 0; i < max_rbs; ++i) {
1378          unsigned results_base = i * 16;
1379          result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
1380       }
1381       break;
1382    }
1383    case PIPE_QUERY_TIME_ELAPSED:
1384       result->u64 += si_query_read_result(buffer, 0, 2, false);
1385       break;
1386    case PIPE_QUERY_TIMESTAMP:
1387       result->u64 = *(uint64_t *)buffer;
1388       break;
1389    case PIPE_QUERY_PRIMITIVES_EMITTED:
1390       /* SAMPLE_STREAMOUTSTATS stores this structure:
1391        * {
1392        *    u64 NumPrimitivesWritten;
1393        *    u64 PrimitiveStorageNeeded;
1394        * }
1395        * We only need NumPrimitivesWritten here. */
1396       result->u64 += si_query_read_result(buffer, 2, 6, true);
1397       break;
1398    case PIPE_QUERY_PRIMITIVES_GENERATED:
1399       /* Here we read PrimitiveStorageNeeded. */
1400       result->u64 += si_query_read_result(buffer, 0, 4, true);
1401       break;
1402    case PIPE_QUERY_SO_STATISTICS:
1403       result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
1404       result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
1405       break;
1406    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1407       result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1408                                   si_query_read_result(buffer, 0, 4, true);
1409       break;
1410    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1411       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1412          result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1413                                      si_query_read_result(buffer, 0, 4, true);
1414          buffer = (char *)buffer + 32;
1415       }
1416       break;
1417    case PIPE_QUERY_PIPELINE_STATISTICS:
1418       for (int i = 0; i < 11; i++) {
1419          result->pipeline_statistics.counters[i] +=
1420             si_query_read_result(buffer, si_query_pipestat_dw_offset(i),
1421                                  si_query_pipestat_end_dw_offset(sscreen, i), false);
1422       }
1423 #if 0 /* for testing */
1424       printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1425              "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1426              "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1427              result->pipeline_statistics.ia_vertices,
1428              result->pipeline_statistics.ia_primitives,
1429              result->pipeline_statistics.vs_invocations,
1430              result->pipeline_statistics.hs_invocations,
1431              result->pipeline_statistics.ds_invocations,
1432              result->pipeline_statistics.gs_invocations,
1433              result->pipeline_statistics.gs_primitives,
1434              result->pipeline_statistics.c_invocations,
1435              result->pipeline_statistics.c_primitives,
1436              result->pipeline_statistics.ps_invocations,
1437              result->pipeline_statistics.cs_invocations);
1438 #endif
1439       break;
1440    default:
1441       assert(0);
1442    }
1443 }
1444 
si_query_hw_suspend(struct si_context * sctx,struct si_query * query)1445 static void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
1446 {
1447    si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
1448 }
1449 
si_query_hw_resume(struct si_context * sctx,struct si_query * query)1450 static void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
1451 {
1452    si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
1453 }
1454 
si_get_query_result(struct pipe_context * ctx,struct pipe_query * query,bool wait,union pipe_query_result * result)1455 static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
1456                                 union pipe_query_result *result)
1457 {
1458    struct si_context *sctx = (struct si_context *)ctx;
1459    struct si_query *squery = (struct si_query *)query;
1460 
1461    return squery->ops->get_result(sctx, squery, wait, result);
1462 }
1463 
si_get_query_result_resource(struct pipe_context * ctx,struct pipe_query * query,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1464 static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
1465                                          enum pipe_query_flags flags, enum pipe_query_value_type result_type,
1466                                          int index, struct pipe_resource *resource, unsigned offset)
1467 {
1468    struct si_context *sctx = (struct si_context *)ctx;
1469    struct si_query *squery = (struct si_query *)query;
1470 
1471    squery->ops->get_result_resource(sctx, squery, flags, result_type, index, resource, offset);
1472 }
1473 
si_query_hw_clear_result(struct si_query_hw * query,union pipe_query_result * result)1474 static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
1475 {
1476    util_query_clear_result(result, query->b.type);
1477 }
1478 
si_query_hw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)1479 static bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1480                                    union pipe_query_result *result)
1481 {
1482    struct si_screen *sscreen = sctx->screen;
1483    struct si_query_hw *query = (struct si_query_hw *)squery;
1484    struct si_query_buffer *qbuf;
1485 
1486    si_query_hw_clear_result(query, result);
1487 
1488    for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1489       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1490       unsigned results_base = 0;
1491       void *map;
1492 
1493       if (squery->b.flushed)
1494          map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
1495       else
1496          map = si_buffer_map(sctx, qbuf->buf, usage);
1497 
1498       if (!map)
1499          return false;
1500 
1501       while (results_base != qbuf->results_end) {
1502          si_query_hw_add_result(sscreen, query, map + results_base, result);
1503          results_base += query->result_size;
1504       }
1505    }
1506 
1507    /* Convert the time to expected units. */
1508    if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
1509        squery->type == PIPE_QUERY_TIMESTAMP) {
1510       result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
1511    }
1512    return true;
1513 }
1514 
si_query_hw_get_result_resource(struct si_context * sctx,struct si_query * squery,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1515 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
1516                                             enum pipe_query_flags flags,
1517                                             enum pipe_query_value_type result_type,
1518                                             int index, struct pipe_resource *resource,
1519                                             unsigned offset)
1520 {
1521    struct si_query_hw *query = (struct si_query_hw *)squery;
1522    struct si_query_buffer *qbuf;
1523    struct si_query_buffer *qbuf_prev;
1524    struct pipe_resource *tmp_buffer = NULL;
1525    unsigned tmp_buffer_offset = 0;
1526    struct si_qbo_state saved_state = {};
1527    struct pipe_grid_info grid = {};
1528    struct pipe_constant_buffer constant_buffer = {};
1529    struct pipe_shader_buffer ssbo[3];
1530    struct si_hw_query_params params;
1531    struct {
1532       uint32_t end_offset;
1533       uint32_t result_stride;
1534       uint32_t result_count;
1535       uint32_t config;
1536       uint32_t fence_offset;
1537       uint32_t pair_stride;
1538       uint32_t pair_count;
1539    } consts;
1540 
1541    if (!sctx->query_result_shader) {
1542       sctx->query_result_shader = si_create_query_result_cs(sctx);
1543       if (!sctx->query_result_shader)
1544          return;
1545    }
1546 
1547    if (query->buffer.previous) {
1548       u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
1549       if (!tmp_buffer)
1550          return;
1551    }
1552 
1553    si_save_qbo_state(sctx, &saved_state);
1554 
1555    si_get_hw_query_result_shader_params(sctx, query, index >= 0 ? index : 0, &params);
1556    consts.end_offset = params.end_offset - params.start_offset;
1557    consts.fence_offset = params.fence_offset - params.start_offset;
1558    consts.result_stride = query->result_size;
1559    consts.pair_stride = params.pair_stride;
1560    consts.pair_count = params.pair_count;
1561 
1562    constant_buffer.buffer_size = sizeof(consts);
1563    constant_buffer.user_buffer = &consts;
1564 
1565    ssbo[1].buffer = tmp_buffer;
1566    ssbo[1].buffer_offset = tmp_buffer_offset;
1567    ssbo[1].buffer_size = 16;
1568 
1569    ssbo[2] = ssbo[1];
1570 
1571    grid.block[0] = 1;
1572    grid.block[1] = 1;
1573    grid.block[2] = 1;
1574    grid.grid[0] = 1;
1575    grid.grid[1] = 1;
1576    grid.grid[2] = 1;
1577 
1578    consts.config = 0;
1579    if (index < 0)
1580       consts.config |= 4;
1581    if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1582        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1583       consts.config |= 8;
1584    else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1585             query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1586       consts.config |= 8 | 256;
1587    else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
1588       consts.config |= 32;
1589 
1590    switch (result_type) {
1591    case PIPE_QUERY_TYPE_U64:
1592    case PIPE_QUERY_TYPE_I64:
1593       consts.config |= 64;
1594       break;
1595    case PIPE_QUERY_TYPE_I32:
1596       consts.config |= 128;
1597       break;
1598    case PIPE_QUERY_TYPE_U32:
1599       break;
1600    }
1601 
1602    sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM |
1603                           (sctx->gfx_level <= GFX8 ? SI_BARRIER_INV_L2 : 0);
1604    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
1605 
1606    for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1607       if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1608          qbuf_prev = qbuf->previous;
1609          consts.result_count = qbuf->results_end / query->result_size;
1610          consts.config &= ~3;
1611          if (qbuf != &query->buffer)
1612             consts.config |= 1;
1613          if (qbuf->previous)
1614             consts.config |= 2;
1615       } else {
1616          /* Only read the last timestamp. */
1617          qbuf_prev = NULL;
1618          consts.result_count = 0;
1619          consts.config |= 16;
1620          params.start_offset += qbuf->results_end - query->result_size;
1621       }
1622 
1623       sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
1624 
1625       ssbo[0].buffer = &qbuf->buf->b.b;
1626       ssbo[0].buffer_offset = params.start_offset;
1627       ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1628 
1629       if (!qbuf->previous) {
1630          ssbo[2].buffer = resource;
1631          ssbo[2].buffer_offset = offset;
1632          ssbo[2].buffer_size = resource->width0 - offset;
1633       }
1634 
1635       if ((flags & PIPE_QUERY_WAIT) && qbuf == &query->buffer) {
1636          uint64_t va;
1637 
1638          /* Wait for result availability. Wait only for readiness
1639           * of the last entry, since the fence writes should be
1640           * serialized in the CP.
1641           */
1642          va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1643          va += params.fence_offset;
1644 
1645          si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
1646       }
1647 
1648       unsigned writable_bitmask = 0x4;
1649 
1650       si_barrier_before_internal_op(sctx, 0, 3, ssbo, writable_bitmask, 0, NULL);
1651       si_launch_grid_internal_ssbos(sctx, &grid, sctx->query_result_shader,
1652                                     3, ssbo, writable_bitmask, false);
1653       si_barrier_after_internal_op(sctx, 0, 3, ssbo, writable_bitmask, 0, NULL);
1654    }
1655 
1656    si_restore_qbo_state(sctx, &saved_state);
1657    pipe_resource_reference(&tmp_buffer, NULL);
1658 }
1659 
si_render_condition(struct pipe_context * ctx,struct pipe_query * query,bool condition,enum pipe_render_cond_flag mode)1660 static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
1661                                 enum pipe_render_cond_flag mode)
1662 {
1663    struct si_context *sctx = (struct si_context *)ctx;
1664    struct si_query_hw *squery = (struct si_query_hw *)query;
1665    struct si_atom *atom = &sctx->atoms.s.render_cond;
1666 
1667    if (query) {
1668       bool needs_workaround = false;
1669 
1670       /* There was a firmware regression in GFX8 which causes successive
1671        * SET_PREDICATION packets to give the wrong answer for
1672        * non-inverted stream overflow predication.
1673        */
1674       if (((sctx->gfx_level == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
1675            (sctx->gfx_level == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
1676           !condition &&
1677           (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1678            (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1679             (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
1680          needs_workaround = true;
1681       }
1682 
1683       if (needs_workaround && !squery->workaround_buf) {
1684          bool old_render_cond_enabled = sctx->render_cond_enabled;
1685          sctx->render_cond_enabled = false;
1686 
1687          u_suballocator_alloc(&sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
1688                               (struct pipe_resource **)&squery->workaround_buf);
1689 
1690          /* Reset to NULL to avoid a redundant SET_PREDICATION
1691           * from launching the compute grid.
1692           */
1693          sctx->render_cond = NULL;
1694 
1695          ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1696                                         &squery->workaround_buf->b.b, squery->workaround_offset);
1697 
1698          /* Settings this in the render cond atom is too late,
1699           * so set it here. */
1700          if (sctx->gfx_level <= GFX8) {
1701             sctx->barrier_flags |= SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME;
1702             si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
1703          }
1704 
1705          sctx->render_cond_enabled = old_render_cond_enabled;
1706       }
1707    }
1708 
1709    sctx->render_cond = query;
1710    sctx->render_cond_invert = condition;
1711    sctx->render_cond_mode = mode;
1712    sctx->render_cond_enabled = query;
1713 
1714    si_set_atom_dirty(sctx, atom, query != NULL);
1715 }
1716 
si_suspend_queries(struct si_context * sctx)1717 void si_suspend_queries(struct si_context *sctx)
1718 {
1719    struct si_query *query;
1720 
1721    LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1722       query->ops->suspend(sctx, query);
1723 }
1724 
si_resume_queries(struct si_context * sctx)1725 void si_resume_queries(struct si_context *sctx)
1726 {
1727    struct si_query *query;
1728 
1729    /* Check CS space here. Resuming must not be interrupted by flushes. */
1730    si_need_gfx_cs_space(sctx, 0);
1731 
1732    LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1733       query->ops->resume(sctx, query);
1734 }
1735 
1736 #define XFULL(name_, query_type_, type_, result_type_, group_id_)                                  \
1737    {                                                                                               \
1738       .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1739       .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_           \
1740    }
1741 
1742 #define X(name_, query_type_, type_, result_type_)                                                 \
1743    XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1744 
1745 #define XG(group_, name_, query_type_, type_, result_type_)                                        \
1746    XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
1747 
1748 static struct pipe_driver_query_info si_driver_query_list[] = {
1749    X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1750    X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1751    X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1752    X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
1753    X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1754    X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1755    X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1756    X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1757    X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1758    X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1759    X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1760    X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1761    X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1762    X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1763    X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1764    X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1765    X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1766    X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1767    X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1768    X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1769    X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1770    X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1771    X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1772    X("slab-wasted-VRAM", SLAB_WASTED_VRAM, BYTES, AVERAGE),
1773    X("slab-wasted-GTT", SLAB_WASTED_GTT, BYTES, AVERAGE),
1774    X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1775    X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1776    X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1777    X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
1778    X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
1779    X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1780    X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1781    X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1782    X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1783    X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1784    X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1785    X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1786    X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1787    X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1788    X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1789    X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1790    X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1791    X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1792 
1793    /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1794     * which use it as a fallback path to detect the GPU type.
1795     *
1796     * Note: The names of these queries are significant for GPUPerfStudio
1797     * (and possibly their order as well). */
1798    XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1799    XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1800    XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1801    XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1802    XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1803 
1804    X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1805    X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1806    X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1807 
1808    /* The following queries must be at the end of the list because their
1809     * availability is adjusted dynamically based on the DRM version. */
1810    X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1811    X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1812    X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1813    X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1814    X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1815    X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1816    X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1817    X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1818    X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1819    X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1820    X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1821    X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1822    X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1823    X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1824 
1825    /* SRBM_STATUS2 */
1826    X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1827 
1828    /* CP_STAT */
1829    X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1830    X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1831    X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1832    X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1833    X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
1834    X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1835 };
1836 
1837 #undef X
1838 #undef XG
1839 #undef XFULL
1840 
si_get_num_queries(struct si_screen * sscreen)1841 static unsigned si_get_num_queries(struct si_screen *sscreen)
1842 {
1843    /* amdgpu */
1844    if (sscreen->info.is_amdgpu) {
1845       if (sscreen->info.gfx_level >= GFX8)
1846          return ARRAY_SIZE(si_driver_query_list);
1847       else
1848          return ARRAY_SIZE(si_driver_query_list) - 7;
1849    }
1850 
1851    /* radeon */
1852    if (sscreen->info.gfx_level == GFX7)
1853       return ARRAY_SIZE(si_driver_query_list) - 6;
1854    else
1855       return ARRAY_SIZE(si_driver_query_list) - 7;
1856 
1857    return ARRAY_SIZE(si_driver_query_list) - 21;
1858 }
1859 
si_get_driver_query_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_info * info)1860 static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
1861                                     struct pipe_driver_query_info *info)
1862 {
1863    struct si_screen *sscreen = (struct si_screen *)screen;
1864    unsigned num_queries = si_get_num_queries(sscreen);
1865 
1866    if (!info) {
1867       unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
1868 
1869       return num_queries + num_perfcounters;
1870    }
1871 
1872    if (index >= num_queries)
1873       return si_get_perfcounter_info(sscreen, index - num_queries, info);
1874 
1875    *info = si_driver_query_list[index];
1876 
1877    switch (info->query_type) {
1878    case SI_QUERY_REQUESTED_VRAM:
1879    case SI_QUERY_VRAM_USAGE:
1880    case SI_QUERY_MAPPED_VRAM:
1881    case SI_QUERY_SLAB_WASTED_VRAM:
1882       info->max_value.u64 = (uint64_t)sscreen->info.vram_size_kb * 1024;
1883       break;
1884    case SI_QUERY_REQUESTED_GTT:
1885    case SI_QUERY_GTT_USAGE:
1886    case SI_QUERY_MAPPED_GTT:
1887    case SI_QUERY_SLAB_WASTED_GTT:
1888       info->max_value.u64 = (uint64_t)sscreen->info.gart_size_kb * 1024;
1889       break;
1890    case SI_QUERY_GPU_TEMPERATURE:
1891       info->max_value.u64 = 125;
1892       break;
1893    case SI_QUERY_VRAM_VIS_USAGE:
1894       info->max_value.u64 = (uint64_t)sscreen->info.vram_vis_size_kb * 1024;
1895       break;
1896    }
1897 
1898    if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
1899       info->group_id += sscreen->perfcounters->base.num_groups;
1900 
1901    return 1;
1902 }
1903 
1904 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1905  * performance counter groups, so be careful when changing this and related
1906  * functions.
1907  */
si_get_driver_query_group_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)1908 static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
1909                                           struct pipe_driver_query_group_info *info)
1910 {
1911    struct si_screen *sscreen = (struct si_screen *)screen;
1912    unsigned num_pc_groups = 0;
1913 
1914    if (sscreen->perfcounters)
1915       num_pc_groups = sscreen->perfcounters->base.num_groups;
1916 
1917    if (!info)
1918       return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
1919 
1920    if (index < num_pc_groups)
1921       return si_get_perfcounter_group_info(sscreen, index, info);
1922 
1923    index -= num_pc_groups;
1924    if (index >= SI_NUM_SW_QUERY_GROUPS)
1925       return 0;
1926 
1927    info->name = "GPIN";
1928    info->max_active_queries = 5;
1929    info->num_queries = 5;
1930    return 1;
1931 }
1932 
1933 static const struct si_query_ops hw_query_ops = {
1934    .destroy = si_query_hw_destroy,
1935    .begin = si_query_hw_begin,
1936    .end = si_query_hw_end,
1937    .get_result = si_query_hw_get_result,
1938    .get_result_resource = si_query_hw_get_result_resource,
1939 
1940    .suspend = si_query_hw_suspend,
1941    .resume = si_query_hw_resume,
1942 };
1943 
1944 static const struct si_query_ops sw_query_ops = {
1945    .destroy = si_query_sw_destroy,
1946    .begin = si_query_sw_begin,
1947    .end = si_query_sw_end,
1948    .get_result = si_query_sw_get_result,
1949    .get_result_resource = NULL
1950 };
1951 
si_init_query_functions(struct si_context * sctx)1952 void si_init_query_functions(struct si_context *sctx)
1953 {
1954    sctx->b.create_query = si_create_query;
1955    sctx->b.create_batch_query = si_create_batch_query;
1956    sctx->b.destroy_query = si_destroy_query;
1957    sctx->b.begin_query = si_begin_query;
1958    sctx->b.end_query = si_end_query;
1959    sctx->b.get_query_result = si_get_query_result;
1960    sctx->b.get_query_result_resource = si_get_query_result_resource;
1961 
1962    if (sctx->has_graphics) {
1963       sctx->atoms.s.render_cond.emit = si_emit_query_predication;
1964       sctx->b.render_condition = si_render_condition;
1965    }
1966 
1967    list_inithead(&sctx->active_queries);
1968 }
1969 
si_init_screen_query_functions(struct si_screen * sscreen)1970 void si_init_screen_query_functions(struct si_screen *sscreen)
1971 {
1972    sscreen->b.get_driver_query_info = si_get_driver_query_info;
1973    sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
1974 }
1975