xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_cp_utils.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_build_pm4.h"
8 
is_ts_event(unsigned event_type)9 static bool is_ts_event(unsigned event_type)
10 {
11    return event_type == V_028A90_CACHE_FLUSH_TS ||
12           event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT ||
13           event_type == V_028A90_BOTTOM_OF_PIPE_TS ||
14           event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS ||
15           event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS;
16 }
17 
18 /* Insert CS_DONE, PS_DONE, or a *_TS event into the pipeline, which will signal after the work
19  * indicated by the event is complete, which optionally includes flushing caches using "gcr_cntl"
20  * after the completion of the work. *_TS events are always signaled at the end of the pipeline,
21  * while CS_DONE and PS_DONE are signaled when those shaders finish. This call only inserts
22  * the event into the pipeline. It doesn't wait for anything and it doesn't execute anything
23  * immediately. The only way to wait for the event completion is to call si_cp_acquire_mem_pws
24  * with the same "event_type".
25  */
si_cp_release_mem_pws(struct si_context * sctx,struct radeon_cmdbuf * cs,unsigned event_type,unsigned gcr_cntl)26 void si_cp_release_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
27                            unsigned event_type, unsigned gcr_cntl)
28 {
29    assert(sctx->gfx_level >= GFX11 && sctx->has_graphics);
30    bool ts = is_ts_event(event_type);
31    /* Extract GCR_CNTL fields because the encoding is different in RELEASE_MEM. */
32    assert(G_586_GLI_INV(gcr_cntl) == 0);
33    assert(G_586_GL1_RANGE(gcr_cntl) == 0);
34    unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
35    unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
36    unsigned glk_wb = G_586_GLK_WB(gcr_cntl);
37    unsigned glk_inv = G_586_GLK_INV(gcr_cntl);
38    unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
39    unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
40    assert(G_586_GL2_US(gcr_cntl) == 0);
41    assert(G_586_GL2_RANGE(gcr_cntl) == 0);
42    assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
43    unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
44    unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
45    unsigned gcr_seq = G_586_SEQ(gcr_cntl);
46 
47    radeon_begin(cs);
48    radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0));
49    radeon_emit(S_490_EVENT_TYPE(event_type) |
50                S_490_EVENT_INDEX(ts ? 5 : 6) |
51                S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
52                S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
53                S_490_SEQ(gcr_seq) | S_490_GLK_WB(glk_wb) | S_490_GLK_INV(glk_inv) |
54                S_490_PWS_ENABLE(1));
55    radeon_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */
56    radeon_emit(0); /* ADDRESS_LO */
57    radeon_emit(0); /* ADDRESS_HI */
58    radeon_emit(0); /* DATA_LO */
59    radeon_emit(0); /* DATA_HI */
60    radeon_emit(0); /* INT_CTXID */
61    radeon_end();
62 }
63 
64 /* This will wait or insert into the pipeline a wait for a previous RELEASE_MEM PWS event.
65  *
66  * "event_type" must be the same as the RELEASE_MEM PWS event.
67  *
68  * "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME, PRE_SHADER,
69  * PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the pipeline instead of completely
70  * idling the hw at the frontend.
71  *
72  * "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the pipeline, any
73  * cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM.
74  *
75  * "distance" determines how many RELEASE_MEM PWS events ago it should wait for, minus one
76  * (starting from 0). There are 3 event types: PS_DONE, CS_DONE, and TS events. The distance
77  * counter increments separately for each type, so 0 with PS_DONE means wait for the last PS_DONE
78  * event, while 0 with *_TS means wait for the last TS event (even if it's a different TS event
79  * because all TS events share the same counter).
80  *
81  * PRE_SHADER waits before the first shader that has IMAGE_OP=1, while PRE_PIX_SHADER waits before
82  * PS if it has IMAGE_OP=1 (IMAGE_OP should really be called SYNC_ENABLE) PRE_DEPTH waits before
83  * depth/stencil tests.
84  *
85  * PRE_COLOR also exists but shouldn't be used because it can hang. It's recommended to use
86  * PRE_PIX_SHADER instead, which means all PS that have color exports with enabled color buffers,
87  * non-zero colormask, and non-zero sample mask must have IMAGE_OP=1 to enable the sync before PS.
88  *
89  * Waiting for a PWS fence that was generated by a previous IB is valid, but if there is an IB
90  * from another process in between and that IB also inserted a PWS fence, the hw will wait for
91  * the newer fence instead because the PWS counter was incremented.
92  */
si_cp_acquire_mem_pws(struct si_context * sctx,struct radeon_cmdbuf * cs,unsigned event_type,unsigned stage_sel,unsigned gcr_cntl,unsigned distance,unsigned sqtt_flush_flags)93 void si_cp_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
94                            unsigned event_type, unsigned stage_sel, unsigned gcr_cntl,
95                            unsigned distance, unsigned sqtt_flush_flags)
96 {
97    assert(sctx->gfx_level >= GFX11 && sctx->has_graphics);
98    bool ts = is_ts_event(event_type);
99    bool cs_done = event_type == V_028A90_CS_DONE;
100    bool ps = event_type == V_028A90_PS_DONE;
101 
102    assert((int)ts + (int)cs_done + (int)ps == 1);
103    assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME);
104    assert(stage_sel != V_580_PRE_COLOR);
105 
106    if (unlikely(sctx->sqtt_enabled))
107       si_sqtt_describe_barrier_start(sctx, cs);
108 
109    radeon_begin(cs);
110    radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
111    radeon_emit(S_580_PWS_STAGE_SEL(stage_sel) |
112                S_580_PWS_COUNTER_SEL(ts ? V_580_TS_SELECT : ps ? V_580_PS_SELECT : V_580_CS_SELECT) |
113                S_580_PWS_ENA2(1) |
114                S_580_PWS_COUNT(distance));
115    radeon_emit(0xffffffff); /* GCR_SIZE */
116    radeon_emit(0x01ffffff); /* GCR_SIZE_HI */
117    radeon_emit(0); /* GCR_BASE_LO */
118    radeon_emit(0); /* GCR_BASE_HI */
119    radeon_emit(S_585_PWS_ENA(1));
120    radeon_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */
121    radeon_end();
122 
123    if (unlikely(sctx->sqtt_enabled))
124       si_sqtt_describe_barrier_end(sctx, cs, sqtt_flush_flags);
125 }
126 
si_cp_release_acquire_mem_pws(struct si_context * sctx,struct radeon_cmdbuf * cs,unsigned event_type,unsigned gcr_cntl,unsigned stage_sel,unsigned sqtt_flush_flags)127 void si_cp_release_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
128                                    unsigned event_type, unsigned gcr_cntl, unsigned stage_sel,
129                                    unsigned sqtt_flush_flags)
130 {
131    si_cp_release_mem_pws(sctx, cs, event_type, gcr_cntl);
132    si_cp_acquire_mem_pws(sctx, cs, event_type, stage_sel, 0, 0, sqtt_flush_flags);
133 }
134 
135 /* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits for idle on older
136  * chips. "engine" determines whether to sync in PFP or ME.
137  */
si_cp_acquire_mem(struct si_context * sctx,struct radeon_cmdbuf * cs,unsigned gcr_cntl,unsigned engine)138 void si_cp_acquire_mem(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned gcr_cntl,
139                        unsigned engine)
140 {
141    assert(engine == V_580_CP_PFP || engine == V_580_CP_ME);
142    assert(gcr_cntl);
143 
144    if (sctx->gfx_level >= GFX10) {
145       /* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
146       unsigned engine_flag = engine == V_580_CP_ME ? BITFIELD_BIT(31) : 0;
147 
148       /* Flush caches. This doesn't wait for idle. */
149       radeon_begin(cs);
150       radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
151       radeon_emit(engine_flag);   /* which engine to use */
152       radeon_emit(0xffffffff);    /* CP_COHER_SIZE */
153       radeon_emit(0x01ffffff);    /* CP_COHER_SIZE_HI */
154       radeon_emit(0);             /* CP_COHER_BASE */
155       radeon_emit(0);             /* CP_COHER_BASE_HI */
156       radeon_emit(0x0000000A);    /* POLL_INTERVAL */
157       radeon_emit(gcr_cntl);      /* GCR_CNTL */
158       radeon_end();
159    } else {
160       bool compute_ib = !sctx->has_graphics;
161 
162       /* This seems problematic with GFX7 (see #4764) */
163       if (sctx->gfx_level != GFX7)
164          gcr_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */
165 
166       if (sctx->gfx_level == GFX9 || compute_ib) {
167          /* Flush caches and wait for the caches to assert idle. */
168          radeon_begin(cs);
169          radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0));
170          radeon_emit(gcr_cntl);      /* CP_COHER_CNTL */
171          radeon_emit(0xffffffff);    /* CP_COHER_SIZE */
172          radeon_emit(0xffffff);      /* CP_COHER_SIZE_HI */
173          radeon_emit(0);             /* CP_COHER_BASE */
174          radeon_emit(0);             /* CP_COHER_BASE_HI */
175          radeon_emit(0x0000000A);    /* POLL_INTERVAL */
176          radeon_end();
177       } else {
178          /* ACQUIRE_MEM is only required on the compute ring. */
179          radeon_begin(cs);
180          radeon_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
181          radeon_emit(gcr_cntl);      /* CP_COHER_CNTL */
182          radeon_emit(0xffffffff);    /* CP_COHER_SIZE */
183          radeon_emit(0);             /* CP_COHER_BASE */
184          radeon_emit(0x0000000A);    /* POLL_INTERVAL */
185          radeon_end();
186       }
187 
188       /* ACQUIRE_MEM & SURFACE_SYNC roll the context if the current context is busy. */
189       if (!compute_ib)
190          sctx->context_roll = true;
191 
192       if (engine == V_580_CP_PFP)
193          si_cp_pfp_sync_me(cs);
194    }
195 }
196 
si_cp_pfp_sync_me(struct radeon_cmdbuf * cs)197 void si_cp_pfp_sync_me(struct radeon_cmdbuf *cs)
198 {
199    radeon_begin(cs);
200    radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
201    radeon_emit(0);
202    radeon_end();
203 }
204