xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_barrier.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_build_pm4.h"
8 
si_get_wait_mem_scratch_bo(struct si_context * ctx,struct radeon_cmdbuf * cs,bool is_secure)9 static struct si_resource *si_get_wait_mem_scratch_bo(struct si_context *ctx,
10                                                       struct radeon_cmdbuf *cs, bool is_secure)
11 {
12    struct si_screen *sscreen = ctx->screen;
13 
14    assert(ctx->gfx_level < GFX11);
15 
16    if (likely(!is_secure)) {
17       return ctx->wait_mem_scratch;
18    } else {
19       assert(sscreen->info.has_tmz_support);
20       if (!ctx->wait_mem_scratch_tmz) {
21          ctx->wait_mem_scratch_tmz =
22             si_aligned_buffer_create(&sscreen->b,
23                                      PIPE_RESOURCE_FLAG_UNMAPPABLE |
24                                      SI_RESOURCE_FLAG_DRIVER_INTERNAL |
25                                      PIPE_RESOURCE_FLAG_ENCRYPTED,
26                                      PIPE_USAGE_DEFAULT, 4,
27                                      sscreen->info.tcc_cache_line_size);
28          si_cp_write_data(ctx, ctx->wait_mem_scratch_tmz, 0, 4, V_370_MEM, V_370_ME,
29                           &ctx->wait_mem_number);
30       }
31 
32       return ctx->wait_mem_scratch_tmz;
33    }
34 }
35 
get_reduced_barrier_flags(struct si_context * ctx)36 static unsigned get_reduced_barrier_flags(struct si_context *ctx)
37 {
38    unsigned flags = ctx->barrier_flags;
39 
40    if (!flags)
41       return 0;
42 
43    if (!ctx->has_graphics) {
44       /* Only process compute flags. */
45       flags &= SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM |
46                SI_BARRIER_INV_L2 | SI_BARRIER_WB_L2 | SI_BARRIER_INV_L2_METADATA |
47                SI_BARRIER_SYNC_CS;
48    }
49 
50    /* Don't flush CB and DB if there have been no draw calls. */
51    if (ctx->num_draw_calls == ctx->last_cb_flush_num_draw_calls &&
52        ctx->num_decompress_calls == ctx->last_cb_flush_num_decompress_calls)
53       flags &= ~SI_BARRIER_SYNC_AND_INV_CB;
54 
55    if (ctx->num_draw_calls == ctx->last_db_flush_num_draw_calls &&
56        ctx->num_decompress_calls == ctx->last_db_flush_num_decompress_calls)
57       flags &= ~SI_BARRIER_SYNC_AND_INV_DB;
58 
59    /* Track the last flush. */
60    if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
61       ctx->num_cb_cache_flushes++;
62       ctx->last_cb_flush_num_draw_calls = ctx->num_draw_calls;
63       ctx->last_cb_flush_num_decompress_calls = ctx->num_decompress_calls;
64    }
65    if (flags & SI_BARRIER_SYNC_AND_INV_DB) {
66       ctx->num_db_cache_flushes++;
67       ctx->last_db_flush_num_draw_calls = ctx->num_draw_calls;
68       ctx->last_db_flush_num_decompress_calls = ctx->num_decompress_calls;
69    }
70 
71    ctx->barrier_flags = 0;
72    return flags;
73 }
74 
gfx10_emit_barrier(struct si_context * ctx,struct radeon_cmdbuf * cs)75 static void gfx10_emit_barrier(struct si_context *ctx, struct radeon_cmdbuf *cs)
76 {
77    assert(ctx->gfx_level >= GFX10);
78    uint32_t gcr_cntl = 0;
79    unsigned flags = get_reduced_barrier_flags(ctx);
80 
81    if (!flags)
82       return;
83 
84    /* We don't need these. */
85    assert(!(flags & SI_BARRIER_EVENT_FLUSH_AND_INV_DB_META));
86    assert(ctx->gfx_level < GFX12 || !(flags & SI_BARRIER_INV_L2_METADATA));
87 
88    radeon_begin(cs);
89 
90    if (flags & SI_BARRIER_EVENT_VGT_FLUSH)
91       radeon_event_write(V_028A90_VGT_FLUSH);
92 
93    if (flags & SI_BARRIER_INV_ICACHE)
94       gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
95    if (flags & SI_BARRIER_INV_SMEM)
96       gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
97    if (flags & SI_BARRIER_INV_VMEM)
98       gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
99 
100    /* The L2 cache ops are:
101     * - INV: - invalidate lines that reflect memory (were loaded from memory)
102     *        - don't touch lines that were overwritten (were stored by gfx clients)
103     * - WB: - don't touch lines that reflect memory
104     *       - write back lines that were overwritten
105     * - WB | INV: - invalidate lines that reflect memory
106     *             - write back lines that were overwritten
107     *
108     * GLM doesn't support WB alone. If WB is set, INV must be set too.
109     */
110    if (flags & SI_BARRIER_INV_L2) {
111       /* Writeback and invalidate everything in L2. */
112       gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1);
113       ctx->num_L2_invalidates++;
114    } else if (flags & SI_BARRIER_WB_L2) {
115       gcr_cntl |= S_586_GL2_WB(1);
116    }
117 
118    /* Invalidate the metadata cache. */
119    if (ctx->gfx_level < GFX12 &&
120        flags & (SI_BARRIER_INV_L2 | SI_BARRIER_WB_L2 | SI_BARRIER_INV_L2_METADATA))
121       gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
122 
123    /* Flush CB/DB. Note that this also idles all shaders, including compute shaders. */
124    if (flags & (SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB)) {
125       unsigned cb_db_event = 0;
126 
127       /* Determine the TS event that we'll use to flush CB/DB. */
128       if ((flags & SI_BARRIER_SYNC_AND_INV_CB && flags & SI_BARRIER_SYNC_AND_INV_DB) ||
129           /* Gfx11 can't use the DB_META event and must use a full flush to flush DB_META. */
130           (ctx->gfx_level == GFX11 && flags & SI_BARRIER_SYNC_AND_INV_DB)) {
131          cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
132       } else if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
133          cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
134       } else {
135          assert(flags & SI_BARRIER_SYNC_AND_INV_DB);
136          cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
137       }
138 
139       /* We must flush CMASK/FMASK/DCC separately if the main event only flushes CB_DATA. */
140       if (ctx->gfx_level < GFX12 && cb_db_event == V_028A90_FLUSH_AND_INV_CB_DATA_TS)
141          radeon_event_write(V_028A90_FLUSH_AND_INV_CB_META);
142 
143       /* We must flush HTILE separately if the main event only flushes DB_DATA. */
144       if (ctx->gfx_level < GFX12 && cb_db_event == V_028A90_FLUSH_AND_INV_DB_DATA_TS)
145          radeon_event_write(V_028A90_FLUSH_AND_INV_DB_META);
146 
147       radeon_end();
148 
149       /* First flush CB/DB, then L1/L2. */
150       gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
151 
152       if (ctx->gfx_level >= GFX11) {
153          si_cp_release_mem_pws(ctx, cs, cb_db_event, gcr_cntl & C_586_GLI_INV);
154 
155          /* Wait for the event and invalidate remaining caches if needed. */
156          si_cp_acquire_mem_pws(ctx, cs, cb_db_event,
157                                flags & SI_BARRIER_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME,
158                                gcr_cntl & ~C_586_GLI_INV, /* keep only GLI_INV */
159                                0, flags);
160 
161          gcr_cntl = 0; /* all done */
162          /* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
163          flags &= ~SI_BARRIER_PFP_SYNC_ME;
164       } else {
165          /* GFX10 */
166          struct si_resource *wait_mem_scratch =
167            si_get_wait_mem_scratch_bo(ctx, cs, ctx->ws->cs_is_secure(cs));
168 
169          /* CB/DB flush and invalidate via RELEASE_MEM.
170           * Combine this with other cache flushes when possible.
171           */
172          uint64_t va = wait_mem_scratch->gpu_address;
173          ctx->wait_mem_number++;
174 
175          /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
176          unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
177          unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
178          unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
179          unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
180          assert(G_586_GL2_US(gcr_cntl) == 0);
181          assert(G_586_GL2_RANGE(gcr_cntl) == 0);
182          assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
183          unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
184          unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
185          unsigned gcr_seq = G_586_SEQ(gcr_cntl);
186 
187          gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &
188                      C_586_GL2_WB; /* keep SEQ */
189 
190          si_cp_release_mem(ctx, cs, cb_db_event,
191                            S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
192                            S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
193                            S_490_SEQ(gcr_seq),
194                            EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
195                            EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number,
196                            SI_NOT_QUERY);
197 
198          if (unlikely(ctx->sqtt_enabled)) {
199             si_sqtt_describe_barrier_start(ctx, &ctx->gfx_cs);
200          }
201 
202          si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
203 
204          if (unlikely(ctx->sqtt_enabled)) {
205             si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags);
206          }
207       }
208 
209       ctx->compute_is_busy = false;
210    } else {
211       /* The TS event above also makes sure that PS and CS are idle, so we have to do this only
212        * if we are not flushing CB or DB.
213        */
214       if (flags & SI_BARRIER_SYNC_PS) {
215          radeon_event_write(V_028A90_PS_PARTIAL_FLUSH);
216          /* Only count explicit shader flushes, not implicit ones. */
217          ctx->num_vs_flushes++;
218          ctx->num_ps_flushes++;
219       } else if (flags & SI_BARRIER_SYNC_VS) {
220          radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
221          ctx->num_vs_flushes++;
222       }
223 
224       if (flags & SI_BARRIER_SYNC_CS && ctx->compute_is_busy) {
225          radeon_event_write(V_028A90_CS_PARTIAL_FLUSH);
226          ctx->num_cs_flushes++;
227          ctx->compute_is_busy = false;
228       }
229       radeon_end();
230    }
231 
232    /* Ignore fields that only modify the behavior of other fields. */
233    if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
234       si_cp_acquire_mem(ctx, cs, gcr_cntl,
235                         flags & SI_BARRIER_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME);
236    } else if (flags & SI_BARRIER_PFP_SYNC_ME) {
237       si_cp_pfp_sync_me(cs);
238    }
239 
240    radeon_begin_again(cs);
241    if (flags & SI_BARRIER_EVENT_PIPELINESTAT_START && ctx->pipeline_stats_enabled != 1) {
242       radeon_event_write(V_028A90_PIPELINESTAT_START);
243       ctx->pipeline_stats_enabled = 1;
244    } else if (flags & SI_BARRIER_EVENT_PIPELINESTAT_STOP && ctx->pipeline_stats_enabled != 0) {
245       radeon_event_write(V_028A90_PIPELINESTAT_STOP);
246       ctx->pipeline_stats_enabled = 0;
247    }
248    radeon_end();
249 }
250 
gfx6_emit_barrier(struct si_context * sctx,struct radeon_cmdbuf * cs)251 static void gfx6_emit_barrier(struct si_context *sctx, struct radeon_cmdbuf *cs)
252 {
253    assert(sctx->gfx_level <= GFX9);
254    unsigned flags = get_reduced_barrier_flags(sctx);
255 
256    if (!flags)
257       return;
258 
259    uint32_t cp_coher_cntl = 0;
260    const uint32_t flush_cb_db = flags & (SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB);
261 
262    /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
263     * bit is set. An alternative way is to write SQC_CACHES, but that
264     * doesn't seem to work reliably. Since the bug doesn't affect
265     * correctness (it only does more work than necessary) and
266     * the performance impact is likely negligible, there is no plan
267     * to add a workaround for it.
268     */
269 
270    if (flags & SI_BARRIER_INV_ICACHE)
271       cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
272    if (flags & SI_BARRIER_INV_SMEM)
273       cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
274 
275    if (sctx->gfx_level <= GFX8) {
276       if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
277          cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |
278                           S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |
279                           S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |
280                           S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |
281                           S_0085F0_CB7_DEST_BASE_ENA(1);
282 
283          /* Necessary for DCC */
284          if (sctx->gfx_level == GFX8)
285             si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,
286                               EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);
287       }
288       if (flags & SI_BARRIER_SYNC_AND_INV_DB)
289          cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
290    }
291 
292    radeon_begin(cs);
293 
294    /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
295    if (flags & SI_BARRIER_SYNC_AND_INV_CB)
296       radeon_event_write(V_028A90_FLUSH_AND_INV_CB_META);
297 
298    /* Flush HTILE. SURFACE_SYNC will wait for idle. */
299    if (flags & (SI_BARRIER_SYNC_AND_INV_DB | SI_BARRIER_EVENT_FLUSH_AND_INV_DB_META))
300       radeon_event_write(V_028A90_FLUSH_AND_INV_DB_META);
301 
302    /* Wait for shader engines to go idle.
303     * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
304     * for everything including CB/DB cache flushes.
305     *
306     * GFX6-8: SURFACE_SYNC with CB_ACTION_ENA doesn't do anything if there are no CB/DB bindings.
307     * Reproducible with: piglit/arb_framebuffer_no_attachments-atomic
308     *
309     * GFX9: The TS event is always written after full pipeline completion regardless of CB/DB
310     * bindings.
311     */
312    if (sctx->gfx_level <= GFX8 || !flush_cb_db) {
313       if (flags & SI_BARRIER_SYNC_PS) {
314          radeon_event_write(V_028A90_PS_PARTIAL_FLUSH);
315          /* Only count explicit shader flushes, not implicit ones done by SURFACE_SYNC. */
316          sctx->num_vs_flushes++;
317          sctx->num_ps_flushes++;
318       } else if (flags & SI_BARRIER_SYNC_VS) {
319          radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
320          sctx->num_vs_flushes++;
321       }
322    }
323 
324    if (flags & SI_BARRIER_SYNC_CS && sctx->compute_is_busy) {
325       radeon_event_write(V_028A90_CS_PARTIAL_FLUSH);
326       sctx->num_cs_flushes++;
327       sctx->compute_is_busy = false;
328    }
329 
330    /* VGT state synchronization. */
331    if (flags & SI_BARRIER_EVENT_VGT_FLUSH)
332       radeon_event_write(V_028A90_VGT_FLUSH);
333 
334    radeon_end();
335 
336    /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
337     * wait for idle on GFX9. We have to use a TS event.
338     */
339    if (sctx->gfx_level == GFX9 && flush_cb_db) {
340       uint64_t va;
341       unsigned tc_flags, cb_db_event;
342 
343       /* Set the CB/DB flush event. */
344       switch (flush_cb_db) {
345       case SI_BARRIER_SYNC_AND_INV_CB:
346          cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
347          break;
348       case SI_BARRIER_SYNC_AND_INV_DB:
349          cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
350          break;
351       default:
352          /* both CB & DB */
353          cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
354       }
355 
356       /* These are the only allowed combinations. If you need to
357        * do multiple operations at once, do them separately.
358        * All operations that invalidate L2 also seem to invalidate
359        * metadata. Volatile (VOL) and WC flushes are not listed here.
360        *
361        * TC    | TC_WB         = writeback & invalidate L2
362        * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
363        *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
364        * TC            | TC_NC = invalidate L2 for MTYPE == NC
365        * TC    | TC_MD         = writeback & invalidate L2 metadata (DCC, etc.)
366        * TCL1                  = invalidate L1
367        */
368       tc_flags = 0;
369 
370       if (flags & SI_BARRIER_INV_L2_METADATA) {
371          tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;
372       }
373 
374       /* Ideally flush L2 together with CB/DB. */
375       if (flags & SI_BARRIER_INV_L2) {
376          /* Writeback and invalidate everything in L2 & L1. */
377          tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;
378 
379          /* Clear the flags. */
380          flags &= ~(SI_BARRIER_INV_L2 | SI_BARRIER_WB_L2);
381          sctx->num_L2_invalidates++;
382       }
383 
384       /* Do the flush (enqueue the event and wait for it). */
385       struct si_resource* wait_mem_scratch =
386         si_get_wait_mem_scratch_bo(sctx, cs, sctx->ws->cs_is_secure(cs));
387 
388       va = wait_mem_scratch->gpu_address;
389       sctx->wait_mem_number++;
390 
391       si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,
392                         EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
393                         wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
394 
395       if (unlikely(sctx->sqtt_enabled)) {
396          si_sqtt_describe_barrier_start(sctx, cs);
397       }
398 
399       si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
400 
401       if (unlikely(sctx->sqtt_enabled)) {
402          si_sqtt_describe_barrier_end(sctx, cs, sctx->barrier_flags);
403       }
404    }
405 
406    /* GFX6-GFX8 only: When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC waits
407     * for idle, so it should be last.
408     *
409     * cp_coher_cntl should contain everything except TC flags at this point.
410     *
411     * GFX6-GFX7 don't support L2 write-back.
412     */
413    unsigned engine = flags & SI_BARRIER_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME;
414 
415    if (flags & SI_BARRIER_INV_L2 || (sctx->gfx_level <= GFX7 && flags & SI_BARRIER_WB_L2)) {
416       /* Invalidate L1 & L2. WB must be set on GFX8+ when TC_ACTION is set. */
417       si_cp_acquire_mem(sctx, cs,
418                         cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
419                         S_0301F0_TC_WB_ACTION_ENA(sctx->gfx_level >= GFX8), engine);
420       sctx->num_L2_invalidates++;
421    } else {
422       /* L1 invalidation and L2 writeback must be done separately, because both operations can't
423        * be done together.
424        */
425       if (flags & SI_BARRIER_WB_L2) {
426          /* WB = write-back
427           * NC = apply to non-coherent MTYPEs
428           *      (i.e. MTYPE <= 1, which is what we use everywhere)
429           *
430           * WB doesn't work without NC.
431           *
432           * If we get here, the only flag that can't be executed together with WB_L2 is VMEM cache
433           * invalidation.
434           */
435          bool last_acquire_mem = !(flags & SI_BARRIER_INV_VMEM);
436 
437          si_cp_acquire_mem(sctx, cs,
438                            cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) |
439                            S_0301F0_TC_NC_ACTION_ENA(1),
440                            /* If this is not the last ACQUIRE_MEM, flush in ME.
441                             * We only want to synchronize with PFP in the last ACQUIRE_MEM. */
442                            last_acquire_mem ? engine : V_580_CP_ME);
443 
444          if (last_acquire_mem)
445             flags &= ~SI_BARRIER_PFP_SYNC_ME;
446          cp_coher_cntl = 0;
447          sctx->num_L2_writebacks++;
448       }
449 
450       if (flags & SI_BARRIER_INV_VMEM)
451          cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
452 
453       /* If there are still some cache flags left... */
454       if (cp_coher_cntl) {
455          si_cp_acquire_mem(sctx, cs, cp_coher_cntl, engine);
456          flags &= ~SI_BARRIER_PFP_SYNC_ME;
457       }
458 
459       /* This might be needed even without any cache flags, such as when doing buffer stores
460        * to an index buffer.
461        */
462       if (flags & SI_BARRIER_PFP_SYNC_ME)
463          si_cp_pfp_sync_me(cs);
464    }
465 
466    if (flags & SI_BARRIER_EVENT_PIPELINESTAT_START && sctx->pipeline_stats_enabled != 1) {
467       radeon_begin(cs);
468       radeon_event_write(V_028A90_PIPELINESTAT_START);
469       radeon_end();
470       sctx->pipeline_stats_enabled = 1;
471    } else if (flags & SI_BARRIER_EVENT_PIPELINESTAT_STOP && sctx->pipeline_stats_enabled != 0) {
472       radeon_begin(cs);
473       radeon_event_write(V_028A90_PIPELINESTAT_STOP);
474       radeon_end();
475       sctx->pipeline_stats_enabled = 0;
476    }
477 }
478 
si_emit_barrier_as_atom(struct si_context * sctx,unsigned index)479 static void si_emit_barrier_as_atom(struct si_context *sctx, unsigned index)
480 {
481    sctx->emit_barrier(sctx, &sctx->gfx_cs);
482 }
483 
si_is_buffer_idle(struct si_context * sctx,struct si_resource * buf,unsigned usage)484 static bool si_is_buffer_idle(struct si_context *sctx, struct si_resource *buf,
485                               unsigned usage)
486 {
487    return !si_cs_is_buffer_referenced(sctx, buf->buf, usage) &&
488           sctx->ws->buffer_wait(sctx->ws, buf->buf, 0, usage);
489 }
490 
si_barrier_before_internal_op(struct si_context * sctx,unsigned flags,unsigned num_buffers,const struct pipe_shader_buffer * buffers,unsigned writable_buffers_mask,unsigned num_images,const struct pipe_image_view * images)491 void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
492                                    unsigned num_buffers,
493                                    const struct pipe_shader_buffer *buffers,
494                                    unsigned writable_buffers_mask,
495                                    unsigned num_images,
496                                    const struct pipe_image_view *images)
497 {
498    for (unsigned i = 0; i < num_images; i++) {
499       /* The driver doesn't decompress resources automatically for internal blits, so do it manually. */
500       si_decompress_subresource(&sctx->b, images[i].resource, PIPE_MASK_RGBAZS,
501                                 images[i].u.tex.level, images[i].u.tex.first_layer,
502                                 images[i].u.tex.last_layer,
503                                 images[i].access & PIPE_IMAGE_ACCESS_WRITE);
504    }
505 
506    /* Don't sync if buffers are idle. */
507    const unsigned ps_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_FRAGMENT) |
508                             SI_BIND_SHADER_BUFFER(PIPE_SHADER_FRAGMENT) |
509                             SI_BIND_IMAGE_BUFFER(PIPE_SHADER_FRAGMENT) |
510                             SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_FRAGMENT);
511    const unsigned cs_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_COMPUTE) |
512                             SI_BIND_SHADER_BUFFER(PIPE_SHADER_COMPUTE) |
513                             SI_BIND_IMAGE_BUFFER(PIPE_SHADER_COMPUTE) |
514                             SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_COMPUTE);
515 
516    for (unsigned i = 0; i < num_buffers; i++) {
517       struct si_resource *buf = si_resource(buffers[i].buffer);
518 
519       if (!buf)
520          continue;
521 
522       /* We always wait for the last write. If the buffer is used for write, also wait
523        * for the last read.
524        */
525       if (!si_is_buffer_idle(sctx, buf, RADEON_USAGE_WRITE |
526                              (writable_buffers_mask & BITFIELD_BIT(i) ? RADEON_USAGE_READ : 0))) {
527          if (buf->bind_history & ps_mask)
528             sctx->barrier_flags |= SI_BARRIER_SYNC_PS;
529          else
530             sctx->barrier_flags |= SI_BARRIER_SYNC_VS;
531 
532          if (buf->bind_history & cs_mask)
533             sctx->barrier_flags |= SI_BARRIER_SYNC_CS;
534       }
535    }
536 
537    /* Don't sync if images are idle. */
538    for (unsigned i = 0; i < num_images; i++) {
539       struct si_resource *img = si_resource(images[i].resource);
540       bool writable = images[i].access & PIPE_IMAGE_ACCESS_WRITE;
541 
542       /* We always wait for the last write. If the buffer is used for write, also wait
543        * for the last read.
544        */
545       if (!si_is_buffer_idle(sctx, img, RADEON_USAGE_WRITE | (writable ? RADEON_USAGE_READ : 0))) {
546          si_make_CB_shader_coherent(sctx, images[i].resource->nr_samples, true,
547                ((struct si_texture*)images[i].resource)->surface.u.gfx9.color.dcc.pipe_aligned);
548          sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS;
549       }
550    }
551 
552    /* Invalidate the VMEM cache only. The SMEM cache isn't used by shader buffers. */
553    sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
554    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
555 }
556 
si_barrier_after_internal_op(struct si_context * sctx,unsigned flags,unsigned num_buffers,const struct pipe_shader_buffer * buffers,unsigned writable_buffers_mask,unsigned num_images,const struct pipe_image_view * images)557 void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
558                                   unsigned num_buffers,
559                                   const struct pipe_shader_buffer *buffers,
560                                   unsigned writable_buffers_mask,
561                                   unsigned num_images,
562                                   const struct pipe_image_view *images)
563 {
564    sctx->barrier_flags |= SI_BARRIER_SYNC_CS;
565 
566    if (num_images) {
567       /* Make sure image stores are visible to CB, which doesn't use L2 on GFX6-8. */
568       sctx->barrier_flags |= sctx->gfx_level <= GFX8 ? SI_BARRIER_WB_L2 : 0;
569       /* Make sure image stores are visible to all CUs. */
570       sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
571    }
572 
573    /* Make sure buffer stores are visible to all CUs and also as index/indirect buffers. */
574    if (num_buffers)
575       sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM | SI_BARRIER_PFP_SYNC_ME;
576 
577    /* We must set L2_cache_dirty for buffers because:
578     * - GFX6,12: CP DMA doesn't use L2.
579     * - GFX6-7,12: Index buffer reads don't use L2.
580     * - GFX6-8,12: CP doesn't use L2.
581     * - GFX6-8: CB/DB don't use L2.
582     *
583     * L2_cache_dirty is checked explicitly when buffers are used in those cases to enforce coherency.
584     */
585    while (writable_buffers_mask)
586       si_resource(buffers[u_bit_scan(&writable_buffers_mask)].buffer)->L2_cache_dirty = true;
587 
588    /* Make sure RBs see our DCC image stores if RBs and TCCs (L2 instances) are non-coherent. */
589    if (sctx->gfx_level >= GFX10 && sctx->screen->info.tcc_rb_non_coherent) {
590       for (unsigned i = 0; i < num_images; i++) {
591          if (vi_dcc_enabled((struct si_texture*)images[i].resource, images[i].u.tex.level) &&
592              images[i].access & PIPE_IMAGE_ACCESS_WRITE &&
593              (sctx->screen->always_allow_dcc_stores ||
594               images[i].access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE)) {
595             sctx->barrier_flags |= SI_BARRIER_INV_L2;
596             break;
597          }
598       }
599    }
600 
601    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
602 }
603 
si_set_dst_src_barrier_buffers(struct pipe_shader_buffer * buffers,struct pipe_resource * dst,struct pipe_resource * src)604 static void si_set_dst_src_barrier_buffers(struct pipe_shader_buffer *buffers,
605                                            struct pipe_resource *dst, struct pipe_resource *src)
606 {
607    assert(dst);
608    memset(buffers, 0, sizeof(buffers[0]) * 2);
609    /* Only the "buffer" field is going to be used. */
610    buffers[0].buffer = dst;
611    buffers[1].buffer = src;
612 }
613 
614 /* This is for simple buffer ops that have 1 dst and 0-1 src. */
si_barrier_before_simple_buffer_op(struct si_context * sctx,unsigned flags,struct pipe_resource * dst,struct pipe_resource * src)615 void si_barrier_before_simple_buffer_op(struct si_context *sctx, unsigned flags,
616                                         struct pipe_resource *dst, struct pipe_resource *src)
617 {
618    struct pipe_shader_buffer barrier_buffers[2];
619    si_set_dst_src_barrier_buffers(barrier_buffers, dst, src);
620    si_barrier_before_internal_op(sctx, flags, src ? 2 : 1, barrier_buffers, 0x1, 0, NULL);
621 }
622 
623 /* This is for simple buffer ops that have 1 dst and 0-1 src. */
si_barrier_after_simple_buffer_op(struct si_context * sctx,unsigned flags,struct pipe_resource * dst,struct pipe_resource * src)624 void si_barrier_after_simple_buffer_op(struct si_context *sctx, unsigned flags,
625                                        struct pipe_resource *dst, struct pipe_resource *src)
626 {
627    struct pipe_shader_buffer barrier_buffers[2];
628    si_set_dst_src_barrier_buffers(barrier_buffers, dst, src);
629    si_barrier_after_internal_op(sctx, flags, src ? 2 : 1, barrier_buffers, 0x1, 0, NULL);
630 }
631 
si_texture_barrier(struct pipe_context * ctx,unsigned flags)632 static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
633 {
634    si_fb_barrier_after_rendering((struct si_context *)ctx, SI_FB_BARRIER_SYNC_CB);
635 }
636 
637 /* This enforces coherency between shader stores and any past and future access. */
si_memory_barrier(struct pipe_context * ctx,unsigned flags)638 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
639 {
640    struct si_context *sctx = (struct si_context *)ctx;
641 
642    /* Ignore PIPE_BARRIER_UPDATE_BUFFER - it synchronizes against updates like buffer_subdata. */
643    /* Ignore PIPE_BARRIER_UPDATE_TEXTURE - it synchronizes against updates like texture_subdata. */
644    /* Ignore PIPE_BARRIER_MAPPED_BUFFER - it synchronizes against buffer_map/unmap. */
645    /* Ignore PIPE_BARRIER_QUERY_BUFFER - the GL spec description is confusing, and the driver
646     * always inserts barriers around get_query_result_resource.
647     */
648    flags &= ~PIPE_BARRIER_UPDATE_BUFFER & ~PIPE_BARRIER_UPDATE_TEXTURE &
649             ~PIPE_BARRIER_MAPPED_BUFFER & ~PIPE_BARRIER_QUERY_BUFFER;
650 
651    if (!flags)
652       return;
653 
654    sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS;
655 
656    if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
657       sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM;
658 
659    /* VMEM cache contents are written back to L2 automatically at the end of waves, but
660     * the contents of other VMEM caches might still be stale.
661     *
662     * TEXTURE and IMAGE mean sampler buffers and image buffers, respectively.
663     */
664    if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE |
665                 PIPE_BARRIER_IMAGE | PIPE_BARRIER_STREAMOUT_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER))
666       sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
667 
668    if (flags & (PIPE_BARRIER_INDEX_BUFFER | PIPE_BARRIER_INDIRECT_BUFFER))
669       sctx->barrier_flags |= SI_BARRIER_PFP_SYNC_ME;
670 
671    /* Index buffers use L2 since GFX8 */
672    if (flags & PIPE_BARRIER_INDEX_BUFFER &&
673        (sctx->gfx_level <= GFX7 || sctx->screen->info.cp_sdma_ge_use_system_memory_scope))
674       sctx->barrier_flags |= SI_BARRIER_WB_L2;
675 
676    /* Indirect buffers use L2 since GFX9. */
677    if (flags & PIPE_BARRIER_INDIRECT_BUFFER &&
678        (sctx->gfx_level <= GFX8 || sctx->screen->info.cp_sdma_ge_use_system_memory_scope))
679       sctx->barrier_flags |= SI_BARRIER_WB_L2;
680 
681    /* MSAA color images are flushed in si_decompress_textures when needed.
682     * Shaders never write to depth/stencil images.
683     */
684    if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) {
685       sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB;
686 
687       if (sctx->gfx_level >= GFX10 && sctx->gfx_level < GFX12) {
688          if (sctx->screen->info.tcc_rb_non_coherent)
689             sctx->barrier_flags |= SI_BARRIER_INV_L2;
690          else /* We don't know which shaders do image stores with DCC: */
691             sctx->barrier_flags |= SI_BARRIER_INV_L2_METADATA;
692       } else if (sctx->gfx_level == GFX9) {
693          /* We have to invalidate L2 for MSAA and when DCC can have pipe_aligned=0. */
694          sctx->barrier_flags |= SI_BARRIER_INV_L2;
695       } else if (sctx->gfx_level <= GFX8) {
696          /* CB doesn't use L2 on GFX6-8.  */
697          sctx->barrier_flags |= SI_BARRIER_WB_L2;
698       }
699    }
700 
701    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
702 }
703 
si_set_sampler_depth_decompress_mask(struct si_context * sctx,struct si_texture * tex)704 static void si_set_sampler_depth_decompress_mask(struct si_context *sctx, struct si_texture *tex)
705 {
706    assert(sctx->gfx_level < GFX12);
707 
708    /* Check all sampler bindings in all shaders where depth textures are bound, and update
709     * which samplers should be decompressed.
710     */
711    u_foreach_bit(sh, sctx->shader_has_depth_tex) {
712       u_foreach_bit(i, sctx->samplers[sh].has_depth_tex_mask) {
713          if (sctx->samplers[sh].views[i]->texture == &tex->buffer.b.b) {
714             sctx->samplers[sh].needs_depth_decompress_mask |= 1 << i;
715             sctx->shader_needs_decompress_mask |= 1 << sh;
716          }
717       }
718    }
719 }
720 
si_fb_barrier_before_rendering(struct si_context * sctx)721 void si_fb_barrier_before_rendering(struct si_context *sctx)
722 {
723    /* Wait for all shaders because all image loads must finish before CB/DB can write there. */
724    sctx->barrier_flags |= SI_BARRIER_SYNC_CS | SI_BARRIER_SYNC_PS;
725    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
726 }
727 
si_fb_barrier_after_rendering(struct si_context * sctx,unsigned flags)728 void si_fb_barrier_after_rendering(struct si_context *sctx, unsigned flags)
729 {
730    if (sctx->gfx_level < GFX12 && !sctx->decompression_enabled) {
731       /* Setting dirty_level_mask should ignore SI_FB_BARRIER_SYNC_* because it triggers
732        * decompression, which is not syncing.
733        */
734       if (sctx->framebuffer.state.zsbuf) {
735          struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
736          struct si_texture *tex = (struct si_texture *)surf->texture;
737 
738          tex->dirty_level_mask |= 1 << surf->u.tex.level;
739 
740          if (tex->surface.has_stencil)
741             tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
742 
743          si_set_sampler_depth_decompress_mask(sctx, tex);
744       }
745 
746       unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
747       while (compressed_cb_mask) {
748          unsigned i = u_bit_scan(&compressed_cb_mask);
749          struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
750          struct si_texture *tex = (struct si_texture *)surf->texture;
751 
752          if (tex->surface.fmask_offset) {
753             tex->dirty_level_mask |= 1 << surf->u.tex.level;
754             tex->fmask_is_identity = false;
755          }
756       }
757    }
758 
759    if (flags & SI_FB_BARRIER_SYNC_CB) {
760       /* Compressed images (MSAA with FMASK) are flushed on demand in si_decompress_textures.
761        *
762        * Synchronize CB only if there is actually a bound color buffer.
763        */
764       if (sctx->framebuffer.uncompressed_cb_mask) {
765          si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
766                                     sctx->framebuffer.CB_has_shader_readable_metadata,
767                                     sctx->framebuffer.all_DCC_pipe_aligned);
768       }
769    }
770 
771    if (flags & SI_FB_BARRIER_SYNC_DB && sctx->framebuffer.state.zsbuf) {
772       /* DB caches are flushed on demand (using si_decompress_textures) except the cases below. */
773       if (sctx->gfx_level >= GFX12) {
774          si_make_DB_shader_coherent(sctx, sctx->framebuffer.nr_samples, true, false);
775       } else if (sctx->generate_mipmap_for_depth) {
776          /* u_blitter doesn't invoke depth decompression when it does multiple blits in a row,
777           * but the only case when it matters for DB is when doing generate_mipmap, which writes Z,
778           * which is always uncompressed. So here we flush DB manually between individual
779           * generate_mipmap blits.
780           */
781          si_make_DB_shader_coherent(sctx, 1, false, sctx->framebuffer.DB_has_shader_readable_metadata);
782       } else if (sctx->screen->info.family == CHIP_NAVI33) {
783          struct si_surface *old_zsurf = (struct si_surface *)sctx->framebuffer.state.zsbuf;
784          struct si_texture *old_ztex = (struct si_texture *)old_zsurf->base.texture;
785 
786          if (old_ztex->upgraded_depth) {
787             /* TODO: some failures related to hyperz appeared after 969ed851 on nv33:
788              * - piglit tex-miplevel-selection
789              * - KHR-GL46.direct_state_access.framebuffers_texture_attachment
790              * - GTF-GL46.gtf30.GL3Tests.blend_minmax.blend_minmax_draw
791              * - KHR-GL46.direct_state_access.framebuffers_texture_layer_attachment
792              *
793              * This seems to fix them:
794              */
795             sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_DB | SI_BARRIER_INV_L2;
796             si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
797          }
798       } else if (sctx->gfx_level == GFX9) {
799          /* It appears that DB metadata "leaks" in a sequence of:
800           *  - depth clear
801           *  - DCC decompress for shader image writes (with DB disabled)
802           *  - render with DEPTH_BEFORE_SHADER=1
803           * Flushing DB metadata works around the problem.
804           */
805          sctx->barrier_flags |= SI_BARRIER_EVENT_FLUSH_AND_INV_DB_META;
806          si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
807       }
808    }
809 }
810 
si_init_barrier_functions(struct si_context * sctx)811 void si_init_barrier_functions(struct si_context *sctx)
812 {
813    if (sctx->gfx_level >= GFX10)
814       sctx->emit_barrier = gfx10_emit_barrier;
815    else
816       sctx->emit_barrier = gfx6_emit_barrier;
817 
818    sctx->atoms.s.barrier.emit = si_emit_barrier_as_atom;
819 
820    sctx->b.memory_barrier = si_memory_barrier;
821    sctx->b.texture_barrier = si_texture_barrier;
822 }
823