1 /*
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_build_pm4.h"
8 #include "util/u_memory.h"
9 #include "util/u_suballoc.h"
10
11 static void si_set_streamout_enable(struct si_context *sctx, bool enable);
12
si_so_target_reference(struct si_streamout_target ** dst,struct pipe_stream_output_target * src)13 static inline void si_so_target_reference(struct si_streamout_target **dst,
14 struct pipe_stream_output_target *src)
15 {
16 pipe_so_target_reference((struct pipe_stream_output_target **)dst, src);
17 }
18
si_create_so_target(struct pipe_context * ctx,struct pipe_resource * buffer,unsigned buffer_offset,unsigned buffer_size)19 static struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx,
20 struct pipe_resource *buffer,
21 unsigned buffer_offset,
22 unsigned buffer_size)
23 {
24 struct si_streamout_target *t;
25 struct si_resource *buf = si_resource(buffer);
26
27 t = CALLOC_STRUCT(si_streamout_target);
28 if (!t) {
29 return NULL;
30 }
31
32 t->b.reference.count = 1;
33 t->b.context = ctx;
34 pipe_resource_reference(&t->b.buffer, buffer);
35 t->b.buffer_offset = buffer_offset;
36 t->b.buffer_size = buffer_size;
37
38 util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size);
39 return &t->b;
40 }
41
si_so_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * target)42 static void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target)
43 {
44 struct si_streamout_target *t = (struct si_streamout_target *)target;
45 pipe_resource_reference(&t->b.buffer, NULL);
46 si_resource_reference(&t->buf_filled_size, NULL);
47 FREE(t);
48 }
49
si_streamout_buffers_dirty(struct si_context * sctx)50 void si_streamout_buffers_dirty(struct si_context *sctx)
51 {
52 if (!sctx->streamout.enabled_mask)
53 return;
54
55 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
56 si_set_streamout_enable(sctx, true);
57 }
58
si_set_streamout_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets)59 static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets,
60 struct pipe_stream_output_target **targets,
61 const unsigned *offsets)
62 {
63 struct si_context *sctx = (struct si_context *)ctx;
64 unsigned old_num_targets = sctx->streamout.num_targets;
65 unsigned i;
66
67 if (!old_num_targets && !num_targets)
68 return;
69
70 if (sctx->gfx_level >= GFX12)
71 si_set_internal_shader_buffer(sctx, SI_STREAMOUT_STATE_BUF, NULL);
72
73 /* We are going to unbind the buffers. Mark which caches need to be flushed. */
74 if (old_num_targets && sctx->streamout.begin_emitted) {
75 /* Stop streamout. */
76 si_emit_streamout_end(sctx);
77
78 /* Since streamout uses vector writes which go through L2
79 * and most other clients can use L2 as well, we don't need
80 * to flush it.
81 *
82 * The only cases which requires flushing it is VGT DMA index
83 * fetching (on <= GFX7) and indirect draw data, which are rare
84 * cases. Thus, flag the L2 dirtiness in the resource and
85 * handle it at draw call time.
86 */
87 for (i = 0; i < old_num_targets; i++)
88 if (sctx->streamout.targets[i])
89 si_resource(sctx->streamout.targets[i]->b.buffer)->L2_cache_dirty = true;
90
91 /* Invalidate the scalar cache in case a streamout buffer is
92 * going to be used as a constant buffer.
93 *
94 * Invalidate vL1, because streamout bypasses it (done by
95 * setting GLC=1 in the store instruction), but vL1 in other
96 * CUs can contain outdated data of streamout buffers.
97 *
98 * VS_PARTIAL_FLUSH is required if the buffers are going to be
99 * used as an input immediately.
100 */
101 sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM |
102 SI_BARRIER_SYNC_VS | SI_BARRIER_PFP_SYNC_ME;
103
104 /* Make the streamout state buffer available to the CP for resuming and DrawTF. */
105 if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope)
106 sctx->barrier_flags |= SI_BARRIER_WB_L2;
107
108 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
109 }
110
111 /* TODO: This is a hack that fixes these failures. It shouldn't be necessary.
112 * spec@ext_transform_feedback@immediate-reuse
113 * spec@ext_transform_feedback@immediate-reuse-index-buffer
114 * spec@ext_transform_feedback@immediate-reuse-uniform-buffer
115 */
116 if (sctx->gfx_level >= GFX11 && sctx->gfx_level < GFX12 && old_num_targets)
117 si_flush_gfx_cs(sctx, 0, NULL);
118
119 /* Streamout buffers must be bound in 2 places:
120 * 1) in VGT by setting the VGT_STRMOUT registers
121 * 2) as shader resources
122 */
123 unsigned enabled_mask = 0, append_bitmask = 0;
124
125 for (i = 0; i < num_targets; i++) {
126 si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
127
128 if (!targets[i]) {
129 si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
130 continue;
131 }
132
133 enabled_mask |= 1 << i;
134
135 if (offsets[i] == ((unsigned)-1))
136 append_bitmask |= 1 << i;
137
138 /* Allocate space for the filled buffer size. */
139 struct si_streamout_target *t = sctx->streamout.targets[i];
140
141 if (sctx->gfx_level >= GFX12) {
142 bool first_target = util_bitcount(enabled_mask) == 1;
143
144 /* The first enabled streamout target will contain the ordered ID/offset buffer for all
145 * targets.
146 */
147 if (first_target && !append_bitmask) {
148 /* The layout is:
149 * struct {
150 * struct {
151 * uint32_t ordered_id; // equal for all buffers
152 * uint32_t dwords_written;
153 * } buffer[4];
154 * };
155 *
156 * The buffer must be initialized to 0 and the address must be aligned to 64
157 * because it's faster when the atomic doesn't straddle a 64B block boundary.
158 */
159 unsigned alloc_size = 32;
160 unsigned alignment = 64;
161
162 si_resource_reference(&t->buf_filled_size, NULL);
163 u_suballocator_alloc(&sctx->allocator_zeroed_memory, alloc_size, alignment,
164 &t->buf_filled_size_offset,
165 (struct pipe_resource **)&t->buf_filled_size);
166
167 /* Offset to dwords_written of the first enabled streamout buffer. */
168 t->buf_filled_size_draw_count_offset = t->buf_filled_size_offset + i * 8 + 4;
169 }
170
171 if (first_target) {
172 struct pipe_shader_buffer sbuf;
173 sbuf.buffer = &t->buf_filled_size->b.b;
174 sbuf.buffer_offset = t->buf_filled_size_offset;
175 sbuf.buffer_size = 32; /* unused, the shader only uses the low 32 bits of the address */
176
177 si_set_internal_shader_buffer(sctx, SI_STREAMOUT_STATE_BUF, &sbuf);
178 }
179 } else {
180 /* GFX6-11 */
181 if (!t->buf_filled_size) {
182 unsigned alloc_size = sctx->gfx_level >= GFX11 ? 8 : 4;
183
184 u_suballocator_alloc(&sctx->allocator_zeroed_memory, alloc_size, 4,
185 &t->buf_filled_size_offset,
186 (struct pipe_resource **)&t->buf_filled_size);
187 t->buf_filled_size_draw_count_offset = t->buf_filled_size_offset;
188 }
189 }
190
191 /* Bind it to the shader. */
192 struct pipe_shader_buffer sbuf;
193 sbuf.buffer = targets[i]->buffer;
194
195 if (sctx->gfx_level >= GFX11) {
196 sbuf.buffer_offset = targets[i]->buffer_offset;
197 sbuf.buffer_size = targets[i]->buffer_size;
198 } else {
199 sbuf.buffer_offset = 0;
200 sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size;
201 }
202
203 si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
204 si_resource(targets[i]->buffer)->bind_history |= SI_BIND_STREAMOUT_BUFFER;
205 }
206 for (; i < old_num_targets; i++) {
207 si_so_target_reference(&sctx->streamout.targets[i], NULL);
208 si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
209 }
210
211 /* Either streamout is being resumed for all targets or none. Required by how we implement it
212 * for GFX12.
213 */
214 assert(!append_bitmask || enabled_mask == append_bitmask);
215
216 if (!!sctx->streamout.enabled_mask != !!enabled_mask)
217 sctx->do_update_shaders = true; /* to keep/remove streamout shader code as an optimization */
218
219 sctx->streamout.num_targets = num_targets;
220 sctx->streamout.enabled_mask = enabled_mask;
221 sctx->streamout.append_bitmask = append_bitmask;
222
223 /* Update dirty state bits. */
224 if (num_targets) {
225 si_streamout_buffers_dirty(sctx);
226
227 /* All readers of the streamout targets need to be finished before we can
228 * start writing to them.
229 */
230 sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS |
231 SI_BARRIER_PFP_SYNC_ME;
232 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
233 } else {
234 si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
235 si_set_streamout_enable(sctx, false);
236 }
237 }
238
si_flush_vgt_streamout(struct si_context * sctx)239 static void si_flush_vgt_streamout(struct si_context *sctx)
240 {
241 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
242 unsigned reg_strmout_cntl;
243
244 radeon_begin(cs);
245
246 /* The register is at different places on different ASICs. */
247 if (sctx->gfx_level >= GFX9) {
248 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
249 radeon_emit(PKT3(PKT3_WRITE_DATA, 3, 0));
250 radeon_emit(S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME));
251 radeon_emit(R_0300FC_CP_STRMOUT_CNTL >> 2);
252 radeon_emit(0);
253 radeon_emit(0);
254 } else if (sctx->gfx_level >= GFX7) {
255 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
256 radeon_set_uconfig_reg(reg_strmout_cntl, 0);
257 } else {
258 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
259 radeon_set_config_reg(reg_strmout_cntl, 0);
260 }
261
262 radeon_event_write(V_028A90_SO_VGTSTREAMOUT_FLUSH);
263
264 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
265 radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
266 radeon_emit(reg_strmout_cntl >> 2); /* register */
267 radeon_emit(0);
268 radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
269 radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
270 radeon_emit(4); /* poll interval */
271 radeon_end();
272 }
273
si_emit_streamout_begin(struct si_context * sctx,unsigned index)274 static void si_emit_streamout_begin(struct si_context *sctx, unsigned index)
275 {
276 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
277 struct si_streamout_target **t = sctx->streamout.targets;
278 bool first_target = true;
279
280 if (sctx->gfx_level < GFX11)
281 si_flush_vgt_streamout(sctx);
282
283 for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
284 if (!t[i])
285 continue;
286
287 t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
288
289 if (sctx->gfx_level >= GFX12) {
290 /* Only the first streamout target holds information. */
291 if (first_target) {
292 if (sctx->streamout.append_bitmask & (1 << i)) {
293 si_cp_copy_data(sctx, cs, COPY_DATA_REG, NULL,
294 R_0309B0_GE_GS_ORDERED_ID_BASE >> 2, COPY_DATA_SRC_MEM,
295 t[i]->buf_filled_size, t[i]->buf_filled_size_offset);
296 } else {
297 radeon_begin(cs);
298 radeon_set_uconfig_reg(R_0309B0_GE_GS_ORDERED_ID_BASE, 0);
299 radeon_end();
300 }
301
302 first_target = false;
303 }
304 } else if (sctx->gfx_level >= GFX11) {
305 if (sctx->streamout.append_bitmask & (1 << i)) {
306 /* Restore the register value. */
307 si_cp_copy_data(sctx, cs, COPY_DATA_REG, NULL,
308 (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 / 4) + i,
309 COPY_DATA_SRC_MEM, t[i]->buf_filled_size,
310 t[i]->buf_filled_size_offset);
311 } else {
312 /* Set to 0. */
313 radeon_begin(cs);
314 radeon_set_uconfig_reg(R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 + i * 4, 0);
315 radeon_end();
316 }
317 } else {
318 /* Legacy streamout.
319 *
320 * The hw binds streamout buffers as shader resources. VGT only counts primitives
321 * and tells the shader through SGPRs what to do.
322 */
323 radeon_begin(cs);
324 radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
325 radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
326 radeon_emit(sctx->streamout.stride_in_dw[i]); /* VTX_STRIDE (in DW) */
327
328 if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
329 uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
330
331 /* Append. */
332 radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
333 radeon_emit(STRMOUT_SELECT_BUFFER(i) |
334 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
335 radeon_emit(0); /* unused */
336 radeon_emit(0); /* unused */
337 radeon_emit(va); /* src address lo */
338 radeon_emit(va >> 32); /* src address hi */
339
340 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
341 RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE);
342 } else {
343 /* Start from the beginning. */
344 radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
345 radeon_emit(STRMOUT_SELECT_BUFFER(i) |
346 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
347 radeon_emit(0); /* unused */
348 radeon_emit(0); /* unused */
349 radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
350 radeon_emit(0); /* unused */
351 }
352 radeon_end_update_context_roll();
353 }
354 }
355
356 sctx->streamout.begin_emitted = true;
357 }
358
si_emit_streamout_end(struct si_context * sctx)359 void si_emit_streamout_end(struct si_context *sctx)
360 {
361 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
362 struct si_streamout_target **t = sctx->streamout.targets;
363
364 if (sctx->gfx_level >= GFX12) {
365 /* Nothing to do. The streamout state buffer already contains the next ordered ID, which
366 * is the only thing we need to restore.
367 */
368 sctx->streamout.begin_emitted = false;
369 return;
370 }
371
372 if (sctx->gfx_level >= GFX11) {
373 /* Wait for streamout to finish before reading GDS_STRMOUT registers. */
374 sctx->barrier_flags |= SI_BARRIER_SYNC_VS;
375 si_emit_barrier_direct(sctx);
376 } else {
377 si_flush_vgt_streamout(sctx);
378 }
379
380 for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
381 if (!t[i])
382 continue;
383
384 if (sctx->gfx_level >= GFX11) {
385 si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM,
386 t[i]->buf_filled_size, t[i]->buf_filled_size_offset,
387 COPY_DATA_REG, NULL,
388 (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
389 /* For DrawTF reading buf_filled_size: */
390 sctx->barrier_flags |= SI_BARRIER_PFP_SYNC_ME;
391 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
392 } else {
393 uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
394
395 radeon_begin(cs);
396 radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
397 radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
398 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
399 radeon_emit(va); /* dst address lo */
400 radeon_emit(va >> 32); /* dst address hi */
401 radeon_emit(0); /* unused */
402 radeon_emit(0); /* unused */
403
404 /* Zero the buffer size. The counters (primitives generated,
405 * primitives emitted) may be enabled even if there is not
406 * buffer bound. This ensures that the primitives-emitted query
407 * won't increment. */
408 radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
409 radeon_end_update_context_roll();
410
411 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
412 RADEON_USAGE_WRITE | RADEON_PRIO_SO_FILLED_SIZE);
413 }
414
415 t[i]->buf_filled_size_valid = true;
416 }
417
418 sctx->streamout.begin_emitted = false;
419 }
420
421 /* STREAMOUT CONFIG DERIVED STATE
422 *
423 * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
424 * The buffer mask is an independent state, so no writes occur if there
425 * are no buffers bound.
426 */
427
si_emit_streamout_enable(struct si_context * sctx,unsigned index)428 static void si_emit_streamout_enable(struct si_context *sctx, unsigned index)
429 {
430 assert(sctx->gfx_level < GFX11);
431
432 radeon_begin(&sctx->gfx_cs);
433 radeon_set_context_reg_seq(R_028B94_VGT_STRMOUT_CONFIG, 2);
434 radeon_emit(S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
435 S_028B94_RAST_STREAM(0) |
436 S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
437 S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
438 S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
439 radeon_emit(sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
440 radeon_end();
441 }
442
si_set_streamout_enable(struct si_context * sctx,bool enable)443 static void si_set_streamout_enable(struct si_context *sctx, bool enable)
444 {
445 if (sctx->gfx_level >= GFX11)
446 return;
447
448 bool old_strmout_en = si_get_strmout_en(sctx);
449 unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
450
451 sctx->streamout.streamout_enabled = enable;
452
453 sctx->streamout.hw_enabled_mask =
454 sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
455 (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
456
457 if ((old_strmout_en != si_get_strmout_en(sctx)) ||
458 (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))
459 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
460 }
461
si_update_prims_generated_query_state(struct si_context * sctx,unsigned type,int diff)462 void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff)
463 {
464 if (sctx->gfx_level < GFX11 && type == PIPE_QUERY_PRIMITIVES_GENERATED) {
465 bool old_strmout_en = si_get_strmout_en(sctx);
466
467 sctx->streamout.num_prims_gen_queries += diff;
468 assert(sctx->streamout.num_prims_gen_queries >= 0);
469
470 sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0;
471
472 if (old_strmout_en != si_get_strmout_en(sctx))
473 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
474
475 if (si_update_ngg(sctx)) {
476 si_shader_change_notify(sctx);
477 sctx->do_update_shaders = true;
478 }
479 }
480 }
481
si_init_streamout_functions(struct si_context * sctx)482 void si_init_streamout_functions(struct si_context *sctx)
483 {
484 sctx->b.create_stream_output_target = si_create_so_target;
485 sctx->b.stream_output_target_destroy = si_so_target_destroy;
486 sctx->b.set_stream_output_targets = si_set_streamout_targets;
487 sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
488
489 if (sctx->gfx_level < GFX11)
490 sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
491 }
492