1 /*
2 * Copyright 2023 Alyssa Rosenzweig
3 * Copyright 2023 Valve Corporation
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "compiler/shader_enums.h"
8 #include "libagx.h"
9
10 #ifndef __OPENCL_VERSION__
11 #include "util/bitscan.h"
12 #define CONST(type_) uint64_t
13 #define libagx_popcount(x) util_bitcount64(x)
14 #define libagx_sub_sat(x, y) ((x >= y) ? (x - y) : 0)
15 #else
16 #define CONST(type_) constant type_ *
17 #define libagx_popcount(x) popcount(x)
18 #define libagx_sub_sat(x, y) sub_sat(x, y)
19 #endif
20
21 #ifndef LIBAGX_GEOMETRY_H
22 #define LIBAGX_GEOMETRY_H
23
24 #define MAX_SO_BUFFERS 4
25 #define MAX_VERTEX_STREAMS 4
26
27 /* Packed geometry state buffer */
28 struct agx_geometry_state {
29 /* Heap to allocate from. */
30 GLOBAL(uchar) heap;
31 uint32_t heap_bottom, heap_size;
32 } PACKED;
33 AGX_STATIC_ASSERT(sizeof(struct agx_geometry_state) == 4 * 4);
34
35 struct agx_restart_unroll_params {
36 /* Heap to allocate from across draws */
37 GLOBAL(struct agx_geometry_state) heap;
38
39 /* Input: index buffer if present. */
40 uint64_t index_buffer;
41
42 /* Input: draw count */
43 CONST(uint) count;
44
45 /* Input: indirect draw descriptor. Raw pointer since it's strided. */
46 uint64_t draws;
47
48 /* Output draw descriptors */
49 GLOBAL(uint) out_draws;
50
51 /* Pointer to zero */
52 uint64_t zero_sink;
53
54 /* Input: maximum draw count, count is clamped to this */
55 uint32_t max_draws;
56
57 /* Primitive restart index */
58 uint32_t restart_index;
59
60 /* Input index buffer size in elements */
61 uint32_t index_buffer_size_el;
62
63 /* Stride for the draw descriptor array */
64 uint32_t draw_stride;
65
66 /* Use first vertex as the provoking vertex for flat shading. We could stick
67 * this in the key, but meh, you're already hosed for perf on the unroll
68 * path.
69 */
70 uint32_t flatshade_first;
71 } PACKED;
72 AGX_STATIC_ASSERT(sizeof(struct agx_restart_unroll_params) == 17 * 4);
73
74 struct agx_gs_setup_indirect_params {
75 /* Index buffer if present. */
76 uint64_t index_buffer;
77
78 /* Indirect draw descriptor. */
79 CONST(uint) draw;
80
81 /* Pointer to be written with allocated vertex buffer */
82 GLOBAL(uintptr_t) vertex_buffer;
83
84 /* Output input assembly state */
85 GLOBAL(struct agx_ia_state) ia;
86
87 /* Output geometry parameters */
88 GLOBAL(struct agx_geometry_params) geom;
89
90 /* Pointer to zero */
91 uint64_t zero_sink;
92
93 /* Vertex (TES) output mask for sizing the allocated buffer */
94 uint64_t vs_outputs;
95
96 /* The index size (1, 2, 4) or 0 if drawing without an index buffer. */
97 uint32_t index_size_B;
98
99 /* Size of the index buffer */
100 uint32_t index_buffer_range_el;
101 } PACKED;
102 AGX_STATIC_ASSERT(sizeof(struct agx_gs_setup_indirect_params) == 16 * 4);
103
104 struct agx_ia_state {
105 /* Index buffer if present. */
106 uint64_t index_buffer;
107
108 /* Size of the bound index buffer for bounds checking */
109 uint32_t index_buffer_range_el;
110
111 /* Number of vertices per instance. Written by CPU for direct draw, indirect
112 * setup kernel for indirect. This is used for VS->GS and VS->TCS indexing.
113 */
114 uint32_t verts_per_instance;
115 } PACKED;
116 AGX_STATIC_ASSERT(sizeof(struct agx_ia_state) == 4 * 4);
117
118 static inline uint64_t
libagx_index_buffer(uint64_t index_buffer,uint size_el,uint offset_el,uint elsize_B,uint64_t zero_sink)119 libagx_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el,
120 uint elsize_B, uint64_t zero_sink)
121 {
122 if (offset_el < size_el)
123 return index_buffer + (offset_el * elsize_B);
124 else
125 return zero_sink;
126 }
127
128 static inline uint
libagx_index_buffer_range_el(uint size_el,uint offset_el)129 libagx_index_buffer_range_el(uint size_el, uint offset_el)
130 {
131 return libagx_sub_sat(size_el, offset_el);
132 }
133
134 struct agx_geometry_params {
135 /* Persistent (cross-draw) geometry state */
136 GLOBAL(struct agx_geometry_state) state;
137
138 /* Address of associated indirect draw buffer */
139 GLOBAL(uint) indirect_desc;
140
141 /* Address of count buffer. For an indirect draw, this will be written by the
142 * indirect setup kernel.
143 */
144 GLOBAL(uint) count_buffer;
145
146 /* Address of the primitives generated counters */
147 GLOBAL(uint) prims_generated_counter[MAX_VERTEX_STREAMS];
148 GLOBAL(uint) xfb_prims_generated_counter[MAX_VERTEX_STREAMS];
149 GLOBAL(uint) xfb_overflow[MAX_VERTEX_STREAMS];
150 GLOBAL(uint) xfb_any_overflow;
151
152 /* Pointers to transform feedback buffer offsets in bytes */
153 GLOBAL(uint) xfb_offs_ptrs[MAX_SO_BUFFERS];
154
155 /* Output index buffer, allocated by pre-GS. */
156 GLOBAL(uint) output_index_buffer;
157
158 /* Address of transform feedback buffer in general, supplied by the CPU. */
159 GLOBAL(uchar) xfb_base_original[MAX_SO_BUFFERS];
160
161 /* Address of transform feedback for the current primitive. Written by pre-GS
162 * program.
163 */
164 GLOBAL(uchar) xfb_base[MAX_SO_BUFFERS];
165
166 /* Address and present mask for the input to the geometry shader. These will
167 * reflect the vertex shader for VS->GS or instead the tessellation
168 * evaluation shader for TES->GS.
169 */
170 uint64_t input_buffer;
171 uint64_t input_mask;
172
173 /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
174 uint64_t flat_outputs;
175
176 uint32_t xfb_size[MAX_SO_BUFFERS];
177
178 /* Number of primitives emitted by transform feedback per stream. Written by
179 * the pre-GS program.
180 */
181 uint32_t xfb_prims[MAX_VERTEX_STREAMS];
182
183 /* Within an indirect GS draw, the grids used to dispatch the VS/GS written
184 * out by the GS indirect setup kernel or the CPU for a direct draw.
185 */
186 uint32_t vs_grid[3];
187 uint32_t gs_grid[3];
188
189 /* Number of input primitives across all instances, calculated by the CPU for
190 * a direct draw or the GS indirect setup kernel for an indirect draw.
191 */
192 uint32_t input_primitives;
193
194 /* Number of input primitives per instance, rounded up to a power-of-two and
195 * with the base-2 log taken. This is used to partition the output vertex IDs
196 * efficiently.
197 */
198 uint32_t primitives_log2;
199
200 /* Number of bytes output by the GS count shader per input primitive (may be
201 * 0), written by CPU and consumed by indirect draw setup shader for
202 * allocating counts.
203 */
204 uint32_t count_buffer_stride;
205
206 /* Dynamic input topology. Must be compatible with the geometry shader's
207 * layout() declared input class.
208 */
209 uint32_t input_topology;
210 } PACKED;
211 AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 82 * 4);
212
213 /* TCS shared memory layout:
214 *
215 * vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
216 *
217 * TODO: compact.
218 */
219 static inline uint
libagx_tcs_in_offs(uint vtx,gl_varying_slot location,uint64_t crosslane_vs_out_mask)220 libagx_tcs_in_offs(uint vtx, gl_varying_slot location,
221 uint64_t crosslane_vs_out_mask)
222 {
223 uint base = vtx * libagx_popcount(crosslane_vs_out_mask);
224 uint offs = libagx_popcount(crosslane_vs_out_mask &
225 (((uint64_t)(1) << location) - 1));
226
227 return (base + offs) * 16;
228 }
229
230 static inline uint
libagx_tcs_in_size(uint32_t vertices_in_patch,uint64_t crosslane_vs_out_mask)231 libagx_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
232 {
233 return vertices_in_patch * libagx_popcount(crosslane_vs_out_mask) * 16;
234 }
235
236 /*
237 * TCS out buffer layout, per-patch:
238 *
239 * float tess_level_outer[4];
240 * float tess_level_inner[2];
241 * vec4 patch_out[MAX_PATCH_OUTPUTS];
242 * vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
243 *
244 * Vertex out are compacted based on the mask of written out. Patch
245 * out are used as-is.
246 *
247 * Bounding boxes are ignored.
248 */
249 static inline uint
libagx_tcs_out_offs(uint vtx_id,gl_varying_slot location,uint nr_patch_out,uint out_patch_size,uint64_t vtx_out_mask)250 libagx_tcs_out_offs(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
251 uint out_patch_size, uint64_t vtx_out_mask)
252 {
253 uint off = 0;
254 if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
255 return off;
256
257 off += 4 * sizeof(float);
258 if (location == VARYING_SLOT_TESS_LEVEL_INNER)
259 return off;
260
261 off += 2 * sizeof(float);
262 if (location >= VARYING_SLOT_PATCH0)
263 return off + (16 * (location - VARYING_SLOT_PATCH0));
264
265 /* Anything else is a per-vtx output */
266 off += 16 * nr_patch_out;
267 off += 16 * vtx_id * libagx_popcount(vtx_out_mask);
268
269 uint idx = libagx_popcount(vtx_out_mask & (((uint64_t)(1) << location) - 1));
270 return off + (16 * idx);
271 }
272
273 static inline uint
libagx_tcs_out_stride(uint nr_patch_out,uint out_patch_size,uint64_t vtx_out_mask)274 libagx_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
275 uint64_t vtx_out_mask)
276 {
277 return libagx_tcs_out_offs(out_patch_size, VARYING_SLOT_VAR0, nr_patch_out,
278 out_patch_size, vtx_out_mask);
279 }
280
281 /* In a tess eval shader, stride for hw vertex ID */
282 #define LIBAGX_TES_PATCH_ID_STRIDE 8192
283
284 #endif
285