xref: /aosp_15_r20/external/mesa3d/src/asahi/lib/shaders/geometry.cl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1/*
2 * Copyright 2023 Alyssa Rosenzweig
3 * Copyright 2023 Valve Corporation
4 * SPDX-License-Identifier: MIT
5 */
6
7#include "shaders/tessellator.h"
8#include "geometry.h"
9
10/* Compatible with util/u_math.h */
11static inline uint
12util_logbase2_ceil(uint n)
13{
14   if (n <= 1)
15      return 0;
16   else
17      return 32 - clz(n - 1);
18}
19
20/* Swap the two non-provoking vertices third vert in odd triangles. This
21 * generates a vertex ID list with a consistent winding order.
22 *
23 * With prim and flatshade_first, the map : [0, 1, 2] -> [0, 1, 2] is its own
24 * inverse. This lets us reuse it for both vertex fetch and transform feedback.
25 */
26uint
27libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
28{
29   unsigned pv = flatshade_first ? 0 : 2;
30
31   bool even = (prim & 1) == 0;
32   bool provoking = vert == pv;
33
34   return (provoking || even) ? vert : ((3 - pv) - vert);
35}
36
37uint64_t
38libagx_xfb_vertex_address(global struct agx_geometry_params *p, uint base_index,
39                          uint vert, uint buffer, uint stride,
40                          uint output_offset)
41{
42   uint index = base_index + vert;
43   uint xfb_offset = (index * stride) + output_offset;
44
45   return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
46}
47
48uint
49libagx_vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
50{
51   /* (0, 1), (1, 2), (2, 0) */
52   if (prim == (num_prims - 1) && vert == 1)
53      return 0;
54   else
55      return prim + vert;
56}
57
58uint
59libagx_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert,
60                                uint num_prims)
61{
62   /* Line list, line strip, or line loop */
63   if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1)
64      return 0;
65
66   if (mode == MESA_PRIM_LINES)
67      prim *= 2;
68
69   return prim + vert;
70}
71
72uint
73libagx_vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
74{
75   /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking
76    * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last.
77    * Piglit clipflat expects us to switch between these orders depending on
78    * provoking vertex, to avoid trivializing the fan.
79    *
80    * Rotate accordingly.
81    */
82   if (flatshade_first) {
83      vert = (vert == 2) ? 0 : (vert + 1);
84   }
85
86   /* The simpler form assuming last is provoking. */
87   return (vert == 0) ? 0 : prim + vert;
88}
89
90uint
91libagx_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert,
92                               bool flatshade_first)
93{
94   if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) {
95      vert = vert + 1;
96      vert = (vert == 3) ? 0 : vert;
97   }
98
99   if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0)
100      return 0;
101
102   if (mode == MESA_PRIM_TRIANGLES)
103      prim *= 3;
104
105   /* Triangle list, triangle strip, or triangle fan */
106   if (mode == MESA_PRIM_TRIANGLE_STRIP) {
107      unsigned pv = flatshade_first ? 0 : 2;
108
109      bool even = (prim & 1) == 0;
110      bool provoking = vert == pv;
111
112      vert = ((provoking || even) ? vert : ((3 - pv) - vert));
113   }
114
115   return prim + vert;
116}
117
118uint
119libagx_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert)
120{
121   /* Line list adj or line strip adj */
122   if (mode == MESA_PRIM_LINES_ADJACENCY)
123      prim *= 4;
124
125   return prim + vert;
126}
127
128uint
129libagx_vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
130                                   bool flatshade_first)
131{
132   /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency".
133    *
134    * There are different cases for first/middle/last/only primitives and for
135    * odd/even primitives.  Determine which case we're in.
136    */
137   bool last = prim == (num_prims - 1);
138   bool first = prim == 0;
139   bool even = (prim & 1) == 0;
140   bool even_or_first = even || first;
141
142   /* When the last vertex is provoking, we rotate the primitives
143    * accordingly. This seems required for OpenGL.
144    */
145   if (!flatshade_first && !even_or_first) {
146      vert = (vert + 4u) % 6u;
147   }
148
149   /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily,
150    * there are lots of patterns we can exploit, avoiding a full 6x6 LUT.
151    *
152    * Here we assume the first vertex is provoking, the Vulkan default.
153    */
154   uint offsets[6] = {
155      0,
156      first ? 1 : (even ? -2 : 3),
157      even_or_first ? 2 : 4,
158      last ? 5 : 6,
159      even_or_first ? 4 : 2,
160      even_or_first ? 3 : -2,
161   };
162
163   /* Ensure NIR can see thru the local array */
164   uint offset = 0;
165   for (uint i = 1; i < 6; ++i) {
166      if (i == vert)
167         offset = offsets[i];
168   }
169
170   /* Finally add to the base of the primitive */
171   return (prim * 2) + offset;
172}
173
174uint
175libagx_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert,
176                                   uint nr, bool flatshade_first)
177{
178   /* Tri adj list or tri adj strip */
179   if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
180      return libagx_vertex_id_for_tri_strip_adj(prim, vert, nr,
181                                                flatshade_first);
182   } else {
183      return (6 * prim) + vert;
184   }
185}
186
187uint
188libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
189                              uint prim, uint vert, uint num_prims)
190{
191   switch (mode) {
192   case MESA_PRIM_POINTS:
193   case MESA_PRIM_LINES:
194   case MESA_PRIM_TRIANGLES:
195   case MESA_PRIM_LINES_ADJACENCY:
196   case MESA_PRIM_TRIANGLES_ADJACENCY:
197      /* Regular primitive: every N vertices defines a primitive */
198      return (prim * mesa_vertices_per_prim(mode)) + vert;
199
200   case MESA_PRIM_LINE_LOOP:
201      return libagx_vertex_id_for_line_loop(prim, vert, num_prims);
202
203   case MESA_PRIM_LINE_STRIP:
204   case MESA_PRIM_LINE_STRIP_ADJACENCY:
205      /* (i, i + 1) or (i, ..., i + 3) */
206      return prim + vert;
207
208   case MESA_PRIM_TRIANGLE_STRIP: {
209      /* Order depends on the provoking vert.
210       *
211       * First: (0, 1, 2), (1, 3, 2), (2, 3, 4).
212       * Last:  (0, 1, 2), (2, 1, 3), (2, 3, 4).
213       *
214       * Pull the (maybe swapped) vert from the corresponding primitive
215       */
216      return prim + libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first);
217   }
218
219   case MESA_PRIM_TRIANGLE_FAN:
220      return libagx_vertex_id_for_tri_fan(prim, vert, flatshade_first);
221
222   case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
223      return libagx_vertex_id_for_tri_strip_adj(prim, vert, num_prims,
224                                                flatshade_first);
225
226   default:
227      return 0;
228   }
229}
230
231uint
232libagx_load_index_buffer_internal(uintptr_t index_buffer,
233                                  uint32_t index_buffer_range_el, uint id,
234                                  uint index_size)
235{
236   bool oob = id >= index_buffer_range_el;
237
238   /* If the load would be out-of-bounds, load the first element which is
239    * assumed valid. If the application index buffer is empty with robustness2,
240    * index_buffer will point to a zero sink where only the first is valid.
241    */
242   if (oob) {
243      id = 0;
244   }
245
246   uint el;
247   if (index_size == 1) {
248      el = ((constant uint8_t *)index_buffer)[id];
249   } else if (index_size == 2) {
250      el = ((constant uint16_t *)index_buffer)[id];
251   } else {
252      el = ((constant uint32_t *)index_buffer)[id];
253   }
254
255   /* D3D robustness semantics. TODO: Optimize? */
256   if (oob) {
257      el = 0;
258   }
259
260   return el;
261}
262
263uint
264libagx_load_index_buffer(constant struct agx_ia_state *p, uint id,
265                         uint index_size)
266{
267   return libagx_load_index_buffer_internal(
268      p->index_buffer, p->index_buffer_range_el, id, index_size);
269}
270
271/*
272 * Return the ID of the first thread in the workgroup where cond is true, or
273 * 1024 if cond is false across the workgroup.
274 */
275static uint
276first_true_thread_in_workgroup(bool cond, local uint *scratch)
277{
278   barrier(CLK_LOCAL_MEM_FENCE);
279   scratch[get_sub_group_id()] = ballot(cond);
280   barrier(CLK_LOCAL_MEM_FENCE);
281
282   uint first_group = ctz(ballot(scratch[get_sub_group_local_id()]));
283   uint off = ctz(first_group < 32 ? scratch[first_group] : 0);
284   return (first_group * 32) + off;
285}
286
287/*
288 * Allocate memory from the heap (thread-safe). Returns the offset into the
289 * heap. The allocation will be word-aligned.
290 */
291static inline uint
292libagx_atomic_alloc(global struct agx_geometry_state *heap, uint size_B)
293{
294   return atomic_fetch_add((volatile atomic_uint *)(&heap->heap_bottom),
295                           align(size_B, 8));
296}
297
298/*
299 * When unrolling the index buffer for a draw, we translate the old indirect
300 * draws to new indirect draws. This routine allocates the new index buffer and
301 * sets up most of the new draw descriptor.
302 */
303static global void *
304setup_unroll_for_draw(global struct agx_restart_unroll_params *p,
305                      constant uint *in_draw, uint draw, enum mesa_prim mode,
306                      uint index_size_B)
307{
308   /* Determine an upper bound on the memory required for the index buffer.
309    * Restarts only decrease the unrolled index buffer size, so the maximum size
310    * is the unrolled size when the input has no restarts.
311    */
312   uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
313   uint max_verts = max_prims * mesa_vertices_per_prim(mode);
314   uint alloc_size = max_verts * index_size_B;
315
316   /* Allocate unrolled index buffer. Atomic since multiple threads may be
317    * running to handle multidraw in parallel.
318    */
319   global struct agx_geometry_state *heap = p->heap;
320   uint old_heap_bottom_B = libagx_atomic_alloc(p->heap, alloc_size);
321
322   /* Regardless of the input stride, we use tightly packed output draws */
323   global uint *out = &p->out_draws[5 * draw];
324
325   /* Setup most of the descriptor. Count will be determined after unroll. */
326   out[1] = in_draw[1];                       /* instance count */
327   out[2] = old_heap_bottom_B / index_size_B; /* index offset */
328   out[3] = in_draw[3];                       /* index bias */
329   out[4] = in_draw[4];                       /* base instance */
330
331   /* Return the index buffer we allocated */
332   return (global uchar *)heap->heap + old_heap_bottom_B;
333}
334
335#define UNROLL(INDEX, suffix)                                                  \
336   kernel void libagx_unroll_restart_##suffix(                                 \
337      global struct agx_restart_unroll_params *p, enum mesa_prim mode,         \
338      uint draw, uint tid)                                                     \
339   {                                                                           \
340      /* For an indirect multidraw, we are dispatched maxDraws times and       \
341       * terminate trailing invocations.                                       \
342       */                                                                      \
343      if (p->count && draw >= *(p->count))                                     \
344         return;                                                               \
345                                                                               \
346      constant uint *in_draw =                                                 \
347         (constant uint *)(p->draws + (draw * p->draw_stride));                \
348                                                                               \
349      uint count = in_draw[0];                                                 \
350                                                                               \
351      local uintptr_t out_ptr, in_ptr;                                         \
352      if (tid == 0) {                                                          \
353         out_ptr = (uintptr_t)setup_unroll_for_draw(p, in_draw, draw, mode,    \
354                                                    sizeof(INDEX));            \
355                                                                               \
356         /* Accessed thru local mem because NIR deref is too aggressive */     \
357         in_ptr = (uintptr_t)(libagx_index_buffer(                             \
358            p->index_buffer, p->index_buffer_size_el, in_draw[2],              \
359            sizeof(INDEX), p->zero_sink));                                     \
360      }                                                                        \
361                                                                               \
362      barrier(CLK_LOCAL_MEM_FENCE);                                            \
363      global INDEX *out = (global INDEX *)out_ptr;                             \
364                                                                               \
365      local uint scratch[32];                                                  \
366                                                                               \
367      uint out_prims = 0;                                                      \
368      INDEX restart_idx = p->restart_index;                                    \
369      bool flatshade_first = p->flatshade_first;                               \
370                                                                               \
371      uint needle = 0;                                                         \
372      uint per_prim = mesa_vertices_per_prim(mode);                            \
373      while (needle < count) {                                                 \
374         /* Search for next restart or the end. Lanes load in parallel. */     \
375         uint next_restart = needle;                                           \
376         for (;;) {                                                            \
377            uint idx = next_restart + tid;                                     \
378            bool restart =                                                     \
379               idx >= count || libagx_load_index_buffer_internal(              \
380                                  in_ptr, p->index_buffer_size_el, idx,        \
381                                  sizeof(INDEX)) == restart_idx;               \
382                                                                               \
383            uint next_offs = first_true_thread_in_workgroup(restart, scratch); \
384                                                                               \
385            next_restart += next_offs;                                         \
386            if (next_offs < 1024)                                              \
387               break;                                                          \
388         }                                                                     \
389                                                                               \
390         /* Emit up to the next restart. Lanes output in parallel */           \
391         uint subcount = next_restart - needle;                                \
392         uint subprims = u_decomposed_prims_for_vertices(mode, subcount);      \
393         uint out_prims_base = out_prims;                                      \
394         for (uint i = tid; i < subprims; i += 1024) {                         \
395            for (uint vtx = 0; vtx < per_prim; ++vtx) {                        \
396               uint id = libagx_vertex_id_for_topology(mode, flatshade_first,  \
397                                                       i, vtx, subprims);      \
398               uint offset = needle + id;                                      \
399                                                                               \
400               out[((out_prims_base + i) * per_prim) + vtx] =                  \
401                  libagx_load_index_buffer_internal(                           \
402                     in_ptr, p->index_buffer_size_el, offset, sizeof(INDEX));  \
403            }                                                                  \
404         }                                                                     \
405                                                                               \
406         out_prims += subprims;                                                \
407         needle = next_restart + 1;                                            \
408      }                                                                        \
409                                                                               \
410      if (tid == 0)                                                            \
411         p->out_draws[(5 * draw) + 0] = out_prims * per_prim;                  \
412   }
413
414UNROLL(uchar, u8)
415UNROLL(ushort, u16)
416UNROLL(uint, u32)
417
418uint
419libagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i)
420{
421   global uint *off_ptr = p->xfb_offs_ptrs[i];
422   if (!off_ptr)
423      return 0;
424
425   uint off = *off_ptr;
426   p->xfb_base[i] = p->xfb_base_original[i] + off;
427   return off;
428}
429
430/*
431 * Translate EndPrimitive for LINE_STRIP or TRIANGLE_STRIP output prims into
432 * writes into the 32-bit output index buffer. We write the sequence (b, b + 1,
433 * b + 2, ..., b + n - 1, -1), where b (base) is the first vertex in the prim, n
434 * (count) is the number of verts in the prims, and -1 is the prim restart index
435 * used to signal the end of the prim.
436 *
437 * For points, we write index buffers without restart, just as a sideband to
438 * pass data into the vertex shader.
439 */
440void
441libagx_end_primitive(global int *index_buffer, uint total_verts,
442                     uint verts_in_prim, uint total_prims,
443                     uint invocation_vertex_base, uint invocation_prim_base,
444                     uint geometry_base, bool restart)
445{
446   /* Previous verts/prims are from previous invocations plus earlier
447    * prims in this invocation. For the intra-invocation counts, we
448    * subtract the count for this prim from the inclusive sum NIR gives us.
449    */
450   uint previous_verts_in_invoc = (total_verts - verts_in_prim);
451   uint previous_verts = invocation_vertex_base + previous_verts_in_invoc;
452   uint previous_prims = restart ? invocation_prim_base + (total_prims - 1) : 0;
453
454   /* The indices are encoded as: (unrolled ID * output vertices) + vertex. */
455   uint index_base = geometry_base + previous_verts_in_invoc;
456
457   /* Index buffer contains 1 index for each vertex and 1 for each prim */
458   global int *out = &index_buffer[previous_verts + previous_prims];
459
460   /* Write out indices for the strip */
461   for (uint i = 0; i < verts_in_prim; ++i) {
462      out[i] = index_base + i;
463   }
464
465   if (restart)
466      out[verts_in_prim] = -1;
467}
468
469void
470libagx_build_gs_draw(global struct agx_geometry_params *p, uint vertices,
471                     uint primitives)
472{
473   global uint *descriptor = p->indirect_desc;
474   global struct agx_geometry_state *state = p->state;
475
476   /* Setup the indirect draw descriptor */
477   uint indices = vertices + primitives; /* includes restart indices */
478
479   /* Allocate the index buffer */
480   uint index_buffer_offset_B = state->heap_bottom;
481   p->output_index_buffer =
482      (global uint *)(state->heap + index_buffer_offset_B);
483   state->heap_bottom += (indices * 4);
484
485   descriptor[0] = indices;                   /* count */
486   descriptor[1] = 1;                         /* instance count */
487   descriptor[2] = index_buffer_offset_B / 4; /* start */
488   descriptor[3] = 0;                         /* index bias */
489   descriptor[4] = 0;                         /* start instance */
490
491   if (state->heap_bottom > state->heap_size) {
492      global uint *foo = (global uint *)(uintptr_t)0xdeadbeef;
493      *foo = 0x1234;
494   }
495}
496
497void
498libagx_gs_setup_indirect(global struct agx_gs_setup_indirect_params *gsi,
499                         enum mesa_prim mode, uint local_id)
500{
501   global struct agx_geometry_params *p = gsi->geom;
502   global struct agx_ia_state *ia = gsi->ia;
503
504   /* Determine the (primitives, instances) grid size. */
505   uint vertex_count = gsi->draw[0];
506   uint instance_count = gsi->draw[1];
507
508   ia->verts_per_instance = vertex_count;
509
510   /* Calculate number of primitives input into the GS */
511   uint prim_per_instance = u_decomposed_prims_for_vertices(mode, vertex_count);
512   p->input_primitives = prim_per_instance * instance_count;
513
514   /* Invoke VS as (vertices, instances); GS as (primitives, instances) */
515   p->vs_grid[0] = vertex_count;
516   p->vs_grid[1] = instance_count;
517
518   p->gs_grid[0] = prim_per_instance;
519   p->gs_grid[1] = instance_count;
520
521   p->primitives_log2 = util_logbase2_ceil(prim_per_instance);
522
523   /* If indexing is enabled, the third word is the offset into the index buffer
524    * in elements. Apply that offset now that we have it. For a hardware
525    * indirect draw, the hardware would do this for us, but for software input
526    * assembly we need to do it ourselves.
527    */
528   if (gsi->index_size_B) {
529      ia->index_buffer =
530         libagx_index_buffer(gsi->index_buffer, gsi->index_buffer_range_el,
531                             gsi->draw[2], gsi->index_size_B, gsi->zero_sink);
532
533      ia->index_buffer_range_el =
534         libagx_index_buffer_range_el(gsi->index_buffer_range_el, gsi->draw[2]);
535   }
536
537   /* We need to allocate VS and GS count buffers, do so now */
538   global struct agx_geometry_state *state = p->state;
539
540   uint vertex_buffer_size =
541      libagx_tcs_in_size(vertex_count * instance_count, gsi->vs_outputs);
542
543   p->count_buffer = (global uint *)(state->heap + state->heap_bottom);
544   state->heap_bottom +=
545      align(p->input_primitives * p->count_buffer_stride, 16);
546
547   p->input_buffer = (uintptr_t)(state->heap + state->heap_bottom);
548   *(gsi->vertex_buffer) = p->input_buffer;
549   state->heap_bottom += align(vertex_buffer_size, 4);
550
551   p->input_mask = gsi->vs_outputs;
552
553   if (state->heap_bottom > state->heap_size) {
554      global uint *foo = (global uint *)(uintptr_t)0x1deadbeef;
555      *foo = 0x1234;
556   }
557}
558
559/*
560 * Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented
561 * manually with subgroup ops and local memory since Mesa doesn't do those
562 * lowerings yet.
563 */
564static uint2
565libagx_work_group_scan_inclusive_add(uint x, local uint *scratch)
566{
567   uint sg_id = get_sub_group_id();
568
569   /* Partial prefix sum of the subgroup */
570   uint sg = sub_group_scan_inclusive_add(x);
571
572   /* Reduction (sum) for the subgroup */
573   uint sg_sum = sub_group_broadcast(sg, 31);
574
575   /* Write out all the subgroups sums */
576   barrier(CLK_LOCAL_MEM_FENCE);
577   scratch[sg_id] = sg_sum;
578   barrier(CLK_LOCAL_MEM_FENCE);
579
580   /* Read all the subgroup sums. Thread T in subgroup G reads the sum of all
581    * threads in subgroup T.
582    */
583   uint other_sum = scratch[get_sub_group_local_id()];
584
585   /* Exclusive sum the subgroup sums to get the total before the current group,
586    * which can be added to the total for the current group.
587    */
588   uint other_sums = sub_group_scan_exclusive_add(other_sum);
589   uint base = sub_group_broadcast(other_sums, sg_id);
590   uint prefix = base + sg;
591
592   /* Reduce the workgroup using the prefix sum we already did */
593   uint reduction = sub_group_broadcast(other_sums + other_sum, 31);
594
595   return (uint2)(prefix, reduction);
596}
597
598kernel void
599libagx_prefix_sum(global uint *buffer, uint len, uint words, uint word)
600{
601   local uint scratch[32];
602   uint tid = get_local_id(0);
603
604   /* Main loop: complete workgroups processing 1024 values at once */
605   uint i, count = 0;
606   uint len_remainder = len % 1024;
607   uint len_rounded_down = len - len_remainder;
608
609   for (i = tid; i < len_rounded_down; i += 1024) {
610      global uint *ptr = &buffer[(i * words) + word];
611      uint value = *ptr;
612      uint2 sums = libagx_work_group_scan_inclusive_add(value, scratch);
613
614      *ptr = count + sums[0];
615      count += sums[1];
616   }
617
618   /* The last iteration is special since we won't have a full subgroup unless
619    * the length is divisible by the subgroup size, and we don't advance count.
620    */
621   global uint *ptr = &buffer[(i * words) + word];
622   uint value = (tid < len_remainder) ? *ptr : 0;
623   uint scan = libagx_work_group_scan_inclusive_add(value, scratch)[0];
624
625   if (tid < len_remainder) {
626      *ptr = count + scan;
627   }
628}
629
630kernel void
631libagx_prefix_sum_tess(global struct libagx_tess_args *p)
632{
633   libagx_prefix_sum(p->counts, p->nr_patches, 1 /* words */, 0 /* word */);
634
635   /* After prefix summing, we know the total # of indices, so allocate the
636    * index buffer now. Elect a thread for the allocation.
637    */
638   barrier(CLK_LOCAL_MEM_FENCE);
639   if (get_local_id(0) != 0)
640      return;
641
642   /* The last element of an inclusive prefix sum is the total sum */
643   uint total = p->counts[p->nr_patches - 1];
644
645   /* Allocate 4-byte indices */
646   uint32_t elsize_B = sizeof(uint32_t);
647   uint32_t size_B = total * elsize_B;
648   uint alloc_B = p->heap->heap_bottom;
649   p->heap->heap_bottom += size_B;
650   p->heap->heap_bottom = align(p->heap->heap_bottom, 8);
651
652   p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->heap) + alloc_B);
653
654   /* ...and now we can generate the API indexed draw */
655   global uint32_t *desc = p->out_draws;
656
657   desc[0] = total;              /* count */
658   desc[1] = 1;                  /* instance_count */
659   desc[2] = alloc_B / elsize_B; /* start */
660   desc[3] = 0;                  /* index_bias */
661   desc[4] = 0;                  /* start_instance */
662}
663
664uintptr_t
665libagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
666                             gl_varying_slot location)
667{
668   return buffer + libagx_tcs_in_offs(vtx, location, mask);
669}
670
671uintptr_t
672libagx_geometry_input_address(constant struct agx_geometry_params *p, uint vtx,
673                              gl_varying_slot location)
674{
675   return libagx_vertex_output_address(p->input_buffer, p->input_mask, vtx,
676                                       location);
677}
678
679unsigned
680libagx_input_vertices(constant struct agx_ia_state *ia)
681{
682   return ia->verts_per_instance;
683}
684