xref: /aosp_15_r20/external/mesa3d/src/compiler/nir/nir_gather_xfb_info.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "nir_xfb_info.h"
25 
26 #include "util/u_dynarray.h"
27 #include <util/u_math.h>
28 
29 static void
add_var_xfb_varying(nir_xfb_info * xfb,nir_xfb_varyings_info * varyings,unsigned buffer,unsigned offset,const struct glsl_type * type)30 add_var_xfb_varying(nir_xfb_info *xfb,
31                     nir_xfb_varyings_info *varyings,
32                     unsigned buffer,
33                     unsigned offset,
34                     const struct glsl_type *type)
35 {
36    if (varyings == NULL)
37       return;
38 
39    nir_xfb_varying_info *varying = &varyings->varyings[varyings->varying_count++];
40 
41    varying->type = type;
42    varying->buffer = buffer;
43    varying->offset = offset;
44    xfb->buffers[buffer].varying_count++;
45 }
46 
47 static nir_xfb_info *
nir_xfb_info_create(void * mem_ctx,uint16_t output_count)48 nir_xfb_info_create(void *mem_ctx, uint16_t output_count)
49 {
50    return rzalloc_size(mem_ctx, nir_xfb_info_size(output_count));
51 }
52 
53 static size_t
nir_xfb_varyings_info_size(uint16_t varying_count)54 nir_xfb_varyings_info_size(uint16_t varying_count)
55 {
56    return sizeof(nir_xfb_info) + sizeof(nir_xfb_varying_info) * varying_count;
57 }
58 
59 static nir_xfb_varyings_info *
nir_xfb_varyings_info_create(void * mem_ctx,uint16_t varying_count)60 nir_xfb_varyings_info_create(void *mem_ctx, uint16_t varying_count)
61 {
62    return rzalloc_size(mem_ctx, nir_xfb_varyings_info_size(varying_count));
63 }
64 
65 static void
add_var_xfb_outputs(nir_xfb_info * xfb,nir_xfb_varyings_info * varyings,nir_variable * var,unsigned buffer,unsigned * location,unsigned * offset,const struct glsl_type * type,bool varying_added)66 add_var_xfb_outputs(nir_xfb_info *xfb,
67                     nir_xfb_varyings_info *varyings,
68                     nir_variable *var,
69                     unsigned buffer,
70                     unsigned *location,
71                     unsigned *offset,
72                     const struct glsl_type *type,
73                     bool varying_added)
74 {
75    /* If this type contains a 64-bit value, align to 8 bytes */
76    if (glsl_type_contains_64bit(type))
77       *offset = ALIGN_POT(*offset, 8);
78 
79    if (glsl_type_is_array_or_matrix(type) && !var->data.compact) {
80       unsigned length = glsl_get_length(type);
81 
82       const struct glsl_type *child_type = glsl_get_array_element(type);
83       if (!glsl_type_is_array(child_type) &&
84           !glsl_type_is_struct(child_type)) {
85 
86          add_var_xfb_varying(xfb, varyings, buffer, *offset, type);
87          varying_added = true;
88       }
89 
90       for (unsigned i = 0; i < length; i++)
91          add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset,
92                              child_type, varying_added);
93    } else if (glsl_type_is_struct_or_ifc(type)) {
94       unsigned length = glsl_get_length(type);
95       for (unsigned i = 0; i < length; i++) {
96          const struct glsl_type *child_type = glsl_get_struct_field(type, i);
97          add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset,
98                              child_type, varying_added);
99       }
100    } else {
101       assert(buffer < NIR_MAX_XFB_BUFFERS);
102       if (xfb->buffers_written & (1 << buffer)) {
103          assert(xfb->buffers[buffer].stride == var->data.xfb.stride);
104          assert(xfb->buffer_to_stream[buffer] == var->data.stream);
105       } else {
106          xfb->buffers_written |= (1 << buffer);
107          xfb->buffers[buffer].stride = var->data.xfb.stride;
108          xfb->buffer_to_stream[buffer] = var->data.stream;
109       }
110 
111       assert(var->data.stream < NIR_MAX_XFB_STREAMS);
112       xfb->streams_written |= (1 << var->data.stream);
113 
114       unsigned comp_slots;
115       if (var->data.compact) {
116          /* This only happens for clip/cull which are float arrays */
117          assert(glsl_without_array(type) == glsl_float_type());
118          assert(var->data.location == VARYING_SLOT_CLIP_DIST0 ||
119                 var->data.location == VARYING_SLOT_CLIP_DIST1);
120          comp_slots = glsl_get_length(type);
121       } else {
122          comp_slots = glsl_get_component_slots(type);
123 
124          UNUSED unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4);
125          assert(attrib_slots == glsl_count_attribute_slots(type, false));
126 
127          /* Ensure that we don't have, for instance, a dvec2 with a
128           * location_frac of 2 which would make it crass a location boundary
129           * even though it fits in a single slot.  However, you can have a
130           * dvec3 which crosses the slot boundary with a location_frac of 2.
131           */
132          assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) ==
133                 attrib_slots);
134       }
135 
136       assert(var->data.location_frac + comp_slots <= 8);
137       uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac;
138       unsigned comp_offset = var->data.location_frac;
139 
140       if (!varying_added) {
141          add_var_xfb_varying(xfb, varyings, buffer, *offset, type);
142       }
143 
144       while (comp_mask) {
145          nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++];
146 
147          output->buffer = buffer;
148          output->offset = *offset;
149          output->location = *location;
150          output->component_mask = comp_mask & 0xf;
151          output->component_offset = comp_offset;
152 
153          *offset += util_bitcount(output->component_mask) * 4;
154          (*location)++;
155          comp_mask >>= 4;
156          comp_offset = 0;
157       }
158    }
159 }
160 
161 static int
compare_xfb_varying_offsets(const void * _a,const void * _b)162 compare_xfb_varying_offsets(const void *_a, const void *_b)
163 {
164    const nir_xfb_varying_info *a = _a, *b = _b;
165 
166    if (a->buffer != b->buffer)
167       return a->buffer - b->buffer;
168 
169    return a->offset - b->offset;
170 }
171 
172 static int
compare_xfb_output_offsets(const void * _a,const void * _b)173 compare_xfb_output_offsets(const void *_a, const void *_b)
174 {
175    const nir_xfb_output_info *a = _a, *b = _b;
176 
177    return a->offset - b->offset;
178 }
179 
180 void
nir_shader_gather_xfb_info(nir_shader * shader)181 nir_shader_gather_xfb_info(nir_shader *shader)
182 {
183    nir_gather_xfb_info_with_varyings(shader, NULL, NULL);
184 }
185 
186 void
nir_gather_xfb_info_with_varyings(nir_shader * shader,void * mem_ctx,nir_xfb_varyings_info ** varyings_info_out)187 nir_gather_xfb_info_with_varyings(nir_shader *shader,
188                                   void *mem_ctx,
189                                   nir_xfb_varyings_info **varyings_info_out)
190 {
191    assert(shader->info.stage == MESA_SHADER_VERTEX ||
192           shader->info.stage == MESA_SHADER_TESS_EVAL ||
193           shader->info.stage == MESA_SHADER_GEOMETRY);
194 
195    /* Compute the number of outputs we have.  This is simply the number of
196     * cumulative locations consumed by all the variables.  If a location is
197     * represented by multiple variables, then they each count separately in
198     * number of outputs.  This is only an estimate as some variables may have
199     * an xfb_buffer but not an output so it may end up larger than we need but
200     * it should be good enough for allocation.
201     */
202    unsigned num_outputs = 0;
203    unsigned num_varyings = 0;
204    nir_xfb_varyings_info *varyings_info = NULL;
205    nir_foreach_shader_out_variable(var, shader) {
206       if (var->data.explicit_xfb_buffer) {
207          num_outputs += glsl_count_attribute_slots(var->type, false);
208          num_varyings += glsl_varying_count(var->type);
209       }
210    }
211    if (num_outputs == 0 || num_varyings == 0)
212       return;
213 
214    nir_xfb_info *xfb = nir_xfb_info_create(shader, num_outputs);
215    if (varyings_info_out != NULL) {
216       *varyings_info_out = nir_xfb_varyings_info_create(mem_ctx, num_varyings);
217       varyings_info = *varyings_info_out;
218    }
219 
220    /* Walk the list of outputs and add them to the array */
221    nir_foreach_shader_out_variable(var, shader) {
222       if (!var->data.explicit_xfb_buffer)
223          continue;
224 
225       unsigned location = var->data.location;
226 
227       /* In order to know if we have a array of blocks can't be done just by
228        * checking if we have an interface type and is an array, because due
229        * splitting we could end on a case were we received a split struct
230        * that contains an array.
231        */
232       bool is_array_block = var->interface_type != NULL &&
233                             glsl_type_is_array(var->type) &&
234                             glsl_without_array(var->type) == var->interface_type;
235 
236       if (var->data.explicit_offset && !is_array_block) {
237          unsigned offset = var->data.offset;
238          add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer,
239                              &location, &offset, var->type, false);
240       } else if (is_array_block) {
241          assert(glsl_type_is_struct_or_ifc(var->interface_type));
242 
243          unsigned aoa_size = glsl_get_aoa_size(var->type);
244          const struct glsl_type *itype = var->interface_type;
245          unsigned nfields = glsl_get_length(itype);
246          for (unsigned b = 0; b < aoa_size; b++) {
247             for (unsigned f = 0; f < nfields; f++) {
248                int foffset = glsl_get_struct_field_offset(itype, f);
249                const struct glsl_type *ftype = glsl_get_struct_field(itype, f);
250                if (foffset < 0) {
251                   location += glsl_count_attribute_slots(ftype, false);
252                   continue;
253                }
254 
255                unsigned offset = foffset;
256                add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer + b,
257                                    &location, &offset, ftype, false);
258             }
259          }
260       }
261    }
262 
263    /* Everything is easier in the state setup code if outputs and varyings are
264     * sorted in order of output offset (and buffer for varyings).
265     */
266    qsort(xfb->outputs, xfb->output_count, sizeof(xfb->outputs[0]),
267          compare_xfb_output_offsets);
268 
269    if (varyings_info != NULL) {
270       qsort(varyings_info->varyings, varyings_info->varying_count,
271             sizeof(varyings_info->varyings[0]),
272             compare_xfb_varying_offsets);
273    }
274 
275 #ifndef NDEBUG
276    /* Finally, do a sanity check */
277    unsigned max_offset[NIR_MAX_XFB_BUFFERS] = { 0 };
278    for (unsigned i = 0; i < xfb->output_count; i++) {
279       assert(xfb->outputs[i].offset >= max_offset[xfb->outputs[i].buffer]);
280       assert(xfb->outputs[i].component_mask != 0);
281       unsigned slots = util_bitcount(xfb->outputs[i].component_mask);
282       max_offset[xfb->outputs[i].buffer] = xfb->outputs[i].offset + slots * 4;
283    }
284 #endif
285 
286    ralloc_free(shader->xfb_info);
287    shader->xfb_info = xfb;
288 }
289 
290 static int
get_xfb_out_sort_index(const nir_xfb_output_info * a)291 get_xfb_out_sort_index(const nir_xfb_output_info *a)
292 {
293    /* Return the maximum number to put dummy components at the end. */
294    if (!a->component_mask)
295       return MAX_XFB_BUFFERS << 26;
296 
297    return ((uint32_t)a->buffer << 26) | /* 2 bits for the buffer */
298           /* 10 bits for the component location (256 * 4) */
299           (((uint32_t)a->location * 4 + a->component_offset) << 16) |
300           /* 16 bits for the offset */
301           a->offset;
302 }
303 
304 static int
compare_xfb_out(const void * pa,const void * pb)305 compare_xfb_out(const void *pa, const void *pb)
306 {
307    const nir_xfb_output_info *a = (const nir_xfb_output_info *)pa;
308    const nir_xfb_output_info *b = (const nir_xfb_output_info *)pb;
309 
310    return get_xfb_out_sort_index(a) - get_xfb_out_sort_index(b);
311 }
312 
313 /**
314  * Gather transform feedback info from lowered IO intrinsics.
315  */
316 void
nir_gather_xfb_info_from_intrinsics(nir_shader * nir)317 nir_gather_xfb_info_from_intrinsics(nir_shader *nir)
318 {
319    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
320    uint8_t buffer_to_stream[MAX_XFB_BUFFERS] = { 0 };
321    uint8_t buffer_mask = 0;
322    uint8_t stream_mask = 0;
323 
324    /* Gather xfb outputs. */
325    struct util_dynarray array = { 0 };
326 
327    nir_foreach_block(block, impl) {
328       nir_foreach_instr(instr, block) {
329          if (instr->type != nir_instr_type_intrinsic ||
330              !nir_instr_xfb_write_mask(nir_instr_as_intrinsic(instr)))
331             continue;
332 
333          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
334 
335          unsigned wr_mask = nir_intrinsic_write_mask(intr);
336 
337          while (wr_mask) {
338             unsigned i = u_bit_scan(&wr_mask);
339             unsigned index = nir_intrinsic_component(intr) + i;
340             nir_io_xfb xfb = index < 2 ? nir_intrinsic_io_xfb(intr) : nir_intrinsic_io_xfb2(intr);
341 
342             if (xfb.out[index % 2].num_components) {
343                nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
344                nir_xfb_output_info out;
345 
346                out.component_offset = index;
347                out.component_mask =
348                   BITFIELD_RANGE(index, xfb.out[index % 2].num_components);
349                out.location = sem.location;
350                out.high_16bits = sem.high_16bits;
351                out.buffer = xfb.out[index % 2].buffer;
352                out.offset = (uint32_t)xfb.out[index % 2].offset * 4;
353                util_dynarray_append(&array, nir_xfb_output_info, out);
354 
355                uint8_t stream = (sem.gs_streams >> (i * 2)) & 0x3;
356                buffer_to_stream[out.buffer] = stream;
357                buffer_mask |= BITFIELD_BIT(out.buffer);
358                stream_mask |= BITFIELD_BIT(stream);
359 
360                /* No elements before component_offset are allowed to be set. */
361                assert(!(out.component_mask & BITFIELD_MASK(out.component_offset)));
362             }
363          }
364       }
365    }
366 
367    nir_xfb_output_info *outputs = (nir_xfb_output_info *)array.data;
368    int count = util_dynarray_num_elements(&array, nir_xfb_output_info);
369 
370    if (!count)
371       return;
372 
373    if (count > 1) {
374       /* Sort outputs by buffer, location, and component. */
375       qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out);
376 
377       /* Merge outputs referencing the same slot. */
378       for (int i = 0; i < count - 1; i++) {
379          nir_xfb_output_info *cur = &outputs[i];
380 
381          if (!cur->component_mask)
382             continue;
383 
384          /* Outputs referencing the same buffer and location are contiguous. */
385          for (int j = i + 1;
386               j < count &&
387               cur->buffer == outputs[j].buffer &&
388               cur->location == outputs[j].location &&
389               cur->high_16bits == outputs[j].high_16bits;
390               j++) {
391             if (outputs[j].component_mask &&
392                 outputs[j].offset - outputs[j].component_offset * 4 ==
393                    cur->offset - cur->component_offset * 4) {
394                unsigned merged_offset = MIN2(cur->component_offset,
395                                              outputs[j].component_offset);
396                /* component_mask is relative to 0, not component_offset */
397                unsigned merged_mask = cur->component_mask | outputs[j].component_mask;
398 
399                /* The component mask should have no holes after merging. */
400                if (util_is_power_of_two_nonzero((merged_mask >> merged_offset) + 1)) {
401                   /* Merge outputs. */
402                   cur->component_offset = merged_offset;
403                   cur->component_mask = merged_mask;
404                   cur->offset = (uint32_t)cur->offset -
405                                 (uint32_t)cur->component_offset * 4 +
406                                 (uint32_t)merged_offset * 4;
407                   /* Disable the other output. */
408                   outputs[j].component_mask = 0;
409                }
410             }
411          }
412       }
413 
414       /* Sort outputs again to put disabled outputs at the end. */
415       qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out);
416 
417       /* Remove disabled outputs. */
418       for (int i = count - 1; i >= 0 && !outputs[i].component_mask; i--)
419          count = i;
420    }
421 
422    for (unsigned i = 0; i < count; i++)
423       assert(outputs[i].component_mask);
424 
425    /* Create nir_xfb_info. */
426    nir_xfb_info *info = nir_xfb_info_create(nir, count);
427    if (!info) {
428       util_dynarray_fini(&array);
429       return;
430    }
431 
432    /* Fill nir_xfb_info. */
433    info->buffers_written = buffer_mask;
434    info->streams_written = stream_mask;
435    memcpy(info->buffer_to_stream, buffer_to_stream, sizeof(buffer_to_stream));
436    info->output_count = count;
437    memcpy(info->outputs, outputs, count * sizeof(outputs[0]));
438 
439    /* Set strides. */
440    for (unsigned i = 0; i < MAX_XFB_BUFFERS; i++) {
441       if (buffer_mask & BITFIELD_BIT(i))
442          info->buffers[i].stride = nir->info.xfb_stride[i] * 4;
443    }
444 
445    /* Set varying_count. */
446    for (unsigned i = 0; i < count; i++)
447       info->buffers[outputs[i].buffer].varying_count++;
448 
449    /* Replace original xfb info. */
450    ralloc_free(nir->xfb_info);
451    nir->xfb_info = info;
452 
453    util_dynarray_fini(&array);
454 }
455 
456 void
nir_print_xfb_info(nir_xfb_info * info,FILE * fp)457 nir_print_xfb_info(nir_xfb_info *info, FILE *fp)
458 {
459    fprintf(fp, "buffers_written: 0x%x\n", info->buffers_written);
460    fprintf(fp, "streams_written: 0x%x\n", info->streams_written);
461 
462    for (unsigned i = 0; i < NIR_MAX_XFB_BUFFERS; i++) {
463       if (BITFIELD_BIT(i) & info->buffers_written) {
464          fprintf(fp, "buffer%u: stride=%u varying_count=%u stream=%u\n", i,
465                  info->buffers[i].stride,
466                  info->buffers[i].varying_count,
467                  info->buffer_to_stream[i]);
468       }
469    }
470 
471    fprintf(fp, "output_count: %u\n", info->output_count);
472 
473    for (unsigned i = 0; i < info->output_count; i++) {
474       fprintf(fp, "output%u: buffer=%u, offset=%u, location=%u, high_16bits=%u, "
475                   "component_offset=%u, component_mask=0x%x\n",
476               i, info->outputs[i].buffer,
477               info->outputs[i].offset,
478               info->outputs[i].location,
479               info->outputs[i].high_16bits,
480               info->outputs[i].component_offset,
481               info->outputs[i].component_mask);
482    }
483 }
484