xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_compile_gs.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2013 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_fs_builder.h"
9 #include "brw_prim.h"
10 #include "brw_nir.h"
11 #include "brw_private.h"
12 #include "dev/intel_debug.h"
13 
14 using namespace brw;
15 
16 static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = {
17    [MESA_PRIM_POINTS] =_3DPRIM_POINTLIST,
18    [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
19    [MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
20    [MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
21    [MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
22    [MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
23    [MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
24    [MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
25    [MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
26    [MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
27    [MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
28    [MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
29    [MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
30    [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
31 };
32 
33 static void
brw_emit_gs_thread_end(fs_visitor & s)34 brw_emit_gs_thread_end(fs_visitor &s)
35 {
36    assert(s.stage == MESA_SHADER_GEOMETRY);
37 
38    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
39 
40    if (s.gs_compile->control_data_header_size_bits > 0) {
41       s.emit_gs_control_data_bits(s.final_gs_vertex_count);
42    }
43 
44    const fs_builder abld = fs_builder(&s).at_end().annotate("thread end");
45    fs_inst *inst;
46 
47    if (gs_prog_data->static_vertex_count != -1) {
48       /* Try and tag the last URB write with EOT instead of emitting a whole
49        * separate write just to finish the thread.
50        */
51       if (s.mark_last_urb_write_with_eot())
52          return;
53 
54       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
55       srcs[URB_LOGICAL_SRC_HANDLE] = s.gs_payload().urb_handles;
56       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(0);
57       inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
58                        srcs, ARRAY_SIZE(srcs));
59    } else {
60       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
61       srcs[URB_LOGICAL_SRC_HANDLE] = s.gs_payload().urb_handles;
62       srcs[URB_LOGICAL_SRC_DATA] = s.final_gs_vertex_count;
63       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
64       inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
65                        srcs, ARRAY_SIZE(srcs));
66    }
67    inst->eot = true;
68    inst->offset = 0;
69 }
70 
71 static void
brw_assign_gs_urb_setup(fs_visitor & s)72 brw_assign_gs_urb_setup(fs_visitor &s)
73 {
74    assert(s.stage == MESA_SHADER_GEOMETRY);
75 
76    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(s.prog_data);
77 
78    s.first_non_payload_grf +=
79       8 * vue_prog_data->urb_read_length * s.nir->info.gs.vertices_in;
80 
81    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
82       /* Rewrite all ATTR file references to GRFs. */
83       s.convert_attr_sources_to_hw_regs(inst);
84    }
85 }
86 
87 static bool
run_gs(fs_visitor & s)88 run_gs(fs_visitor &s)
89 {
90    assert(s.stage == MESA_SHADER_GEOMETRY);
91 
92    s.payload_ = new gs_thread_payload(s);
93 
94    const fs_builder bld = fs_builder(&s).at_end();
95 
96    s.final_gs_vertex_count = bld.vgrf(BRW_TYPE_UD);
97 
98    if (s.gs_compile->control_data_header_size_bits > 0) {
99       /* Create a VGRF to store accumulated control data bits. */
100       s.control_data_bits = bld.vgrf(BRW_TYPE_UD);
101 
102       /* If we're outputting more than 32 control data bits, then EmitVertex()
103        * will set control_data_bits to 0 after emitting the first vertex.
104        * Otherwise, we need to initialize it to 0 here.
105        */
106       if (s.gs_compile->control_data_header_size_bits <= 32) {
107          const fs_builder abld = bld.annotate("initialize control data bits");
108          abld.MOV(s.control_data_bits, brw_imm_ud(0u));
109       }
110    }
111 
112    nir_to_brw(&s);
113 
114    brw_emit_gs_thread_end(s);
115 
116    if (s.failed)
117       return false;
118 
119    brw_calculate_cfg(s);
120 
121    brw_fs_optimize(s);
122 
123    s.assign_curb_setup();
124    brw_assign_gs_urb_setup(s);
125 
126    brw_fs_lower_3src_null_dest(s);
127    brw_fs_workaround_memory_fence_before_eot(s);
128    brw_fs_workaround_emit_dummy_mov_instruction(s);
129 
130    brw_allocate_registers(s, true /* allow_spilling */);
131 
132    return !s.failed;
133 }
134 
135 extern "C" const unsigned *
brw_compile_gs(const struct brw_compiler * compiler,struct brw_compile_gs_params * params)136 brw_compile_gs(const struct brw_compiler *compiler,
137                struct brw_compile_gs_params *params)
138 {
139    nir_shader *nir = params->base.nir;
140    const struct brw_gs_prog_key *key = params->key;
141    struct brw_gs_prog_data *prog_data = params->prog_data;
142 
143    struct brw_gs_compile c;
144    memset(&c, 0, sizeof(c));
145    c.key = *key;
146 
147    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);
148 
149    prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
150    prog_data->base.base.ray_queries = nir->info.ray_queries;
151    prog_data->base.base.total_scratch = 0;
152 
153    /* The GLSL linker will have already matched up GS inputs and the outputs
154     * of prior stages.  The driver does extend VS outputs in some cases, but
155     * only for legacy OpenGL or Gfx4-5 hardware, neither of which offer
156     * geometry shader support.  So we can safely ignore that.
157     *
158     * For SSO pipelines, we use a fixed VUE map layout based on variable
159     * locations, so we can rely on rendezvous-by-location making this work.
160     */
161    GLbitfield64 inputs_read = nir->info.inputs_read;
162    brw_compute_vue_map(compiler->devinfo,
163                        &c.input_vue_map, inputs_read,
164                        nir->info.separate_shader, 1);
165 
166    brw_nir_apply_key(nir, compiler, &key->base,
167                      brw_geometry_stage_dispatch_width(compiler->devinfo));
168    brw_nir_lower_vue_inputs(nir, &c.input_vue_map);
169    brw_nir_lower_vue_outputs(nir);
170    brw_postprocess_nir(nir, compiler, debug_enabled,
171                        key->base.robust_flags);
172 
173    prog_data->base.clip_distance_mask =
174       ((1 << nir->info.clip_distance_array_size) - 1);
175    prog_data->base.cull_distance_mask =
176       ((1 << nir->info.cull_distance_array_size) - 1) <<
177       nir->info.clip_distance_array_size;
178 
179    prog_data->include_primitive_id =
180       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
181 
182    prog_data->invocations = nir->info.gs.invocations;
183 
184    nir_gs_count_vertices_and_primitives(
185       nir, &prog_data->static_vertex_count, nullptr, nullptr, 1u);
186 
187    if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) {
188       /* When the output type is points, the geometry shader may output data
189        * to multiple streams, and EndPrimitive() has no effect.  So we
190        * configure the hardware to interpret the control data as stream ID.
191        */
192       prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
193 
194       /* We only have to emit control bits if we are using non-zero streams */
195       if (nir->info.gs.active_stream_mask != (1 << 0))
196          c.control_data_bits_per_vertex = 2;
197       else
198          c.control_data_bits_per_vertex = 0;
199    } else {
200       /* When the output type is triangle_strip or line_strip, EndPrimitive()
201        * may be used to terminate the current strip and start a new one
202        * (similar to primitive restart), and outputting data to multiple
203        * streams is not supported.  So we configure the hardware to interpret
204        * the control data as EndPrimitive information (a.k.a. "cut bits").
205        */
206       prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
207 
208       /* We only need to output control data if the shader actually calls
209        * EndPrimitive().
210        */
211       c.control_data_bits_per_vertex =
212          nir->info.gs.uses_end_primitive ? 1 : 0;
213    }
214 
215    c.control_data_header_size_bits =
216       nir->info.gs.vertices_out * c.control_data_bits_per_vertex;
217 
218    /* 1 HWORD = 32 bytes = 256 bits */
219    prog_data->control_data_header_size_hwords =
220       ALIGN(c.control_data_header_size_bits, 256) / 256;
221 
222    /* Compute the output vertex size.
223     *
224     * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
225     * Size (p168):
226     *
227     *     [0,62] indicating [1,63] 16B units
228     *
229     *     Specifies the size of each vertex stored in the GS output entry
230     *     (following any Control Header data) as a number of 128-bit units
231     *     (minus one).
232     *
233     *     Programming Restrictions: The vertex size must be programmed as a
234     *     multiple of 32B units with the following exception: Rendering is
235     *     disabled (as per SOL stage state) and the vertex size output by the
236     *     GS thread is 16B.
237     *
238     *     If rendering is enabled (as per SOL state) the vertex size must be
239     *     programmed as a multiple of 32B units. In other words, the only time
240     *     software can program a vertex size with an odd number of 16B units
241     *     is when rendering is disabled.
242     *
243     * Note: B=bytes in the above text.
244     *
245     * It doesn't seem worth the extra trouble to optimize the case where the
246     * vertex size is 16B (especially since this would require special-casing
247     * the GEN assembly that writes to the URB).  So we just set the vertex
248     * size to a multiple of 32B (2 vec4's) in all cases.
249     *
250     * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
251     * budget that as follows:
252     *
253     *   512 bytes for varyings (a varying component is 4 bytes and
254     *             gl_MaxGeometryOutputComponents = 128)
255     *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
256     *             bytes)
257     *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
258     *             even if it's not used)
259     *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
260     *             whenever clip planes are enabled, even if the shader doesn't
261     *             write to gl_ClipDistance)
262     *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
263     *             (see below)--this causes up to 1 VUE slot to be wasted
264     *   400 bytes available for varying packing overhead
265     *
266     * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
267     * per interpolation type, so this is plenty.
268     *
269     */
270    unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
271    assert(output_vertex_size_bytes <= GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
272    prog_data->output_vertex_size_hwords =
273       ALIGN(output_vertex_size_bytes, 32) / 32;
274 
275    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
276     * That divides up as follows:
277     *
278     *     64 bytes for the control data header (cut indices or StreamID bits)
279     *   4096 bytes for varyings (a varying component is 4 bytes and
280     *              gl_MaxGeometryTotalOutputComponents = 1024)
281     *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
282     *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
283     *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
284     *              even if it's not used)
285     *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
286     *              whenever clip planes are enabled, even if the shader doesn't
287     *              write to gl_ClipDistance)
288     *   4096 bytes overhead since the VUE size must be a multiple of 32
289     *              bytes (see above)--this causes up to 1 VUE slot to be wasted
290     *   8128 bytes available for varying packing overhead
291     *
292     * Worst-case varying packing overhead is 3/4 of a varying slot per
293     * interpolation type, which works out to 3072 bytes, so this would allow
294     * us to accommodate 2 interpolation types without any danger of running
295     * out of URB space.
296     *
297     * In practice, the risk of running out of URB space is very small, since
298     * the above figures are all worst-case, and most of them scale with the
299     * number of output vertices.  So we'll just calculate the amount of space
300     * we need, and if it's too large, fail to compile.
301     *
302     * The above is for gfx7+ where we have a single URB entry that will hold
303     * all the output.
304     */
305    unsigned output_size_bytes =
306       prog_data->output_vertex_size_hwords * 32 * nir->info.gs.vertices_out;
307    output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
308 
309    /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
310     * which comes before the control header.
311     */
312    output_size_bytes += 32;
313 
314    /* Shaders can technically set max_vertices = 0, at which point we
315     * may have a URB size of 0 bytes.  Nothing good can come from that,
316     * so enforce a minimum size.
317     */
318    if (output_size_bytes == 0)
319       output_size_bytes = 1;
320 
321    unsigned max_output_size_bytes = GFX7_MAX_GS_URB_ENTRY_SIZE_BYTES;
322    if (output_size_bytes > max_output_size_bytes)
323       return NULL;
324 
325 
326    /* URB entry sizes are stored as a multiple of 64 bytes in gfx7+. */
327    prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
328 
329    assert(nir->info.gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim));
330    prog_data->output_topology =
331       gl_prim_to_hw_prim[nir->info.gs.output_primitive];
332 
333    prog_data->vertices_in = nir->info.gs.vertices_in;
334 
335    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
336     * need to program a URB read length of ceiling(num_slots / 2).
337     */
338    prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
339 
340    /* Now that prog_data setup is done, we are ready to actually compile the
341     * program.
342     */
343    if (unlikely(debug_enabled)) {
344       fprintf(stderr, "GS Input ");
345       brw_print_vue_map(stderr, &c.input_vue_map, MESA_SHADER_GEOMETRY);
346       fprintf(stderr, "GS Output ");
347       brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
348    }
349 
350    fs_visitor v(compiler, &params->base, &c, prog_data, nir,
351                 params->base.stats != NULL, debug_enabled);
352    if (run_gs(v)) {
353       prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
354 
355       assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
356       prog_data->base.base.dispatch_grf_start_reg =
357          v.payload().num_regs / reg_unit(compiler->devinfo);
358 
359       fs_generator g(compiler, &params->base,
360                      &prog_data->base.base, MESA_SHADER_GEOMETRY);
361       if (unlikely(debug_enabled)) {
362          const char *label =
363             nir->info.label ? nir->info.label : "unnamed";
364          char *name = ralloc_asprintf(params->base.mem_ctx,
365                                       "%s geometry shader %s",
366                                       label, nir->info.name);
367          g.enable_debug(name);
368       }
369       g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
370                       v.performance_analysis.require(), params->base.stats);
371       g.add_const_data(nir->constant_data, nir->constant_data_size);
372       return g.get_assembly();
373    }
374 
375    params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
376 
377    return NULL;
378 }
379 
380