xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_fs_thread_payload.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2006-2022 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_fs.h"
25 #include "elk_fs_builder.h"
26 
27 using namespace elk;
28 
elk_vs_thread_payload(const elk_fs_visitor & v)29 elk_vs_thread_payload::elk_vs_thread_payload(const elk_fs_visitor &v)
30 {
31    unsigned r = 0;
32 
33    /* R0: Thread header. */
34    r += reg_unit(v.devinfo);
35 
36    /* R1: URB handles. */
37    urb_handles = elk_ud8_grf(r, 0);
38    r += reg_unit(v.devinfo);
39 
40    num_regs = r;
41 }
42 
elk_tcs_thread_payload(const elk_fs_visitor & v)43 elk_tcs_thread_payload::elk_tcs_thread_payload(const elk_fs_visitor &v)
44 {
45    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(v.prog_data);
46    struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(v.prog_data);
47    struct elk_tcs_prog_key *tcs_key = (struct elk_tcs_prog_key *) v.key;
48 
49    if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH) {
50       patch_urb_output = elk_ud1_grf(0, 0);
51       primitive_id = elk_vec1_grf(0, 1);
52 
53       /* r1-r4 contain the ICP handles. */
54       icp_handle_start = elk_ud8_grf(1, 0);
55 
56       num_regs = 5;
57    } else {
58       assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
59       assert(tcs_key->input_vertices <= ELK_MAX_TCS_INPUT_VERTICES);
60 
61       unsigned r = 0;
62 
63       r += reg_unit(v.devinfo);
64 
65       patch_urb_output = elk_ud8_grf(r, 0);
66       r += reg_unit(v.devinfo);
67 
68       if (tcs_prog_data->include_primitive_id) {
69          primitive_id = elk_vec8_grf(r, 0);
70          r += reg_unit(v.devinfo);
71       }
72 
73       /* ICP handles occupy the next 1-32 registers. */
74       icp_handle_start = elk_ud8_grf(r, 0);
75       r += elk_tcs_prog_key_input_vertices(tcs_key) * reg_unit(v.devinfo);
76 
77       num_regs = r;
78    }
79 }
80 
elk_tes_thread_payload(const elk_fs_visitor & v)81 elk_tes_thread_payload::elk_tes_thread_payload(const elk_fs_visitor &v)
82 {
83    unsigned r = 0;
84 
85    /* R0: Thread Header. */
86    patch_urb_input = retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD);
87    primitive_id = elk_vec1_grf(0, 1);
88    r += reg_unit(v.devinfo);
89 
90    /* R1-3: gl_TessCoord.xyz. */
91    for (unsigned i = 0; i < 3; i++) {
92       coords[i] = elk_vec8_grf(r, 0);
93       r += reg_unit(v.devinfo);
94    }
95 
96    /* R4: URB output handles. */
97    urb_output = elk_ud8_grf(r, 0);
98    r += reg_unit(v.devinfo);
99 
100    num_regs = r;
101 }
102 
elk_gs_thread_payload(elk_fs_visitor & v)103 elk_gs_thread_payload::elk_gs_thread_payload(elk_fs_visitor &v)
104 {
105    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(v.prog_data);
106    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(v.prog_data);
107    const fs_builder bld = fs_builder(&v).at_end();
108 
109    /* R0: thread header. */
110    unsigned r = reg_unit(v.devinfo);
111 
112    /* R1: output URB handles. */
113    urb_handles = bld.vgrf(ELK_REGISTER_TYPE_UD);
114    bld.AND(urb_handles, elk_ud8_grf(r, 0), elk_imm_ud(0xFFFF));
115 
116    /* R1: Instance ID stored in bits 31:27 */
117    instance_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
118    bld.SHR(instance_id, elk_ud8_grf(r, 0), elk_imm_ud(27u));
119 
120    r += reg_unit(v.devinfo);
121 
122    if (gs_prog_data->include_primitive_id) {
123       primitive_id = elk_ud8_grf(r, 0);
124       r += reg_unit(v.devinfo);
125    }
126 
127    /* Always enable VUE handles so we can safely use pull model if needed.
128     *
129     * The push model for a GS uses a ton of register space even for trivial
130     * scenarios with just a few inputs, so just make things easier and a bit
131     * safer by always having pull model available.
132     */
133    gs_prog_data->base.include_vue_handles = true;
134 
135    /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
136    icp_handle_start = elk_ud8_grf(r, 0);
137    r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo);
138 
139    num_regs = r;
140 
141    /* Use a maximum of 24 registers for push-model inputs. */
142    const unsigned max_push_components = 24;
143 
144    /* If pushing our inputs would take too many registers, reduce the URB read
145     * length (which is in HWords, or 8 registers), and resort to pulling.
146     *
147     * Note that the GS reads <URB Read Length> HWords for every vertex - so we
148     * have to multiply by VerticesIn to obtain the total storage requirement.
149     */
150    if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in >
151        max_push_components) {
152       vue_prog_data->urb_read_length =
153          ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8;
154    }
155 }
156 
157 static inline void
setup_fs_payload_gfx6(elk_fs_thread_payload & payload,const elk_fs_visitor & v,bool & source_depth_to_render_target)158 setup_fs_payload_gfx6(elk_fs_thread_payload &payload,
159                       const elk_fs_visitor &v,
160                       bool &source_depth_to_render_target)
161 {
162    struct elk_wm_prog_data *prog_data = elk_wm_prog_data(v.prog_data);
163 
164    const unsigned payload_width = MIN2(16, v.dispatch_width);
165    assert(v.dispatch_width % payload_width == 0);
166    assert(v.devinfo->ver >= 6);
167 
168    payload.num_regs = 0;
169 
170    /* R0: PS thread payload header. */
171    payload.num_regs++;
172 
173    for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
174       /* R1: masks, pixel X/Y coordinates. */
175       payload.subspan_coord_reg[j] = payload.num_regs++;
176    }
177 
178    for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
179       /* R3-26: barycentric interpolation coordinates.  These appear in the
180        * same order that they appear in the elk_barycentric_mode enum.  Each
181        * set of coordinates occupies 2 registers if dispatch width == 8 and 4
182        * registers if dispatch width == 16.  Coordinates only appear if they
183        * were enabled using the "Barycentric Interpolation Mode" bits in
184        * WM_STATE.
185        */
186       for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
187          if (prog_data->barycentric_interp_modes & (1 << i)) {
188             payload.barycentric_coord_reg[i][j] = payload.num_regs;
189             payload.num_regs += payload_width / 4;
190          }
191       }
192 
193       /* R27-28: interpolated depth if uses source depth */
194       if (prog_data->uses_src_depth) {
195          payload.source_depth_reg[j] = payload.num_regs;
196          payload.num_regs += payload_width / 8;
197       }
198 
199       /* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
200       if (prog_data->uses_src_w) {
201          payload.source_w_reg[j] = payload.num_regs;
202          payload.num_regs += payload_width / 8;
203       }
204 
205       /* R31: MSAA position offsets. */
206       if (prog_data->uses_pos_offset) {
207          payload.sample_pos_reg[j] = payload.num_regs;
208          payload.num_regs++;
209       }
210 
211       /* R32-33: MSAA input coverage mask */
212       if (prog_data->uses_sample_mask) {
213          assert(v.devinfo->ver >= 7);
214          payload.sample_mask_in_reg[j] = payload.num_regs;
215          payload.num_regs += payload_width / 8;
216       }
217    }
218 
219    if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
220       source_depth_to_render_target = true;
221    }
222 }
223 
224 #undef P                        /* prompted depth */
225 #undef C                        /* computed */
226 #undef N                        /* non-promoted? */
227 
228 #define P 0
229 #define C 1
230 #define N 2
231 
232 static const struct {
233    GLuint mode:2;
234    GLuint sd_present:1;
235    GLuint sd_to_rt:1;
236    GLuint dd_present:1;
237    GLuint ds_present:1;
238 } wm_iz_table[ELK_WM_IZ_BIT_MAX] =
239 {
240  { P, 0, 0, 0, 0 },
241  { P, 0, 0, 0, 0 },
242  { P, 0, 0, 0, 0 },
243  { P, 0, 0, 0, 0 },
244  { P, 0, 0, 0, 0 },
245  { N, 1, 1, 0, 0 },
246  { N, 0, 1, 0, 0 },
247  { N, 0, 1, 0, 0 },
248  { P, 0, 0, 0, 0 },
249  { P, 0, 0, 0, 0 },
250  { C, 0, 1, 1, 0 },
251  { C, 0, 1, 1, 0 },
252  { P, 0, 0, 0, 0 },
253  { N, 1, 1, 0, 0 },
254  { C, 0, 1, 1, 0 },
255  { C, 0, 1, 1, 0 },
256  { P, 0, 0, 0, 0 },
257  { P, 0, 0, 0, 0 },
258  { P, 0, 0, 0, 0 },
259  { P, 0, 0, 0, 0 },
260  { P, 0, 0, 0, 0 },
261  { N, 1, 1, 0, 0 },
262  { N, 0, 1, 0, 0 },
263  { N, 0, 1, 0, 0 },
264  { P, 0, 0, 0, 0 },
265  { P, 0, 0, 0, 0 },
266  { C, 0, 1, 1, 0 },
267  { C, 0, 1, 1, 0 },
268  { P, 0, 0, 0, 0 },
269  { N, 1, 1, 0, 0 },
270  { C, 0, 1, 1, 0 },
271  { C, 0, 1, 1, 0 },
272  { P, 0, 0, 0, 0 },
273  { P, 0, 0, 0, 0 },
274  { P, 0, 0, 0, 0 },
275  { P, 0, 0, 0, 0 },
276  { P, 0, 0, 0, 0 },
277  { N, 1, 1, 0, 1 },
278  { N, 0, 1, 0, 1 },
279  { N, 0, 1, 0, 1 },
280  { P, 0, 0, 0, 0 },
281  { P, 0, 0, 0, 0 },
282  { C, 0, 1, 1, 1 },
283  { C, 0, 1, 1, 1 },
284  { P, 0, 0, 0, 0 },
285  { N, 1, 1, 0, 1 },
286  { C, 0, 1, 1, 1 },
287  { C, 0, 1, 1, 1 },
288  { P, 0, 0, 0, 0 },
289  { C, 0, 0, 0, 1 },
290  { P, 0, 0, 0, 0 },
291  { C, 0, 1, 0, 1 },
292  { P, 0, 0, 0, 0 },
293  { C, 1, 1, 0, 1 },
294  { C, 0, 1, 0, 1 },
295  { C, 0, 1, 0, 1 },
296  { P, 0, 0, 0, 0 },
297  { C, 1, 1, 1, 1 },
298  { C, 0, 1, 1, 1 },
299  { C, 0, 1, 1, 1 },
300  { P, 0, 0, 0, 0 },
301  { C, 1, 1, 1, 1 },
302  { C, 0, 1, 1, 1 },
303  { C, 0, 1, 1, 1 }
304 };
305 
306 /**
307  * \param line_aa  ELK_NEVER, ELK_ALWAYS or ELK_SOMETIMES
308  * \param lookup  bitmask of ELK_WM_IZ_* flags
309  */
310 static inline void
setup_fs_payload_gfx4(elk_fs_thread_payload & payload,const elk_fs_visitor & v,bool & source_depth_to_render_target,bool & runtime_check_aads_emit)311 setup_fs_payload_gfx4(elk_fs_thread_payload &payload,
312                       const elk_fs_visitor &v,
313                       bool &source_depth_to_render_target,
314                       bool &runtime_check_aads_emit)
315 {
316    assert(v.dispatch_width <= 16);
317 
318    struct elk_wm_prog_data *prog_data = elk_wm_prog_data(v.prog_data);
319    elk_wm_prog_key *key = (elk_wm_prog_key *) v.key;
320 
321    GLuint reg = 1;
322    bool kill_stats_promoted_workaround = false;
323    int lookup = key->iz_lookup;
324 
325    assert(lookup < ELK_WM_IZ_BIT_MAX);
326 
327    /* Crazy workaround in the windowizer, which we need to track in
328     * our register allocation and render target writes.  See the "If
329     * statistics are enabled..." paragraph of 11.5.3.2: Early Depth
330     * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
331     */
332    if (key->stats_wm &&
333        (lookup & ELK_WM_IZ_PS_KILL_ALPHATEST_BIT) &&
334        wm_iz_table[lookup].mode == P) {
335       kill_stats_promoted_workaround = true;
336    }
337 
338    payload.subspan_coord_reg[0] = reg++;
339 
340    if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
341        kill_stats_promoted_workaround) {
342       payload.source_depth_reg[0] = reg;
343       reg += 2;
344    }
345 
346    if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
347       source_depth_to_render_target = true;
348 
349    if (wm_iz_table[lookup].ds_present || key->line_aa != ELK_NEVER) {
350       payload.aa_dest_stencil_reg[0] = reg;
351       runtime_check_aads_emit =
352          !wm_iz_table[lookup].ds_present && key->line_aa == ELK_SOMETIMES;
353       reg++;
354    }
355 
356    if (wm_iz_table[lookup].dd_present) {
357       payload.dest_depth_reg[0] = reg;
358       reg+=2;
359    }
360 
361    payload.num_regs = reg;
362 }
363 
364 #undef P                        /* prompted depth */
365 #undef C                        /* computed */
366 #undef N                        /* non-promoted? */
367 
elk_fs_thread_payload(const elk_fs_visitor & v,bool & source_depth_to_render_target,bool & runtime_check_aads_emit)368 elk_fs_thread_payload::elk_fs_thread_payload(const elk_fs_visitor &v,
369                                      bool &source_depth_to_render_target,
370                                      bool &runtime_check_aads_emit)
371   : subspan_coord_reg(),
372     source_depth_reg(),
373     source_w_reg(),
374     aa_dest_stencil_reg(),
375     dest_depth_reg(),
376     sample_pos_reg(),
377     sample_mask_in_reg(),
378     depth_w_coef_reg(),
379     barycentric_coord_reg()
380 {
381    if (v.devinfo->ver >= 6)
382       setup_fs_payload_gfx6(*this, v, source_depth_to_render_target);
383    else
384       setup_fs_payload_gfx4(*this, v, source_depth_to_render_target,
385                             runtime_check_aads_emit);
386 }
387 
elk_cs_thread_payload(const elk_fs_visitor & v)388 elk_cs_thread_payload::elk_cs_thread_payload(const elk_fs_visitor &v)
389 {
390    num_regs = reg_unit(v.devinfo);
391 }
392 
393 void
load_subgroup_id(const fs_builder & bld,elk_fs_reg & dest) const394 elk_cs_thread_payload::load_subgroup_id(const fs_builder &bld,
395                                     elk_fs_reg &dest) const
396 {
397    auto devinfo = bld.shader->devinfo;
398    dest = retype(dest, ELK_REGISTER_TYPE_UD);
399 
400    assert(gl_shader_stage_is_compute(bld.shader->stage));
401    int index = elk_get_subgroup_id_param_index(devinfo,
402                                                bld.shader->stage_prog_data);
403    bld.MOV(dest, elk_fs_reg(UNIFORM, index, ELK_REGISTER_TYPE_UD));
404 }
405 
406