xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_fs_visitor.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file elk_fs_visitor.cpp
25  *
26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
27  * makes it easier to do backend-specific optimizations than doing so
28  * in the GLSL IR or in the native code.
29  */
30 #include "elk_eu.h"
31 #include "elk_fs.h"
32 #include "elk_fs_builder.h"
33 #include "elk_nir.h"
34 #include "compiler/glsl_types.h"
35 
36 using namespace elk;
37 
38 /* Input data is organized with first the per-primitive values, followed
39  * by per-vertex values.  The per-vertex will have interpolation information
40  * associated, so use 4 components for each value.
41  */
42 
43 /* The register location here is relative to the start of the URB
44  * data.  It will get adjusted to be a real location before
45  * generate_code() time.
46  */
47 elk_fs_reg
interp_reg(const fs_builder & bld,unsigned location,unsigned channel,unsigned comp)48 elk_fs_visitor::interp_reg(const fs_builder &bld, unsigned location,
49                        unsigned channel, unsigned comp)
50 {
51    assert(stage == MESA_SHADER_FRAGMENT);
52    assert(BITFIELD64_BIT(location) & ~nir->info.per_primitive_inputs);
53 
54    const struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
55 
56    assert(prog_data->urb_setup[location] >= 0);
57    unsigned nr = prog_data->urb_setup[location];
58    channel += prog_data->urb_setup_channel[location];
59 
60    /* Adjust so we start counting from the first per_vertex input. */
61    assert(nr >= prog_data->num_per_primitive_inputs);
62    nr -= prog_data->num_per_primitive_inputs;
63 
64    const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
65    const unsigned regnr = per_vertex_start + (nr * 4) + channel;
66 
67    return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp);
68 }
69 
70 /* The register location here is relative to the start of the URB
71  * data.  It will get adjusted to be a real location before
72  * generate_code() time.
73  */
74 elk_fs_reg
per_primitive_reg(const fs_builder & bld,int location,unsigned comp)75 elk_fs_visitor::per_primitive_reg(const fs_builder &bld, int location, unsigned comp)
76 {
77    assert(stage == MESA_SHADER_FRAGMENT);
78    assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs);
79 
80    const struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
81 
82    comp += prog_data->urb_setup_channel[location];
83 
84    assert(prog_data->urb_setup[location] >= 0);
85 
86    const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
87 
88    assert(regnr < prog_data->num_per_primitive_inputs);
89 
90    return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp % 4);
91 }
92 
93 /** Emits the interpolation for the varying inputs. */
94 void
emit_interpolation_setup_gfx4()95 elk_fs_visitor::emit_interpolation_setup_gfx4()
96 {
97    struct elk_reg g1_uw = retype(elk_vec1_grf(1, 0), ELK_REGISTER_TYPE_UW);
98 
99    fs_builder abld = fs_builder(this).at_end().annotate("compute pixel centers");
100    this->pixel_x = vgrf(glsl_uint_type());
101    this->pixel_y = vgrf(glsl_uint_type());
102    this->pixel_x.type = ELK_REGISTER_TYPE_UW;
103    this->pixel_y.type = ELK_REGISTER_TYPE_UW;
104    abld.ADD(this->pixel_x,
105             elk_fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
106             elk_fs_reg(elk_imm_v(0x10101010)));
107    abld.ADD(this->pixel_y,
108             elk_fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
109             elk_fs_reg(elk_imm_v(0x11001100)));
110 
111    const fs_builder bld = fs_builder(this).at_end();
112    abld = bld.annotate("compute pixel deltas from v0");
113 
114    this->delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL] =
115       vgrf(glsl_vec2_type());
116    const elk_fs_reg &delta_xy = this->delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL];
117    const elk_fs_reg xstart(negate(elk_vec1_grf(1, 0)));
118    const elk_fs_reg ystart(negate(elk_vec1_grf(1, 1)));
119 
120    if (devinfo->has_pln) {
121       for (unsigned i = 0; i < dispatch_width / 8; i++) {
122          abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 0), i),
123                              quarter(this->pixel_x, i), xstart);
124          abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 1), i),
125                              quarter(this->pixel_y, i), ystart);
126       }
127    } else {
128       abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
129       abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
130    }
131 
132    this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg);
133 
134    /* The SF program automatically handles doing the perspective correction or
135     * not based on wm_prog_data::interp_mode[] so we can use the same pixel
136     * offsets for both perspective and non-perspective.
137     */
138    this->delta_xy[ELK_BARYCENTRIC_NONPERSPECTIVE_PIXEL] =
139       this->delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL];
140 
141    abld = bld.annotate("compute pos.w and 1/pos.w");
142    /* Compute wpos.w.  It's always in our setup, since it's needed to
143     * interpolate the other attributes.
144     */
145    this->wpos_w = vgrf(glsl_float_type());
146    abld.emit(ELK_FS_OPCODE_LINTERP, wpos_w, delta_xy,
147              interp_reg(abld, VARYING_SLOT_POS, 3, 0));
148    /* Compute the pixel 1/W value from wpos.w. */
149    this->pixel_w = vgrf(glsl_float_type());
150    abld.emit(ELK_SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
151 }
152 
153 /** Emits the interpolation for the varying inputs. */
154 void
emit_interpolation_setup_gfx6()155 elk_fs_visitor::emit_interpolation_setup_gfx6()
156 {
157    const fs_builder bld = fs_builder(this).at_end();
158    fs_builder abld = bld.annotate("compute pixel centers");
159 
160    this->pixel_x = vgrf(glsl_float_type());
161    this->pixel_y = vgrf(glsl_float_type());
162 
163    const struct elk_wm_prog_key *wm_key = (elk_wm_prog_key*) this->key;
164    struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(prog_data);
165 
166    elk_fs_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */
167    elk_fs_reg int_sample_offset_xy; /* Used on Gen8+ */
168    elk_fs_reg half_int_sample_offset_x, half_int_sample_offset_y;
169 
170    /* The thread payload only delivers subspan locations (ss0, ss1,
171     * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
172     * generate 4 pixel coordinates out of each subspan location. We do this
173     * by replicating a subspan coordinate 4 times and adding an offset of 1
174     * in each direction from the initial top left (tl) location to generate
175     * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
176     * (br = +1 in x, +1 in y).
177     *
178     * The locations we build look like this in SIMD8 :
179     *
180     *    ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
181     *
182     * The value 0x11001010 is a vector of 8 half byte vector. It adds
183     * following to generate the 4 pixels coordinates out of the subspan0:
184     *
185     *  0x
186     *    1 : ss0.y + 1 -> ss0.br.y
187     *    1 : ss0.y + 1 -> ss0.bl.y
188     *    0 : ss0.y + 0 -> ss0.tr.y
189     *    0 : ss0.y + 0 -> ss0.tl.y
190     *    1 : ss0.x + 1 -> ss0.br.x
191     *    0 : ss0.x + 0 -> ss0.bl.x
192     *    1 : ss0.x + 1 -> ss0.tr.x
193     *    0 : ss0.x + 0 -> ss0.tl.x
194     *
195     * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
196     * coordinates out of 2 subspans coordinates in a single ADD instruction
197     * (twice the operation above).
198     */
199    int_sample_offset_xy = elk_fs_reg(elk_imm_v(0x11001010));
200    half_int_sample_offset_x = elk_fs_reg(elk_imm_uw(0));
201    half_int_sample_offset_y = elk_fs_reg(elk_imm_uw(0));
202    /* On Gfx12.5, because of regioning restrictions, the interpolation code
203     * is slightly different and works off X & Y only inputs. The ordering
204     * of the half bytes here is a bit odd, with each subspan replicated
205     * twice and every other element is discarded :
206     *
207     *             ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br
208     *  X offset:    0      0      1      0      0      0      1      0
209     *  Y offset:    0      0      0      0      1      0      1      0
210     */
211    int_sample_offset_x = elk_fs_reg(elk_imm_v(0x01000100));
212    int_sample_offset_y = elk_fs_reg(elk_imm_v(0x01010000));
213 
214    elk_fs_reg int_pixel_offset_xy = int_sample_offset_xy; /* Used on Gen8+ */
215    elk_fs_reg half_int_pixel_offset_x = half_int_sample_offset_x;
216    elk_fs_reg half_int_pixel_offset_y = half_int_sample_offset_y;
217 
218    for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
219       const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
220       /* According to the "PS Thread Payload for Normal Dispatch"
221        * pages on the BSpec, subspan X/Y coordinates are stored in
222        * R1.2-R1.5/R2.2-R2.5 on gfx6+, and on R0.10-R0.13/R1.10-R1.13
223        * on gfx20+.  gi_reg is the 32B section of the GRF that
224        * contains the subspan coordinates.
225        */
226       const struct elk_reg gi_reg = elk_vec1_grf(i + 1, 0);
227       const struct elk_reg gi_uw = retype(gi_reg, ELK_REGISTER_TYPE_UW);
228 
229       if (devinfo->ver >= 8 || dispatch_width == 8) {
230          /* The "Register Region Restrictions" page says for BDW (and newer,
231           * presumably):
232           *
233           *     "When destination spans two registers, the source may be one or
234           *      two registers. The destination elements must be evenly split
235           *      between the two registers."
236           *
237           * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
238           * to compute our pixel centers.
239           */
240          const fs_builder dbld =
241             abld.exec_all().group(hbld.dispatch_width() * 2, 0);
242          elk_fs_reg int_pixel_xy = dbld.vgrf(ELK_REGISTER_TYPE_UW);
243 
244          dbld.ADD(int_pixel_xy,
245                   elk_fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
246                   int_pixel_offset_xy);
247 
248          hbld.emit(ELK_FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy,
249                                       horiz_stride(half_int_pixel_offset_x, 0));
250          hbld.emit(ELK_FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy,
251                                       horiz_stride(half_int_pixel_offset_y, 0));
252       } else {
253          /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
254           *
255           *     "When destination spans two registers, the source MUST span
256           *      two registers."
257           *
258           * Since the GRF source of the ADD will only read a single register,
259           * we must do two separate ADDs in SIMD16.
260           */
261          const elk_fs_reg int_pixel_x = hbld.vgrf(ELK_REGISTER_TYPE_UW);
262          const elk_fs_reg int_pixel_y = hbld.vgrf(ELK_REGISTER_TYPE_UW);
263 
264          hbld.ADD(int_pixel_x,
265                   elk_fs_reg(stride(suboffset(gi_uw, 4), 2, 4, 0)),
266                   elk_fs_reg(elk_imm_v(0x10101010)));
267          hbld.ADD(int_pixel_y,
268                   elk_fs_reg(stride(suboffset(gi_uw, 5), 2, 4, 0)),
269                   elk_fs_reg(elk_imm_v(0x11001100)));
270 
271          /* As of gfx6, we can no longer mix float and int sources.  We have
272           * to turn the integer pixel centers into floats for their actual
273           * use.
274           */
275          hbld.MOV(offset(pixel_x, hbld, i), int_pixel_x);
276          hbld.MOV(offset(pixel_y, hbld, i), int_pixel_y);
277       }
278    }
279 
280    abld = bld.annotate("compute pos.z");
281    if (wm_prog_data->uses_src_depth)
282       this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg);
283 
284    if (wm_prog_data->uses_src_w) {
285       abld = bld.annotate("compute pos.w");
286       this->pixel_w = fetch_payload_reg(abld, fs_payload().source_w_reg);
287       this->wpos_w = vgrf(glsl_float_type());
288       abld.emit(ELK_SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
289    }
290 
291    if (wm_key->persample_interp == ELK_SOMETIMES) {
292       assert(!devinfo->needs_unlit_centroid_workaround);
293 
294       const fs_builder ubld = bld.exec_all().group(16, 0);
295       bool loaded_flag = false;
296 
297       for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
298          if (!(wm_prog_data->barycentric_interp_modes & BITFIELD_BIT(i)))
299             continue;
300 
301          /* The sample mode will always be the top bit set in the perspective
302           * or non-perspective section.  In the case where no SAMPLE mode was
303           * requested, elk_wm_prog_data_barycentric_modes() will swap out the top
304           * mode for SAMPLE so this works regardless of whether SAMPLE was
305           * requested or not.
306           */
307          int sample_mode;
308          if (BITFIELD_BIT(i) & ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) {
309             sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
310                                         ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
311          } else {
312             sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
313                                         ELK_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
314          }
315          assert(wm_prog_data->barycentric_interp_modes &
316                 BITFIELD_BIT(sample_mode));
317 
318          if (i == sample_mode)
319             continue;
320 
321          uint8_t *barys = fs_payload().barycentric_coord_reg[i];
322 
323          uint8_t *sample_barys = fs_payload().barycentric_coord_reg[sample_mode];
324          assert(barys[0] && sample_barys[0]);
325 
326          if (!loaded_flag) {
327             check_dynamic_msaa_flag(ubld, wm_prog_data,
328                                     INTEL_MSAA_FLAG_PERSAMPLE_INTERP);
329          }
330 
331          for (unsigned j = 0; j < dispatch_width / 8; j++) {
332             set_predicate(
333                ELK_PREDICATE_NORMAL,
334                ubld.MOV(elk_vec8_grf(barys[j / 2] + (j % 2) * 2, 0),
335                         elk_vec8_grf(sample_barys[j / 2] + (j % 2) * 2, 0)));
336          }
337       }
338    }
339 
340    for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
341       this->delta_xy[i] = fetch_barycentric_reg(
342          bld, fs_payload().barycentric_coord_reg[i]);
343    }
344 
345    uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
346       (1 << ELK_BARYCENTRIC_PERSPECTIVE_CENTROID |
347        1 << ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
348 
349    if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
350       /* Get the pixel/sample mask into f0 so that we know which
351        * pixels are lit.  Then, for each channel that is unlit,
352        * replace the centroid data with non-centroid data.
353        */
354       for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
355          bld.exec_all().group(1, 0)
356             .MOV(retype(elk_flag_reg(0, i), ELK_REGISTER_TYPE_UW),
357                  retype(elk_vec1_grf(1 + i, 7), ELK_REGISTER_TYPE_UW));
358       }
359 
360       for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
361          if (!(centroid_modes & (1 << i)))
362             continue;
363 
364          const elk_fs_reg centroid_delta_xy = delta_xy[i];
365          const elk_fs_reg &pixel_delta_xy = delta_xy[i - 1];
366 
367          delta_xy[i] = bld.vgrf(ELK_REGISTER_TYPE_F, 2);
368 
369          for (unsigned c = 0; c < 2; c++) {
370             for (unsigned q = 0; q < dispatch_width / 8; q++) {
371                set_predicate(ELK_PREDICATE_NORMAL,
372                   bld.quarter(q).SEL(
373                      quarter(offset(delta_xy[i], bld, c), q),
374                      quarter(offset(centroid_delta_xy, bld, c), q),
375                      quarter(offset(pixel_delta_xy, bld, c), q)));
376             }
377          }
378       }
379    }
380 }
381 
382 static enum elk_conditional_mod
cond_for_alpha_func(enum compare_func func)383 cond_for_alpha_func(enum compare_func func)
384 {
385    switch(func) {
386    case COMPARE_FUNC_GREATER:
387       return ELK_CONDITIONAL_G;
388    case COMPARE_FUNC_GEQUAL:
389       return ELK_CONDITIONAL_GE;
390    case COMPARE_FUNC_LESS:
391       return ELK_CONDITIONAL_L;
392    case COMPARE_FUNC_LEQUAL:
393       return ELK_CONDITIONAL_LE;
394    case COMPARE_FUNC_EQUAL:
395       return ELK_CONDITIONAL_EQ;
396    case COMPARE_FUNC_NOTEQUAL:
397       return ELK_CONDITIONAL_NEQ;
398    default:
399       unreachable("Not reached");
400    }
401 }
402 
403 /**
404  * Alpha test support for when we compile it into the shader instead
405  * of using the normal fixed-function alpha test.
406  */
407 void
emit_alpha_test()408 elk_fs_visitor::emit_alpha_test()
409 {
410    assert(stage == MESA_SHADER_FRAGMENT);
411    elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
412    const fs_builder bld = fs_builder(this).at_end();
413    const fs_builder abld = bld.annotate("Alpha test");
414 
415    elk_fs_inst *cmp;
416    if (key->alpha_test_func == COMPARE_FUNC_ALWAYS)
417       return;
418 
419    if (key->alpha_test_func == COMPARE_FUNC_NEVER) {
420       /* f0.1 = 0 */
421       elk_fs_reg some_reg = elk_fs_reg(retype(elk_vec8_grf(0, 0),
422                                       ELK_REGISTER_TYPE_UW));
423       cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
424                      ELK_CONDITIONAL_NEQ);
425    } else {
426       /* RT0 alpha */
427       elk_fs_reg color = offset(outputs[0], bld, 3);
428 
429       /* f0.1 &= func(color, ref) */
430       cmp = abld.CMP(bld.null_reg_f(), color, elk_imm_f(key->alpha_test_ref),
431                      cond_for_alpha_func(key->alpha_test_func));
432    }
433    cmp->predicate = ELK_PREDICATE_NORMAL;
434    cmp->flag_subreg = 1;
435 }
436 
437 elk_fs_inst *
emit_single_fb_write(const fs_builder & bld,elk_fs_reg color0,elk_fs_reg color1,elk_fs_reg src0_alpha,unsigned components)438 elk_fs_visitor::emit_single_fb_write(const fs_builder &bld,
439                                  elk_fs_reg color0, elk_fs_reg color1,
440                                  elk_fs_reg src0_alpha, unsigned components)
441 {
442    assert(stage == MESA_SHADER_FRAGMENT);
443    struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
444 
445    /* Hand over gl_FragDepth or the payload depth. */
446    const elk_fs_reg dst_depth = fetch_payload_reg(bld, fs_payload().dest_depth_reg);
447    elk_fs_reg src_depth;
448 
449    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
450       src_depth = frag_depth;
451    } else if (source_depth_to_render_target) {
452       /* If we got here, we're in one of those strange Gen4-5 cases where
453        * we're forced to pass the source depth, unmodified, to the FB write.
454        * In this case, we don't want to use pixel_z because we may not have
455        * set up interpolation.  It's also perfectly safe because it only
456        * happens on old hardware (no coarse interpolation) and this is
457        * explicitly the pass-through case.
458        */
459       assert(devinfo->ver <= 5);
460       src_depth = fetch_payload_reg(bld, fs_payload().source_depth_reg);
461    }
462 
463    const elk_fs_reg sources[] = {
464       color0, color1, src0_alpha, src_depth, dst_depth,
465       (prog_data->uses_omask ? sample_mask : elk_fs_reg()),
466       elk_imm_ud(components)
467    };
468    assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
469    elk_fs_inst *write = bld.emit(ELK_FS_OPCODE_FB_WRITE_LOGICAL, elk_fs_reg(),
470                              sources, ARRAY_SIZE(sources));
471 
472    if (prog_data->uses_kill) {
473       write->predicate = ELK_PREDICATE_NORMAL;
474       write->flag_subreg = sample_mask_flag_subreg(*this);
475    }
476 
477    return write;
478 }
479 
480 void
do_emit_fb_writes(int nr_color_regions,bool replicate_alpha)481 elk_fs_visitor::do_emit_fb_writes(int nr_color_regions, bool replicate_alpha)
482 {
483    const fs_builder bld = fs_builder(this).at_end();
484    elk_fs_inst *inst = NULL;
485 
486    for (int target = 0; target < nr_color_regions; target++) {
487       /* Skip over outputs that weren't written. */
488       if (this->outputs[target].file == BAD_FILE)
489          continue;
490 
491       const fs_builder abld = bld.annotate(
492          ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
493 
494       elk_fs_reg src0_alpha;
495       if (devinfo->ver >= 6 && replicate_alpha && target != 0)
496          src0_alpha = offset(outputs[0], bld, 3);
497 
498       inst = emit_single_fb_write(abld, this->outputs[target],
499                                   this->dual_src_output, src0_alpha, 4);
500       inst->target = target;
501    }
502 
503    if (inst == NULL) {
504       /* Even if there's no color buffers enabled, we still need to send
505        * alpha out the pipeline to our null renderbuffer to support
506        * alpha-testing, alpha-to-coverage, and so on.
507        */
508       /* FINISHME: Factor out this frequently recurring pattern into a
509        * helper function.
510        */
511       const elk_fs_reg srcs[] = { reg_undef, reg_undef,
512                               reg_undef, offset(this->outputs[0], bld, 3) };
513       const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD, 4);
514       bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
515 
516       inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
517       inst->target = 0;
518    }
519 
520    inst->last_rt = true;
521    inst->eot = true;
522 }
523 
524 void
emit_fb_writes()525 elk_fs_visitor::emit_fb_writes()
526 {
527    assert(stage == MESA_SHADER_FRAGMENT);
528    struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
529    elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
530 
531    if (source_depth_to_render_target && devinfo->ver == 6) {
532       /* For outputting oDepth on gfx6, SIMD8 writes have to be used.  This
533        * would require SIMD8 moves of each half to message regs, e.g. by using
534        * the SIMD lowering pass.  Unfortunately this is more difficult than it
535        * sounds because the SIMD8 single-source message lacks channel selects
536        * for the second and third subspans.
537        */
538       limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n");
539    }
540 
541    /* ANV doesn't know about sample mask output during the wm key creation
542     * so we compute if we need replicate alpha and emit alpha to coverage
543     * workaround here.
544     */
545    const bool replicate_alpha = key->alpha_test_replicate_alpha ||
546       (key->nr_color_regions > 1 && key->alpha_to_coverage &&
547        (sample_mask.file == BAD_FILE || devinfo->ver == 6));
548 
549    prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE &&
550                                 this->outputs[0].file != BAD_FILE);
551    assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
552 
553    do_emit_fb_writes(key->nr_color_regions, replicate_alpha);
554 }
555 
556 void
emit_urb_writes(const elk_fs_reg & gs_vertex_count)557 elk_fs_visitor::emit_urb_writes(const elk_fs_reg &gs_vertex_count)
558 {
559    int slot, urb_offset, length;
560    int starting_urb_offset = 0;
561    const struct elk_vue_prog_data *vue_prog_data =
562       elk_vue_prog_data(this->prog_data);
563    const struct elk_vs_prog_key *vs_key =
564       (const struct elk_vs_prog_key *) this->key;
565    const GLbitfield64 psiz_mask =
566       VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ | VARYING_BIT_PRIMITIVE_SHADING_RATE;
567    const struct intel_vue_map *vue_map = &vue_prog_data->vue_map;
568    bool flush;
569    elk_fs_reg sources[8];
570    elk_fs_reg urb_handle;
571 
572    switch (stage) {
573    case MESA_SHADER_VERTEX:
574       urb_handle = vs_payload().urb_handles;
575       break;
576    case MESA_SHADER_TESS_EVAL:
577       urb_handle = tes_payload().urb_output;
578       break;
579    case MESA_SHADER_GEOMETRY:
580       urb_handle = gs_payload().urb_handles;
581       break;
582    default:
583       unreachable("invalid stage");
584    }
585 
586    const fs_builder bld = fs_builder(this).at_end();
587 
588    elk_fs_reg per_slot_offsets;
589 
590    if (stage == MESA_SHADER_GEOMETRY) {
591       const struct elk_gs_prog_data *gs_prog_data =
592          elk_gs_prog_data(this->prog_data);
593 
594       /* We need to increment the Global Offset to skip over the control data
595        * header and the extra "Vertex Count" field (1 HWord) at the beginning
596        * of the VUE.  We're counting in OWords, so the units are doubled.
597        */
598       starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
599       if (gs_prog_data->static_vertex_count == -1)
600          starting_urb_offset += 2;
601 
602       /* The URB offset is in 128-bit units, so we need to multiply by 2 */
603       const int output_vertex_size_owords =
604          gs_prog_data->output_vertex_size_hwords * 2;
605 
606       if (gs_vertex_count.file == IMM) {
607          per_slot_offsets = elk_imm_ud(output_vertex_size_owords *
608                                        gs_vertex_count.ud);
609       } else {
610          per_slot_offsets = vgrf(glsl_uint_type());
611          bld.MUL(per_slot_offsets, gs_vertex_count,
612                  elk_imm_ud(output_vertex_size_owords));
613       }
614    }
615 
616    length = 0;
617    urb_offset = starting_urb_offset;
618    flush = false;
619 
620    /* SSO shaders can have VUE slots allocated which are never actually
621     * written to, so ignore them when looking for the last (written) slot.
622     */
623    int last_slot = vue_map->num_slots - 1;
624    while (last_slot > 0 &&
625           (vue_map->slot_to_varying[last_slot] == ELK_VARYING_SLOT_PAD ||
626            outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
627       last_slot--;
628    }
629 
630    bool urb_written = false;
631    for (slot = 0; slot < vue_map->num_slots; slot++) {
632       int varying = vue_map->slot_to_varying[slot];
633       switch (varying) {
634       case VARYING_SLOT_PSIZ: {
635          /* The point size varying slot is the vue header and is always in the
636           * vue map.  But often none of the special varyings that live there
637           * are written and in that case we can skip writing to the vue
638           * header, provided the corresponding state properly clamps the
639           * values further down the pipeline. */
640          if ((vue_map->slots_valid & psiz_mask) == 0) {
641             assert(length == 0);
642             urb_offset++;
643             break;
644          }
645 
646          elk_fs_reg zero(VGRF, alloc.allocate(dispatch_width / 8),
647                      ELK_REGISTER_TYPE_UD);
648          bld.MOV(zero, elk_imm_ud(0u));
649 
650          if (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_SHADING_RATE &&
651              this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE].file != BAD_FILE) {
652             sources[length++] = this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE];
653          } else if (devinfo->has_coarse_pixel_primitive_and_cb) {
654             uint32_t one_fp16 = 0x3C00;
655             elk_fs_reg one_by_one_fp16(VGRF, alloc.allocate(dispatch_width / 8),
656                                    ELK_REGISTER_TYPE_UD);
657             bld.MOV(one_by_one_fp16, elk_imm_ud((one_fp16 << 16) | one_fp16));
658             sources[length++] = one_by_one_fp16;
659          } else {
660             sources[length++] = zero;
661          }
662 
663          if (vue_map->slots_valid & VARYING_BIT_LAYER)
664             sources[length++] = this->outputs[VARYING_SLOT_LAYER];
665          else
666             sources[length++] = zero;
667 
668          if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
669             sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
670          else
671             sources[length++] = zero;
672 
673          if (vue_map->slots_valid & VARYING_BIT_PSIZ)
674             sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
675          else
676             sources[length++] = zero;
677          break;
678       }
679       case ELK_VARYING_SLOT_NDC:
680       case VARYING_SLOT_EDGE:
681          unreachable("unexpected scalar vs output");
682          break;
683 
684       default:
685          /* gl_Position is always in the vue map, but isn't always written by
686           * the shader.  Other varyings (clip distances) get added to the vue
687           * map but don't always get written.  In those cases, the
688           * corresponding this->output[] slot will be invalid we and can skip
689           * the urb write for the varying.  If we've already queued up a vue
690           * slot for writing we flush a mlen 5 urb write, otherwise we just
691           * advance the urb_offset.
692           */
693          if (varying == ELK_VARYING_SLOT_PAD ||
694              this->outputs[varying].file == BAD_FILE) {
695             if (length > 0)
696                flush = true;
697             else
698                urb_offset++;
699             break;
700          }
701 
702          if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color &&
703              (varying == VARYING_SLOT_COL0 ||
704               varying == VARYING_SLOT_COL1 ||
705               varying == VARYING_SLOT_BFC0 ||
706               varying == VARYING_SLOT_BFC1)) {
707             /* We need to clamp these guys, so do a saturating MOV into a
708              * temp register and use that for the payload.
709              */
710             for (int i = 0; i < 4; i++) {
711                elk_fs_reg reg = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
712                                    outputs[varying].type);
713                elk_fs_reg src = offset(this->outputs[varying], bld, i);
714                set_saturate(true, bld.MOV(reg, src));
715                sources[length++] = reg;
716             }
717          } else {
718             int slot_offset = 0;
719 
720             /* When using Primitive Replication, there may be multiple slots
721              * assigned to POS.
722              */
723             if (varying == VARYING_SLOT_POS)
724                slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS];
725 
726             for (unsigned i = 0; i < 4; i++) {
727                sources[length++] = offset(this->outputs[varying], bld,
728                                           i + (slot_offset * 4));
729             }
730          }
731          break;
732       }
733 
734       const fs_builder abld = bld.annotate("URB write");
735 
736       /* If we've queued up 8 registers of payload (2 VUE slots), if this is
737        * the last slot or if we need to flush (see BAD_FILE varying case
738        * above), emit a URB write send now to flush out the data.
739        */
740       if (length == 8 || (length > 0 && slot == last_slot))
741          flush = true;
742       if (flush) {
743          elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
744 
745          srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
746          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
747          srcs[URB_LOGICAL_SRC_DATA] = elk_fs_reg(VGRF,
748                                              alloc.allocate((dispatch_width / 8) * length),
749                                              ELK_REGISTER_TYPE_F);
750          srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(length);
751          abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
752 
753          elk_fs_inst *inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
754                                    srcs, ARRAY_SIZE(srcs));
755 
756          inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
757 
758          inst->offset = urb_offset;
759          urb_offset = starting_urb_offset + slot + 1;
760          length = 0;
761          flush = false;
762          urb_written = true;
763       }
764    }
765 
766    /* If we don't have any valid slots to write, just do a minimal urb write
767     * send to terminate the shader.  This includes 1 slot of undefined data,
768     * because it's invalid to write 0 data:
769     *
770     * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
771     * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
772     * Write Data Payload:
773     *
774     *    "The write data payload can be between 1 and 8 message phases long."
775     */
776    if (!urb_written) {
777       /* For GS, just turn EmitVertex() into a no-op.  We don't want it to
778        * end the thread, and emit_gs_thread_end() already emits a SEND with
779        * EOT at the end of the program for us.
780        */
781       if (stage == MESA_SHADER_GEOMETRY)
782          return;
783 
784       elk_fs_reg uniform_urb_handle = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
785                                          ELK_REGISTER_TYPE_UD);
786       elk_fs_reg payload = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
787                               ELK_REGISTER_TYPE_UD);
788 
789       bld.exec_all().MOV(uniform_urb_handle, urb_handle);
790 
791       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
792       srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
793       srcs[URB_LOGICAL_SRC_DATA] = payload;
794       srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
795 
796       elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
797                                srcs, ARRAY_SIZE(srcs));
798       inst->eot = true;
799       inst->offset = 1;
800       return;
801    }
802 }
803 
804 void
emit_urb_fence()805 elk_fs_visitor::emit_urb_fence()
806 {
807    const fs_builder bld = fs_builder(this).at_end();
808    elk_fs_reg dst = bld.vgrf(ELK_REGISTER_TYPE_UD);
809    elk_fs_inst *fence = bld.emit(ELK_SHADER_OPCODE_MEMORY_FENCE, dst,
810                              elk_vec8_grf(0, 0),
811                              elk_imm_ud(true),
812                              elk_imm_ud(0));
813    fence->sfid = ELK_SFID_URB;
814    fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_LOCAL,
815                                     LSC_FLUSH_TYPE_NONE, true);
816 
817    bld.exec_all().group(1, 0).emit(ELK_FS_OPCODE_SCHEDULING_FENCE,
818                                    bld.null_reg_ud(),
819                                    &dst,
820                                    1);
821 }
822 
823 void
emit_cs_terminate()824 elk_fs_visitor::emit_cs_terminate()
825 {
826    assert(devinfo->ver >= 7);
827    const fs_builder bld = fs_builder(this).at_end();
828 
829    /* We can't directly send from g0, since sends with EOT have to use
830     * g112-127. So, copy it to a virtual register, The register allocator will
831     * make sure it uses the appropriate register range.
832     */
833    struct elk_reg g0 = retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD);
834    elk_fs_reg payload = elk_fs_reg(VGRF, alloc.allocate(1), ELK_REGISTER_TYPE_UD);
835    bld.group(8, 0).exec_all().MOV(payload, g0);
836 
837    /* Send a message to the thread spawner to terminate the thread. */
838    elk_fs_inst *inst = bld.exec_all()
839                       .emit(ELK_CS_OPCODE_CS_TERMINATE, reg_undef, payload);
840    inst->eot = true;
841 }
842 
elk_fs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,const elk_base_prog_key * key,struct elk_stage_prog_data * prog_data,const nir_shader * shader,unsigned dispatch_width,bool needs_register_pressure,bool debug_enabled)843 elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
844                        const struct elk_compile_params *params,
845                        const elk_base_prog_key *key,
846                        struct elk_stage_prog_data *prog_data,
847                        const nir_shader *shader,
848                        unsigned dispatch_width,
849                        bool needs_register_pressure,
850                        bool debug_enabled)
851    : elk_backend_shader(compiler, params, shader, prog_data, debug_enabled),
852      key(key), gs_compile(NULL), prog_data(prog_data),
853      live_analysis(this), regpressure_analysis(this),
854      performance_analysis(this),
855      needs_register_pressure(needs_register_pressure),
856      dispatch_width(dispatch_width),
857      api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
858 {
859    init();
860 }
861 
elk_fs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,const elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data,const nir_shader * shader,unsigned dispatch_width,bool needs_register_pressure,bool debug_enabled)862 elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
863                        const struct elk_compile_params *params,
864                        const elk_wm_prog_key *key,
865                        struct elk_wm_prog_data *prog_data,
866                        const nir_shader *shader,
867                        unsigned dispatch_width,
868                        bool needs_register_pressure,
869                        bool debug_enabled)
870    : elk_backend_shader(compiler, params, shader, &prog_data->base,
871                     debug_enabled),
872      key(&key->base), gs_compile(NULL), prog_data(&prog_data->base),
873      live_analysis(this), regpressure_analysis(this),
874      performance_analysis(this),
875      needs_register_pressure(needs_register_pressure),
876      dispatch_width(dispatch_width),
877      api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
878 {
879    init();
880    assert(api_subgroup_size == 0 ||
881           api_subgroup_size == 8 ||
882           api_subgroup_size == 16 ||
883           api_subgroup_size == 32);
884 }
885 
elk_fs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,struct elk_gs_compile * c,struct elk_gs_prog_data * prog_data,const nir_shader * shader,bool needs_register_pressure,bool debug_enabled)886 elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
887                        const struct elk_compile_params *params,
888                        struct elk_gs_compile *c,
889                        struct elk_gs_prog_data *prog_data,
890                        const nir_shader *shader,
891                        bool needs_register_pressure,
892                        bool debug_enabled)
893    : elk_backend_shader(compiler, params, shader, &prog_data->base.base,
894                     debug_enabled),
895      key(&c->key.base), gs_compile(c),
896      prog_data(&prog_data->base.base),
897      live_analysis(this), regpressure_analysis(this),
898      performance_analysis(this),
899      needs_register_pressure(needs_register_pressure),
900      dispatch_width(8),
901      api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
902 {
903    init();
904    assert(api_subgroup_size == 0 ||
905           api_subgroup_size == 8 ||
906           api_subgroup_size == 16 ||
907           api_subgroup_size == 32);
908 }
909 
910 void
init()911 elk_fs_visitor::init()
912 {
913    if (key)
914       this->key_tex = &key->tex;
915    else
916       this->key_tex = NULL;
917 
918    this->max_dispatch_width = 32;
919    this->prog_data = this->stage_prog_data;
920 
921    this->failed = false;
922    this->fail_msg = NULL;
923 
924    this->payload_ = NULL;
925    this->source_depth_to_render_target = false;
926    this->runtime_check_aads_emit = false;
927    this->first_non_payload_grf = 0;
928    this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : ELK_MAX_GRF;
929 
930    this->uniforms = 0;
931    this->last_scratch = 0;
932    this->push_constant_loc = NULL;
933 
934    memset(&this->shader_stats, 0, sizeof(this->shader_stats));
935 
936    this->grf_used = 0;
937    this->spilled_any_registers = false;
938 }
939 
~elk_fs_visitor()940 elk_fs_visitor::~elk_fs_visitor()
941 {
942    delete this->payload_;
943 }
944