1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file elk_fs_visitor.cpp
25 *
26 * This file supports generating the FS LIR from the GLSL IR. The LIR
27 * makes it easier to do backend-specific optimizations than doing so
28 * in the GLSL IR or in the native code.
29 */
30 #include "elk_eu.h"
31 #include "elk_fs.h"
32 #include "elk_fs_builder.h"
33 #include "elk_nir.h"
34 #include "compiler/glsl_types.h"
35
36 using namespace elk;
37
38 /* Input data is organized with first the per-primitive values, followed
39 * by per-vertex values. The per-vertex will have interpolation information
40 * associated, so use 4 components for each value.
41 */
42
43 /* The register location here is relative to the start of the URB
44 * data. It will get adjusted to be a real location before
45 * generate_code() time.
46 */
47 elk_fs_reg
interp_reg(const fs_builder & bld,unsigned location,unsigned channel,unsigned comp)48 elk_fs_visitor::interp_reg(const fs_builder &bld, unsigned location,
49 unsigned channel, unsigned comp)
50 {
51 assert(stage == MESA_SHADER_FRAGMENT);
52 assert(BITFIELD64_BIT(location) & ~nir->info.per_primitive_inputs);
53
54 const struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
55
56 assert(prog_data->urb_setup[location] >= 0);
57 unsigned nr = prog_data->urb_setup[location];
58 channel += prog_data->urb_setup_channel[location];
59
60 /* Adjust so we start counting from the first per_vertex input. */
61 assert(nr >= prog_data->num_per_primitive_inputs);
62 nr -= prog_data->num_per_primitive_inputs;
63
64 const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
65 const unsigned regnr = per_vertex_start + (nr * 4) + channel;
66
67 return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp);
68 }
69
70 /* The register location here is relative to the start of the URB
71 * data. It will get adjusted to be a real location before
72 * generate_code() time.
73 */
74 elk_fs_reg
per_primitive_reg(const fs_builder & bld,int location,unsigned comp)75 elk_fs_visitor::per_primitive_reg(const fs_builder &bld, int location, unsigned comp)
76 {
77 assert(stage == MESA_SHADER_FRAGMENT);
78 assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs);
79
80 const struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
81
82 comp += prog_data->urb_setup_channel[location];
83
84 assert(prog_data->urb_setup[location] >= 0);
85
86 const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
87
88 assert(regnr < prog_data->num_per_primitive_inputs);
89
90 return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp % 4);
91 }
92
93 /** Emits the interpolation for the varying inputs. */
94 void
emit_interpolation_setup_gfx4()95 elk_fs_visitor::emit_interpolation_setup_gfx4()
96 {
97 struct elk_reg g1_uw = retype(elk_vec1_grf(1, 0), ELK_REGISTER_TYPE_UW);
98
99 fs_builder abld = fs_builder(this).at_end().annotate("compute pixel centers");
100 this->pixel_x = vgrf(glsl_uint_type());
101 this->pixel_y = vgrf(glsl_uint_type());
102 this->pixel_x.type = ELK_REGISTER_TYPE_UW;
103 this->pixel_y.type = ELK_REGISTER_TYPE_UW;
104 abld.ADD(this->pixel_x,
105 elk_fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
106 elk_fs_reg(elk_imm_v(0x10101010)));
107 abld.ADD(this->pixel_y,
108 elk_fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
109 elk_fs_reg(elk_imm_v(0x11001100)));
110
111 const fs_builder bld = fs_builder(this).at_end();
112 abld = bld.annotate("compute pixel deltas from v0");
113
114 this->delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL] =
115 vgrf(glsl_vec2_type());
116 const elk_fs_reg &delta_xy = this->delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL];
117 const elk_fs_reg xstart(negate(elk_vec1_grf(1, 0)));
118 const elk_fs_reg ystart(negate(elk_vec1_grf(1, 1)));
119
120 if (devinfo->has_pln) {
121 for (unsigned i = 0; i < dispatch_width / 8; i++) {
122 abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 0), i),
123 quarter(this->pixel_x, i), xstart);
124 abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 1), i),
125 quarter(this->pixel_y, i), ystart);
126 }
127 } else {
128 abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
129 abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
130 }
131
132 this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg);
133
134 /* The SF program automatically handles doing the perspective correction or
135 * not based on wm_prog_data::interp_mode[] so we can use the same pixel
136 * offsets for both perspective and non-perspective.
137 */
138 this->delta_xy[ELK_BARYCENTRIC_NONPERSPECTIVE_PIXEL] =
139 this->delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL];
140
141 abld = bld.annotate("compute pos.w and 1/pos.w");
142 /* Compute wpos.w. It's always in our setup, since it's needed to
143 * interpolate the other attributes.
144 */
145 this->wpos_w = vgrf(glsl_float_type());
146 abld.emit(ELK_FS_OPCODE_LINTERP, wpos_w, delta_xy,
147 interp_reg(abld, VARYING_SLOT_POS, 3, 0));
148 /* Compute the pixel 1/W value from wpos.w. */
149 this->pixel_w = vgrf(glsl_float_type());
150 abld.emit(ELK_SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
151 }
152
153 /** Emits the interpolation for the varying inputs. */
154 void
emit_interpolation_setup_gfx6()155 elk_fs_visitor::emit_interpolation_setup_gfx6()
156 {
157 const fs_builder bld = fs_builder(this).at_end();
158 fs_builder abld = bld.annotate("compute pixel centers");
159
160 this->pixel_x = vgrf(glsl_float_type());
161 this->pixel_y = vgrf(glsl_float_type());
162
163 const struct elk_wm_prog_key *wm_key = (elk_wm_prog_key*) this->key;
164 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(prog_data);
165
166 elk_fs_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */
167 elk_fs_reg int_sample_offset_xy; /* Used on Gen8+ */
168 elk_fs_reg half_int_sample_offset_x, half_int_sample_offset_y;
169
170 /* The thread payload only delivers subspan locations (ss0, ss1,
171 * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
172 * generate 4 pixel coordinates out of each subspan location. We do this
173 * by replicating a subspan coordinate 4 times and adding an offset of 1
174 * in each direction from the initial top left (tl) location to generate
175 * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
176 * (br = +1 in x, +1 in y).
177 *
178 * The locations we build look like this in SIMD8 :
179 *
180 * ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
181 *
182 * The value 0x11001010 is a vector of 8 half byte vector. It adds
183 * following to generate the 4 pixels coordinates out of the subspan0:
184 *
185 * 0x
186 * 1 : ss0.y + 1 -> ss0.br.y
187 * 1 : ss0.y + 1 -> ss0.bl.y
188 * 0 : ss0.y + 0 -> ss0.tr.y
189 * 0 : ss0.y + 0 -> ss0.tl.y
190 * 1 : ss0.x + 1 -> ss0.br.x
191 * 0 : ss0.x + 0 -> ss0.bl.x
192 * 1 : ss0.x + 1 -> ss0.tr.x
193 * 0 : ss0.x + 0 -> ss0.tl.x
194 *
195 * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
196 * coordinates out of 2 subspans coordinates in a single ADD instruction
197 * (twice the operation above).
198 */
199 int_sample_offset_xy = elk_fs_reg(elk_imm_v(0x11001010));
200 half_int_sample_offset_x = elk_fs_reg(elk_imm_uw(0));
201 half_int_sample_offset_y = elk_fs_reg(elk_imm_uw(0));
202 /* On Gfx12.5, because of regioning restrictions, the interpolation code
203 * is slightly different and works off X & Y only inputs. The ordering
204 * of the half bytes here is a bit odd, with each subspan replicated
205 * twice and every other element is discarded :
206 *
207 * ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br
208 * X offset: 0 0 1 0 0 0 1 0
209 * Y offset: 0 0 0 0 1 0 1 0
210 */
211 int_sample_offset_x = elk_fs_reg(elk_imm_v(0x01000100));
212 int_sample_offset_y = elk_fs_reg(elk_imm_v(0x01010000));
213
214 elk_fs_reg int_pixel_offset_xy = int_sample_offset_xy; /* Used on Gen8+ */
215 elk_fs_reg half_int_pixel_offset_x = half_int_sample_offset_x;
216 elk_fs_reg half_int_pixel_offset_y = half_int_sample_offset_y;
217
218 for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
219 const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
220 /* According to the "PS Thread Payload for Normal Dispatch"
221 * pages on the BSpec, subspan X/Y coordinates are stored in
222 * R1.2-R1.5/R2.2-R2.5 on gfx6+, and on R0.10-R0.13/R1.10-R1.13
223 * on gfx20+. gi_reg is the 32B section of the GRF that
224 * contains the subspan coordinates.
225 */
226 const struct elk_reg gi_reg = elk_vec1_grf(i + 1, 0);
227 const struct elk_reg gi_uw = retype(gi_reg, ELK_REGISTER_TYPE_UW);
228
229 if (devinfo->ver >= 8 || dispatch_width == 8) {
230 /* The "Register Region Restrictions" page says for BDW (and newer,
231 * presumably):
232 *
233 * "When destination spans two registers, the source may be one or
234 * two registers. The destination elements must be evenly split
235 * between the two registers."
236 *
237 * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
238 * to compute our pixel centers.
239 */
240 const fs_builder dbld =
241 abld.exec_all().group(hbld.dispatch_width() * 2, 0);
242 elk_fs_reg int_pixel_xy = dbld.vgrf(ELK_REGISTER_TYPE_UW);
243
244 dbld.ADD(int_pixel_xy,
245 elk_fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
246 int_pixel_offset_xy);
247
248 hbld.emit(ELK_FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy,
249 horiz_stride(half_int_pixel_offset_x, 0));
250 hbld.emit(ELK_FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy,
251 horiz_stride(half_int_pixel_offset_y, 0));
252 } else {
253 /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
254 *
255 * "When destination spans two registers, the source MUST span
256 * two registers."
257 *
258 * Since the GRF source of the ADD will only read a single register,
259 * we must do two separate ADDs in SIMD16.
260 */
261 const elk_fs_reg int_pixel_x = hbld.vgrf(ELK_REGISTER_TYPE_UW);
262 const elk_fs_reg int_pixel_y = hbld.vgrf(ELK_REGISTER_TYPE_UW);
263
264 hbld.ADD(int_pixel_x,
265 elk_fs_reg(stride(suboffset(gi_uw, 4), 2, 4, 0)),
266 elk_fs_reg(elk_imm_v(0x10101010)));
267 hbld.ADD(int_pixel_y,
268 elk_fs_reg(stride(suboffset(gi_uw, 5), 2, 4, 0)),
269 elk_fs_reg(elk_imm_v(0x11001100)));
270
271 /* As of gfx6, we can no longer mix float and int sources. We have
272 * to turn the integer pixel centers into floats for their actual
273 * use.
274 */
275 hbld.MOV(offset(pixel_x, hbld, i), int_pixel_x);
276 hbld.MOV(offset(pixel_y, hbld, i), int_pixel_y);
277 }
278 }
279
280 abld = bld.annotate("compute pos.z");
281 if (wm_prog_data->uses_src_depth)
282 this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg);
283
284 if (wm_prog_data->uses_src_w) {
285 abld = bld.annotate("compute pos.w");
286 this->pixel_w = fetch_payload_reg(abld, fs_payload().source_w_reg);
287 this->wpos_w = vgrf(glsl_float_type());
288 abld.emit(ELK_SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
289 }
290
291 if (wm_key->persample_interp == ELK_SOMETIMES) {
292 assert(!devinfo->needs_unlit_centroid_workaround);
293
294 const fs_builder ubld = bld.exec_all().group(16, 0);
295 bool loaded_flag = false;
296
297 for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
298 if (!(wm_prog_data->barycentric_interp_modes & BITFIELD_BIT(i)))
299 continue;
300
301 /* The sample mode will always be the top bit set in the perspective
302 * or non-perspective section. In the case where no SAMPLE mode was
303 * requested, elk_wm_prog_data_barycentric_modes() will swap out the top
304 * mode for SAMPLE so this works regardless of whether SAMPLE was
305 * requested or not.
306 */
307 int sample_mode;
308 if (BITFIELD_BIT(i) & ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) {
309 sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
310 ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
311 } else {
312 sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
313 ELK_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
314 }
315 assert(wm_prog_data->barycentric_interp_modes &
316 BITFIELD_BIT(sample_mode));
317
318 if (i == sample_mode)
319 continue;
320
321 uint8_t *barys = fs_payload().barycentric_coord_reg[i];
322
323 uint8_t *sample_barys = fs_payload().barycentric_coord_reg[sample_mode];
324 assert(barys[0] && sample_barys[0]);
325
326 if (!loaded_flag) {
327 check_dynamic_msaa_flag(ubld, wm_prog_data,
328 INTEL_MSAA_FLAG_PERSAMPLE_INTERP);
329 }
330
331 for (unsigned j = 0; j < dispatch_width / 8; j++) {
332 set_predicate(
333 ELK_PREDICATE_NORMAL,
334 ubld.MOV(elk_vec8_grf(barys[j / 2] + (j % 2) * 2, 0),
335 elk_vec8_grf(sample_barys[j / 2] + (j % 2) * 2, 0)));
336 }
337 }
338 }
339
340 for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
341 this->delta_xy[i] = fetch_barycentric_reg(
342 bld, fs_payload().barycentric_coord_reg[i]);
343 }
344
345 uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
346 (1 << ELK_BARYCENTRIC_PERSPECTIVE_CENTROID |
347 1 << ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
348
349 if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
350 /* Get the pixel/sample mask into f0 so that we know which
351 * pixels are lit. Then, for each channel that is unlit,
352 * replace the centroid data with non-centroid data.
353 */
354 for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
355 bld.exec_all().group(1, 0)
356 .MOV(retype(elk_flag_reg(0, i), ELK_REGISTER_TYPE_UW),
357 retype(elk_vec1_grf(1 + i, 7), ELK_REGISTER_TYPE_UW));
358 }
359
360 for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
361 if (!(centroid_modes & (1 << i)))
362 continue;
363
364 const elk_fs_reg centroid_delta_xy = delta_xy[i];
365 const elk_fs_reg &pixel_delta_xy = delta_xy[i - 1];
366
367 delta_xy[i] = bld.vgrf(ELK_REGISTER_TYPE_F, 2);
368
369 for (unsigned c = 0; c < 2; c++) {
370 for (unsigned q = 0; q < dispatch_width / 8; q++) {
371 set_predicate(ELK_PREDICATE_NORMAL,
372 bld.quarter(q).SEL(
373 quarter(offset(delta_xy[i], bld, c), q),
374 quarter(offset(centroid_delta_xy, bld, c), q),
375 quarter(offset(pixel_delta_xy, bld, c), q)));
376 }
377 }
378 }
379 }
380 }
381
382 static enum elk_conditional_mod
cond_for_alpha_func(enum compare_func func)383 cond_for_alpha_func(enum compare_func func)
384 {
385 switch(func) {
386 case COMPARE_FUNC_GREATER:
387 return ELK_CONDITIONAL_G;
388 case COMPARE_FUNC_GEQUAL:
389 return ELK_CONDITIONAL_GE;
390 case COMPARE_FUNC_LESS:
391 return ELK_CONDITIONAL_L;
392 case COMPARE_FUNC_LEQUAL:
393 return ELK_CONDITIONAL_LE;
394 case COMPARE_FUNC_EQUAL:
395 return ELK_CONDITIONAL_EQ;
396 case COMPARE_FUNC_NOTEQUAL:
397 return ELK_CONDITIONAL_NEQ;
398 default:
399 unreachable("Not reached");
400 }
401 }
402
403 /**
404 * Alpha test support for when we compile it into the shader instead
405 * of using the normal fixed-function alpha test.
406 */
407 void
emit_alpha_test()408 elk_fs_visitor::emit_alpha_test()
409 {
410 assert(stage == MESA_SHADER_FRAGMENT);
411 elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
412 const fs_builder bld = fs_builder(this).at_end();
413 const fs_builder abld = bld.annotate("Alpha test");
414
415 elk_fs_inst *cmp;
416 if (key->alpha_test_func == COMPARE_FUNC_ALWAYS)
417 return;
418
419 if (key->alpha_test_func == COMPARE_FUNC_NEVER) {
420 /* f0.1 = 0 */
421 elk_fs_reg some_reg = elk_fs_reg(retype(elk_vec8_grf(0, 0),
422 ELK_REGISTER_TYPE_UW));
423 cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
424 ELK_CONDITIONAL_NEQ);
425 } else {
426 /* RT0 alpha */
427 elk_fs_reg color = offset(outputs[0], bld, 3);
428
429 /* f0.1 &= func(color, ref) */
430 cmp = abld.CMP(bld.null_reg_f(), color, elk_imm_f(key->alpha_test_ref),
431 cond_for_alpha_func(key->alpha_test_func));
432 }
433 cmp->predicate = ELK_PREDICATE_NORMAL;
434 cmp->flag_subreg = 1;
435 }
436
437 elk_fs_inst *
emit_single_fb_write(const fs_builder & bld,elk_fs_reg color0,elk_fs_reg color1,elk_fs_reg src0_alpha,unsigned components)438 elk_fs_visitor::emit_single_fb_write(const fs_builder &bld,
439 elk_fs_reg color0, elk_fs_reg color1,
440 elk_fs_reg src0_alpha, unsigned components)
441 {
442 assert(stage == MESA_SHADER_FRAGMENT);
443 struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
444
445 /* Hand over gl_FragDepth or the payload depth. */
446 const elk_fs_reg dst_depth = fetch_payload_reg(bld, fs_payload().dest_depth_reg);
447 elk_fs_reg src_depth;
448
449 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
450 src_depth = frag_depth;
451 } else if (source_depth_to_render_target) {
452 /* If we got here, we're in one of those strange Gen4-5 cases where
453 * we're forced to pass the source depth, unmodified, to the FB write.
454 * In this case, we don't want to use pixel_z because we may not have
455 * set up interpolation. It's also perfectly safe because it only
456 * happens on old hardware (no coarse interpolation) and this is
457 * explicitly the pass-through case.
458 */
459 assert(devinfo->ver <= 5);
460 src_depth = fetch_payload_reg(bld, fs_payload().source_depth_reg);
461 }
462
463 const elk_fs_reg sources[] = {
464 color0, color1, src0_alpha, src_depth, dst_depth,
465 (prog_data->uses_omask ? sample_mask : elk_fs_reg()),
466 elk_imm_ud(components)
467 };
468 assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
469 elk_fs_inst *write = bld.emit(ELK_FS_OPCODE_FB_WRITE_LOGICAL, elk_fs_reg(),
470 sources, ARRAY_SIZE(sources));
471
472 if (prog_data->uses_kill) {
473 write->predicate = ELK_PREDICATE_NORMAL;
474 write->flag_subreg = sample_mask_flag_subreg(*this);
475 }
476
477 return write;
478 }
479
480 void
do_emit_fb_writes(int nr_color_regions,bool replicate_alpha)481 elk_fs_visitor::do_emit_fb_writes(int nr_color_regions, bool replicate_alpha)
482 {
483 const fs_builder bld = fs_builder(this).at_end();
484 elk_fs_inst *inst = NULL;
485
486 for (int target = 0; target < nr_color_regions; target++) {
487 /* Skip over outputs that weren't written. */
488 if (this->outputs[target].file == BAD_FILE)
489 continue;
490
491 const fs_builder abld = bld.annotate(
492 ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
493
494 elk_fs_reg src0_alpha;
495 if (devinfo->ver >= 6 && replicate_alpha && target != 0)
496 src0_alpha = offset(outputs[0], bld, 3);
497
498 inst = emit_single_fb_write(abld, this->outputs[target],
499 this->dual_src_output, src0_alpha, 4);
500 inst->target = target;
501 }
502
503 if (inst == NULL) {
504 /* Even if there's no color buffers enabled, we still need to send
505 * alpha out the pipeline to our null renderbuffer to support
506 * alpha-testing, alpha-to-coverage, and so on.
507 */
508 /* FINISHME: Factor out this frequently recurring pattern into a
509 * helper function.
510 */
511 const elk_fs_reg srcs[] = { reg_undef, reg_undef,
512 reg_undef, offset(this->outputs[0], bld, 3) };
513 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD, 4);
514 bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
515
516 inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
517 inst->target = 0;
518 }
519
520 inst->last_rt = true;
521 inst->eot = true;
522 }
523
524 void
emit_fb_writes()525 elk_fs_visitor::emit_fb_writes()
526 {
527 assert(stage == MESA_SHADER_FRAGMENT);
528 struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
529 elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
530
531 if (source_depth_to_render_target && devinfo->ver == 6) {
532 /* For outputting oDepth on gfx6, SIMD8 writes have to be used. This
533 * would require SIMD8 moves of each half to message regs, e.g. by using
534 * the SIMD lowering pass. Unfortunately this is more difficult than it
535 * sounds because the SIMD8 single-source message lacks channel selects
536 * for the second and third subspans.
537 */
538 limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n");
539 }
540
541 /* ANV doesn't know about sample mask output during the wm key creation
542 * so we compute if we need replicate alpha and emit alpha to coverage
543 * workaround here.
544 */
545 const bool replicate_alpha = key->alpha_test_replicate_alpha ||
546 (key->nr_color_regions > 1 && key->alpha_to_coverage &&
547 (sample_mask.file == BAD_FILE || devinfo->ver == 6));
548
549 prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE &&
550 this->outputs[0].file != BAD_FILE);
551 assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
552
553 do_emit_fb_writes(key->nr_color_regions, replicate_alpha);
554 }
555
556 void
emit_urb_writes(const elk_fs_reg & gs_vertex_count)557 elk_fs_visitor::emit_urb_writes(const elk_fs_reg &gs_vertex_count)
558 {
559 int slot, urb_offset, length;
560 int starting_urb_offset = 0;
561 const struct elk_vue_prog_data *vue_prog_data =
562 elk_vue_prog_data(this->prog_data);
563 const struct elk_vs_prog_key *vs_key =
564 (const struct elk_vs_prog_key *) this->key;
565 const GLbitfield64 psiz_mask =
566 VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ | VARYING_BIT_PRIMITIVE_SHADING_RATE;
567 const struct intel_vue_map *vue_map = &vue_prog_data->vue_map;
568 bool flush;
569 elk_fs_reg sources[8];
570 elk_fs_reg urb_handle;
571
572 switch (stage) {
573 case MESA_SHADER_VERTEX:
574 urb_handle = vs_payload().urb_handles;
575 break;
576 case MESA_SHADER_TESS_EVAL:
577 urb_handle = tes_payload().urb_output;
578 break;
579 case MESA_SHADER_GEOMETRY:
580 urb_handle = gs_payload().urb_handles;
581 break;
582 default:
583 unreachable("invalid stage");
584 }
585
586 const fs_builder bld = fs_builder(this).at_end();
587
588 elk_fs_reg per_slot_offsets;
589
590 if (stage == MESA_SHADER_GEOMETRY) {
591 const struct elk_gs_prog_data *gs_prog_data =
592 elk_gs_prog_data(this->prog_data);
593
594 /* We need to increment the Global Offset to skip over the control data
595 * header and the extra "Vertex Count" field (1 HWord) at the beginning
596 * of the VUE. We're counting in OWords, so the units are doubled.
597 */
598 starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
599 if (gs_prog_data->static_vertex_count == -1)
600 starting_urb_offset += 2;
601
602 /* The URB offset is in 128-bit units, so we need to multiply by 2 */
603 const int output_vertex_size_owords =
604 gs_prog_data->output_vertex_size_hwords * 2;
605
606 if (gs_vertex_count.file == IMM) {
607 per_slot_offsets = elk_imm_ud(output_vertex_size_owords *
608 gs_vertex_count.ud);
609 } else {
610 per_slot_offsets = vgrf(glsl_uint_type());
611 bld.MUL(per_slot_offsets, gs_vertex_count,
612 elk_imm_ud(output_vertex_size_owords));
613 }
614 }
615
616 length = 0;
617 urb_offset = starting_urb_offset;
618 flush = false;
619
620 /* SSO shaders can have VUE slots allocated which are never actually
621 * written to, so ignore them when looking for the last (written) slot.
622 */
623 int last_slot = vue_map->num_slots - 1;
624 while (last_slot > 0 &&
625 (vue_map->slot_to_varying[last_slot] == ELK_VARYING_SLOT_PAD ||
626 outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
627 last_slot--;
628 }
629
630 bool urb_written = false;
631 for (slot = 0; slot < vue_map->num_slots; slot++) {
632 int varying = vue_map->slot_to_varying[slot];
633 switch (varying) {
634 case VARYING_SLOT_PSIZ: {
635 /* The point size varying slot is the vue header and is always in the
636 * vue map. But often none of the special varyings that live there
637 * are written and in that case we can skip writing to the vue
638 * header, provided the corresponding state properly clamps the
639 * values further down the pipeline. */
640 if ((vue_map->slots_valid & psiz_mask) == 0) {
641 assert(length == 0);
642 urb_offset++;
643 break;
644 }
645
646 elk_fs_reg zero(VGRF, alloc.allocate(dispatch_width / 8),
647 ELK_REGISTER_TYPE_UD);
648 bld.MOV(zero, elk_imm_ud(0u));
649
650 if (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_SHADING_RATE &&
651 this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE].file != BAD_FILE) {
652 sources[length++] = this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE];
653 } else if (devinfo->has_coarse_pixel_primitive_and_cb) {
654 uint32_t one_fp16 = 0x3C00;
655 elk_fs_reg one_by_one_fp16(VGRF, alloc.allocate(dispatch_width / 8),
656 ELK_REGISTER_TYPE_UD);
657 bld.MOV(one_by_one_fp16, elk_imm_ud((one_fp16 << 16) | one_fp16));
658 sources[length++] = one_by_one_fp16;
659 } else {
660 sources[length++] = zero;
661 }
662
663 if (vue_map->slots_valid & VARYING_BIT_LAYER)
664 sources[length++] = this->outputs[VARYING_SLOT_LAYER];
665 else
666 sources[length++] = zero;
667
668 if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
669 sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
670 else
671 sources[length++] = zero;
672
673 if (vue_map->slots_valid & VARYING_BIT_PSIZ)
674 sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
675 else
676 sources[length++] = zero;
677 break;
678 }
679 case ELK_VARYING_SLOT_NDC:
680 case VARYING_SLOT_EDGE:
681 unreachable("unexpected scalar vs output");
682 break;
683
684 default:
685 /* gl_Position is always in the vue map, but isn't always written by
686 * the shader. Other varyings (clip distances) get added to the vue
687 * map but don't always get written. In those cases, the
688 * corresponding this->output[] slot will be invalid we and can skip
689 * the urb write for the varying. If we've already queued up a vue
690 * slot for writing we flush a mlen 5 urb write, otherwise we just
691 * advance the urb_offset.
692 */
693 if (varying == ELK_VARYING_SLOT_PAD ||
694 this->outputs[varying].file == BAD_FILE) {
695 if (length > 0)
696 flush = true;
697 else
698 urb_offset++;
699 break;
700 }
701
702 if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color &&
703 (varying == VARYING_SLOT_COL0 ||
704 varying == VARYING_SLOT_COL1 ||
705 varying == VARYING_SLOT_BFC0 ||
706 varying == VARYING_SLOT_BFC1)) {
707 /* We need to clamp these guys, so do a saturating MOV into a
708 * temp register and use that for the payload.
709 */
710 for (int i = 0; i < 4; i++) {
711 elk_fs_reg reg = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
712 outputs[varying].type);
713 elk_fs_reg src = offset(this->outputs[varying], bld, i);
714 set_saturate(true, bld.MOV(reg, src));
715 sources[length++] = reg;
716 }
717 } else {
718 int slot_offset = 0;
719
720 /* When using Primitive Replication, there may be multiple slots
721 * assigned to POS.
722 */
723 if (varying == VARYING_SLOT_POS)
724 slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS];
725
726 for (unsigned i = 0; i < 4; i++) {
727 sources[length++] = offset(this->outputs[varying], bld,
728 i + (slot_offset * 4));
729 }
730 }
731 break;
732 }
733
734 const fs_builder abld = bld.annotate("URB write");
735
736 /* If we've queued up 8 registers of payload (2 VUE slots), if this is
737 * the last slot or if we need to flush (see BAD_FILE varying case
738 * above), emit a URB write send now to flush out the data.
739 */
740 if (length == 8 || (length > 0 && slot == last_slot))
741 flush = true;
742 if (flush) {
743 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
744
745 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
746 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
747 srcs[URB_LOGICAL_SRC_DATA] = elk_fs_reg(VGRF,
748 alloc.allocate((dispatch_width / 8) * length),
749 ELK_REGISTER_TYPE_F);
750 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(length);
751 abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
752
753 elk_fs_inst *inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
754 srcs, ARRAY_SIZE(srcs));
755
756 inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
757
758 inst->offset = urb_offset;
759 urb_offset = starting_urb_offset + slot + 1;
760 length = 0;
761 flush = false;
762 urb_written = true;
763 }
764 }
765
766 /* If we don't have any valid slots to write, just do a minimal urb write
767 * send to terminate the shader. This includes 1 slot of undefined data,
768 * because it's invalid to write 0 data:
769 *
770 * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
771 * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
772 * Write Data Payload:
773 *
774 * "The write data payload can be between 1 and 8 message phases long."
775 */
776 if (!urb_written) {
777 /* For GS, just turn EmitVertex() into a no-op. We don't want it to
778 * end the thread, and emit_gs_thread_end() already emits a SEND with
779 * EOT at the end of the program for us.
780 */
781 if (stage == MESA_SHADER_GEOMETRY)
782 return;
783
784 elk_fs_reg uniform_urb_handle = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
785 ELK_REGISTER_TYPE_UD);
786 elk_fs_reg payload = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
787 ELK_REGISTER_TYPE_UD);
788
789 bld.exec_all().MOV(uniform_urb_handle, urb_handle);
790
791 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
792 srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
793 srcs[URB_LOGICAL_SRC_DATA] = payload;
794 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
795
796 elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
797 srcs, ARRAY_SIZE(srcs));
798 inst->eot = true;
799 inst->offset = 1;
800 return;
801 }
802 }
803
804 void
emit_urb_fence()805 elk_fs_visitor::emit_urb_fence()
806 {
807 const fs_builder bld = fs_builder(this).at_end();
808 elk_fs_reg dst = bld.vgrf(ELK_REGISTER_TYPE_UD);
809 elk_fs_inst *fence = bld.emit(ELK_SHADER_OPCODE_MEMORY_FENCE, dst,
810 elk_vec8_grf(0, 0),
811 elk_imm_ud(true),
812 elk_imm_ud(0));
813 fence->sfid = ELK_SFID_URB;
814 fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_LOCAL,
815 LSC_FLUSH_TYPE_NONE, true);
816
817 bld.exec_all().group(1, 0).emit(ELK_FS_OPCODE_SCHEDULING_FENCE,
818 bld.null_reg_ud(),
819 &dst,
820 1);
821 }
822
823 void
emit_cs_terminate()824 elk_fs_visitor::emit_cs_terminate()
825 {
826 assert(devinfo->ver >= 7);
827 const fs_builder bld = fs_builder(this).at_end();
828
829 /* We can't directly send from g0, since sends with EOT have to use
830 * g112-127. So, copy it to a virtual register, The register allocator will
831 * make sure it uses the appropriate register range.
832 */
833 struct elk_reg g0 = retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD);
834 elk_fs_reg payload = elk_fs_reg(VGRF, alloc.allocate(1), ELK_REGISTER_TYPE_UD);
835 bld.group(8, 0).exec_all().MOV(payload, g0);
836
837 /* Send a message to the thread spawner to terminate the thread. */
838 elk_fs_inst *inst = bld.exec_all()
839 .emit(ELK_CS_OPCODE_CS_TERMINATE, reg_undef, payload);
840 inst->eot = true;
841 }
842
elk_fs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,const elk_base_prog_key * key,struct elk_stage_prog_data * prog_data,const nir_shader * shader,unsigned dispatch_width,bool needs_register_pressure,bool debug_enabled)843 elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
844 const struct elk_compile_params *params,
845 const elk_base_prog_key *key,
846 struct elk_stage_prog_data *prog_data,
847 const nir_shader *shader,
848 unsigned dispatch_width,
849 bool needs_register_pressure,
850 bool debug_enabled)
851 : elk_backend_shader(compiler, params, shader, prog_data, debug_enabled),
852 key(key), gs_compile(NULL), prog_data(prog_data),
853 live_analysis(this), regpressure_analysis(this),
854 performance_analysis(this),
855 needs_register_pressure(needs_register_pressure),
856 dispatch_width(dispatch_width),
857 api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
858 {
859 init();
860 }
861
elk_fs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,const elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data,const nir_shader * shader,unsigned dispatch_width,bool needs_register_pressure,bool debug_enabled)862 elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
863 const struct elk_compile_params *params,
864 const elk_wm_prog_key *key,
865 struct elk_wm_prog_data *prog_data,
866 const nir_shader *shader,
867 unsigned dispatch_width,
868 bool needs_register_pressure,
869 bool debug_enabled)
870 : elk_backend_shader(compiler, params, shader, &prog_data->base,
871 debug_enabled),
872 key(&key->base), gs_compile(NULL), prog_data(&prog_data->base),
873 live_analysis(this), regpressure_analysis(this),
874 performance_analysis(this),
875 needs_register_pressure(needs_register_pressure),
876 dispatch_width(dispatch_width),
877 api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
878 {
879 init();
880 assert(api_subgroup_size == 0 ||
881 api_subgroup_size == 8 ||
882 api_subgroup_size == 16 ||
883 api_subgroup_size == 32);
884 }
885
elk_fs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,struct elk_gs_compile * c,struct elk_gs_prog_data * prog_data,const nir_shader * shader,bool needs_register_pressure,bool debug_enabled)886 elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
887 const struct elk_compile_params *params,
888 struct elk_gs_compile *c,
889 struct elk_gs_prog_data *prog_data,
890 const nir_shader *shader,
891 bool needs_register_pressure,
892 bool debug_enabled)
893 : elk_backend_shader(compiler, params, shader, &prog_data->base.base,
894 debug_enabled),
895 key(&c->key.base), gs_compile(c),
896 prog_data(&prog_data->base.base),
897 live_analysis(this), regpressure_analysis(this),
898 performance_analysis(this),
899 needs_register_pressure(needs_register_pressure),
900 dispatch_width(8),
901 api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
902 {
903 init();
904 assert(api_subgroup_size == 0 ||
905 api_subgroup_size == 8 ||
906 api_subgroup_size == 16 ||
907 api_subgroup_size == 32);
908 }
909
910 void
init()911 elk_fs_visitor::init()
912 {
913 if (key)
914 this->key_tex = &key->tex;
915 else
916 this->key_tex = NULL;
917
918 this->max_dispatch_width = 32;
919 this->prog_data = this->stage_prog_data;
920
921 this->failed = false;
922 this->fail_msg = NULL;
923
924 this->payload_ = NULL;
925 this->source_depth_to_render_target = false;
926 this->runtime_check_aads_emit = false;
927 this->first_non_payload_grf = 0;
928 this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : ELK_MAX_GRF;
929
930 this->uniforms = 0;
931 this->last_scratch = 0;
932 this->push_constant_loc = NULL;
933
934 memset(&this->shader_stats, 0, sizeof(this->shader_stats));
935
936 this->grf_used = 0;
937 this->spilled_any_registers = false;
938 }
939
~elk_fs_visitor()940 elk_fs_visitor::~elk_fs_visitor()
941 {
942 delete this->payload_;
943 }
944