1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_eu.h"
30 #include "brw_fs.h"
31
32 namespace brw {
33 /**
34 * Toolbox to assemble an FS IR program out of individual instructions.
35 */
36 class fs_builder {
37 public:
38 /**
39 * Construct an fs_builder that inserts instructions into \p shader.
40 * \p dispatch_width gives the native execution width of the program.
41 */
fs_builder(fs_visitor * shader,unsigned dispatch_width)42 fs_builder(fs_visitor *shader,
43 unsigned dispatch_width) :
44 shader(shader), block(NULL), cursor(NULL),
45 _dispatch_width(dispatch_width),
46 _group(0),
47 force_writemask_all(false),
48 annotation()
49 {
50 }
51
fs_builder(fs_visitor * s)52 explicit fs_builder(fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
53
54 /**
55 * Construct an fs_builder that inserts instructions into \p shader
56 * before instruction \p inst in basic block \p block. The default
57 * execution controls and debug annotation are initialized from the
58 * instruction passed as argument.
59 */
fs_builder(fs_visitor * shader,bblock_t * block,fs_inst * inst)60 fs_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
61 shader(shader), block(block), cursor(inst),
62 _dispatch_width(inst->exec_size),
63 _group(inst->group),
64 force_writemask_all(inst->force_writemask_all)
65 {
66 #ifndef NDEBUG
67 annotation.str = inst->annotation;
68 #else
69 annotation.str = NULL;
70 #endif
71 }
72
73 /**
74 * Construct an fs_builder that inserts instructions before \p cursor in
75 * basic block \p block, inheriting other code generation parameters
76 * from this.
77 */
78 fs_builder
at(bblock_t * block,exec_node * cursor)79 at(bblock_t *block, exec_node *cursor) const
80 {
81 fs_builder bld = *this;
82 bld.block = block;
83 bld.cursor = cursor;
84 return bld;
85 }
86
87 /**
88 * Construct an fs_builder appending instructions at the end of the
89 * instruction list of the shader, inheriting other code generation
90 * parameters from this.
91 */
92 fs_builder
at_end()93 at_end() const
94 {
95 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
96 }
97
98 /**
99 * Construct a builder specifying the default SIMD width and group of
100 * channel enable signals, inheriting other code generation parameters
101 * from this.
102 *
103 * \p n gives the default SIMD width, \p i gives the slot group used for
104 * predication and control flow masking in multiples of \p n channels.
105 */
106 fs_builder
group(unsigned n,unsigned i)107 group(unsigned n, unsigned i) const
108 {
109 fs_builder bld = *this;
110
111 if (n <= dispatch_width() && i < dispatch_width() / n) {
112 bld._group += i * n;
113 } else {
114 /* The requested channel group isn't a subset of the channel group
115 * of this builder, which means that the resulting instructions
116 * would use (potentially undefined) channel enable signals not
117 * specified by the parent builder. That's only valid if the
118 * instruction doesn't have per-channel semantics, in which case
119 * we should clear off the default group index in order to prevent
120 * emitting instructions with channel group not aligned to their
121 * own execution size.
122 */
123 assert(force_writemask_all);
124 bld._group = 0;
125 }
126
127 bld._dispatch_width = n;
128 return bld;
129 }
130
131 /**
132 * Alias for group() with width equal to eight.
133 */
134 fs_builder
quarter(unsigned i)135 quarter(unsigned i) const
136 {
137 return group(8, i);
138 }
139
140 /**
141 * Construct a builder with per-channel control flow execution masking
142 * disabled if \p b is true. If control flow execution masking is
143 * already disabled this has no effect.
144 */
145 fs_builder
146 exec_all(bool b = true) const
147 {
148 fs_builder bld = *this;
149 if (b)
150 bld.force_writemask_all = true;
151 return bld;
152 }
153
154 /**
155 * Construct a builder with the given debug annotation info.
156 */
157 fs_builder
annotate(const char * str)158 annotate(const char *str) const
159 {
160 fs_builder bld = *this;
161 bld.annotation.str = str;
162 return bld;
163 }
164
165 /**
166 * Get the SIMD width in use.
167 */
168 unsigned
dispatch_width()169 dispatch_width() const
170 {
171 return _dispatch_width;
172 }
173
174 /**
175 * Get the channel group in use.
176 */
177 unsigned
group()178 group() const
179 {
180 return _group;
181 }
182
183 /**
184 * Allocate a virtual register of natural vector size (one for this IR)
185 * and SIMD width. \p n gives the amount of space to allocate in
186 * dispatch_width units (which is just enough space for one logical
187 * component in this IR).
188 */
189 brw_reg
190 vgrf(enum brw_reg_type type, unsigned n = 1) const
191 {
192 const unsigned unit = reg_unit(shader->devinfo);
193 assert(dispatch_width() <= 32);
194
195 if (n > 0)
196 return brw_vgrf(shader->alloc.allocate(
197 DIV_ROUND_UP(n * brw_type_size_bytes(type) * dispatch_width(),
198 unit * REG_SIZE) * unit),
199 type);
200 else
201 return retype(null_reg_ud(), type);
202 }
203
204 /**
205 * Create a null register of floating type.
206 */
207 brw_reg
null_reg_f()208 null_reg_f() const
209 {
210 return brw_reg(retype(brw_null_reg(), BRW_TYPE_F));
211 }
212
213 brw_reg
null_reg_df()214 null_reg_df() const
215 {
216 return brw_reg(retype(brw_null_reg(), BRW_TYPE_DF));
217 }
218
219 /**
220 * Create a null register of signed integer type.
221 */
222 brw_reg
null_reg_d()223 null_reg_d() const
224 {
225 return brw_reg(retype(brw_null_reg(), BRW_TYPE_D));
226 }
227
228 /**
229 * Create a null register of unsigned integer type.
230 */
231 brw_reg
null_reg_ud()232 null_reg_ud() const
233 {
234 return brw_reg(retype(brw_null_reg(), BRW_TYPE_UD));
235 }
236
237 /**
238 * Insert an instruction into the program.
239 */
240 fs_inst *
emit(const fs_inst & inst)241 emit(const fs_inst &inst) const
242 {
243 return emit(new(shader->mem_ctx) fs_inst(inst));
244 }
245
246 /**
247 * Create and insert a nullary control instruction into the program.
248 */
249 fs_inst *
emit(enum opcode opcode)250 emit(enum opcode opcode) const
251 {
252 return emit(fs_inst(opcode, dispatch_width()));
253 }
254
255 /**
256 * Create and insert a nullary instruction into the program.
257 */
258 fs_inst *
emit(enum opcode opcode,const brw_reg & dst)259 emit(enum opcode opcode, const brw_reg &dst) const
260 {
261 return emit(fs_inst(opcode, dispatch_width(), dst));
262 }
263
264 /**
265 * Create and insert a unary instruction into the program.
266 */
267 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0)268 emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0) const
269 {
270 return emit(fs_inst(opcode, dispatch_width(), dst, src0));
271 }
272
273 /**
274 * Create and insert a binary instruction into the program.
275 */
276 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)277 emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
278 const brw_reg &src1) const
279 {
280 return emit(fs_inst(opcode, dispatch_width(), dst,
281 src0, src1));
282 }
283
284 /**
285 * Create and insert a ternary instruction into the program.
286 */
287 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)288 emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
289 const brw_reg &src1, const brw_reg &src2) const
290 {
291 switch (opcode) {
292 case BRW_OPCODE_BFE:
293 case BRW_OPCODE_BFI2:
294 case BRW_OPCODE_MAD:
295 case BRW_OPCODE_LRP:
296 return emit(fs_inst(opcode, dispatch_width(), dst,
297 fix_3src_operand(src0),
298 fix_3src_operand(src1),
299 fix_3src_operand(src2)));
300
301 default:
302 return emit(fs_inst(opcode, dispatch_width(), dst,
303 src0, src1, src2));
304 }
305 }
306
307 /**
308 * Create and insert an instruction with a variable number of sources
309 * into the program.
310 */
311 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg srcs[],unsigned n)312 emit(enum opcode opcode, const brw_reg &dst, const brw_reg srcs[],
313 unsigned n) const
314 {
315 /* Use the emit() methods for specific operand counts to ensure that
316 * opcode-specific operand fixups occur.
317 */
318 if (n == 3) {
319 return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
320 } else {
321 return emit(fs_inst(opcode, dispatch_width(), dst, srcs, n));
322 }
323 }
324
325 /**
326 * Insert a preallocated instruction into the program.
327 */
328 fs_inst *
emit(fs_inst * inst)329 emit(fs_inst *inst) const
330 {
331 assert(inst->exec_size <= 32);
332 assert(inst->exec_size == dispatch_width() ||
333 force_writemask_all);
334
335 inst->group = _group;
336 inst->force_writemask_all = force_writemask_all;
337 #ifndef NDEBUG
338 inst->annotation = annotation.str;
339 #endif
340
341 if (block)
342 static_cast<fs_inst *>(cursor)->insert_before(block, inst);
343 else
344 cursor->insert_before(inst);
345
346 return inst;
347 }
348
349 /**
350 * Select \p src0 if the comparison of both sources with the given
351 * conditional mod evaluates to true, otherwise select \p src1.
352 *
353 * Generally useful to get the minimum or maximum of two values.
354 */
355 fs_inst *
emit_minmax(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod mod)356 emit_minmax(const brw_reg &dst, const brw_reg &src0,
357 const brw_reg &src1, brw_conditional_mod mod) const
358 {
359 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
360
361 /* In some cases we can't have bytes as operand for src1, so use the
362 * same type for both operand.
363 */
364 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
365 fix_unsigned_negate(src1)));
366 }
367
368 /**
369 * Copy any live channel from \p src to the first channel of the result.
370 */
371 brw_reg
emit_uniformize(const brw_reg & src)372 emit_uniformize(const brw_reg &src) const
373 {
374 /* FIXME: We use a vector chan_index and dst to allow constant and
375 * copy propagration to move result all the way into the consuming
376 * instruction (typically a surface index or sampler index for a
377 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
378 * dispatch. Once we teach const/copy propagation about scalars we
379 * should go back to scalar destinations here.
380 */
381 const fs_builder ubld = exec_all();
382 const brw_reg chan_index = vgrf(BRW_TYPE_UD);
383 const brw_reg dst = vgrf(src.type);
384
385 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
386 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
387
388 return brw_reg(component(dst, 0));
389 }
390
391 brw_reg
move_to_vgrf(const brw_reg & src,unsigned num_components)392 move_to_vgrf(const brw_reg &src, unsigned num_components) const
393 {
394 brw_reg *const src_comps = new brw_reg[num_components];
395 for (unsigned i = 0; i < num_components; i++)
396 src_comps[i] = offset(src, dispatch_width(), i);
397
398 const brw_reg dst = vgrf(src.type, num_components);
399 LOAD_PAYLOAD(dst, src_comps, num_components, 0);
400
401 delete[] src_comps;
402
403 return brw_reg(dst);
404 }
405
406 void
emit_scan_step(enum opcode opcode,brw_conditional_mod mod,const brw_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)407 emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
408 const brw_reg &tmp,
409 unsigned left_offset, unsigned left_stride,
410 unsigned right_offset, unsigned right_stride) const
411 {
412 brw_reg left, right;
413 left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
414 right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
415 if ((tmp.type == BRW_TYPE_Q || tmp.type == BRW_TYPE_UQ) &&
416 (!shader->devinfo->has_64bit_int || shader->devinfo->ver >= 20)) {
417 switch (opcode) {
418 case BRW_OPCODE_MUL:
419 /* This will get lowered by integer MUL lowering */
420 set_condmod(mod, emit(opcode, right, left, right));
421 break;
422
423 case BRW_OPCODE_SEL: {
424 /* In order for the comparisons to work out right, we need our
425 * comparisons to be strict.
426 */
427 assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
428 if (mod == BRW_CONDITIONAL_GE)
429 mod = BRW_CONDITIONAL_G;
430
431 /* We treat the bottom 32 bits as unsigned regardless of
432 * whether or not the integer as a whole is signed.
433 */
434 brw_reg right_low = subscript(right, BRW_TYPE_UD, 0);
435 brw_reg left_low = subscript(left, BRW_TYPE_UD, 0);
436
437 /* The upper bits get the same sign as the 64-bit type */
438 brw_reg_type type32 = brw_type_with_size(tmp.type, 32);
439 brw_reg right_high = subscript(right, type32, 1);
440 brw_reg left_high = subscript(left, type32, 1);
441
442 /* Build up our comparison:
443 *
444 * l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
445 */
446 CMP(null_reg_ud(), retype(left_low, BRW_TYPE_UD),
447 retype(right_low, BRW_TYPE_UD), mod);
448 set_predicate(BRW_PREDICATE_NORMAL,
449 CMP(null_reg_ud(), left_high, right_high,
450 BRW_CONDITIONAL_EQ));
451 set_predicate_inv(BRW_PREDICATE_NORMAL, true,
452 CMP(null_reg_ud(), left_high, right_high, mod));
453
454 /* We could use selects here or we could use predicated MOVs
455 * because the destination and second source (if it were a SEL)
456 * are the same.
457 */
458 set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
459 set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
460 break;
461 }
462
463 default:
464 unreachable("Unsupported 64-bit scan op");
465 }
466 } else {
467 set_condmod(mod, emit(opcode, right, left, right));
468 }
469 }
470
471 void
emit_scan(enum opcode opcode,const brw_reg & tmp,unsigned cluster_size,brw_conditional_mod mod)472 emit_scan(enum opcode opcode, const brw_reg &tmp,
473 unsigned cluster_size, brw_conditional_mod mod) const
474 {
475 assert(dispatch_width() >= 8);
476
477 /* The instruction splitting code isn't advanced enough to split
478 * these so we need to handle that ourselves.
479 */
480 if (dispatch_width() * brw_type_size_bytes(tmp.type) > 2 * REG_SIZE) {
481 const unsigned half_width = dispatch_width() / 2;
482 const fs_builder ubld = exec_all().group(half_width, 0);
483 brw_reg left = tmp;
484 brw_reg right = horiz_offset(tmp, half_width);
485 ubld.emit_scan(opcode, left, cluster_size, mod);
486 ubld.emit_scan(opcode, right, cluster_size, mod);
487 if (cluster_size > half_width) {
488 ubld.emit_scan_step(opcode, mod, tmp,
489 half_width - 1, 0, half_width, 1);
490 }
491 return;
492 }
493
494 if (cluster_size > 1) {
495 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
496 ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
497 }
498
499 if (cluster_size > 2) {
500 if (brw_type_size_bytes(tmp.type) <= 4) {
501 const fs_builder ubld =
502 exec_all().group(dispatch_width() / 4, 0);
503 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
504 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
505 } else {
506 /* For 64-bit types, we have to do things differently because
507 * the code above would land us with destination strides that
508 * the hardware can't handle. Fortunately, we'll only be
509 * 8-wide in that case and it's the same number of
510 * instructions.
511 */
512 const fs_builder ubld = exec_all().group(2, 0);
513 for (unsigned i = 0; i < dispatch_width(); i += 4)
514 ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
515 }
516 }
517
518 for (unsigned i = 4;
519 i < MIN2(cluster_size, dispatch_width());
520 i *= 2) {
521 const fs_builder ubld = exec_all().group(i, 0);
522 ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
523
524 if (dispatch_width() > i * 2)
525 ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
526
527 if (dispatch_width() > i * 4) {
528 ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
529 ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
530 }
531 }
532 }
533
534 fs_inst *
emit_undef_for_dst(const fs_inst * old_inst)535 emit_undef_for_dst(const fs_inst *old_inst) const
536 {
537 assert(old_inst->dst.file == VGRF);
538 fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
539 retype(old_inst->dst, BRW_TYPE_UD));
540 inst->size_written = old_inst->size_written;
541
542 return inst;
543 }
544
545 /**
546 * Assorted arithmetic ops.
547 * @{
548 */
549 #define _ALU1(prefix, op) \
550 fs_inst * \
551 op(const brw_reg &dst, const brw_reg &src0) const \
552 { \
553 assert(_dispatch_width == 1 || \
554 (dst.file >= VGRF && dst.stride != 0) || \
555 (dst.file < VGRF && dst.hstride != 0)); \
556 return emit(prefix##op, dst, src0); \
557 } \
558 brw_reg \
559 op(const brw_reg &src0, fs_inst **out = NULL) const \
560 { \
561 fs_inst *inst = op(vgrf(src0.type), src0); \
562 if (out) *out = inst; \
563 return inst->dst; \
564 }
565 #define ALU1(op) _ALU1(BRW_OPCODE_, op)
566 #define VIRT1(op) _ALU1(SHADER_OPCODE_, op)
567
568 fs_inst *
alu2(opcode op,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)569 alu2(opcode op, const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
570 {
571 return emit(op, dst, src0, src1);
572 }
573 brw_reg
574 alu2(opcode op, const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
575 {
576 enum brw_reg_type inferred_dst_type =
577 brw_type_larger_of(src0.type, src1.type);
578 fs_inst *inst = alu2(op, vgrf(inferred_dst_type), src0, src1);
579 if (out) *out = inst;
580 return inst->dst;
581 }
582
583 #define _ALU2(prefix, op) \
584 fs_inst * \
585 op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const \
586 { \
587 return alu2(prefix##op, dst, src0, src1); \
588 } \
589 brw_reg \
590 op(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const \
591 { \
592 return alu2(prefix##op, src0, src1, out); \
593 }
594 #define ALU2(op) _ALU2(BRW_OPCODE_, op)
595 #define VIRT2(op) _ALU2(SHADER_OPCODE_, op)
596
597 #define ALU2_ACC(op) \
598 fs_inst * \
599 op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const \
600 { \
601 fs_inst *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
602 inst->writes_accumulator = true; \
603 return inst; \
604 }
605
606 #define ALU3(op) \
607 fs_inst * \
608 op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1, \
609 const brw_reg &src2) const \
610 { \
611 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
612 }
613
614 ALU3(ADD3)
ALU2_ACC(ADDC)615 ALU2_ACC(ADDC)
616 ALU2(AND)
617 ALU2(ASR)
618 ALU2(AVG)
619 ALU3(BFE)
620 ALU2(BFI1)
621 ALU3(BFI2)
622 ALU1(BFREV)
623 ALU1(CBIT)
624 ALU2(DP2)
625 ALU2(DP3)
626 ALU2(DP4)
627 ALU2(DPH)
628 ALU1(FBH)
629 ALU1(FBL)
630 ALU1(FRC)
631 ALU3(DP4A)
632 ALU2(LINE)
633 ALU1(LZD)
634 ALU2(MAC)
635 ALU2_ACC(MACH)
636 ALU3(MAD)
637 ALU1(MOV)
638 ALU2(MUL)
639 ALU1(NOT)
640 ALU2(OR)
641 ALU2(PLN)
642 ALU1(RNDD)
643 ALU1(RNDE)
644 ALU1(RNDU)
645 ALU1(RNDZ)
646 ALU2(ROL)
647 ALU2(ROR)
648 ALU2(SEL)
649 ALU2(SHL)
650 ALU2(SHR)
651 ALU2_ACC(SUBB)
652 ALU2(XOR)
653
654 VIRT1(RCP)
655 VIRT1(RSQ)
656 VIRT1(SQRT)
657 VIRT1(EXP2)
658 VIRT1(LOG2)
659 VIRT2(POW)
660 VIRT2(INT_QUOTIENT)
661 VIRT2(INT_REMAINDER)
662 VIRT1(SIN)
663 VIRT1(COS)
664
665 #undef ALU3
666 #undef ALU2_ACC
667 #undef ALU2
668 #undef VIRT2
669 #undef _ALU2
670 #undef ALU1
671 #undef VIRT1
672 #undef _ALU1
673 /** @} */
674
675 fs_inst *
676 ADD(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
677 {
678 return alu2(BRW_OPCODE_ADD, dst, src0, src1);
679 }
680
681 brw_reg
682 ADD(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
683 {
684 if (src1.file == IMM && src1.ud == 0 && !out)
685 return src0;
686
687 return alu2(BRW_OPCODE_ADD, src0, src1, out);
688 }
689
690 /**
691 * CMP: Sets the low bit of the destination channels with the result
692 * of the comparison, while the upper bits are undefined, and updates
693 * the flag register with the packed 16 bits of the result.
694 */
695 fs_inst *
CMP(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)696 CMP(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
697 brw_conditional_mod condition) const
698 {
699 /* Take the instruction:
700 *
701 * CMP null<d> src0<f> src1<f>
702 *
703 * Original gfx4 does type conversion to the destination type
704 * before comparison, producing garbage results for floating
705 * point comparisons.
706 */
707 const enum brw_reg_type type =
708 dst.is_null() ?
709 src0.type :
710 brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
711
712 return set_condmod(condition,
713 emit(BRW_OPCODE_CMP, retype(dst, type),
714 fix_unsigned_negate(src0),
715 fix_unsigned_negate(src1)));
716 }
717
718 /**
719 * CMPN: Behaves like CMP, but produces true if src1 is NaN.
720 */
721 fs_inst *
CMPN(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)722 CMPN(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
723 brw_conditional_mod condition) const
724 {
725 /* Take the instruction:
726 *
727 * CMP null<d> src0<f> src1<f>
728 *
729 * Original gfx4 does type conversion to the destination type
730 * before comparison, producing garbage results for floating
731 * point comparisons.
732 */
733 const enum brw_reg_type type =
734 dst.is_null() ?
735 src0.type :
736 brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
737
738 return set_condmod(condition,
739 emit(BRW_OPCODE_CMPN, retype(dst, type),
740 fix_unsigned_negate(src0),
741 fix_unsigned_negate(src1)));
742 }
743
744 /**
745 * Gfx4 predicated IF.
746 */
747 fs_inst *
IF(brw_predicate predicate)748 IF(brw_predicate predicate) const
749 {
750 return set_predicate(predicate, emit(BRW_OPCODE_IF));
751 }
752
753 /**
754 * CSEL: dst = src2 <op> 0.0f ? src0 : src1
755 */
756 fs_inst *
CSEL(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,brw_conditional_mod condition)757 CSEL(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
758 const brw_reg &src2, brw_conditional_mod condition) const
759 {
760 return set_condmod(condition,
761 emit(BRW_OPCODE_CSEL,
762 retype(dst, src2.type),
763 retype(src0, src2.type),
764 retype(src1, src2.type),
765 src2));
766 }
767
768 /**
769 * Emit a linear interpolation instruction.
770 */
771 fs_inst *
LRP(const brw_reg & dst,const brw_reg & x,const brw_reg & y,const brw_reg & a)772 LRP(const brw_reg &dst, const brw_reg &x, const brw_reg &y,
773 const brw_reg &a) const
774 {
775 if (shader->devinfo->ver <= 10) {
776 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
777 * we need to reorder the operands.
778 */
779 return emit(BRW_OPCODE_LRP, dst, a, y, x);
780
781 } else {
782 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
783 const brw_reg y_times_a = vgrf(dst.type);
784 const brw_reg one_minus_a = vgrf(dst.type);
785 const brw_reg x_times_one_minus_a = vgrf(dst.type);
786
787 MUL(y_times_a, y, a);
788 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
789 MUL(x_times_one_minus_a, x, brw_reg(one_minus_a));
790 return ADD(dst, brw_reg(x_times_one_minus_a), brw_reg(y_times_a));
791 }
792 }
793
794 /**
795 * Collect a number of registers in a contiguous range of registers.
796 */
797 fs_inst *
LOAD_PAYLOAD(const brw_reg & dst,const brw_reg * src,unsigned sources,unsigned header_size)798 LOAD_PAYLOAD(const brw_reg &dst, const brw_reg *src,
799 unsigned sources, unsigned header_size) const
800 {
801 fs_inst *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
802 inst->header_size = header_size;
803 inst->size_written = header_size * REG_SIZE;
804 for (unsigned i = header_size; i < sources; i++) {
805 inst->size_written += dispatch_width() * brw_type_size_bytes(src[i].type) *
806 dst.stride;
807 }
808
809 return inst;
810 }
811
812 fs_inst *
VEC(const brw_reg & dst,const brw_reg * src,unsigned sources)813 VEC(const brw_reg &dst, const brw_reg *src, unsigned sources) const
814 {
815 return sources == 1 ? MOV(dst, src[0])
816 : LOAD_PAYLOAD(dst, src, sources, 0);
817 }
818
819 fs_inst *
SYNC(enum tgl_sync_function sync)820 SYNC(enum tgl_sync_function sync) const
821 {
822 return emit(BRW_OPCODE_SYNC, null_reg_ud(), brw_imm_ud(sync));
823 }
824
825 fs_inst *
UNDEF(const brw_reg & dst)826 UNDEF(const brw_reg &dst) const
827 {
828 assert(dst.file == VGRF);
829 assert(dst.offset % REG_SIZE == 0);
830 fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
831 retype(dst, BRW_TYPE_UD));
832 inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
833
834 return inst;
835 }
836
837 fs_inst *
DPAS(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,unsigned sdepth,unsigned rcount)838 DPAS(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1, const brw_reg &src2,
839 unsigned sdepth, unsigned rcount) const
840 {
841 assert(_dispatch_width == 8 * reg_unit(shader->devinfo));
842 assert(sdepth == 8);
843 assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
844
845 fs_inst *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
846 inst->sdepth = sdepth;
847 inst->rcount = rcount;
848
849 if (dst.type == BRW_TYPE_HF) {
850 inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE / 2;
851 } else {
852 inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE;
853 }
854
855 return inst;
856 }
857
858 void
VARYING_PULL_CONSTANT_LOAD(const brw_reg & dst,const brw_reg & surface,const brw_reg & surface_handle,const brw_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)859 VARYING_PULL_CONSTANT_LOAD(const brw_reg &dst,
860 const brw_reg &surface,
861 const brw_reg &surface_handle,
862 const brw_reg &varying_offset,
863 uint32_t const_offset,
864 uint8_t alignment,
865 unsigned components) const
866 {
867 assert(components <= 4);
868
869 /* We have our constant surface use a pitch of 4 bytes, so our index can
870 * be any component of a vector, and then we load 4 contiguous
871 * components starting from that. TODO: Support loading fewer than 4.
872 */
873 brw_reg total_offset = ADD(varying_offset, brw_imm_ud(const_offset));
874
875 /* The pull load message will load a vec4 (16 bytes). If we are loading
876 * a double this means we are only loading 2 elements worth of data.
877 * We also want to use a 32-bit data type for the dst of the load operation
878 * so other parts of the driver don't get confused about the size of the
879 * result.
880 */
881 brw_reg vec4_result = vgrf(BRW_TYPE_F, 4);
882
883 brw_reg srcs[PULL_VARYING_CONSTANT_SRCS];
884 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE] = surface;
885 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
886 srcs[PULL_VARYING_CONSTANT_SRC_OFFSET] = total_offset;
887 srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT] = brw_imm_ud(alignment);
888
889 fs_inst *inst = emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
890 vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
891 inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
892
893 shuffle_from_32bit_read(*this, dst, vec4_result, 0, components);
894 }
895
896 brw_reg
LOAD_SUBGROUP_INVOCATION()897 LOAD_SUBGROUP_INVOCATION() const
898 {
899 brw_reg reg = vgrf(shader->dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
900 exec_all().emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
901 return reg;
902 }
903
904 fs_visitor *shader;
905
BREAK()906 fs_inst *BREAK() { return emit(BRW_OPCODE_BREAK); }
DO()907 fs_inst *DO() { return emit(BRW_OPCODE_DO); }
ENDIF()908 fs_inst *ENDIF() { return emit(BRW_OPCODE_ENDIF); }
NOP()909 fs_inst *NOP() { return emit(BRW_OPCODE_NOP); }
WHILE()910 fs_inst *WHILE() { return emit(BRW_OPCODE_WHILE); }
CONTINUE()911 fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
912
913 private:
914 /**
915 * Workaround for negation of UD registers. See comment in
916 * fs_generator::generate_code() for more details.
917 */
918 brw_reg
fix_unsigned_negate(const brw_reg & src)919 fix_unsigned_negate(const brw_reg &src) const
920 {
921 if (src.type == BRW_TYPE_UD &&
922 src.negate) {
923 brw_reg temp = vgrf(BRW_TYPE_UD);
924 MOV(temp, src);
925 return brw_reg(temp);
926 } else {
927 return src;
928 }
929 }
930
931 /**
932 * Workaround for source register modes not supported by the ternary
933 * instruction encoding.
934 */
935 brw_reg
fix_3src_operand(const brw_reg & src)936 fix_3src_operand(const brw_reg &src) const
937 {
938 switch (src.file) {
939 case FIXED_GRF:
940 /* FINISHME: Could handle scalar region, other stride=1 regions */
941 if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
942 src.width != BRW_WIDTH_8 ||
943 src.hstride != BRW_HORIZONTAL_STRIDE_1)
944 break;
945 FALLTHROUGH;
946 case ATTR:
947 case VGRF:
948 case UNIFORM:
949 case IMM:
950 return src;
951 default:
952 break;
953 }
954
955 brw_reg expanded = vgrf(src.type);
956 MOV(expanded, src);
957 return expanded;
958 }
959
960 bblock_t *block;
961 exec_node *cursor;
962
963 unsigned _dispatch_width;
964 unsigned _group;
965 bool force_writemask_all;
966
967 /** Debug annotation info. */
968 struct {
969 const char *str;
970 } annotation;
971 };
972 }
973
974 static inline brw_reg
offset(const brw_reg & reg,const brw::fs_builder & bld,unsigned delta)975 offset(const brw_reg ®, const brw::fs_builder &bld, unsigned delta)
976 {
977 return offset(reg, bld.dispatch_width(), delta);
978 }
979
980 #endif
981