xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_builder.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27 
28 #include "brw_ir_fs.h"
29 #include "brw_eu.h"
30 #include "brw_fs.h"
31 
32 namespace brw {
33    /**
34     * Toolbox to assemble an FS IR program out of individual instructions.
35     */
36    class fs_builder {
37    public:
38       /**
39        * Construct an fs_builder that inserts instructions into \p shader.
40        * \p dispatch_width gives the native execution width of the program.
41        */
fs_builder(fs_visitor * shader,unsigned dispatch_width)42       fs_builder(fs_visitor *shader,
43                  unsigned dispatch_width) :
44          shader(shader), block(NULL), cursor(NULL),
45          _dispatch_width(dispatch_width),
46          _group(0),
47          force_writemask_all(false),
48          annotation()
49       {
50       }
51 
fs_builder(fs_visitor * s)52       explicit fs_builder(fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
53 
54       /**
55        * Construct an fs_builder that inserts instructions into \p shader
56        * before instruction \p inst in basic block \p block.  The default
57        * execution controls and debug annotation are initialized from the
58        * instruction passed as argument.
59        */
fs_builder(fs_visitor * shader,bblock_t * block,fs_inst * inst)60       fs_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
61          shader(shader), block(block), cursor(inst),
62          _dispatch_width(inst->exec_size),
63          _group(inst->group),
64          force_writemask_all(inst->force_writemask_all)
65       {
66 #ifndef NDEBUG
67          annotation.str = inst->annotation;
68 #else
69          annotation.str = NULL;
70 #endif
71       }
72 
73       /**
74        * Construct an fs_builder that inserts instructions before \p cursor in
75        * basic block \p block, inheriting other code generation parameters
76        * from this.
77        */
78       fs_builder
at(bblock_t * block,exec_node * cursor)79       at(bblock_t *block, exec_node *cursor) const
80       {
81          fs_builder bld = *this;
82          bld.block = block;
83          bld.cursor = cursor;
84          return bld;
85       }
86 
87       /**
88        * Construct an fs_builder appending instructions at the end of the
89        * instruction list of the shader, inheriting other code generation
90        * parameters from this.
91        */
92       fs_builder
at_end()93       at_end() const
94       {
95          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
96       }
97 
98       /**
99        * Construct a builder specifying the default SIMD width and group of
100        * channel enable signals, inheriting other code generation parameters
101        * from this.
102        *
103        * \p n gives the default SIMD width, \p i gives the slot group used for
104        * predication and control flow masking in multiples of \p n channels.
105        */
106       fs_builder
group(unsigned n,unsigned i)107       group(unsigned n, unsigned i) const
108       {
109          fs_builder bld = *this;
110 
111          if (n <= dispatch_width() && i < dispatch_width() / n) {
112             bld._group += i * n;
113          } else {
114             /* The requested channel group isn't a subset of the channel group
115              * of this builder, which means that the resulting instructions
116              * would use (potentially undefined) channel enable signals not
117              * specified by the parent builder.  That's only valid if the
118              * instruction doesn't have per-channel semantics, in which case
119              * we should clear off the default group index in order to prevent
120              * emitting instructions with channel group not aligned to their
121              * own execution size.
122              */
123             assert(force_writemask_all);
124             bld._group = 0;
125          }
126 
127          bld._dispatch_width = n;
128          return bld;
129       }
130 
131       /**
132        * Alias for group() with width equal to eight.
133        */
134       fs_builder
quarter(unsigned i)135       quarter(unsigned i) const
136       {
137          return group(8, i);
138       }
139 
140       /**
141        * Construct a builder with per-channel control flow execution masking
142        * disabled if \p b is true.  If control flow execution masking is
143        * already disabled this has no effect.
144        */
145       fs_builder
146       exec_all(bool b = true) const
147       {
148          fs_builder bld = *this;
149          if (b)
150             bld.force_writemask_all = true;
151          return bld;
152       }
153 
154       /**
155        * Construct a builder with the given debug annotation info.
156        */
157       fs_builder
annotate(const char * str)158       annotate(const char *str) const
159       {
160          fs_builder bld = *this;
161          bld.annotation.str = str;
162          return bld;
163       }
164 
165       /**
166        * Get the SIMD width in use.
167        */
168       unsigned
dispatch_width()169       dispatch_width() const
170       {
171          return _dispatch_width;
172       }
173 
174       /**
175        * Get the channel group in use.
176        */
177       unsigned
group()178       group() const
179       {
180          return _group;
181       }
182 
183       /**
184        * Allocate a virtual register of natural vector size (one for this IR)
185        * and SIMD width.  \p n gives the amount of space to allocate in
186        * dispatch_width units (which is just enough space for one logical
187        * component in this IR).
188        */
189       brw_reg
190       vgrf(enum brw_reg_type type, unsigned n = 1) const
191       {
192          const unsigned unit = reg_unit(shader->devinfo);
193          assert(dispatch_width() <= 32);
194 
195          if (n > 0)
196             return brw_vgrf(shader->alloc.allocate(
197                                DIV_ROUND_UP(n * brw_type_size_bytes(type) * dispatch_width(),
198                                             unit * REG_SIZE) * unit),
199                             type);
200          else
201             return retype(null_reg_ud(), type);
202       }
203 
204       /**
205        * Create a null register of floating type.
206        */
207       brw_reg
null_reg_f()208       null_reg_f() const
209       {
210          return brw_reg(retype(brw_null_reg(), BRW_TYPE_F));
211       }
212 
213       brw_reg
null_reg_df()214       null_reg_df() const
215       {
216          return brw_reg(retype(brw_null_reg(), BRW_TYPE_DF));
217       }
218 
219       /**
220        * Create a null register of signed integer type.
221        */
222       brw_reg
null_reg_d()223       null_reg_d() const
224       {
225          return brw_reg(retype(brw_null_reg(), BRW_TYPE_D));
226       }
227 
228       /**
229        * Create a null register of unsigned integer type.
230        */
231       brw_reg
null_reg_ud()232       null_reg_ud() const
233       {
234          return brw_reg(retype(brw_null_reg(), BRW_TYPE_UD));
235       }
236 
237       /**
238        * Insert an instruction into the program.
239        */
240       fs_inst *
emit(const fs_inst & inst)241       emit(const fs_inst &inst) const
242       {
243          return emit(new(shader->mem_ctx) fs_inst(inst));
244       }
245 
246       /**
247        * Create and insert a nullary control instruction into the program.
248        */
249       fs_inst *
emit(enum opcode opcode)250       emit(enum opcode opcode) const
251       {
252          return emit(fs_inst(opcode, dispatch_width()));
253       }
254 
255       /**
256        * Create and insert a nullary instruction into the program.
257        */
258       fs_inst *
emit(enum opcode opcode,const brw_reg & dst)259       emit(enum opcode opcode, const brw_reg &dst) const
260       {
261          return emit(fs_inst(opcode, dispatch_width(), dst));
262       }
263 
264       /**
265        * Create and insert a unary instruction into the program.
266        */
267       fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0)268       emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0) const
269       {
270          return emit(fs_inst(opcode, dispatch_width(), dst, src0));
271       }
272 
273       /**
274        * Create and insert a binary instruction into the program.
275        */
276       fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)277       emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
278            const brw_reg &src1) const
279       {
280          return emit(fs_inst(opcode, dispatch_width(), dst,
281                                  src0, src1));
282       }
283 
284       /**
285        * Create and insert a ternary instruction into the program.
286        */
287       fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)288       emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
289            const brw_reg &src1, const brw_reg &src2) const
290       {
291          switch (opcode) {
292          case BRW_OPCODE_BFE:
293          case BRW_OPCODE_BFI2:
294          case BRW_OPCODE_MAD:
295          case BRW_OPCODE_LRP:
296             return emit(fs_inst(opcode, dispatch_width(), dst,
297                                     fix_3src_operand(src0),
298                                     fix_3src_operand(src1),
299                                     fix_3src_operand(src2)));
300 
301          default:
302             return emit(fs_inst(opcode, dispatch_width(), dst,
303                                     src0, src1, src2));
304          }
305       }
306 
307       /**
308        * Create and insert an instruction with a variable number of sources
309        * into the program.
310        */
311       fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg srcs[],unsigned n)312       emit(enum opcode opcode, const brw_reg &dst, const brw_reg srcs[],
313            unsigned n) const
314       {
315          /* Use the emit() methods for specific operand counts to ensure that
316           * opcode-specific operand fixups occur.
317           */
318          if (n == 3) {
319             return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
320          } else {
321             return emit(fs_inst(opcode, dispatch_width(), dst, srcs, n));
322          }
323       }
324 
325       /**
326        * Insert a preallocated instruction into the program.
327        */
328       fs_inst *
emit(fs_inst * inst)329       emit(fs_inst *inst) const
330       {
331          assert(inst->exec_size <= 32);
332          assert(inst->exec_size == dispatch_width() ||
333                 force_writemask_all);
334 
335          inst->group = _group;
336          inst->force_writemask_all = force_writemask_all;
337 #ifndef NDEBUG
338          inst->annotation = annotation.str;
339 #endif
340 
341          if (block)
342             static_cast<fs_inst *>(cursor)->insert_before(block, inst);
343          else
344             cursor->insert_before(inst);
345 
346          return inst;
347       }
348 
349       /**
350        * Select \p src0 if the comparison of both sources with the given
351        * conditional mod evaluates to true, otherwise select \p src1.
352        *
353        * Generally useful to get the minimum or maximum of two values.
354        */
355       fs_inst *
emit_minmax(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod mod)356       emit_minmax(const brw_reg &dst, const brw_reg &src0,
357                   const brw_reg &src1, brw_conditional_mod mod) const
358       {
359          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
360 
361          /* In some cases we can't have bytes as operand for src1, so use the
362           * same type for both operand.
363           */
364          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
365                                      fix_unsigned_negate(src1)));
366       }
367 
368       /**
369        * Copy any live channel from \p src to the first channel of the result.
370        */
371       brw_reg
emit_uniformize(const brw_reg & src)372       emit_uniformize(const brw_reg &src) const
373       {
374          /* FIXME: We use a vector chan_index and dst to allow constant and
375           * copy propagration to move result all the way into the consuming
376           * instruction (typically a surface index or sampler index for a
377           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
378           * dispatch. Once we teach const/copy propagation about scalars we
379           * should go back to scalar destinations here.
380           */
381          const fs_builder ubld = exec_all();
382          const brw_reg chan_index = vgrf(BRW_TYPE_UD);
383          const brw_reg dst = vgrf(src.type);
384 
385          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
386          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
387 
388          return brw_reg(component(dst, 0));
389       }
390 
391       brw_reg
move_to_vgrf(const brw_reg & src,unsigned num_components)392       move_to_vgrf(const brw_reg &src, unsigned num_components) const
393       {
394          brw_reg *const src_comps = new brw_reg[num_components];
395          for (unsigned i = 0; i < num_components; i++)
396             src_comps[i] = offset(src, dispatch_width(), i);
397 
398          const brw_reg dst = vgrf(src.type, num_components);
399          LOAD_PAYLOAD(dst, src_comps, num_components, 0);
400 
401          delete[] src_comps;
402 
403          return brw_reg(dst);
404       }
405 
406       void
emit_scan_step(enum opcode opcode,brw_conditional_mod mod,const brw_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)407       emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
408                      const brw_reg &tmp,
409                      unsigned left_offset, unsigned left_stride,
410                      unsigned right_offset, unsigned right_stride) const
411       {
412          brw_reg left, right;
413          left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
414          right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
415          if ((tmp.type == BRW_TYPE_Q || tmp.type == BRW_TYPE_UQ) &&
416              (!shader->devinfo->has_64bit_int || shader->devinfo->ver >= 20)) {
417             switch (opcode) {
418             case BRW_OPCODE_MUL:
419                /* This will get lowered by integer MUL lowering */
420                set_condmod(mod, emit(opcode, right, left, right));
421                break;
422 
423             case BRW_OPCODE_SEL: {
424                /* In order for the comparisons to work out right, we need our
425                 * comparisons to be strict.
426                 */
427                assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
428                if (mod == BRW_CONDITIONAL_GE)
429                   mod = BRW_CONDITIONAL_G;
430 
431                /* We treat the bottom 32 bits as unsigned regardless of
432                 * whether or not the integer as a whole is signed.
433                 */
434                brw_reg right_low = subscript(right, BRW_TYPE_UD, 0);
435                brw_reg left_low = subscript(left, BRW_TYPE_UD, 0);
436 
437                /* The upper bits get the same sign as the 64-bit type */
438                brw_reg_type type32 = brw_type_with_size(tmp.type, 32);
439                brw_reg right_high = subscript(right, type32, 1);
440                brw_reg left_high = subscript(left, type32, 1);
441 
442                /* Build up our comparison:
443                 *
444                 *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
445                 */
446                CMP(null_reg_ud(), retype(left_low, BRW_TYPE_UD),
447                                   retype(right_low, BRW_TYPE_UD), mod);
448                set_predicate(BRW_PREDICATE_NORMAL,
449                              CMP(null_reg_ud(), left_high, right_high,
450                                  BRW_CONDITIONAL_EQ));
451                set_predicate_inv(BRW_PREDICATE_NORMAL, true,
452                                  CMP(null_reg_ud(), left_high, right_high, mod));
453 
454                /* We could use selects here or we could use predicated MOVs
455                 * because the destination and second source (if it were a SEL)
456                 * are the same.
457                 */
458                set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
459                set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
460                break;
461             }
462 
463             default:
464                unreachable("Unsupported 64-bit scan op");
465             }
466          } else {
467             set_condmod(mod, emit(opcode, right, left, right));
468          }
469       }
470 
471       void
emit_scan(enum opcode opcode,const brw_reg & tmp,unsigned cluster_size,brw_conditional_mod mod)472       emit_scan(enum opcode opcode, const brw_reg &tmp,
473                 unsigned cluster_size, brw_conditional_mod mod) const
474       {
475          assert(dispatch_width() >= 8);
476 
477          /* The instruction splitting code isn't advanced enough to split
478           * these so we need to handle that ourselves.
479           */
480          if (dispatch_width() * brw_type_size_bytes(tmp.type) > 2 * REG_SIZE) {
481             const unsigned half_width = dispatch_width() / 2;
482             const fs_builder ubld = exec_all().group(half_width, 0);
483             brw_reg left = tmp;
484             brw_reg right = horiz_offset(tmp, half_width);
485             ubld.emit_scan(opcode, left, cluster_size, mod);
486             ubld.emit_scan(opcode, right, cluster_size, mod);
487             if (cluster_size > half_width) {
488                ubld.emit_scan_step(opcode, mod, tmp,
489                                    half_width - 1, 0, half_width, 1);
490             }
491             return;
492          }
493 
494          if (cluster_size > 1) {
495             const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
496             ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
497          }
498 
499          if (cluster_size > 2) {
500             if (brw_type_size_bytes(tmp.type) <= 4) {
501                const fs_builder ubld =
502                   exec_all().group(dispatch_width() / 4, 0);
503                ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
504                ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
505             } else {
506                /* For 64-bit types, we have to do things differently because
507                 * the code above would land us with destination strides that
508                 * the hardware can't handle.  Fortunately, we'll only be
509                 * 8-wide in that case and it's the same number of
510                 * instructions.
511                 */
512                const fs_builder ubld = exec_all().group(2, 0);
513                for (unsigned i = 0; i < dispatch_width(); i += 4)
514                   ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
515             }
516          }
517 
518          for (unsigned i = 4;
519               i < MIN2(cluster_size, dispatch_width());
520               i *= 2) {
521             const fs_builder ubld = exec_all().group(i, 0);
522             ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
523 
524             if (dispatch_width() > i * 2)
525                ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
526 
527             if (dispatch_width() > i * 4) {
528                ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
529                ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
530             }
531          }
532       }
533 
534       fs_inst *
emit_undef_for_dst(const fs_inst * old_inst)535       emit_undef_for_dst(const fs_inst *old_inst) const
536       {
537          assert(old_inst->dst.file == VGRF);
538          fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
539                                   retype(old_inst->dst, BRW_TYPE_UD));
540          inst->size_written = old_inst->size_written;
541 
542          return inst;
543       }
544 
545       /**
546        * Assorted arithmetic ops.
547        * @{
548        */
549 #define _ALU1(prefix, op)                                \
550       fs_inst *                                          \
551       op(const brw_reg &dst, const brw_reg &src0) const    \
552       {                                                  \
553          assert(_dispatch_width == 1 ||                  \
554                 (dst.file >= VGRF && dst.stride != 0) || \
555                 (dst.file < VGRF && dst.hstride != 0));  \
556          return emit(prefix##op, dst, src0);             \
557       }                                                  \
558       brw_reg                                             \
559       op(const brw_reg &src0, fs_inst **out = NULL) const \
560       {                                                  \
561          fs_inst *inst = op(vgrf(src0.type), src0);      \
562          if (out) *out = inst;                           \
563          return inst->dst;                               \
564       }
565 #define ALU1(op) _ALU1(BRW_OPCODE_, op)
566 #define VIRT1(op) _ALU1(SHADER_OPCODE_, op)
567 
568       fs_inst *
alu2(opcode op,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)569       alu2(opcode op, const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
570       {
571          return emit(op, dst, src0, src1);
572       }
573       brw_reg
574       alu2(opcode op, const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
575       {
576          enum brw_reg_type inferred_dst_type =
577             brw_type_larger_of(src0.type, src1.type);
578          fs_inst *inst = alu2(op, vgrf(inferred_dst_type), src0, src1);
579          if (out) *out = inst;
580          return inst->dst;
581       }
582 
583 #define _ALU2(prefix, op)                                                    \
584       fs_inst *                                                              \
585       op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const    \
586       {                                                                      \
587          return alu2(prefix##op, dst, src0, src1);                           \
588       }                                                                      \
589       brw_reg                                                                 \
590       op(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const \
591       {                                                                      \
592          return alu2(prefix##op, src0, src1, out);                           \
593       }
594 #define ALU2(op) _ALU2(BRW_OPCODE_, op)
595 #define VIRT2(op) _ALU2(SHADER_OPCODE_, op)
596 
597 #define ALU2_ACC(op)                                                    \
598       fs_inst *                                                     \
599       op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const \
600       {                                                                 \
601          fs_inst *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
602          inst->writes_accumulator = true;                               \
603          return inst;                                                   \
604       }
605 
606 #define ALU3(op)                                                        \
607       fs_inst *                                                     \
608       op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,  \
609          const brw_reg &src2) const                                     \
610       {                                                                 \
611          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
612       }
613 
614       ALU3(ADD3)
ALU2_ACC(ADDC)615       ALU2_ACC(ADDC)
616       ALU2(AND)
617       ALU2(ASR)
618       ALU2(AVG)
619       ALU3(BFE)
620       ALU2(BFI1)
621       ALU3(BFI2)
622       ALU1(BFREV)
623       ALU1(CBIT)
624       ALU2(DP2)
625       ALU2(DP3)
626       ALU2(DP4)
627       ALU2(DPH)
628       ALU1(FBH)
629       ALU1(FBL)
630       ALU1(FRC)
631       ALU3(DP4A)
632       ALU2(LINE)
633       ALU1(LZD)
634       ALU2(MAC)
635       ALU2_ACC(MACH)
636       ALU3(MAD)
637       ALU1(MOV)
638       ALU2(MUL)
639       ALU1(NOT)
640       ALU2(OR)
641       ALU2(PLN)
642       ALU1(RNDD)
643       ALU1(RNDE)
644       ALU1(RNDU)
645       ALU1(RNDZ)
646       ALU2(ROL)
647       ALU2(ROR)
648       ALU2(SEL)
649       ALU2(SHL)
650       ALU2(SHR)
651       ALU2_ACC(SUBB)
652       ALU2(XOR)
653 
654       VIRT1(RCP)
655       VIRT1(RSQ)
656       VIRT1(SQRT)
657       VIRT1(EXP2)
658       VIRT1(LOG2)
659       VIRT2(POW)
660       VIRT2(INT_QUOTIENT)
661       VIRT2(INT_REMAINDER)
662       VIRT1(SIN)
663       VIRT1(COS)
664 
665 #undef ALU3
666 #undef ALU2_ACC
667 #undef ALU2
668 #undef VIRT2
669 #undef _ALU2
670 #undef ALU1
671 #undef VIRT1
672 #undef _ALU1
673       /** @} */
674 
675       fs_inst *
676       ADD(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
677       {
678          return alu2(BRW_OPCODE_ADD, dst, src0, src1);
679       }
680 
681       brw_reg
682       ADD(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
683       {
684          if (src1.file == IMM && src1.ud == 0 && !out)
685             return src0;
686 
687          return alu2(BRW_OPCODE_ADD, src0, src1, out);
688       }
689 
690       /**
691        * CMP: Sets the low bit of the destination channels with the result
692        * of the comparison, while the upper bits are undefined, and updates
693        * the flag register with the packed 16 bits of the result.
694        */
695       fs_inst *
CMP(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)696       CMP(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
697           brw_conditional_mod condition) const
698       {
699          /* Take the instruction:
700           *
701           * CMP null<d> src0<f> src1<f>
702           *
703           * Original gfx4 does type conversion to the destination type
704           * before comparison, producing garbage results for floating
705           * point comparisons.
706           */
707          const enum brw_reg_type type =
708             dst.is_null() ?
709             src0.type :
710             brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
711 
712          return set_condmod(condition,
713                             emit(BRW_OPCODE_CMP, retype(dst, type),
714                                  fix_unsigned_negate(src0),
715                                  fix_unsigned_negate(src1)));
716       }
717 
718       /**
719        * CMPN: Behaves like CMP, but produces true if src1 is NaN.
720        */
721       fs_inst *
CMPN(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)722       CMPN(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
723            brw_conditional_mod condition) const
724       {
725          /* Take the instruction:
726           *
727           * CMP null<d> src0<f> src1<f>
728           *
729           * Original gfx4 does type conversion to the destination type
730           * before comparison, producing garbage results for floating
731           * point comparisons.
732           */
733          const enum brw_reg_type type =
734             dst.is_null() ?
735             src0.type :
736             brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
737 
738          return set_condmod(condition,
739                             emit(BRW_OPCODE_CMPN, retype(dst, type),
740                                  fix_unsigned_negate(src0),
741                                  fix_unsigned_negate(src1)));
742       }
743 
744       /**
745        * Gfx4 predicated IF.
746        */
747       fs_inst *
IF(brw_predicate predicate)748       IF(brw_predicate predicate) const
749       {
750          return set_predicate(predicate, emit(BRW_OPCODE_IF));
751       }
752 
753       /**
754        * CSEL: dst = src2 <op> 0.0f ? src0 : src1
755        */
756       fs_inst *
CSEL(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,brw_conditional_mod condition)757       CSEL(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
758            const brw_reg &src2, brw_conditional_mod condition) const
759       {
760          return set_condmod(condition,
761                             emit(BRW_OPCODE_CSEL,
762                                  retype(dst, src2.type),
763                                  retype(src0, src2.type),
764                                  retype(src1, src2.type),
765                                  src2));
766       }
767 
768       /**
769        * Emit a linear interpolation instruction.
770        */
771       fs_inst *
LRP(const brw_reg & dst,const brw_reg & x,const brw_reg & y,const brw_reg & a)772       LRP(const brw_reg &dst, const brw_reg &x, const brw_reg &y,
773           const brw_reg &a) const
774       {
775          if (shader->devinfo->ver <= 10) {
776             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
777              * we need to reorder the operands.
778              */
779             return emit(BRW_OPCODE_LRP, dst, a, y, x);
780 
781          } else {
782             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
783             const brw_reg y_times_a = vgrf(dst.type);
784             const brw_reg one_minus_a = vgrf(dst.type);
785             const brw_reg x_times_one_minus_a = vgrf(dst.type);
786 
787             MUL(y_times_a, y, a);
788             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
789             MUL(x_times_one_minus_a, x, brw_reg(one_minus_a));
790             return ADD(dst, brw_reg(x_times_one_minus_a), brw_reg(y_times_a));
791          }
792       }
793 
794       /**
795        * Collect a number of registers in a contiguous range of registers.
796        */
797       fs_inst *
LOAD_PAYLOAD(const brw_reg & dst,const brw_reg * src,unsigned sources,unsigned header_size)798       LOAD_PAYLOAD(const brw_reg &dst, const brw_reg *src,
799                    unsigned sources, unsigned header_size) const
800       {
801          fs_inst *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
802          inst->header_size = header_size;
803          inst->size_written = header_size * REG_SIZE;
804          for (unsigned i = header_size; i < sources; i++) {
805             inst->size_written += dispatch_width() * brw_type_size_bytes(src[i].type) *
806                                   dst.stride;
807          }
808 
809          return inst;
810       }
811 
812       fs_inst *
VEC(const brw_reg & dst,const brw_reg * src,unsigned sources)813       VEC(const brw_reg &dst, const brw_reg *src, unsigned sources) const
814       {
815          return sources == 1 ? MOV(dst, src[0])
816                              : LOAD_PAYLOAD(dst, src, sources, 0);
817       }
818 
819       fs_inst *
SYNC(enum tgl_sync_function sync)820       SYNC(enum tgl_sync_function sync) const
821       {
822          return emit(BRW_OPCODE_SYNC, null_reg_ud(), brw_imm_ud(sync));
823       }
824 
825       fs_inst *
UNDEF(const brw_reg & dst)826       UNDEF(const brw_reg &dst) const
827       {
828          assert(dst.file == VGRF);
829          assert(dst.offset % REG_SIZE == 0);
830          fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
831                                   retype(dst, BRW_TYPE_UD));
832          inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
833 
834          return inst;
835       }
836 
837       fs_inst *
DPAS(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,unsigned sdepth,unsigned rcount)838       DPAS(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1, const brw_reg &src2,
839            unsigned sdepth, unsigned rcount) const
840       {
841          assert(_dispatch_width == 8 * reg_unit(shader->devinfo));
842          assert(sdepth == 8);
843          assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
844 
845          fs_inst *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
846          inst->sdepth = sdepth;
847          inst->rcount = rcount;
848 
849          if (dst.type == BRW_TYPE_HF) {
850             inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE / 2;
851          } else {
852             inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE;
853          }
854 
855          return inst;
856       }
857 
858       void
VARYING_PULL_CONSTANT_LOAD(const brw_reg & dst,const brw_reg & surface,const brw_reg & surface_handle,const brw_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)859       VARYING_PULL_CONSTANT_LOAD(const brw_reg &dst,
860                                  const brw_reg &surface,
861                                  const brw_reg &surface_handle,
862                                  const brw_reg &varying_offset,
863                                  uint32_t const_offset,
864                                  uint8_t alignment,
865                                  unsigned components) const
866       {
867          assert(components <= 4);
868 
869          /* We have our constant surface use a pitch of 4 bytes, so our index can
870           * be any component of a vector, and then we load 4 contiguous
871           * components starting from that.  TODO: Support loading fewer than 4.
872           */
873          brw_reg total_offset = ADD(varying_offset, brw_imm_ud(const_offset));
874 
875          /* The pull load message will load a vec4 (16 bytes). If we are loading
876           * a double this means we are only loading 2 elements worth of data.
877           * We also want to use a 32-bit data type for the dst of the load operation
878           * so other parts of the driver don't get confused about the size of the
879           * result.
880           */
881          brw_reg vec4_result = vgrf(BRW_TYPE_F, 4);
882 
883          brw_reg srcs[PULL_VARYING_CONSTANT_SRCS];
884          srcs[PULL_VARYING_CONSTANT_SRC_SURFACE]        = surface;
885          srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
886          srcs[PULL_VARYING_CONSTANT_SRC_OFFSET]         = total_offset;
887          srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT]      = brw_imm_ud(alignment);
888 
889          fs_inst *inst = emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
890                               vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
891          inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
892 
893          shuffle_from_32bit_read(*this, dst, vec4_result, 0, components);
894       }
895 
896       brw_reg
LOAD_SUBGROUP_INVOCATION()897       LOAD_SUBGROUP_INVOCATION() const
898       {
899          brw_reg reg = vgrf(shader->dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
900          exec_all().emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
901          return reg;
902       }
903 
904       fs_visitor *shader;
905 
BREAK()906       fs_inst *BREAK()    { return emit(BRW_OPCODE_BREAK); }
DO()907       fs_inst *DO()       { return emit(BRW_OPCODE_DO); }
ENDIF()908       fs_inst *ENDIF()    { return emit(BRW_OPCODE_ENDIF); }
NOP()909       fs_inst *NOP()      { return emit(BRW_OPCODE_NOP); }
WHILE()910       fs_inst *WHILE()    { return emit(BRW_OPCODE_WHILE); }
CONTINUE()911       fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
912 
913    private:
914       /**
915        * Workaround for negation of UD registers.  See comment in
916        * fs_generator::generate_code() for more details.
917        */
918       brw_reg
fix_unsigned_negate(const brw_reg & src)919       fix_unsigned_negate(const brw_reg &src) const
920       {
921          if (src.type == BRW_TYPE_UD &&
922              src.negate) {
923             brw_reg temp = vgrf(BRW_TYPE_UD);
924             MOV(temp, src);
925             return brw_reg(temp);
926          } else {
927             return src;
928          }
929       }
930 
931       /**
932        * Workaround for source register modes not supported by the ternary
933        * instruction encoding.
934        */
935       brw_reg
fix_3src_operand(const brw_reg & src)936       fix_3src_operand(const brw_reg &src) const
937       {
938          switch (src.file) {
939          case FIXED_GRF:
940             /* FINISHME: Could handle scalar region, other stride=1 regions */
941             if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
942                 src.width != BRW_WIDTH_8 ||
943                 src.hstride != BRW_HORIZONTAL_STRIDE_1)
944                break;
945             FALLTHROUGH;
946          case ATTR:
947          case VGRF:
948          case UNIFORM:
949          case IMM:
950             return src;
951          default:
952             break;
953          }
954 
955          brw_reg expanded = vgrf(src.type);
956          MOV(expanded, src);
957          return expanded;
958       }
959 
960       bblock_t *block;
961       exec_node *cursor;
962 
963       unsigned _dispatch_width;
964       unsigned _group;
965       bool force_writemask_all;
966 
967       /** Debug annotation info. */
968       struct {
969          const char *str;
970       } annotation;
971    };
972 }
973 
974 static inline brw_reg
offset(const brw_reg & reg,const brw::fs_builder & bld,unsigned delta)975 offset(const brw_reg &reg, const brw::fs_builder &bld, unsigned delta)
976 {
977    return offset(reg, bld.dispatch_width(), delta);
978 }
979 
980 #endif
981