xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_fs_builder.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #ifndef ELK_FS_BUILDER_H
26 #define ELK_FS_BUILDER_H
27 
28 #include "elk_ir_fs.h"
29 #include "elk_shader.h"
30 #include "elk_eu.h"
31 #include "elk_fs.h"
32 
33 namespace elk {
34    /**
35     * Toolbox to assemble an FS IR program out of individual instructions.
36     *
37     * This object is meant to have an interface consistent with
38     * elk::vec4_builder.  They cannot be fully interchangeable because
39     * elk::fs_builder generates scalar code while elk::vec4_builder generates
40     * vector code.
41     */
42    class fs_builder {
43    public:
44       /** Type used in this IR to represent a source of an instruction. */
45       typedef elk_fs_reg src_reg;
46 
47       /** Type used in this IR to represent the destination of an instruction. */
48       typedef elk_fs_reg dst_reg;
49 
50       /** Type used in this IR to represent an instruction. */
51       typedef elk_fs_inst instruction;
52 
53       /**
54        * Construct an fs_builder that inserts instructions into \p shader.
55        * \p dispatch_width gives the native execution width of the program.
56        */
fs_builder(elk_fs_visitor * shader,unsigned dispatch_width)57       fs_builder(elk_fs_visitor *shader,
58                  unsigned dispatch_width) :
59          shader(shader), block(NULL), cursor(NULL),
60          _dispatch_width(dispatch_width),
61          _group(0),
62          force_writemask_all(false),
63          annotation()
64       {
65       }
66 
fs_builder(elk_fs_visitor * s)67       explicit fs_builder(elk_fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
68 
69       /**
70        * Construct an fs_builder that inserts instructions into \p shader
71        * before instruction \p inst in basic block \p block.  The default
72        * execution controls and debug annotation are initialized from the
73        * instruction passed as argument.
74        */
fs_builder(elk_fs_visitor * shader,elk_bblock_t * block,elk_fs_inst * inst)75       fs_builder(elk_fs_visitor *shader, elk_bblock_t *block, elk_fs_inst *inst) :
76          shader(shader), block(block), cursor(inst),
77          _dispatch_width(inst->exec_size),
78          _group(inst->group),
79          force_writemask_all(inst->force_writemask_all)
80       {
81          annotation.str = inst->annotation;
82          annotation.ir = inst->ir;
83       }
84 
85       /**
86        * Construct an fs_builder that inserts instructions before \p cursor in
87        * basic block \p block, inheriting other code generation parameters
88        * from this.
89        */
90       fs_builder
at(elk_bblock_t * block,exec_node * cursor)91       at(elk_bblock_t *block, exec_node *cursor) const
92       {
93          fs_builder bld = *this;
94          bld.block = block;
95          bld.cursor = cursor;
96          return bld;
97       }
98 
99       /**
100        * Construct an fs_builder appending instructions at the end of the
101        * instruction list of the shader, inheriting other code generation
102        * parameters from this.
103        */
104       fs_builder
at_end()105       at_end() const
106       {
107          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
108       }
109 
110       /**
111        * Construct a builder specifying the default SIMD width and group of
112        * channel enable signals, inheriting other code generation parameters
113        * from this.
114        *
115        * \p n gives the default SIMD width, \p i gives the slot group used for
116        * predication and control flow masking in multiples of \p n channels.
117        */
118       fs_builder
group(unsigned n,unsigned i)119       group(unsigned n, unsigned i) const
120       {
121          fs_builder bld = *this;
122 
123          if (n <= dispatch_width() && i < dispatch_width() / n) {
124             bld._group += i * n;
125          } else {
126             /* The requested channel group isn't a subset of the channel group
127              * of this builder, which means that the resulting instructions
128              * would use (potentially undefined) channel enable signals not
129              * specified by the parent builder.  That's only valid if the
130              * instruction doesn't have per-channel semantics, in which case
131              * we should clear off the default group index in order to prevent
132              * emitting instructions with channel group not aligned to their
133              * own execution size.
134              */
135             assert(force_writemask_all);
136             bld._group = 0;
137          }
138 
139          bld._dispatch_width = n;
140          return bld;
141       }
142 
143       /**
144        * Alias for group() with width equal to eight.
145        */
146       fs_builder
quarter(unsigned i)147       quarter(unsigned i) const
148       {
149          return group(8, i);
150       }
151 
152       /**
153        * Construct a builder with per-channel control flow execution masking
154        * disabled if \p b is true.  If control flow execution masking is
155        * already disabled this has no effect.
156        */
157       fs_builder
158       exec_all(bool b = true) const
159       {
160          fs_builder bld = *this;
161          if (b)
162             bld.force_writemask_all = true;
163          return bld;
164       }
165 
166       /**
167        * Construct a builder with the given debug annotation info.
168        */
169       fs_builder
170       annotate(const char *str, const void *ir = NULL) const
171       {
172          fs_builder bld = *this;
173          bld.annotation.str = str;
174          bld.annotation.ir = ir;
175          return bld;
176       }
177 
178       /**
179        * Get the SIMD width in use.
180        */
181       unsigned
dispatch_width()182       dispatch_width() const
183       {
184          return _dispatch_width;
185       }
186 
187       /**
188        * Get the channel group in use.
189        */
190       unsigned
group()191       group() const
192       {
193          return _group;
194       }
195 
196       /**
197        * Allocate a virtual register of natural vector size (one for this IR)
198        * and SIMD width.  \p n gives the amount of space to allocate in
199        * dispatch_width units (which is just enough space for one logical
200        * component in this IR).
201        */
202       dst_reg
203       vgrf(enum elk_reg_type type, unsigned n = 1) const
204       {
205          const unsigned unit = reg_unit(shader->devinfo);
206          assert(dispatch_width() <= 32);
207 
208          if (n > 0)
209             return dst_reg(VGRF, shader->alloc.allocate(
210                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
211                                            unit * REG_SIZE) * unit),
212                            type);
213          else
214             return retype(null_reg_ud(), type);
215       }
216 
217       /**
218        * Create a null register of floating type.
219        */
220       dst_reg
null_reg_f()221       null_reg_f() const
222       {
223          return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_F));
224       }
225 
226       dst_reg
null_reg_df()227       null_reg_df() const
228       {
229          return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_DF));
230       }
231 
232       /**
233        * Create a null register of signed integer type.
234        */
235       dst_reg
null_reg_d()236       null_reg_d() const
237       {
238          return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
239       }
240 
241       /**
242        * Create a null register of unsigned integer type.
243        */
244       dst_reg
null_reg_ud()245       null_reg_ud() const
246       {
247          return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_UD));
248       }
249 
250       /**
251        * Insert an instruction into the program.
252        */
253       instruction *
emit(const instruction & inst)254       emit(const instruction &inst) const
255       {
256          return emit(new(shader->mem_ctx) instruction(inst));
257       }
258 
259       /**
260        * Create and insert a nullary control instruction into the program.
261        */
262       instruction *
emit(enum elk_opcode opcode)263       emit(enum elk_opcode opcode) const
264       {
265          return emit(instruction(opcode, dispatch_width()));
266       }
267 
268       /**
269        * Create and insert a nullary instruction into the program.
270        */
271       instruction *
emit(enum elk_opcode opcode,const dst_reg & dst)272       emit(enum elk_opcode opcode, const dst_reg &dst) const
273       {
274          return emit(instruction(opcode, dispatch_width(), dst));
275       }
276 
277       /**
278        * Create and insert a unary instruction into the program.
279        */
280       instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0)281       emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0) const
282       {
283          switch (opcode) {
284          case ELK_SHADER_OPCODE_RCP:
285          case ELK_SHADER_OPCODE_RSQ:
286          case ELK_SHADER_OPCODE_SQRT:
287          case ELK_SHADER_OPCODE_EXP2:
288          case ELK_SHADER_OPCODE_LOG2:
289          case ELK_SHADER_OPCODE_SIN:
290          case ELK_SHADER_OPCODE_COS:
291             return emit(instruction(opcode, dispatch_width(), dst,
292                                     fix_math_operand(src0)));
293 
294          default:
295             return emit(instruction(opcode, dispatch_width(), dst, src0));
296          }
297       }
298 
299       /**
300        * Create and insert a binary instruction into the program.
301        */
302       instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)303       emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
304            const src_reg &src1) const
305       {
306          switch (opcode) {
307          case ELK_SHADER_OPCODE_POW:
308          case ELK_SHADER_OPCODE_INT_QUOTIENT:
309          case ELK_SHADER_OPCODE_INT_REMAINDER:
310             return emit(instruction(opcode, dispatch_width(), dst,
311                                     fix_math_operand(src0),
312                                     fix_math_operand(src1)));
313 
314          default:
315             return emit(instruction(opcode, dispatch_width(), dst,
316                                     src0, src1));
317 
318          }
319       }
320 
321       /**
322        * Create and insert a ternary instruction into the program.
323        */
324       instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)325       emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
326            const src_reg &src1, const src_reg &src2) const
327       {
328          switch (opcode) {
329          case ELK_OPCODE_BFE:
330          case ELK_OPCODE_BFI2:
331          case ELK_OPCODE_MAD:
332          case ELK_OPCODE_LRP:
333             return emit(instruction(opcode, dispatch_width(), dst,
334                                     fix_3src_operand(src0),
335                                     fix_3src_operand(src1),
336                                     fix_3src_operand(src2)));
337 
338          default:
339             return emit(instruction(opcode, dispatch_width(), dst,
340                                     src0, src1, src2));
341          }
342       }
343 
344       /**
345        * Create and insert an instruction with a variable number of sources
346        * into the program.
347        */
348       instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)349       emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg srcs[],
350            unsigned n) const
351       {
352          /* Use the emit() methods for specific operand counts to ensure that
353           * opcode-specific operand fixups occur.
354           */
355          if (n == 2) {
356             return emit(opcode, dst, srcs[0], srcs[1]);
357          } else if (n == 3) {
358             return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
359          } else {
360             return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
361          }
362       }
363 
364       /**
365        * Insert a preallocated instruction into the program.
366        */
367       instruction *
emit(instruction * inst)368       emit(instruction *inst) const
369       {
370          assert(inst->exec_size <= 32);
371          assert(inst->exec_size == dispatch_width() ||
372                 force_writemask_all);
373 
374          inst->group = _group;
375          inst->force_writemask_all = force_writemask_all;
376          inst->annotation = annotation.str;
377          inst->ir = annotation.ir;
378 
379          if (block)
380             static_cast<instruction *>(cursor)->insert_before(block, inst);
381          else
382             cursor->insert_before(inst);
383 
384          return inst;
385       }
386 
387       /**
388        * Select \p src0 if the comparison of both sources with the given
389        * conditional mod evaluates to true, otherwise select \p src1.
390        *
391        * Generally useful to get the minimum or maximum of two values.
392        */
393       instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod mod)394       emit_minmax(const dst_reg &dst, const src_reg &src0,
395                   const src_reg &src1, elk_conditional_mod mod) const
396       {
397          assert(mod == ELK_CONDITIONAL_GE || mod == ELK_CONDITIONAL_L);
398 
399          /* In some cases we can't have bytes as operand for src1, so use the
400           * same type for both operand.
401           */
402          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
403                                      fix_unsigned_negate(src1)));
404       }
405 
406       /**
407        * Copy any live channel from \p src to the first channel of the result.
408        */
409       src_reg
emit_uniformize(const src_reg & src)410       emit_uniformize(const src_reg &src) const
411       {
412          /* FIXME: We use a vector chan_index and dst to allow constant and
413           * copy propagration to move result all the way into the consuming
414           * instruction (typically a surface index or sampler index for a
415           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
416           * dispatch. Once we teach const/copy propagation about scalars we
417           * should go back to scalar destinations here.
418           */
419          const fs_builder ubld = exec_all();
420          const dst_reg chan_index = vgrf(ELK_REGISTER_TYPE_UD);
421          const dst_reg dst = vgrf(src.type);
422 
423          ubld.emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
424          ubld.emit(ELK_SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
425 
426          return src_reg(component(dst, 0));
427       }
428 
429       src_reg
move_to_vgrf(const src_reg & src,unsigned num_components)430       move_to_vgrf(const src_reg &src, unsigned num_components) const
431       {
432          src_reg *const src_comps = new src_reg[num_components];
433          for (unsigned i = 0; i < num_components; i++)
434             src_comps[i] = offset(src, dispatch_width(), i);
435 
436          const dst_reg dst = vgrf(src.type, num_components);
437          LOAD_PAYLOAD(dst, src_comps, num_components, 0);
438 
439          delete[] src_comps;
440 
441          return src_reg(dst);
442       }
443 
444       void
emit_scan_step(enum elk_opcode opcode,elk_conditional_mod mod,const dst_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)445       emit_scan_step(enum elk_opcode opcode, elk_conditional_mod mod,
446                      const dst_reg &tmp,
447                      unsigned left_offset, unsigned left_stride,
448                      unsigned right_offset, unsigned right_stride) const
449       {
450          dst_reg left, right;
451          left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
452          right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
453          if ((tmp.type == ELK_REGISTER_TYPE_Q ||
454               tmp.type == ELK_REGISTER_TYPE_UQ) &&
455              !shader->devinfo->has_64bit_int) {
456             switch (opcode) {
457             case ELK_OPCODE_MUL:
458                /* This will get lowered by integer MUL lowering */
459                set_condmod(mod, emit(opcode, right, left, right));
460                break;
461 
462             case ELK_OPCODE_SEL: {
463                /* In order for the comparisons to work out right, we need our
464                 * comparisons to be strict.
465                 */
466                assert(mod == ELK_CONDITIONAL_L || mod == ELK_CONDITIONAL_GE);
467                if (mod == ELK_CONDITIONAL_GE)
468                   mod = ELK_CONDITIONAL_G;
469 
470                /* We treat the bottom 32 bits as unsigned regardless of
471                 * whether or not the integer as a whole is signed.
472                 */
473                dst_reg right_low = subscript(right, ELK_REGISTER_TYPE_UD, 0);
474                dst_reg left_low = subscript(left, ELK_REGISTER_TYPE_UD, 0);
475 
476                /* The upper bits get the same sign as the 64-bit type */
477                elk_reg_type type32 = elk_reg_type_from_bit_size(32, tmp.type);
478                dst_reg right_high = subscript(right, type32, 1);
479                dst_reg left_high = subscript(left, type32, 1);
480 
481                /* Build up our comparison:
482                 *
483                 *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
484                 */
485                CMP(null_reg_ud(), retype(left_low, ELK_REGISTER_TYPE_UD),
486                                   retype(right_low, ELK_REGISTER_TYPE_UD), mod);
487                set_predicate(ELK_PREDICATE_NORMAL,
488                              CMP(null_reg_ud(), left_high, right_high,
489                                  ELK_CONDITIONAL_EQ));
490                set_predicate_inv(ELK_PREDICATE_NORMAL, true,
491                                  CMP(null_reg_ud(), left_high, right_high, mod));
492 
493                /* We could use selects here or we could use predicated MOVs
494                 * because the destination and second source (if it were a SEL)
495                 * are the same.
496                 */
497                set_predicate(ELK_PREDICATE_NORMAL, MOV(right_low, left_low));
498                set_predicate(ELK_PREDICATE_NORMAL, MOV(right_high, left_high));
499                break;
500             }
501 
502             default:
503                unreachable("Unsupported 64-bit scan op");
504             }
505          } else {
506             set_condmod(mod, emit(opcode, right, left, right));
507          }
508       }
509 
510       void
emit_scan(enum elk_opcode opcode,const dst_reg & tmp,unsigned cluster_size,elk_conditional_mod mod)511       emit_scan(enum elk_opcode opcode, const dst_reg &tmp,
512                 unsigned cluster_size, elk_conditional_mod mod) const
513       {
514          assert(dispatch_width() >= 8);
515 
516          /* The instruction splitting code isn't advanced enough to split
517           * these so we need to handle that ourselves.
518           */
519          if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
520             const unsigned half_width = dispatch_width() / 2;
521             const fs_builder ubld = exec_all().group(half_width, 0);
522             dst_reg left = tmp;
523             dst_reg right = horiz_offset(tmp, half_width);
524             ubld.emit_scan(opcode, left, cluster_size, mod);
525             ubld.emit_scan(opcode, right, cluster_size, mod);
526             if (cluster_size > half_width) {
527                ubld.emit_scan_step(opcode, mod, tmp,
528                                    half_width - 1, 0, half_width, 1);
529             }
530             return;
531          }
532 
533          if (cluster_size > 1) {
534             const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
535             ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
536          }
537 
538          if (cluster_size > 2) {
539             if (type_sz(tmp.type) <= 4) {
540                const fs_builder ubld =
541                   exec_all().group(dispatch_width() / 4, 0);
542                ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
543                ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
544             } else {
545                /* For 64-bit types, we have to do things differently because
546                 * the code above would land us with destination strides that
547                 * the hardware can't handle.  Fortunately, we'll only be
548                 * 8-wide in that case and it's the same number of
549                 * instructions.
550                 */
551                const fs_builder ubld = exec_all().group(2, 0);
552                for (unsigned i = 0; i < dispatch_width(); i += 4)
553                   ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
554             }
555          }
556 
557          for (unsigned i = 4;
558               i < MIN2(cluster_size, dispatch_width());
559               i *= 2) {
560             const fs_builder ubld = exec_all().group(i, 0);
561             ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
562 
563             if (dispatch_width() > i * 2)
564                ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
565 
566             if (dispatch_width() > i * 4) {
567                ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
568                ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
569             }
570          }
571       }
572 
573       instruction *
emit_undef_for_dst(const instruction * old_inst)574       emit_undef_for_dst(const instruction *old_inst) const
575       {
576          assert(old_inst->dst.file == VGRF);
577          instruction *inst = emit(ELK_SHADER_OPCODE_UNDEF,
578                                   retype(old_inst->dst, ELK_REGISTER_TYPE_UD));
579          inst->size_written = old_inst->size_written;
580 
581          return inst;
582       }
583 
584       /**
585        * Assorted arithmetic ops.
586        * @{
587        */
588 #define ALU1(op)                                        \
589       instruction *                                     \
590       op(const dst_reg &dst, const src_reg &src0) const \
591       {                                                 \
592          return emit(ELK_OPCODE_##op, dst, src0);       \
593       }
594 
595 #define ALU2(op)                                                        \
596       instruction *                                                     \
597       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
598       {                                                                 \
599          return emit(ELK_OPCODE_##op, dst, src0, src1);                 \
600       }
601 
602 #define ALU2_ACC(op)                                                    \
603       instruction *                                                     \
604       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
605       {                                                                 \
606          instruction *inst = emit(ELK_OPCODE_##op, dst, src0, src1);    \
607          inst->writes_accumulator = true;                               \
608          return inst;                                                   \
609       }
610 
611 #define ALU3(op)                                                        \
612       instruction *                                                     \
613       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
614          const src_reg &src2) const                                     \
615       {                                                                 \
616          return emit(ELK_OPCODE_##op, dst, src0, src1, src2);           \
617       }
618 
619       ALU2(ADD)
ALU2_ACC(ADDC)620       ALU2_ACC(ADDC)
621       ALU2(AND)
622       ALU2(ASR)
623       ALU2(AVG)
624       ALU3(BFE)
625       ALU2(BFI1)
626       ALU3(BFI2)
627       ALU1(BFREV)
628       ALU1(CBIT)
629       ALU1(DIM)
630       ALU2(DP2)
631       ALU2(DP3)
632       ALU2(DP4)
633       ALU2(DPH)
634       ALU1(FBH)
635       ALU1(FBL)
636       ALU1(FRC)
637       ALU2(LINE)
638       ALU1(LZD)
639       ALU2(MAC)
640       ALU2_ACC(MACH)
641       ALU3(MAD)
642       ALU1(MOV)
643       ALU2(MUL)
644       ALU1(NOT)
645       ALU2(OR)
646       ALU2(PLN)
647       ALU1(RNDD)
648       ALU1(RNDE)
649       ALU1(RNDU)
650       ALU1(RNDZ)
651       ALU2(SAD2)
652       ALU2_ACC(SADA2)
653       ALU2(SEL)
654       ALU2(SHL)
655       ALU2(SHR)
656       ALU2_ACC(SUBB)
657       ALU2(XOR)
658 
659 #undef ALU3
660 #undef ALU2_ACC
661 #undef ALU2
662 #undef ALU1
663 
664       instruction *
665       F32TO16(const dst_reg &dst, const src_reg &src) const
666       {
667          assert(dst.type == ELK_REGISTER_TYPE_HF);
668          assert(src.type == ELK_REGISTER_TYPE_F);
669 
670          if (shader->devinfo->ver >= 8) {
671             return MOV(dst, src);
672          } else {
673             assert(shader->devinfo->ver == 7);
674             return emit(ELK_OPCODE_F32TO16,
675                         retype(dst, ELK_REGISTER_TYPE_W), src);
676          }
677       }
678 
679       instruction *
F16TO32(const dst_reg & dst,const src_reg & src)680       F16TO32(const dst_reg &dst, const src_reg &src) const
681       {
682          assert(dst.type == ELK_REGISTER_TYPE_F);
683          assert(src.type == ELK_REGISTER_TYPE_HF);
684 
685          if (shader->devinfo->ver >= 8) {
686             return MOV(dst, src);
687          } else {
688             assert(shader->devinfo->ver == 7);
689             return emit(ELK_OPCODE_F16TO32,
690                         dst, retype(src, ELK_REGISTER_TYPE_W));
691          }
692       }
693       /** @} */
694 
695       /**
696        * CMP: Sets the low bit of the destination channels with the result
697        * of the comparison, while the upper bits are undefined, and updates
698        * the flag register with the packed 16 bits of the result.
699        */
700       instruction *
CMP(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod condition)701       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
702           elk_conditional_mod condition) const
703       {
704          /* Take the instruction:
705           *
706           * CMP null<d> src0<f> src1<f>
707           *
708           * Original gfx4 does type conversion to the destination type
709           * before comparison, producing garbage results for floating
710           * point comparisons.
711           *
712           * The destination type doesn't matter on newer generations,
713           * so we set the type to match src0 so we can compact the
714           * instruction.
715           */
716          return set_condmod(condition,
717                             emit(ELK_OPCODE_CMP, retype(dst, src0.type),
718                                  fix_unsigned_negate(src0),
719                                  fix_unsigned_negate(src1)));
720       }
721 
722       /**
723        * CMPN: Behaves like CMP, but produces true if src1 is NaN.
724        */
725       instruction *
CMPN(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod condition)726       CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
727            elk_conditional_mod condition) const
728       {
729          /* Take the instruction:
730           *
731           * CMP null<d> src0<f> src1<f>
732           *
733           * Original gfx4 does type conversion to the destination type
734           * before comparison, producing garbage results for floating
735           * point comparisons.
736           *
737           * The destination type doesn't matter on newer generations,
738           * so we set the type to match src0 so we can compact the
739           * instruction.
740           */
741          return set_condmod(condition,
742                             emit(ELK_OPCODE_CMPN, retype(dst, src0.type),
743                                  fix_unsigned_negate(src0),
744                                  fix_unsigned_negate(src1)));
745       }
746 
747       /**
748        * Gfx4 predicated IF.
749        */
750       instruction *
IF(elk_predicate predicate)751       IF(elk_predicate predicate) const
752       {
753          return set_predicate(predicate, emit(ELK_OPCODE_IF));
754       }
755 
756       /**
757        * CSEL: dst = src2 <op> 0.0f ? src0 : src1
758        */
759       instruction *
CSEL(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,elk_conditional_mod condition)760       CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
761            const src_reg &src2, elk_conditional_mod condition) const
762       {
763          /* CSEL only operates on floats, so we can't do integer </<=/>=/>
764           * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
765           * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
766           */
767          assert(src2.type == ELK_REGISTER_TYPE_F);
768 
769          return set_condmod(condition,
770                             emit(ELK_OPCODE_CSEL,
771                                  retype(dst, ELK_REGISTER_TYPE_F),
772                                  retype(src0, ELK_REGISTER_TYPE_F),
773                                  retype(src1, ELK_REGISTER_TYPE_F),
774                                  src2));
775       }
776 
777       /**
778        * Emit a linear interpolation instruction.
779        */
780       instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)781       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
782           const src_reg &a) const
783       {
784          if (shader->devinfo->ver >= 6) {
785             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
786              * we need to reorder the operands.
787              */
788             return emit(ELK_OPCODE_LRP, dst, a, y, x);
789 
790          } else {
791             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
792             const dst_reg y_times_a = vgrf(dst.type);
793             const dst_reg one_minus_a = vgrf(dst.type);
794             const dst_reg x_times_one_minus_a = vgrf(dst.type);
795 
796             MUL(y_times_a, y, a);
797             ADD(one_minus_a, negate(a), elk_imm_f(1.0f));
798             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
799             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
800          }
801       }
802 
803       /**
804        * Collect a number of registers in a contiguous range of registers.
805        */
806       instruction *
LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)807       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
808                    unsigned sources, unsigned header_size) const
809       {
810          instruction *inst = emit(ELK_SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
811          inst->header_size = header_size;
812          inst->size_written = header_size * REG_SIZE;
813          for (unsigned i = header_size; i < sources; i++) {
814             inst->size_written += dispatch_width() * type_sz(src[i].type) *
815                                   dst.stride;
816          }
817 
818          return inst;
819       }
820 
821       instruction *
UNDEF(const dst_reg & dst)822       UNDEF(const dst_reg &dst) const
823       {
824          assert(dst.file == VGRF);
825          assert(dst.offset % REG_SIZE == 0);
826          instruction *inst = emit(ELK_SHADER_OPCODE_UNDEF,
827                                   retype(dst, ELK_REGISTER_TYPE_UD));
828          inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
829 
830          return inst;
831       }
832 
833       elk_fs_visitor *shader;
834 
BREAK()835       elk_fs_inst *BREAK()    { return emit(ELK_OPCODE_BREAK); }
DO()836       elk_fs_inst *DO()       { return emit(ELK_OPCODE_DO); }
ENDIF()837       elk_fs_inst *ENDIF()    { return emit(ELK_OPCODE_ENDIF); }
NOP()838       elk_fs_inst *NOP()      { return emit(ELK_OPCODE_NOP); }
WHILE()839       elk_fs_inst *WHILE()    { return emit(ELK_OPCODE_WHILE); }
CONTINUE()840       elk_fs_inst *CONTINUE() { return emit(ELK_OPCODE_CONTINUE); }
841 
842    private:
843       /**
844        * Workaround for negation of UD registers.  See comment in
845        * elk_fs_generator::generate_code() for more details.
846        */
847       src_reg
fix_unsigned_negate(const src_reg & src)848       fix_unsigned_negate(const src_reg &src) const
849       {
850          if (src.type == ELK_REGISTER_TYPE_UD &&
851              src.negate) {
852             dst_reg temp = vgrf(ELK_REGISTER_TYPE_UD);
853             MOV(temp, src);
854             return src_reg(temp);
855          } else {
856             return src;
857          }
858       }
859 
860       /**
861        * Workaround for source register modes not supported by the ternary
862        * instruction encoding.
863        */
864       src_reg
fix_3src_operand(const src_reg & src)865       fix_3src_operand(const src_reg &src) const
866       {
867          switch (src.file) {
868          case FIXED_GRF:
869             /* FINISHME: Could handle scalar region, other stride=1 regions */
870             if (src.vstride != ELK_VERTICAL_STRIDE_8 ||
871                 src.width != ELK_WIDTH_8 ||
872                 src.hstride != ELK_HORIZONTAL_STRIDE_1)
873                break;
874             FALLTHROUGH;
875          case ATTR:
876          case VGRF:
877          case UNIFORM:
878          case IMM:
879             return src;
880          default:
881             break;
882          }
883 
884          dst_reg expanded = vgrf(src.type);
885          MOV(expanded, src);
886          return expanded;
887       }
888 
889       /**
890        * Workaround for source register modes not supported by the math
891        * instruction.
892        */
893       src_reg
fix_math_operand(const src_reg & src)894       fix_math_operand(const src_reg &src) const
895       {
896          /* Can't do hstride == 0 args on gfx6 math, so expand it out. We
897           * might be able to do better by doing execsize = 1 math and then
898           * expanding that result out, but we would need to be careful with
899           * masking.
900           *
901           * Gfx6 hardware ignores source modifiers (negate and abs) on math
902           * instructions, so we also move to a temp to set those up.
903           *
904           * Gfx7 relaxes most of the above restrictions, but still can't use IMM
905           * operands to math
906           */
907          if ((shader->devinfo->ver == 6 &&
908               (src.file == IMM || src.file == UNIFORM ||
909                src.abs || src.negate)) ||
910              (shader->devinfo->ver == 7 && src.file == IMM)) {
911             const dst_reg tmp = vgrf(src.type);
912             MOV(tmp, src);
913             return tmp;
914          } else {
915             return src;
916          }
917       }
918 
919       elk_bblock_t *block;
920       exec_node *cursor;
921 
922       unsigned _dispatch_width;
923       unsigned _group;
924       bool force_writemask_all;
925 
926       /** Debug annotation info. */
927       struct {
928          const char *str;
929          const void *ir;
930       } annotation;
931    };
932 }
933 
934 static inline elk_fs_reg
offset(const elk_fs_reg & reg,const elk::fs_builder & bld,unsigned delta)935 offset(const elk_fs_reg &reg, const elk::fs_builder &bld, unsigned delta)
936 {
937    return offset(reg, bld.dispatch_width(), delta);
938 }
939 
940 #endif
941