1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef ELK_FS_BUILDER_H
26 #define ELK_FS_BUILDER_H
27
28 #include "elk_ir_fs.h"
29 #include "elk_shader.h"
30 #include "elk_eu.h"
31 #include "elk_fs.h"
32
33 namespace elk {
34 /**
35 * Toolbox to assemble an FS IR program out of individual instructions.
36 *
37 * This object is meant to have an interface consistent with
38 * elk::vec4_builder. They cannot be fully interchangeable because
39 * elk::fs_builder generates scalar code while elk::vec4_builder generates
40 * vector code.
41 */
42 class fs_builder {
43 public:
44 /** Type used in this IR to represent a source of an instruction. */
45 typedef elk_fs_reg src_reg;
46
47 /** Type used in this IR to represent the destination of an instruction. */
48 typedef elk_fs_reg dst_reg;
49
50 /** Type used in this IR to represent an instruction. */
51 typedef elk_fs_inst instruction;
52
53 /**
54 * Construct an fs_builder that inserts instructions into \p shader.
55 * \p dispatch_width gives the native execution width of the program.
56 */
fs_builder(elk_fs_visitor * shader,unsigned dispatch_width)57 fs_builder(elk_fs_visitor *shader,
58 unsigned dispatch_width) :
59 shader(shader), block(NULL), cursor(NULL),
60 _dispatch_width(dispatch_width),
61 _group(0),
62 force_writemask_all(false),
63 annotation()
64 {
65 }
66
fs_builder(elk_fs_visitor * s)67 explicit fs_builder(elk_fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
68
69 /**
70 * Construct an fs_builder that inserts instructions into \p shader
71 * before instruction \p inst in basic block \p block. The default
72 * execution controls and debug annotation are initialized from the
73 * instruction passed as argument.
74 */
fs_builder(elk_fs_visitor * shader,elk_bblock_t * block,elk_fs_inst * inst)75 fs_builder(elk_fs_visitor *shader, elk_bblock_t *block, elk_fs_inst *inst) :
76 shader(shader), block(block), cursor(inst),
77 _dispatch_width(inst->exec_size),
78 _group(inst->group),
79 force_writemask_all(inst->force_writemask_all)
80 {
81 annotation.str = inst->annotation;
82 annotation.ir = inst->ir;
83 }
84
85 /**
86 * Construct an fs_builder that inserts instructions before \p cursor in
87 * basic block \p block, inheriting other code generation parameters
88 * from this.
89 */
90 fs_builder
at(elk_bblock_t * block,exec_node * cursor)91 at(elk_bblock_t *block, exec_node *cursor) const
92 {
93 fs_builder bld = *this;
94 bld.block = block;
95 bld.cursor = cursor;
96 return bld;
97 }
98
99 /**
100 * Construct an fs_builder appending instructions at the end of the
101 * instruction list of the shader, inheriting other code generation
102 * parameters from this.
103 */
104 fs_builder
at_end()105 at_end() const
106 {
107 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
108 }
109
110 /**
111 * Construct a builder specifying the default SIMD width and group of
112 * channel enable signals, inheriting other code generation parameters
113 * from this.
114 *
115 * \p n gives the default SIMD width, \p i gives the slot group used for
116 * predication and control flow masking in multiples of \p n channels.
117 */
118 fs_builder
group(unsigned n,unsigned i)119 group(unsigned n, unsigned i) const
120 {
121 fs_builder bld = *this;
122
123 if (n <= dispatch_width() && i < dispatch_width() / n) {
124 bld._group += i * n;
125 } else {
126 /* The requested channel group isn't a subset of the channel group
127 * of this builder, which means that the resulting instructions
128 * would use (potentially undefined) channel enable signals not
129 * specified by the parent builder. That's only valid if the
130 * instruction doesn't have per-channel semantics, in which case
131 * we should clear off the default group index in order to prevent
132 * emitting instructions with channel group not aligned to their
133 * own execution size.
134 */
135 assert(force_writemask_all);
136 bld._group = 0;
137 }
138
139 bld._dispatch_width = n;
140 return bld;
141 }
142
143 /**
144 * Alias for group() with width equal to eight.
145 */
146 fs_builder
quarter(unsigned i)147 quarter(unsigned i) const
148 {
149 return group(8, i);
150 }
151
152 /**
153 * Construct a builder with per-channel control flow execution masking
154 * disabled if \p b is true. If control flow execution masking is
155 * already disabled this has no effect.
156 */
157 fs_builder
158 exec_all(bool b = true) const
159 {
160 fs_builder bld = *this;
161 if (b)
162 bld.force_writemask_all = true;
163 return bld;
164 }
165
166 /**
167 * Construct a builder with the given debug annotation info.
168 */
169 fs_builder
170 annotate(const char *str, const void *ir = NULL) const
171 {
172 fs_builder bld = *this;
173 bld.annotation.str = str;
174 bld.annotation.ir = ir;
175 return bld;
176 }
177
178 /**
179 * Get the SIMD width in use.
180 */
181 unsigned
dispatch_width()182 dispatch_width() const
183 {
184 return _dispatch_width;
185 }
186
187 /**
188 * Get the channel group in use.
189 */
190 unsigned
group()191 group() const
192 {
193 return _group;
194 }
195
196 /**
197 * Allocate a virtual register of natural vector size (one for this IR)
198 * and SIMD width. \p n gives the amount of space to allocate in
199 * dispatch_width units (which is just enough space for one logical
200 * component in this IR).
201 */
202 dst_reg
203 vgrf(enum elk_reg_type type, unsigned n = 1) const
204 {
205 const unsigned unit = reg_unit(shader->devinfo);
206 assert(dispatch_width() <= 32);
207
208 if (n > 0)
209 return dst_reg(VGRF, shader->alloc.allocate(
210 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
211 unit * REG_SIZE) * unit),
212 type);
213 else
214 return retype(null_reg_ud(), type);
215 }
216
217 /**
218 * Create a null register of floating type.
219 */
220 dst_reg
null_reg_f()221 null_reg_f() const
222 {
223 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_F));
224 }
225
226 dst_reg
null_reg_df()227 null_reg_df() const
228 {
229 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_DF));
230 }
231
232 /**
233 * Create a null register of signed integer type.
234 */
235 dst_reg
null_reg_d()236 null_reg_d() const
237 {
238 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
239 }
240
241 /**
242 * Create a null register of unsigned integer type.
243 */
244 dst_reg
null_reg_ud()245 null_reg_ud() const
246 {
247 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_UD));
248 }
249
250 /**
251 * Insert an instruction into the program.
252 */
253 instruction *
emit(const instruction & inst)254 emit(const instruction &inst) const
255 {
256 return emit(new(shader->mem_ctx) instruction(inst));
257 }
258
259 /**
260 * Create and insert a nullary control instruction into the program.
261 */
262 instruction *
emit(enum elk_opcode opcode)263 emit(enum elk_opcode opcode) const
264 {
265 return emit(instruction(opcode, dispatch_width()));
266 }
267
268 /**
269 * Create and insert a nullary instruction into the program.
270 */
271 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst)272 emit(enum elk_opcode opcode, const dst_reg &dst) const
273 {
274 return emit(instruction(opcode, dispatch_width(), dst));
275 }
276
277 /**
278 * Create and insert a unary instruction into the program.
279 */
280 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0)281 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0) const
282 {
283 switch (opcode) {
284 case ELK_SHADER_OPCODE_RCP:
285 case ELK_SHADER_OPCODE_RSQ:
286 case ELK_SHADER_OPCODE_SQRT:
287 case ELK_SHADER_OPCODE_EXP2:
288 case ELK_SHADER_OPCODE_LOG2:
289 case ELK_SHADER_OPCODE_SIN:
290 case ELK_SHADER_OPCODE_COS:
291 return emit(instruction(opcode, dispatch_width(), dst,
292 fix_math_operand(src0)));
293
294 default:
295 return emit(instruction(opcode, dispatch_width(), dst, src0));
296 }
297 }
298
299 /**
300 * Create and insert a binary instruction into the program.
301 */
302 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)303 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
304 const src_reg &src1) const
305 {
306 switch (opcode) {
307 case ELK_SHADER_OPCODE_POW:
308 case ELK_SHADER_OPCODE_INT_QUOTIENT:
309 case ELK_SHADER_OPCODE_INT_REMAINDER:
310 return emit(instruction(opcode, dispatch_width(), dst,
311 fix_math_operand(src0),
312 fix_math_operand(src1)));
313
314 default:
315 return emit(instruction(opcode, dispatch_width(), dst,
316 src0, src1));
317
318 }
319 }
320
321 /**
322 * Create and insert a ternary instruction into the program.
323 */
324 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)325 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
326 const src_reg &src1, const src_reg &src2) const
327 {
328 switch (opcode) {
329 case ELK_OPCODE_BFE:
330 case ELK_OPCODE_BFI2:
331 case ELK_OPCODE_MAD:
332 case ELK_OPCODE_LRP:
333 return emit(instruction(opcode, dispatch_width(), dst,
334 fix_3src_operand(src0),
335 fix_3src_operand(src1),
336 fix_3src_operand(src2)));
337
338 default:
339 return emit(instruction(opcode, dispatch_width(), dst,
340 src0, src1, src2));
341 }
342 }
343
344 /**
345 * Create and insert an instruction with a variable number of sources
346 * into the program.
347 */
348 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)349 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg srcs[],
350 unsigned n) const
351 {
352 /* Use the emit() methods for specific operand counts to ensure that
353 * opcode-specific operand fixups occur.
354 */
355 if (n == 2) {
356 return emit(opcode, dst, srcs[0], srcs[1]);
357 } else if (n == 3) {
358 return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
359 } else {
360 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
361 }
362 }
363
364 /**
365 * Insert a preallocated instruction into the program.
366 */
367 instruction *
emit(instruction * inst)368 emit(instruction *inst) const
369 {
370 assert(inst->exec_size <= 32);
371 assert(inst->exec_size == dispatch_width() ||
372 force_writemask_all);
373
374 inst->group = _group;
375 inst->force_writemask_all = force_writemask_all;
376 inst->annotation = annotation.str;
377 inst->ir = annotation.ir;
378
379 if (block)
380 static_cast<instruction *>(cursor)->insert_before(block, inst);
381 else
382 cursor->insert_before(inst);
383
384 return inst;
385 }
386
387 /**
388 * Select \p src0 if the comparison of both sources with the given
389 * conditional mod evaluates to true, otherwise select \p src1.
390 *
391 * Generally useful to get the minimum or maximum of two values.
392 */
393 instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod mod)394 emit_minmax(const dst_reg &dst, const src_reg &src0,
395 const src_reg &src1, elk_conditional_mod mod) const
396 {
397 assert(mod == ELK_CONDITIONAL_GE || mod == ELK_CONDITIONAL_L);
398
399 /* In some cases we can't have bytes as operand for src1, so use the
400 * same type for both operand.
401 */
402 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
403 fix_unsigned_negate(src1)));
404 }
405
406 /**
407 * Copy any live channel from \p src to the first channel of the result.
408 */
409 src_reg
emit_uniformize(const src_reg & src)410 emit_uniformize(const src_reg &src) const
411 {
412 /* FIXME: We use a vector chan_index and dst to allow constant and
413 * copy propagration to move result all the way into the consuming
414 * instruction (typically a surface index or sampler index for a
415 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
416 * dispatch. Once we teach const/copy propagation about scalars we
417 * should go back to scalar destinations here.
418 */
419 const fs_builder ubld = exec_all();
420 const dst_reg chan_index = vgrf(ELK_REGISTER_TYPE_UD);
421 const dst_reg dst = vgrf(src.type);
422
423 ubld.emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
424 ubld.emit(ELK_SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
425
426 return src_reg(component(dst, 0));
427 }
428
429 src_reg
move_to_vgrf(const src_reg & src,unsigned num_components)430 move_to_vgrf(const src_reg &src, unsigned num_components) const
431 {
432 src_reg *const src_comps = new src_reg[num_components];
433 for (unsigned i = 0; i < num_components; i++)
434 src_comps[i] = offset(src, dispatch_width(), i);
435
436 const dst_reg dst = vgrf(src.type, num_components);
437 LOAD_PAYLOAD(dst, src_comps, num_components, 0);
438
439 delete[] src_comps;
440
441 return src_reg(dst);
442 }
443
444 void
emit_scan_step(enum elk_opcode opcode,elk_conditional_mod mod,const dst_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)445 emit_scan_step(enum elk_opcode opcode, elk_conditional_mod mod,
446 const dst_reg &tmp,
447 unsigned left_offset, unsigned left_stride,
448 unsigned right_offset, unsigned right_stride) const
449 {
450 dst_reg left, right;
451 left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
452 right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
453 if ((tmp.type == ELK_REGISTER_TYPE_Q ||
454 tmp.type == ELK_REGISTER_TYPE_UQ) &&
455 !shader->devinfo->has_64bit_int) {
456 switch (opcode) {
457 case ELK_OPCODE_MUL:
458 /* This will get lowered by integer MUL lowering */
459 set_condmod(mod, emit(opcode, right, left, right));
460 break;
461
462 case ELK_OPCODE_SEL: {
463 /* In order for the comparisons to work out right, we need our
464 * comparisons to be strict.
465 */
466 assert(mod == ELK_CONDITIONAL_L || mod == ELK_CONDITIONAL_GE);
467 if (mod == ELK_CONDITIONAL_GE)
468 mod = ELK_CONDITIONAL_G;
469
470 /* We treat the bottom 32 bits as unsigned regardless of
471 * whether or not the integer as a whole is signed.
472 */
473 dst_reg right_low = subscript(right, ELK_REGISTER_TYPE_UD, 0);
474 dst_reg left_low = subscript(left, ELK_REGISTER_TYPE_UD, 0);
475
476 /* The upper bits get the same sign as the 64-bit type */
477 elk_reg_type type32 = elk_reg_type_from_bit_size(32, tmp.type);
478 dst_reg right_high = subscript(right, type32, 1);
479 dst_reg left_high = subscript(left, type32, 1);
480
481 /* Build up our comparison:
482 *
483 * l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
484 */
485 CMP(null_reg_ud(), retype(left_low, ELK_REGISTER_TYPE_UD),
486 retype(right_low, ELK_REGISTER_TYPE_UD), mod);
487 set_predicate(ELK_PREDICATE_NORMAL,
488 CMP(null_reg_ud(), left_high, right_high,
489 ELK_CONDITIONAL_EQ));
490 set_predicate_inv(ELK_PREDICATE_NORMAL, true,
491 CMP(null_reg_ud(), left_high, right_high, mod));
492
493 /* We could use selects here or we could use predicated MOVs
494 * because the destination and second source (if it were a SEL)
495 * are the same.
496 */
497 set_predicate(ELK_PREDICATE_NORMAL, MOV(right_low, left_low));
498 set_predicate(ELK_PREDICATE_NORMAL, MOV(right_high, left_high));
499 break;
500 }
501
502 default:
503 unreachable("Unsupported 64-bit scan op");
504 }
505 } else {
506 set_condmod(mod, emit(opcode, right, left, right));
507 }
508 }
509
510 void
emit_scan(enum elk_opcode opcode,const dst_reg & tmp,unsigned cluster_size,elk_conditional_mod mod)511 emit_scan(enum elk_opcode opcode, const dst_reg &tmp,
512 unsigned cluster_size, elk_conditional_mod mod) const
513 {
514 assert(dispatch_width() >= 8);
515
516 /* The instruction splitting code isn't advanced enough to split
517 * these so we need to handle that ourselves.
518 */
519 if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
520 const unsigned half_width = dispatch_width() / 2;
521 const fs_builder ubld = exec_all().group(half_width, 0);
522 dst_reg left = tmp;
523 dst_reg right = horiz_offset(tmp, half_width);
524 ubld.emit_scan(opcode, left, cluster_size, mod);
525 ubld.emit_scan(opcode, right, cluster_size, mod);
526 if (cluster_size > half_width) {
527 ubld.emit_scan_step(opcode, mod, tmp,
528 half_width - 1, 0, half_width, 1);
529 }
530 return;
531 }
532
533 if (cluster_size > 1) {
534 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
535 ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
536 }
537
538 if (cluster_size > 2) {
539 if (type_sz(tmp.type) <= 4) {
540 const fs_builder ubld =
541 exec_all().group(dispatch_width() / 4, 0);
542 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
543 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
544 } else {
545 /* For 64-bit types, we have to do things differently because
546 * the code above would land us with destination strides that
547 * the hardware can't handle. Fortunately, we'll only be
548 * 8-wide in that case and it's the same number of
549 * instructions.
550 */
551 const fs_builder ubld = exec_all().group(2, 0);
552 for (unsigned i = 0; i < dispatch_width(); i += 4)
553 ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
554 }
555 }
556
557 for (unsigned i = 4;
558 i < MIN2(cluster_size, dispatch_width());
559 i *= 2) {
560 const fs_builder ubld = exec_all().group(i, 0);
561 ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
562
563 if (dispatch_width() > i * 2)
564 ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
565
566 if (dispatch_width() > i * 4) {
567 ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
568 ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
569 }
570 }
571 }
572
573 instruction *
emit_undef_for_dst(const instruction * old_inst)574 emit_undef_for_dst(const instruction *old_inst) const
575 {
576 assert(old_inst->dst.file == VGRF);
577 instruction *inst = emit(ELK_SHADER_OPCODE_UNDEF,
578 retype(old_inst->dst, ELK_REGISTER_TYPE_UD));
579 inst->size_written = old_inst->size_written;
580
581 return inst;
582 }
583
584 /**
585 * Assorted arithmetic ops.
586 * @{
587 */
588 #define ALU1(op) \
589 instruction * \
590 op(const dst_reg &dst, const src_reg &src0) const \
591 { \
592 return emit(ELK_OPCODE_##op, dst, src0); \
593 }
594
595 #define ALU2(op) \
596 instruction * \
597 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
598 { \
599 return emit(ELK_OPCODE_##op, dst, src0, src1); \
600 }
601
602 #define ALU2_ACC(op) \
603 instruction * \
604 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
605 { \
606 instruction *inst = emit(ELK_OPCODE_##op, dst, src0, src1); \
607 inst->writes_accumulator = true; \
608 return inst; \
609 }
610
611 #define ALU3(op) \
612 instruction * \
613 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
614 const src_reg &src2) const \
615 { \
616 return emit(ELK_OPCODE_##op, dst, src0, src1, src2); \
617 }
618
619 ALU2(ADD)
ALU2_ACC(ADDC)620 ALU2_ACC(ADDC)
621 ALU2(AND)
622 ALU2(ASR)
623 ALU2(AVG)
624 ALU3(BFE)
625 ALU2(BFI1)
626 ALU3(BFI2)
627 ALU1(BFREV)
628 ALU1(CBIT)
629 ALU1(DIM)
630 ALU2(DP2)
631 ALU2(DP3)
632 ALU2(DP4)
633 ALU2(DPH)
634 ALU1(FBH)
635 ALU1(FBL)
636 ALU1(FRC)
637 ALU2(LINE)
638 ALU1(LZD)
639 ALU2(MAC)
640 ALU2_ACC(MACH)
641 ALU3(MAD)
642 ALU1(MOV)
643 ALU2(MUL)
644 ALU1(NOT)
645 ALU2(OR)
646 ALU2(PLN)
647 ALU1(RNDD)
648 ALU1(RNDE)
649 ALU1(RNDU)
650 ALU1(RNDZ)
651 ALU2(SAD2)
652 ALU2_ACC(SADA2)
653 ALU2(SEL)
654 ALU2(SHL)
655 ALU2(SHR)
656 ALU2_ACC(SUBB)
657 ALU2(XOR)
658
659 #undef ALU3
660 #undef ALU2_ACC
661 #undef ALU2
662 #undef ALU1
663
664 instruction *
665 F32TO16(const dst_reg &dst, const src_reg &src) const
666 {
667 assert(dst.type == ELK_REGISTER_TYPE_HF);
668 assert(src.type == ELK_REGISTER_TYPE_F);
669
670 if (shader->devinfo->ver >= 8) {
671 return MOV(dst, src);
672 } else {
673 assert(shader->devinfo->ver == 7);
674 return emit(ELK_OPCODE_F32TO16,
675 retype(dst, ELK_REGISTER_TYPE_W), src);
676 }
677 }
678
679 instruction *
F16TO32(const dst_reg & dst,const src_reg & src)680 F16TO32(const dst_reg &dst, const src_reg &src) const
681 {
682 assert(dst.type == ELK_REGISTER_TYPE_F);
683 assert(src.type == ELK_REGISTER_TYPE_HF);
684
685 if (shader->devinfo->ver >= 8) {
686 return MOV(dst, src);
687 } else {
688 assert(shader->devinfo->ver == 7);
689 return emit(ELK_OPCODE_F16TO32,
690 dst, retype(src, ELK_REGISTER_TYPE_W));
691 }
692 }
693 /** @} */
694
695 /**
696 * CMP: Sets the low bit of the destination channels with the result
697 * of the comparison, while the upper bits are undefined, and updates
698 * the flag register with the packed 16 bits of the result.
699 */
700 instruction *
CMP(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod condition)701 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
702 elk_conditional_mod condition) const
703 {
704 /* Take the instruction:
705 *
706 * CMP null<d> src0<f> src1<f>
707 *
708 * Original gfx4 does type conversion to the destination type
709 * before comparison, producing garbage results for floating
710 * point comparisons.
711 *
712 * The destination type doesn't matter on newer generations,
713 * so we set the type to match src0 so we can compact the
714 * instruction.
715 */
716 return set_condmod(condition,
717 emit(ELK_OPCODE_CMP, retype(dst, src0.type),
718 fix_unsigned_negate(src0),
719 fix_unsigned_negate(src1)));
720 }
721
722 /**
723 * CMPN: Behaves like CMP, but produces true if src1 is NaN.
724 */
725 instruction *
CMPN(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod condition)726 CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
727 elk_conditional_mod condition) const
728 {
729 /* Take the instruction:
730 *
731 * CMP null<d> src0<f> src1<f>
732 *
733 * Original gfx4 does type conversion to the destination type
734 * before comparison, producing garbage results for floating
735 * point comparisons.
736 *
737 * The destination type doesn't matter on newer generations,
738 * so we set the type to match src0 so we can compact the
739 * instruction.
740 */
741 return set_condmod(condition,
742 emit(ELK_OPCODE_CMPN, retype(dst, src0.type),
743 fix_unsigned_negate(src0),
744 fix_unsigned_negate(src1)));
745 }
746
747 /**
748 * Gfx4 predicated IF.
749 */
750 instruction *
IF(elk_predicate predicate)751 IF(elk_predicate predicate) const
752 {
753 return set_predicate(predicate, emit(ELK_OPCODE_IF));
754 }
755
756 /**
757 * CSEL: dst = src2 <op> 0.0f ? src0 : src1
758 */
759 instruction *
CSEL(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,elk_conditional_mod condition)760 CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
761 const src_reg &src2, elk_conditional_mod condition) const
762 {
763 /* CSEL only operates on floats, so we can't do integer </<=/>=/>
764 * comparisons. Zero/non-zero (== and !=) comparisons almost work.
765 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
766 */
767 assert(src2.type == ELK_REGISTER_TYPE_F);
768
769 return set_condmod(condition,
770 emit(ELK_OPCODE_CSEL,
771 retype(dst, ELK_REGISTER_TYPE_F),
772 retype(src0, ELK_REGISTER_TYPE_F),
773 retype(src1, ELK_REGISTER_TYPE_F),
774 src2));
775 }
776
777 /**
778 * Emit a linear interpolation instruction.
779 */
780 instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)781 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
782 const src_reg &a) const
783 {
784 if (shader->devinfo->ver >= 6) {
785 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
786 * we need to reorder the operands.
787 */
788 return emit(ELK_OPCODE_LRP, dst, a, y, x);
789
790 } else {
791 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
792 const dst_reg y_times_a = vgrf(dst.type);
793 const dst_reg one_minus_a = vgrf(dst.type);
794 const dst_reg x_times_one_minus_a = vgrf(dst.type);
795
796 MUL(y_times_a, y, a);
797 ADD(one_minus_a, negate(a), elk_imm_f(1.0f));
798 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
799 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
800 }
801 }
802
803 /**
804 * Collect a number of registers in a contiguous range of registers.
805 */
806 instruction *
LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)807 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
808 unsigned sources, unsigned header_size) const
809 {
810 instruction *inst = emit(ELK_SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
811 inst->header_size = header_size;
812 inst->size_written = header_size * REG_SIZE;
813 for (unsigned i = header_size; i < sources; i++) {
814 inst->size_written += dispatch_width() * type_sz(src[i].type) *
815 dst.stride;
816 }
817
818 return inst;
819 }
820
821 instruction *
UNDEF(const dst_reg & dst)822 UNDEF(const dst_reg &dst) const
823 {
824 assert(dst.file == VGRF);
825 assert(dst.offset % REG_SIZE == 0);
826 instruction *inst = emit(ELK_SHADER_OPCODE_UNDEF,
827 retype(dst, ELK_REGISTER_TYPE_UD));
828 inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
829
830 return inst;
831 }
832
833 elk_fs_visitor *shader;
834
BREAK()835 elk_fs_inst *BREAK() { return emit(ELK_OPCODE_BREAK); }
DO()836 elk_fs_inst *DO() { return emit(ELK_OPCODE_DO); }
ENDIF()837 elk_fs_inst *ENDIF() { return emit(ELK_OPCODE_ENDIF); }
NOP()838 elk_fs_inst *NOP() { return emit(ELK_OPCODE_NOP); }
WHILE()839 elk_fs_inst *WHILE() { return emit(ELK_OPCODE_WHILE); }
CONTINUE()840 elk_fs_inst *CONTINUE() { return emit(ELK_OPCODE_CONTINUE); }
841
842 private:
843 /**
844 * Workaround for negation of UD registers. See comment in
845 * elk_fs_generator::generate_code() for more details.
846 */
847 src_reg
fix_unsigned_negate(const src_reg & src)848 fix_unsigned_negate(const src_reg &src) const
849 {
850 if (src.type == ELK_REGISTER_TYPE_UD &&
851 src.negate) {
852 dst_reg temp = vgrf(ELK_REGISTER_TYPE_UD);
853 MOV(temp, src);
854 return src_reg(temp);
855 } else {
856 return src;
857 }
858 }
859
860 /**
861 * Workaround for source register modes not supported by the ternary
862 * instruction encoding.
863 */
864 src_reg
fix_3src_operand(const src_reg & src)865 fix_3src_operand(const src_reg &src) const
866 {
867 switch (src.file) {
868 case FIXED_GRF:
869 /* FINISHME: Could handle scalar region, other stride=1 regions */
870 if (src.vstride != ELK_VERTICAL_STRIDE_8 ||
871 src.width != ELK_WIDTH_8 ||
872 src.hstride != ELK_HORIZONTAL_STRIDE_1)
873 break;
874 FALLTHROUGH;
875 case ATTR:
876 case VGRF:
877 case UNIFORM:
878 case IMM:
879 return src;
880 default:
881 break;
882 }
883
884 dst_reg expanded = vgrf(src.type);
885 MOV(expanded, src);
886 return expanded;
887 }
888
889 /**
890 * Workaround for source register modes not supported by the math
891 * instruction.
892 */
893 src_reg
fix_math_operand(const src_reg & src)894 fix_math_operand(const src_reg &src) const
895 {
896 /* Can't do hstride == 0 args on gfx6 math, so expand it out. We
897 * might be able to do better by doing execsize = 1 math and then
898 * expanding that result out, but we would need to be careful with
899 * masking.
900 *
901 * Gfx6 hardware ignores source modifiers (negate and abs) on math
902 * instructions, so we also move to a temp to set those up.
903 *
904 * Gfx7 relaxes most of the above restrictions, but still can't use IMM
905 * operands to math
906 */
907 if ((shader->devinfo->ver == 6 &&
908 (src.file == IMM || src.file == UNIFORM ||
909 src.abs || src.negate)) ||
910 (shader->devinfo->ver == 7 && src.file == IMM)) {
911 const dst_reg tmp = vgrf(src.type);
912 MOV(tmp, src);
913 return tmp;
914 } else {
915 return src;
916 }
917 }
918
919 elk_bblock_t *block;
920 exec_node *cursor;
921
922 unsigned _dispatch_width;
923 unsigned _group;
924 bool force_writemask_all;
925
926 /** Debug annotation info. */
927 struct {
928 const char *str;
929 const void *ir;
930 } annotation;
931 };
932 }
933
934 static inline elk_fs_reg
offset(const elk_fs_reg & reg,const elk::fs_builder & bld,unsigned delta)935 offset(const elk_fs_reg ®, const elk::fs_builder &bld, unsigned delta)
936 {
937 return offset(reg, bld.dispatch_width(), delta);
938 }
939
940 #endif
941