xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_fs_lower_regioning.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_fs.h"
25 #include "elk_cfg.h"
26 #include "elk_fs_builder.h"
27 
28 using namespace elk;
29 
30 namespace {
31    /* From the SKL PRM Vol 2a, "Move":
32     *
33     * "A mov with the same source and destination type, no source modifier,
34     *  and no saturation is a raw move. A packed byte destination region (B
35     *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36     *  using raw move."
37     */
38    bool
is_byte_raw_mov(const elk_fs_inst * inst)39    is_byte_raw_mov(const elk_fs_inst *inst)
40    {
41       return type_sz(inst->dst.type) == 1 &&
42              inst->opcode == ELK_OPCODE_MOV &&
43              inst->src[0].type == inst->dst.type &&
44              !inst->saturate &&
45              !inst->src[0].negate &&
46              !inst->src[0].abs;
47    }
48 
49    /*
50     * Return an acceptable byte stride for the destination of an instruction
51     * that requires it to have some particular alignment.
52     */
53    unsigned
required_dst_byte_stride(const elk_fs_inst * inst)54    required_dst_byte_stride(const elk_fs_inst *inst)
55    {
56       if (inst->dst.is_accumulator()) {
57          /* If the destination is an accumulator, insist that we leave the
58           * stride alone.  We cannot "fix" accumulator destinations by writing
59           * to a temporary and emitting a MOV into the original destination.
60           * For multiply instructions (our one use of the accumulator), the
61           * MUL writes the full 66 bits of the accumulator whereas the MOV we
62           * would emit only writes 33 bits and leaves the top 33 bits
63           * undefined.
64           *
65           * It's safe to just require the original stride here because the
66           * lowering pass will detect the mismatch in has_invalid_src_region
67           * and fix the sources of the multiply instead of the destination.
68           */
69          return inst->dst.stride * type_sz(inst->dst.type);
70       } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71           !is_byte_raw_mov(inst)) {
72          return get_exec_type_size(inst);
73       } else {
74          /* Calculate the maximum byte stride and the minimum/maximum type
75           * size across all source and destination operands we are required to
76           * lower.
77           */
78          unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
79          unsigned min_size = type_sz(inst->dst.type);
80          unsigned max_size = type_sz(inst->dst.type);
81 
82          for (unsigned i = 0; i < inst->sources; i++) {
83             if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
84                const unsigned size = type_sz(inst->src[i].type);
85                max_stride = MAX2(max_stride, inst->src[i].stride * size);
86                min_size = MIN2(min_size, size);
87                max_size = MAX2(max_size, size);
88             }
89          }
90 
91          /* All operands involved in lowering need to fit in the calculated
92           * stride.
93           */
94          assert(max_size <= 4 * min_size);
95 
96          /* Attempt to use the largest byte stride among all present operands,
97           * but never exceed a stride of 4 since that would lead to illegal
98           * destination regions during lowering.
99           */
100          return MIN2(max_stride, 4 * min_size);
101       }
102    }
103 
104    /*
105     * Return an acceptable byte sub-register offset for the destination of an
106     * instruction that requires it to be aligned to the sub-register offset of
107     * the sources.
108     */
109    unsigned
required_dst_byte_offset(const intel_device_info * devinfo,const elk_fs_inst * inst)110    required_dst_byte_offset(const intel_device_info *devinfo, const elk_fs_inst *inst)
111    {
112       for (unsigned i = 0; i < inst->sources; i++) {
113          if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
114             if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
115                 reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
116                return 0;
117       }
118 
119       return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
120    }
121 
122    /*
123     * Return the closest legal execution type for an instruction on
124     * the specified platform.
125     */
126    elk_reg_type
required_exec_type(const intel_device_info * devinfo,const elk_fs_inst * inst)127    required_exec_type(const intel_device_info *devinfo, const elk_fs_inst *inst)
128    {
129       const elk_reg_type t = get_exec_type(inst);
130       const bool has_64bit = elk_reg_type_is_floating_point(t) ?
131          devinfo->has_64bit_float : devinfo->has_64bit_int;
132 
133       switch (inst->opcode) {
134       case ELK_SHADER_OPCODE_SHUFFLE:
135          /* IVB has an issue (which we found empirically) where it reads
136           * two address register components per channel for indirectly
137           * addressed 64-bit sources.
138           *
139           * From the Cherryview PRM Vol 7. "Register Region Restrictions":
140           *
141           *    "When source or destination datatype is 64b or operation is
142           *    integer DWord multiply, indirect addressing must not be
143           *    used."
144           *
145           * Work around both of the above and handle platforms that
146           * don't support 64-bit types at all.
147           */
148          if ((!devinfo->has_64bit_int || devinfo->platform == INTEL_PLATFORM_CHV) &&
149              type_sz(t) > 4)
150             return ELK_REGISTER_TYPE_UD;
151          else if (has_dst_aligned_region_restriction(devinfo, inst))
152             return elk_int_type(type_sz(t), false);
153          else
154             return t;
155 
156       case ELK_SHADER_OPCODE_SEL_EXEC:
157          if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
158              type_sz(t) > 4)
159             return ELK_REGISTER_TYPE_UD;
160          else
161             return t;
162 
163       case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
164          if (has_dst_aligned_region_restriction(devinfo, inst))
165             return elk_int_type(type_sz(t), false);
166          else
167             return t;
168 
169       case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
170          /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
171           *
172           *    "When source or destination datatype is 64b or operation is
173           *    integer DWord multiply, indirect addressing must not be
174           *    used."
175           *
176           * Work around the above and handle platforms that don't
177           * support 64-bit types at all.
178           */
179          if ((!has_64bit || devinfo->platform == INTEL_PLATFORM_CHV) &&
180              type_sz(t) > 4)
181             return ELK_REGISTER_TYPE_UD;
182          else
183             return elk_int_type(type_sz(t), false);
184 
185       case ELK_SHADER_OPCODE_BROADCAST:
186       case ELK_SHADER_OPCODE_MOV_INDIRECT:
187          if ((devinfo->verx10 == 70 || devinfo->platform == INTEL_PLATFORM_CHV) &&
188              type_sz(inst->src[0].type) > 4)
189             return elk_int_type(type_sz(t), false);
190          else
191             return t;
192 
193       default:
194          return t;
195       }
196    }
197 
198    /*
199     * Return the stride between channels of the specified register in
200     * byte units, or ~0u if the region cannot be represented with a
201     * single one-dimensional stride.
202     */
203    unsigned
byte_stride(const elk_fs_reg & reg)204    byte_stride(const elk_fs_reg &reg)
205    {
206       switch (reg.file) {
207       case BAD_FILE:
208       case UNIFORM:
209       case IMM:
210       case VGRF:
211       case MRF:
212       case ATTR:
213          return reg.stride * type_sz(reg.type);
214       case ARF:
215       case FIXED_GRF:
216          if (reg.is_null()) {
217             return 0;
218          } else {
219             const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
220             const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
221             const unsigned width = 1 << reg.width;
222 
223             if (width == 1) {
224                return vstride * type_sz(reg.type);
225             } else if (hstride * width == vstride) {
226                return hstride * type_sz(reg.type);
227             } else {
228                return ~0u;
229             }
230          }
231       default:
232          unreachable("Invalid register file");
233       }
234    }
235 
236    /*
237     * Return whether the instruction has an unsupported channel bit layout
238     * specified for the i-th source region.
239     */
240    bool
has_invalid_src_region(const intel_device_info * devinfo,const elk_fs_inst * inst,unsigned i)241    has_invalid_src_region(const intel_device_info *devinfo, const elk_fs_inst *inst,
242                           unsigned i)
243    {
244       if (is_send(inst) || inst->is_math() || inst->is_control_source(i)) {
245          return false;
246       }
247 
248       /* Empirical testing shows that Broadwell has a bug affecting half-float
249        * MAD instructions when any of its sources has a non-zero offset, such
250        * as:
251        *
252        * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
253        *
254        * We used to generate code like this for SIMD8 executions where we
255        * used to pack components Y and W of a vector at offset 16B of a SIMD
256        * register. The problem doesn't occur if the stride of the source is 0.
257        */
258       if (devinfo->ver == 8 &&
259           inst->opcode == ELK_OPCODE_MAD &&
260           inst->src[i].type == ELK_REGISTER_TYPE_HF &&
261           reg_offset(inst->src[i]) % REG_SIZE > 0 &&
262           inst->src[i].stride != 0) {
263          return true;
264       }
265 
266       const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
267       const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
268 
269       return has_dst_aligned_region_restriction(devinfo, inst) &&
270              !is_uniform(inst->src[i]) &&
271              (byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
272               src_byte_offset != dst_byte_offset);
273    }
274 
275    /*
276     * Return whether the instruction has an unsupported channel bit layout
277     * specified for the destination region.
278     */
279    bool
has_invalid_dst_region(const intel_device_info * devinfo,const elk_fs_inst * inst)280    has_invalid_dst_region(const intel_device_info *devinfo,
281                           const elk_fs_inst *inst)
282    {
283       if (is_send(inst) || inst->is_math()) {
284          return false;
285       } else {
286          const elk_reg_type exec_type = get_exec_type(inst);
287          const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
288          const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
289             type_sz(inst->dst.type) < type_sz(exec_type);
290 
291          return (has_dst_aligned_region_restriction(devinfo, inst) &&
292                  (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
293                   required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
294                 (is_narrowing_conversion &&
295                  required_dst_byte_stride(inst) != byte_stride(inst->dst));
296       }
297    }
298 
299    /**
300     * Return a non-zero value if the execution type of the instruction is
301     * unsupported.  The destination and sources matching the returned mask
302     * will be bit-cast to an integer type of appropriate size, lowering any
303     * source or destination modifiers into separate MOV instructions.
304     */
305    unsigned
has_invalid_exec_type(const intel_device_info * devinfo,const elk_fs_inst * inst)306    has_invalid_exec_type(const intel_device_info *devinfo, const elk_fs_inst *inst)
307    {
308       if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
309          switch (inst->opcode) {
310          case ELK_SHADER_OPCODE_SHUFFLE:
311          case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
312          case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
313          case ELK_SHADER_OPCODE_BROADCAST:
314          case ELK_SHADER_OPCODE_MOV_INDIRECT:
315             return 0x1;
316 
317          case ELK_SHADER_OPCODE_SEL_EXEC:
318             return 0x3;
319 
320          default:
321             unreachable("Unknown invalid execution type source mask.");
322          }
323       } else {
324          return 0;
325       }
326    }
327 
328    /*
329     * Return whether the instruction has unsupported source modifiers
330     * specified for the i-th source region.
331     */
332    bool
has_invalid_src_modifiers(const intel_device_info * devinfo,const elk_fs_inst * inst,unsigned i)333    has_invalid_src_modifiers(const intel_device_info *devinfo,
334                              const elk_fs_inst *inst, unsigned i)
335    {
336       return (!inst->can_do_source_mods(devinfo) &&
337               (inst->src[i].negate || inst->src[i].abs)) ||
338              ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
339               (inst->src[i].negate || inst->src[i].abs ||
340                inst->src[i].type != get_exec_type(inst)));
341    }
342 
343    /*
344     * Return whether the instruction has an unsupported type conversion
345     * specified for the destination.
346     */
347    bool
has_invalid_conversion(const intel_device_info * devinfo,const elk_fs_inst * inst)348    has_invalid_conversion(const intel_device_info *devinfo, const elk_fs_inst *inst)
349    {
350       switch (inst->opcode) {
351       case ELK_OPCODE_MOV:
352          return false;
353       case ELK_OPCODE_SEL:
354          return inst->dst.type != get_exec_type(inst);
355       default:
356          /* FIXME: We assume the opcodes not explicitly mentioned before just
357           * work fine with arbitrary conversions, unless they need to be
358           * bit-cast.
359           */
360          return has_invalid_exec_type(devinfo, inst) &&
361                 inst->dst.type != get_exec_type(inst);
362       }
363    }
364 
365    /**
366     * Return whether the instruction has unsupported destination modifiers.
367     */
368    bool
has_invalid_dst_modifiers(const intel_device_info * devinfo,const elk_fs_inst * inst)369    has_invalid_dst_modifiers(const intel_device_info *devinfo, const elk_fs_inst *inst)
370    {
371       return (has_invalid_exec_type(devinfo, inst) &&
372               (inst->saturate || inst->conditional_mod)) ||
373              has_invalid_conversion(devinfo, inst);
374    }
375 
376    /**
377     * Return whether the instruction has non-standard semantics for the
378     * conditional mod which don't cause the flag register to be updated with
379     * the comparison result.
380     */
381    bool
has_inconsistent_cmod(const elk_fs_inst * inst)382    has_inconsistent_cmod(const elk_fs_inst *inst)
383    {
384       return inst->opcode == ELK_OPCODE_SEL ||
385              inst->opcode == ELK_OPCODE_CSEL ||
386              inst->opcode == ELK_OPCODE_IF ||
387              inst->opcode == ELK_OPCODE_WHILE;
388    }
389 
390    bool
391    lower_instruction(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst);
392 }
393 
394 namespace elk {
395    /**
396     * Remove any modifiers from the \p i-th source region of the instruction,
397     * including negate, abs and any implicit type conversion to the execution
398     * type.  Instead any source modifiers will be implemented as a separate
399     * MOV instruction prior to the original instruction.
400     */
401    bool
lower_src_modifiers(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst,unsigned i)402    lower_src_modifiers(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst, unsigned i)
403    {
404       assert(inst->components_read(i) == 1);
405       assert(v->devinfo->has_integer_dword_mul ||
406              inst->opcode != ELK_OPCODE_MUL ||
407              elk_reg_type_is_floating_point(get_exec_type(inst)) ||
408              MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
409              type_sz(inst->src[i].type) == get_exec_type_size(inst));
410 
411       const fs_builder ibld(v, block, inst);
412       const elk_fs_reg tmp = ibld.vgrf(get_exec_type(inst));
413 
414       lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
415       inst->src[i] = tmp;
416 
417       return true;
418    }
419 }
420 
421 namespace {
422    /**
423     * Remove any modifiers from the destination region of the instruction,
424     * including saturate, conditional mod and any implicit type conversion
425     * from the execution type.  Instead any destination modifiers will be
426     * implemented as a separate MOV instruction after the original
427     * instruction.
428     */
429    bool
lower_dst_modifiers(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)430    lower_dst_modifiers(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
431    {
432       const fs_builder ibld(v, block, inst);
433       const elk_reg_type type = get_exec_type(inst);
434       /* Not strictly necessary, but if possible use a temporary with the same
435        * channel alignment as the current destination in order to avoid
436        * violating the restrictions enforced later on by lower_src_region()
437        * and lower_dst_region(), which would introduce additional copy
438        * instructions into the program unnecessarily.
439        */
440       const unsigned stride =
441          type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
442          type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
443       elk_fs_reg tmp = ibld.vgrf(type, stride);
444       ibld.UNDEF(tmp);
445       tmp = horiz_stride(tmp, stride);
446 
447       /* Emit a MOV taking care of all the destination modifiers. */
448       elk_fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
449       mov->saturate = inst->saturate;
450       if (!has_inconsistent_cmod(inst))
451          mov->conditional_mod = inst->conditional_mod;
452       if (inst->opcode != ELK_OPCODE_SEL) {
453          mov->predicate = inst->predicate;
454          mov->predicate_inverse = inst->predicate_inverse;
455       }
456       mov->flag_subreg = inst->flag_subreg;
457       lower_instruction(v, block, mov);
458 
459       /* Point the original instruction at the temporary, and clean up any
460        * destination modifiers.
461        */
462       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
463       inst->dst = tmp;
464       inst->size_written = inst->dst.component_size(inst->exec_size);
465       inst->saturate = false;
466       if (!has_inconsistent_cmod(inst))
467          inst->conditional_mod = ELK_CONDITIONAL_NONE;
468 
469       assert(!inst->flags_written(v->devinfo) || !mov->predicate);
470       return true;
471    }
472 
473    /**
474     * Remove any non-trivial shuffling of data from the \p i-th source region
475     * of the instruction.  Instead implement the region as a series of integer
476     * copies into a temporary with the same channel layout as the destination.
477     */
478    bool
lower_src_region(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst,unsigned i)479    lower_src_region(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst, unsigned i)
480    {
481       assert(inst->components_read(i) == 1);
482       const fs_builder ibld(v, block, inst);
483       const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
484                               type_sz(inst->src[i].type);
485       assert(stride > 0);
486       elk_fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
487       ibld.UNDEF(tmp);
488       tmp = horiz_stride(tmp, stride);
489 
490       /* Emit a series of 32-bit integer copies with any source modifiers
491        * cleaned up (because their semantics are dependent on the type).
492        */
493       const elk_reg_type raw_type = elk_int_type(MIN2(type_sz(tmp.type), 4),
494                                                  false);
495       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
496       elk_fs_reg raw_src = inst->src[i];
497       raw_src.negate = false;
498       raw_src.abs = false;
499 
500       for (unsigned j = 0; j < n; j++)
501          ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
502 
503       /* Point the original instruction at the temporary, making sure to keep
504        * any source modifiers in the instruction.
505        */
506       elk_fs_reg lower_src = tmp;
507       lower_src.negate = inst->src[i].negate;
508       lower_src.abs = inst->src[i].abs;
509       inst->src[i] = lower_src;
510 
511       return true;
512    }
513 
514    /**
515     * Remove any non-trivial shuffling of data from the destination region of
516     * the instruction.  Instead implement the region as a series of integer
517     * copies from a temporary with a channel layout compatible with the
518     * sources.
519     */
520    bool
lower_dst_region(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)521    lower_dst_region(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
522    {
523       /* We cannot replace the result of an integer multiply which writes the
524        * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
525        * value whereas the MOV will act on only 32 or 33 bits of the
526        * accumulator.
527        */
528       assert(inst->opcode != ELK_OPCODE_MUL || !inst->dst.is_accumulator() ||
529              elk_reg_type_is_floating_point(inst->dst.type));
530 
531       const fs_builder ibld(v, block, inst);
532       const unsigned stride = required_dst_byte_stride(inst) /
533                               type_sz(inst->dst.type);
534       assert(stride > 0);
535       elk_fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
536       ibld.UNDEF(tmp);
537       tmp = horiz_stride(tmp, stride);
538 
539       /* Emit a series of 32-bit integer copies from the temporary into the
540        * original destination.
541        */
542       const elk_reg_type raw_type = elk_int_type(MIN2(type_sz(tmp.type), 4),
543                                                  false);
544       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
545 
546       if (inst->predicate && inst->opcode != ELK_OPCODE_SEL) {
547          /* Note that in general we cannot simply predicate the copies on the
548           * same flag register as the original instruction, since it may have
549           * been overwritten by the instruction itself.  Instead initialize
550           * the temporary with the previous contents of the destination
551           * register.
552           */
553          for (unsigned j = 0; j < n; j++)
554             ibld.MOV(subscript(tmp, raw_type, j),
555                      subscript(inst->dst, raw_type, j));
556       }
557 
558       for (unsigned j = 0; j < n; j++)
559          ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
560                                         subscript(tmp, raw_type, j));
561 
562       /* Point the original instruction at the temporary, making sure to keep
563        * any destination modifiers in the instruction.
564        */
565       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
566       inst->dst = tmp;
567       inst->size_written = inst->dst.component_size(inst->exec_size);
568 
569       return true;
570    }
571 
572    /**
573     * Change sources and destination of the instruction to an
574     * appropriate legal type, splitting the instruction into multiple
575     * ones of smaller execution type if necessary, to be used in cases
576     * where the execution type of an instruction is unsupported.
577     */
578    bool
lower_exec_type(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)579    lower_exec_type(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
580    {
581       assert(inst->dst.type == get_exec_type(inst));
582       const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
583       const elk_reg_type raw_type = required_exec_type(v->devinfo, inst);
584       const unsigned n = get_exec_type_size(inst) / type_sz(raw_type);
585       const fs_builder ibld(v, block, inst);
586 
587       elk_fs_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
588       ibld.UNDEF(tmp);
589       tmp = horiz_stride(tmp, inst->dst.stride);
590 
591       for (unsigned j = 0; j < n; j++) {
592          elk_fs_inst sub_inst = *inst;
593 
594          for (unsigned i = 0; i < inst->sources; i++) {
595             if (mask & (1u << i)) {
596                assert(inst->src[i].type == inst->dst.type);
597                sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
598             }
599          }
600 
601          sub_inst.dst = subscript(tmp, raw_type, j);
602 
603          assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
604          assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
605          ibld.emit(sub_inst);
606 
607          elk_fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
608                                  subscript(tmp, raw_type, j));
609          if (inst->opcode != ELK_OPCODE_SEL) {
610             mov->predicate = inst->predicate;
611             mov->predicate_inverse = inst->predicate_inverse;
612          }
613          lower_instruction(v, block, mov);
614       }
615 
616       inst->remove(block);
617 
618       return true;
619    }
620 
621    /**
622     * Legalize the source and destination regioning controls of the specified
623     * instruction.
624     */
625    bool
lower_instruction(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)626    lower_instruction(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
627    {
628       const intel_device_info *devinfo = v->devinfo;
629       bool progress = false;
630 
631       if (has_invalid_dst_modifiers(devinfo, inst))
632          progress |= lower_dst_modifiers(v, block, inst);
633 
634       if (has_invalid_dst_region(devinfo, inst))
635          progress |= lower_dst_region(v, block, inst);
636 
637       for (unsigned i = 0; i < inst->sources; i++) {
638          if (has_invalid_src_modifiers(devinfo, inst, i))
639             progress |= lower_src_modifiers(v, block, inst, i);
640 
641          if (has_invalid_src_region(devinfo, inst, i))
642             progress |= lower_src_region(v, block, inst, i);
643       }
644 
645       if (has_invalid_exec_type(devinfo, inst))
646          progress |= lower_exec_type(v, block, inst);
647 
648       return progress;
649    }
650 }
651 
652 bool
lower_regioning()653 elk_fs_visitor::lower_regioning()
654 {
655    bool progress = false;
656 
657    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg)
658       progress |= lower_instruction(this, block, inst);
659 
660    if (progress)
661       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
662 
663    return progress;
664 }
665