xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_lower_regioning.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_fs_builder.h"
27 
28 using namespace brw;
29 
30 namespace {
31    /* From the SKL PRM Vol 2a, "Move":
32     *
33     * "A mov with the same source and destination type, no source modifier,
34     *  and no saturation is a raw move. A packed byte destination region (B
35     *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36     *  using raw move."
37     */
38    bool
is_byte_raw_mov(const fs_inst * inst)39    is_byte_raw_mov(const fs_inst *inst)
40    {
41       return brw_type_size_bytes(inst->dst.type) == 1 &&
42              inst->opcode == BRW_OPCODE_MOV &&
43              inst->src[0].type == inst->dst.type &&
44              !inst->saturate &&
45              !inst->src[0].negate &&
46              !inst->src[0].abs;
47    }
48 
49    /*
50     * Return an acceptable byte stride for the specified source of an
51     * instruction affected by a regioning restriction.
52     */
53    unsigned
required_src_byte_stride(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)54    required_src_byte_stride(const intel_device_info *devinfo, const fs_inst *inst,
55                             unsigned i)
56    {
57       if (has_dst_aligned_region_restriction(devinfo, inst)) {
58          return MAX2(brw_type_size_bytes(inst->dst.type),
59                      byte_stride(inst->dst));
60 
61       } else if (has_subdword_integer_region_restriction(devinfo, inst) &&
62                  brw_type_size_bytes(inst->src[i].type) < 4 &&
63                  byte_stride(inst->src[i]) >= 4) {
64          /* Use a stride of 32bits if possible, since that will guarantee that
65           * the copy emitted to lower this region won't be affected by the
66           * sub-dword integer region restrictions.  This may not be possible
67           * for the second source of an instruction if we're required to use
68           * packed data due to Wa_16012383669.
69           */
70          return (i == 1 ? brw_type_size_bytes(inst->src[i].type) : 4);
71 
72       } else {
73          return byte_stride(inst->src[i]);
74       }
75    }
76 
77    /*
78     * Return an acceptable byte sub-register offset for the specified source
79     * of an instruction affected by a regioning restriction.
80     */
81    unsigned
required_src_byte_offset(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)82    required_src_byte_offset(const intel_device_info *devinfo, const fs_inst *inst,
83                             unsigned i)
84    {
85       if (has_dst_aligned_region_restriction(devinfo, inst)) {
86          return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
87 
88       } else if (has_subdword_integer_region_restriction(devinfo, inst) &&
89                  brw_type_size_bytes(inst->src[i].type) < 4 &&
90                  byte_stride(inst->src[i]) >= 4) {
91          const unsigned dst_byte_stride =
92             MAX2(byte_stride(inst->dst), brw_type_size_bytes(inst->dst.type));
93          const unsigned src_byte_stride = required_src_byte_stride(devinfo, inst, i);
94          const unsigned dst_byte_offset =
95             reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
96          const unsigned src_byte_offset =
97             reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
98 
99          if (src_byte_stride > brw_type_size_bytes(inst->src[i].type)) {
100             assert(src_byte_stride >= dst_byte_stride);
101             /* The source is affected by the Xe2+ sub-dword integer regioning
102              * restrictions.  For the case of source 0 BSpec#56640 specifies a
103              * number of equations relating the source and destination
104              * sub-register numbers in all cases where a source stride of
105              * 32bits is allowed.  These equations have the form:
106              *
107              *   k * Dst.SubReg % m = Src.SubReg / l
108              *
109              * For some constants k, l and m different for each combination of
110              * source and destination types and strides.  The expression in
111              * the return statement below computes a valid source offset by
112              * inverting the equation like:
113              *
114              *   Src.SubReg = l * k * (Dst.SubReg % m)
115              *
116              * and then scaling by the element type sizes in order to get an
117              * expression in terms of byte offsets instead of sub-register
118              * numbers.  It can be easily verified that in all cases listed on
119              * the hardware spec where the source has a well-defined uniform
120              * stride the product l*k is equal to the ratio between the source
121              * and destination strides.
122              */
123             const unsigned m = 64 * dst_byte_stride / src_byte_stride;
124             return dst_byte_offset % m * src_byte_stride / dst_byte_stride;
125          } else {
126             assert(src_byte_stride == brw_type_size_bytes(inst->src[i].type));
127             /* A packed source is required, likely due to the stricter
128              * requirements of the second source region.  The source being
129              * packed guarantees that the region of the original instruction
130              * will be valid, but the copy may break the regioning
131              * restrictions.  Do our best to try to prevent that from
132              * happening by making sure the offset of the temporary matches
133              * the original source based on the same equation above -- However
134              * that may not be sufficient if the source had a stride larger
135              * than 32bits, lowering the copy recursively may be necessary.
136              */
137             return src_byte_offset * src_byte_stride / byte_stride(inst->src[i]);
138          }
139 
140       } else {
141          return reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
142       }
143    }
144 
145    /*
146     * Return an acceptable byte stride for the destination of an instruction
147     * that requires it to have some particular alignment.
148     */
149    unsigned
required_dst_byte_stride(const fs_inst * inst)150    required_dst_byte_stride(const fs_inst *inst)
151    {
152       if (inst->dst.is_accumulator()) {
153          /* If the destination is an accumulator, insist that we leave the
154           * stride alone.  We cannot "fix" accumulator destinations by writing
155           * to a temporary and emitting a MOV into the original destination.
156           * For multiply instructions (our one use of the accumulator), the
157           * MUL writes the full 66 bits of the accumulator whereas the MOV we
158           * would emit only writes 33 bits and leaves the top 33 bits
159           * undefined.
160           *
161           * It's safe to just require the original stride here because the
162           * lowering pass will detect the mismatch in has_invalid_src_region
163           * and fix the sources of the multiply instead of the destination.
164           */
165          return inst->dst.hstride * brw_type_size_bytes(inst->dst.type);
166       } else if (brw_type_size_bytes(inst->dst.type) < get_exec_type_size(inst) &&
167           !is_byte_raw_mov(inst)) {
168          return get_exec_type_size(inst);
169       } else {
170          /* Calculate the maximum byte stride and the minimum/maximum type
171           * size across all source and destination operands we are required to
172           * lower.
173           */
174          unsigned max_stride = inst->dst.stride * brw_type_size_bytes(inst->dst.type);
175          unsigned min_size = brw_type_size_bytes(inst->dst.type);
176          unsigned max_size = brw_type_size_bytes(inst->dst.type);
177 
178          for (unsigned i = 0; i < inst->sources; i++) {
179             if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
180                const unsigned size = brw_type_size_bytes(inst->src[i].type);
181                max_stride = MAX2(max_stride, inst->src[i].stride * size);
182                min_size = MIN2(min_size, size);
183                max_size = MAX2(max_size, size);
184             }
185          }
186 
187          /* All operands involved in lowering need to fit in the calculated
188           * stride.
189           */
190          assert(max_size <= 4 * min_size);
191 
192          /* Attempt to use the largest byte stride among all present operands,
193           * but never exceed a stride of 4 since that would lead to illegal
194           * destination regions during lowering.
195           */
196          return MIN2(max_stride, 4 * min_size);
197       }
198    }
199 
200    /*
201     * Return an acceptable byte sub-register offset for the destination of an
202     * instruction that requires it to be aligned to the sub-register offset of
203     * the sources.
204     */
205    unsigned
required_dst_byte_offset(const intel_device_info * devinfo,const fs_inst * inst)206    required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst)
207    {
208       for (unsigned i = 0; i < inst->sources; i++) {
209          if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
210             if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
211                 reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
212                return 0;
213       }
214 
215       return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
216    }
217 
218    /*
219     * Return the closest legal execution type for an instruction on
220     * the specified platform.
221     */
222    brw_reg_type
required_exec_type(const intel_device_info * devinfo,const fs_inst * inst)223    required_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
224    {
225       const brw_reg_type t = get_exec_type(inst);
226       const bool has_64bit = brw_type_is_float(t) ?
227          devinfo->has_64bit_float : devinfo->has_64bit_int;
228 
229       switch (inst->opcode) {
230       case SHADER_OPCODE_SHUFFLE:
231          /* IVB has an issue (which we found empirically) where it reads
232           * two address register components per channel for indirectly
233           * addressed 64-bit sources.
234           *
235           * From the Cherryview PRM Vol 7. "Register Region Restrictions":
236           *
237           *    "When source or destination datatype is 64b or operation is
238           *    integer DWord multiply, indirect addressing must not be
239           *    used."
240           *
241           * Work around both of the above and handle platforms that
242           * don't support 64-bit types at all.
243           */
244          if ((!devinfo->has_64bit_int ||
245               intel_device_info_is_9lp(devinfo) ||
246               devinfo->ver >= 20) && brw_type_size_bytes(t) > 4)
247             return BRW_TYPE_UD;
248          else if (has_dst_aligned_region_restriction(devinfo, inst))
249             return brw_int_type(brw_type_size_bytes(t), false);
250          else
251             return t;
252 
253       case SHADER_OPCODE_SEL_EXEC:
254          if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
255              brw_type_size_bytes(t) > 4)
256             return BRW_TYPE_UD;
257          else
258             return t;
259 
260       case SHADER_OPCODE_QUAD_SWIZZLE:
261          if (has_dst_aligned_region_restriction(devinfo, inst))
262             return brw_int_type(brw_type_size_bytes(t), false);
263          else
264             return t;
265 
266       case SHADER_OPCODE_CLUSTER_BROADCAST:
267          /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
268           *
269           *    "When source or destination datatype is 64b or operation is
270           *    integer DWord multiply, indirect addressing must not be
271           *    used."
272           *
273           * For MTL (verx10 == 125), float64 is supported, but int64 is not.
274           * Therefore we need to lower cluster broadcast using 32-bit int ops.
275           *
276           * For gfx12.5+ platforms that support int64, the register regions
277           * used by cluster broadcast aren't supported by the 64-bit pipeline.
278           *
279           * Work around the above and handle platforms that don't
280           * support 64-bit types at all.
281           */
282          if ((!has_64bit || devinfo->verx10 >= 125 ||
283               intel_device_info_is_9lp(devinfo) ||
284               devinfo->ver >= 20) && brw_type_size_bytes(t) > 4)
285             return BRW_TYPE_UD;
286          else
287             return brw_int_type(brw_type_size_bytes(t), false);
288 
289       default:
290          return t;
291       }
292    }
293 
294    /*
295     * Return whether the instruction has an unsupported channel bit layout
296     * specified for the i-th source region.
297     */
298    bool
has_invalid_src_region(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)299    has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
300                           unsigned i)
301    {
302       /* Wa_22016140776:
303        *
304        *    Scalar broadcast on HF math (packed or unpacked) must not be used.
305        *    Compiler must use a mov instruction to expand the scalar value to
306        *    a vector before using in a HF (packed or unpacked) math operation.
307        */
308       if (inst->is_math() && intel_needs_workaround(devinfo, 22016140776) &&
309           is_uniform(inst->src[i]) && inst->src[i].type == BRW_TYPE_HF) {
310          return true;
311       }
312 
313       if (is_send(inst) || inst->is_math() || inst->is_control_source(i) ||
314           inst->opcode == BRW_OPCODE_DPAS) {
315          return false;
316       }
317 
318       const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
319       const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
320 
321       return (has_dst_aligned_region_restriction(devinfo, inst) &&
322               !is_uniform(inst->src[i]) &&
323               (byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
324                src_byte_offset != dst_byte_offset)) ||
325              (has_subdword_integer_region_restriction(devinfo, inst) &&
326               (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) ||
327                src_byte_offset != required_src_byte_offset(devinfo, inst, i)));
328    }
329 
330    /*
331     * Return whether the instruction has an unsupported channel bit layout
332     * specified for the destination region.
333     */
334    bool
has_invalid_dst_region(const intel_device_info * devinfo,const fs_inst * inst)335    has_invalid_dst_region(const intel_device_info *devinfo,
336                           const fs_inst *inst)
337    {
338       if (is_send(inst) || inst->is_math()) {
339          return false;
340       } else {
341          const brw_reg_type exec_type = get_exec_type(inst);
342          const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
343          const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
344             brw_type_size_bytes(inst->dst.type) < brw_type_size_bytes(exec_type);
345 
346          return (has_dst_aligned_region_restriction(devinfo, inst) &&
347                  (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
348                   required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
349                 (is_narrowing_conversion &&
350                  required_dst_byte_stride(inst) != byte_stride(inst->dst));
351       }
352    }
353 
354    /**
355     * Return a non-zero value if the execution type of the instruction is
356     * unsupported.  The destination and sources matching the returned mask
357     * will be bit-cast to an integer type of appropriate size, lowering any
358     * source or destination modifiers into separate MOV instructions.
359     */
360    unsigned
has_invalid_exec_type(const intel_device_info * devinfo,const fs_inst * inst)361    has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
362    {
363       if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
364          switch (inst->opcode) {
365          case SHADER_OPCODE_SHUFFLE:
366          case SHADER_OPCODE_QUAD_SWIZZLE:
367          case SHADER_OPCODE_CLUSTER_BROADCAST:
368          case SHADER_OPCODE_BROADCAST:
369          case SHADER_OPCODE_MOV_INDIRECT:
370             return 0x1;
371 
372          case SHADER_OPCODE_SEL_EXEC:
373             return 0x3;
374 
375          default:
376             unreachable("Unknown invalid execution type source mask.");
377          }
378       } else {
379          return 0;
380       }
381    }
382 
383    /**
384     * Return whether the instruction has an unsupported type conversion
385     * that must be handled by expanding the source operand.
386     */
387    bool
has_invalid_src_conversion(const intel_device_info * devinfo,const fs_inst * inst)388    has_invalid_src_conversion(const intel_device_info *devinfo,
389                               const fs_inst *inst)
390    {
391       /* Scalar byte to float conversion is not allowed on DG2+ */
392       return devinfo->verx10 >= 125 &&
393              inst->opcode == BRW_OPCODE_MOV &&
394              brw_type_is_float(inst->dst.type) &&
395              brw_type_size_bits(inst->src[0].type) == 8 &&
396              is_uniform(inst->src[0]);
397    }
398 
399    /*
400     * Return whether the instruction has unsupported source modifiers
401     * specified for the i-th source region.
402     */
403    bool
has_invalid_src_modifiers(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)404    has_invalid_src_modifiers(const intel_device_info *devinfo,
405                              const fs_inst *inst, unsigned i)
406    {
407       return (!inst->can_do_source_mods(devinfo) &&
408               (inst->src[i].negate || inst->src[i].abs)) ||
409              ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
410               (inst->src[i].negate || inst->src[i].abs ||
411                inst->src[i].type != get_exec_type(inst))) ||
412              has_invalid_src_conversion(devinfo, inst);
413    }
414 
415    /*
416     * Return whether the instruction has an unsupported type conversion
417     * specified for the destination.
418     */
419    bool
has_invalid_conversion(const intel_device_info * devinfo,const fs_inst * inst)420    has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
421    {
422       switch (inst->opcode) {
423       case BRW_OPCODE_MOV:
424          return false;
425       case BRW_OPCODE_SEL:
426          return inst->dst.type != get_exec_type(inst);
427       default:
428          /* FIXME: We assume the opcodes not explicitly mentioned before just
429           * work fine with arbitrary conversions, unless they need to be
430           * bit-cast.
431           */
432          return has_invalid_exec_type(devinfo, inst) &&
433                 inst->dst.type != get_exec_type(inst);
434       }
435    }
436 
437    /**
438     * Return whether the instruction has unsupported destination modifiers.
439     */
440    bool
has_invalid_dst_modifiers(const intel_device_info * devinfo,const fs_inst * inst)441    has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
442    {
443       return (has_invalid_exec_type(devinfo, inst) &&
444               (inst->saturate || inst->conditional_mod)) ||
445              has_invalid_conversion(devinfo, inst);
446    }
447 
448    /**
449     * Return whether the instruction has non-standard semantics for the
450     * conditional mod which don't cause the flag register to be updated with
451     * the comparison result.
452     */
453    bool
has_inconsistent_cmod(const fs_inst * inst)454    has_inconsistent_cmod(const fs_inst *inst)
455    {
456       return inst->opcode == BRW_OPCODE_SEL ||
457              inst->opcode == BRW_OPCODE_CSEL ||
458              inst->opcode == BRW_OPCODE_IF ||
459              inst->opcode == BRW_OPCODE_WHILE;
460    }
461 
462    bool
463    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
464 }
465 
466 namespace brw {
467    /**
468     * Remove any modifiers from the \p i-th source region of the instruction,
469     * including negate, abs and any implicit type conversion to the execution
470     * type.  Instead any source modifiers will be implemented as a separate
471     * MOV instruction prior to the original instruction.
472     */
473    bool
lower_src_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)474    lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
475    {
476       assert(inst->components_read(i) == 1);
477       assert(v->devinfo->has_integer_dword_mul ||
478              inst->opcode != BRW_OPCODE_MUL ||
479              brw_type_is_float(get_exec_type(inst)) ||
480              MIN2(brw_type_size_bytes(inst->src[0].type), brw_type_size_bytes(inst->src[1].type)) >= 4 ||
481              brw_type_size_bytes(inst->src[i].type) == get_exec_type_size(inst));
482 
483       const fs_builder ibld(v, block, inst);
484       const brw_reg tmp = ibld.vgrf(get_exec_type(inst));
485 
486       lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
487       inst->src[i] = tmp;
488 
489       return true;
490    }
491 }
492 
493 namespace {
494    /**
495     * Remove any modifiers from the destination region of the instruction,
496     * including saturate, conditional mod and any implicit type conversion
497     * from the execution type.  Instead any destination modifiers will be
498     * implemented as a separate MOV instruction after the original
499     * instruction.
500     */
501    bool
lower_dst_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst)502    lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
503    {
504       const fs_builder ibld(v, block, inst);
505       const brw_reg_type type = get_exec_type(inst);
506       /* Not strictly necessary, but if possible use a temporary with the same
507        * channel alignment as the current destination in order to avoid
508        * violating the restrictions enforced later on by lower_src_region()
509        * and lower_dst_region(), which would introduce additional copy
510        * instructions into the program unnecessarily.
511        */
512       const unsigned stride =
513          brw_type_size_bytes(inst->dst.type) * inst->dst.stride <= brw_type_size_bytes(type) ? 1 :
514          brw_type_size_bytes(inst->dst.type) * inst->dst.stride / brw_type_size_bytes(type);
515       brw_reg tmp = ibld.vgrf(type, stride);
516       ibld.UNDEF(tmp);
517       tmp = horiz_stride(tmp, stride);
518 
519       /* Emit a MOV taking care of all the destination modifiers. */
520       fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
521       mov->saturate = inst->saturate;
522       if (!has_inconsistent_cmod(inst))
523          mov->conditional_mod = inst->conditional_mod;
524       if (inst->opcode != BRW_OPCODE_SEL) {
525          mov->predicate = inst->predicate;
526          mov->predicate_inverse = inst->predicate_inverse;
527       }
528       mov->flag_subreg = inst->flag_subreg;
529       lower_instruction(v, block, mov);
530 
531       /* Point the original instruction at the temporary, and clean up any
532        * destination modifiers.
533        */
534       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
535       inst->dst = tmp;
536       inst->size_written = inst->dst.component_size(inst->exec_size);
537       inst->saturate = false;
538       if (!has_inconsistent_cmod(inst))
539          inst->conditional_mod = BRW_CONDITIONAL_NONE;
540 
541       assert(!inst->flags_written(v->devinfo) || !mov->predicate);
542       return true;
543    }
544 
545    /**
546     * Remove any non-trivial shuffling of data from the \p i-th source region
547     * of the instruction.  Instead implement the region as a series of integer
548     * copies into a temporary with the same channel layout as the destination.
549     */
550    bool
lower_src_region(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)551    lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
552    {
553       assert(inst->components_read(i) == 1);
554       const intel_device_info *devinfo = v->devinfo;
555       const fs_builder ibld(v, block, inst);
556       const unsigned stride = required_src_byte_stride(devinfo, inst, i) /
557                               brw_type_size_bytes(inst->src[i].type);
558       assert(stride > 0);
559       /* Calculate the size of the temporary allocation manually instead of
560        * relying on the builder, since we may have to add some amount of
561        * padding mandated by the hardware for Xe2+ instructions with sub-dword
562        * integer regions.
563        */
564       const unsigned size =
565          DIV_ROUND_UP(required_src_byte_offset(v->devinfo, inst, i) +
566                       inst->exec_size * stride *
567                       brw_type_size_bytes(inst->src[i].type),
568                       reg_unit(devinfo) * REG_SIZE) * reg_unit(devinfo);
569       brw_reg tmp = brw_vgrf(v->alloc.allocate(size), inst->src[i].type);
570       ibld.UNDEF(tmp);
571       tmp = byte_offset(horiz_stride(tmp, stride),
572                         required_src_byte_offset(devinfo, inst, i));
573 
574       /* Emit a series of 32-bit integer copies with any source modifiers
575        * cleaned up (because their semantics are dependent on the type).
576        */
577       const brw_reg_type raw_type = brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4),
578                                                  false);
579       const unsigned n = brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type);
580       brw_reg raw_src = inst->src[i];
581       raw_src.negate = false;
582       raw_src.abs = false;
583 
584       for (unsigned j = 0; j < n; j++) {
585 	fs_inst *jnst = ibld.MOV(subscript(tmp, raw_type, j),
586 				 subscript(raw_src, raw_type, j));
587 	if (has_subdword_integer_region_restriction(devinfo, jnst)) {
588            /* The copy isn't guaranteed to comply with all subdword integer
589             * regioning restrictions in some cases.  Lower it recursively.
590             */
591 	   lower_instruction(v, block, jnst);
592         }
593       }
594 
595       /* Point the original instruction at the temporary, making sure to keep
596        * any source modifiers in the instruction.
597        */
598       brw_reg lower_src = tmp;
599       lower_src.negate = inst->src[i].negate;
600       lower_src.abs = inst->src[i].abs;
601       inst->src[i] = lower_src;
602 
603       return true;
604    }
605 
606    /**
607     * Remove any non-trivial shuffling of data from the destination region of
608     * the instruction.  Instead implement the region as a series of integer
609     * copies from a temporary with a channel layout compatible with the
610     * sources.
611     */
612    bool
lower_dst_region(fs_visitor * v,bblock_t * block,fs_inst * inst)613    lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
614    {
615       /* We cannot replace the result of an integer multiply which writes the
616        * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
617        * value whereas the MOV will act on only 32 or 33 bits of the
618        * accumulator.
619        */
620       assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
621              brw_type_is_float(inst->dst.type));
622 
623       const fs_builder ibld(v, block, inst);
624       const unsigned stride = required_dst_byte_stride(inst) /
625                               brw_type_size_bytes(inst->dst.type);
626       assert(stride > 0);
627       brw_reg tmp = ibld.vgrf(inst->dst.type, stride);
628       ibld.UNDEF(tmp);
629       tmp = horiz_stride(tmp, stride);
630 
631       /* Emit a series of 32-bit integer copies from the temporary into the
632        * original destination.
633        */
634       const brw_reg_type raw_type = brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4),
635                                                  false);
636       const unsigned n = brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type);
637 
638       if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
639          /* Note that in general we cannot simply predicate the copies on the
640           * same flag register as the original instruction, since it may have
641           * been overwritten by the instruction itself.  Instead initialize
642           * the temporary with the previous contents of the destination
643           * register.
644           */
645          for (unsigned j = 0; j < n; j++)
646             ibld.MOV(subscript(tmp, raw_type, j),
647                      subscript(inst->dst, raw_type, j));
648       }
649 
650       for (unsigned j = 0; j < n; j++)
651          ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
652                                         subscript(tmp, raw_type, j));
653 
654       /* If the destination was an accumulator, after lowering it will be a
655        * GRF. Clear writes_accumulator for the instruction.
656        */
657       if (inst->dst.is_accumulator())
658          inst->writes_accumulator = false;
659 
660       /* Point the original instruction at the temporary, making sure to keep
661        * any destination modifiers in the instruction.
662        */
663       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
664       inst->dst = tmp;
665       inst->size_written = inst->dst.component_size(inst->exec_size);
666 
667       return true;
668    }
669 
670    /**
671     * Change sources and destination of the instruction to an
672     * appropriate legal type, splitting the instruction into multiple
673     * ones of smaller execution type if necessary, to be used in cases
674     * where the execution type of an instruction is unsupported.
675     */
676    bool
lower_exec_type(fs_visitor * v,bblock_t * block,fs_inst * inst)677    lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
678    {
679       assert(inst->dst.type == get_exec_type(inst));
680       const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
681       const brw_reg_type raw_type = required_exec_type(v->devinfo, inst);
682       const unsigned n = get_exec_type_size(inst) / brw_type_size_bytes(raw_type);
683       const fs_builder ibld(v, block, inst);
684 
685       brw_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
686       ibld.UNDEF(tmp);
687       tmp = horiz_stride(tmp, inst->dst.stride);
688 
689       for (unsigned j = 0; j < n; j++) {
690          fs_inst sub_inst = *inst;
691 
692          for (unsigned i = 0; i < inst->sources; i++) {
693             if (mask & (1u << i)) {
694                assert(inst->src[i].type == inst->dst.type);
695                sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
696             }
697          }
698 
699          sub_inst.dst = subscript(tmp, raw_type, j);
700 
701          assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
702          assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
703          ibld.emit(sub_inst);
704 
705          fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
706                                  subscript(tmp, raw_type, j));
707          if (inst->opcode != BRW_OPCODE_SEL) {
708             mov->predicate = inst->predicate;
709             mov->predicate_inverse = inst->predicate_inverse;
710          }
711          lower_instruction(v, block, mov);
712       }
713 
714       inst->remove(block);
715 
716       return true;
717    }
718 
719    /**
720     * Legalize the source and destination regioning controls of the specified
721     * instruction.
722     */
723    bool
lower_instruction(fs_visitor * v,bblock_t * block,fs_inst * inst)724    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
725    {
726       const intel_device_info *devinfo = v->devinfo;
727       bool progress = false;
728 
729       if (has_invalid_dst_modifiers(devinfo, inst))
730          progress |= lower_dst_modifiers(v, block, inst);
731 
732       if (has_invalid_dst_region(devinfo, inst))
733          progress |= lower_dst_region(v, block, inst);
734 
735       for (unsigned i = 0; i < inst->sources; i++) {
736          if (has_invalid_src_modifiers(devinfo, inst, i))
737             progress |= lower_src_modifiers(v, block, inst, i);
738 
739          if (has_invalid_src_region(devinfo, inst, i))
740             progress |= lower_src_region(v, block, inst, i);
741       }
742 
743       if (has_invalid_exec_type(devinfo, inst))
744          progress |= lower_exec_type(v, block, inst);
745 
746       return progress;
747    }
748 }
749 
750 bool
brw_fs_lower_regioning(fs_visitor & s)751 brw_fs_lower_regioning(fs_visitor &s)
752 {
753    bool progress = false;
754 
755    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg)
756       progress |= lower_instruction(&s, block, inst);
757 
758    if (progress)
759       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
760 
761    return progress;
762 }
763