1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_fs_builder.h"
27
28 using namespace brw;
29
30 namespace {
31 /* From the SKL PRM Vol 2a, "Move":
32 *
33 * "A mov with the same source and destination type, no source modifier,
34 * and no saturation is a raw move. A packed byte destination region (B
35 * or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36 * using raw move."
37 */
38 bool
is_byte_raw_mov(const fs_inst * inst)39 is_byte_raw_mov(const fs_inst *inst)
40 {
41 return brw_type_size_bytes(inst->dst.type) == 1 &&
42 inst->opcode == BRW_OPCODE_MOV &&
43 inst->src[0].type == inst->dst.type &&
44 !inst->saturate &&
45 !inst->src[0].negate &&
46 !inst->src[0].abs;
47 }
48
49 /*
50 * Return an acceptable byte stride for the specified source of an
51 * instruction affected by a regioning restriction.
52 */
53 unsigned
required_src_byte_stride(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)54 required_src_byte_stride(const intel_device_info *devinfo, const fs_inst *inst,
55 unsigned i)
56 {
57 if (has_dst_aligned_region_restriction(devinfo, inst)) {
58 return MAX2(brw_type_size_bytes(inst->dst.type),
59 byte_stride(inst->dst));
60
61 } else if (has_subdword_integer_region_restriction(devinfo, inst) &&
62 brw_type_size_bytes(inst->src[i].type) < 4 &&
63 byte_stride(inst->src[i]) >= 4) {
64 /* Use a stride of 32bits if possible, since that will guarantee that
65 * the copy emitted to lower this region won't be affected by the
66 * sub-dword integer region restrictions. This may not be possible
67 * for the second source of an instruction if we're required to use
68 * packed data due to Wa_16012383669.
69 */
70 return (i == 1 ? brw_type_size_bytes(inst->src[i].type) : 4);
71
72 } else {
73 return byte_stride(inst->src[i]);
74 }
75 }
76
77 /*
78 * Return an acceptable byte sub-register offset for the specified source
79 * of an instruction affected by a regioning restriction.
80 */
81 unsigned
required_src_byte_offset(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)82 required_src_byte_offset(const intel_device_info *devinfo, const fs_inst *inst,
83 unsigned i)
84 {
85 if (has_dst_aligned_region_restriction(devinfo, inst)) {
86 return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
87
88 } else if (has_subdword_integer_region_restriction(devinfo, inst) &&
89 brw_type_size_bytes(inst->src[i].type) < 4 &&
90 byte_stride(inst->src[i]) >= 4) {
91 const unsigned dst_byte_stride =
92 MAX2(byte_stride(inst->dst), brw_type_size_bytes(inst->dst.type));
93 const unsigned src_byte_stride = required_src_byte_stride(devinfo, inst, i);
94 const unsigned dst_byte_offset =
95 reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
96 const unsigned src_byte_offset =
97 reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
98
99 if (src_byte_stride > brw_type_size_bytes(inst->src[i].type)) {
100 assert(src_byte_stride >= dst_byte_stride);
101 /* The source is affected by the Xe2+ sub-dword integer regioning
102 * restrictions. For the case of source 0 BSpec#56640 specifies a
103 * number of equations relating the source and destination
104 * sub-register numbers in all cases where a source stride of
105 * 32bits is allowed. These equations have the form:
106 *
107 * k * Dst.SubReg % m = Src.SubReg / l
108 *
109 * For some constants k, l and m different for each combination of
110 * source and destination types and strides. The expression in
111 * the return statement below computes a valid source offset by
112 * inverting the equation like:
113 *
114 * Src.SubReg = l * k * (Dst.SubReg % m)
115 *
116 * and then scaling by the element type sizes in order to get an
117 * expression in terms of byte offsets instead of sub-register
118 * numbers. It can be easily verified that in all cases listed on
119 * the hardware spec where the source has a well-defined uniform
120 * stride the product l*k is equal to the ratio between the source
121 * and destination strides.
122 */
123 const unsigned m = 64 * dst_byte_stride / src_byte_stride;
124 return dst_byte_offset % m * src_byte_stride / dst_byte_stride;
125 } else {
126 assert(src_byte_stride == brw_type_size_bytes(inst->src[i].type));
127 /* A packed source is required, likely due to the stricter
128 * requirements of the second source region. The source being
129 * packed guarantees that the region of the original instruction
130 * will be valid, but the copy may break the regioning
131 * restrictions. Do our best to try to prevent that from
132 * happening by making sure the offset of the temporary matches
133 * the original source based on the same equation above -- However
134 * that may not be sufficient if the source had a stride larger
135 * than 32bits, lowering the copy recursively may be necessary.
136 */
137 return src_byte_offset * src_byte_stride / byte_stride(inst->src[i]);
138 }
139
140 } else {
141 return reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
142 }
143 }
144
145 /*
146 * Return an acceptable byte stride for the destination of an instruction
147 * that requires it to have some particular alignment.
148 */
149 unsigned
required_dst_byte_stride(const fs_inst * inst)150 required_dst_byte_stride(const fs_inst *inst)
151 {
152 if (inst->dst.is_accumulator()) {
153 /* If the destination is an accumulator, insist that we leave the
154 * stride alone. We cannot "fix" accumulator destinations by writing
155 * to a temporary and emitting a MOV into the original destination.
156 * For multiply instructions (our one use of the accumulator), the
157 * MUL writes the full 66 bits of the accumulator whereas the MOV we
158 * would emit only writes 33 bits and leaves the top 33 bits
159 * undefined.
160 *
161 * It's safe to just require the original stride here because the
162 * lowering pass will detect the mismatch in has_invalid_src_region
163 * and fix the sources of the multiply instead of the destination.
164 */
165 return inst->dst.hstride * brw_type_size_bytes(inst->dst.type);
166 } else if (brw_type_size_bytes(inst->dst.type) < get_exec_type_size(inst) &&
167 !is_byte_raw_mov(inst)) {
168 return get_exec_type_size(inst);
169 } else {
170 /* Calculate the maximum byte stride and the minimum/maximum type
171 * size across all source and destination operands we are required to
172 * lower.
173 */
174 unsigned max_stride = inst->dst.stride * brw_type_size_bytes(inst->dst.type);
175 unsigned min_size = brw_type_size_bytes(inst->dst.type);
176 unsigned max_size = brw_type_size_bytes(inst->dst.type);
177
178 for (unsigned i = 0; i < inst->sources; i++) {
179 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
180 const unsigned size = brw_type_size_bytes(inst->src[i].type);
181 max_stride = MAX2(max_stride, inst->src[i].stride * size);
182 min_size = MIN2(min_size, size);
183 max_size = MAX2(max_size, size);
184 }
185 }
186
187 /* All operands involved in lowering need to fit in the calculated
188 * stride.
189 */
190 assert(max_size <= 4 * min_size);
191
192 /* Attempt to use the largest byte stride among all present operands,
193 * but never exceed a stride of 4 since that would lead to illegal
194 * destination regions during lowering.
195 */
196 return MIN2(max_stride, 4 * min_size);
197 }
198 }
199
200 /*
201 * Return an acceptable byte sub-register offset for the destination of an
202 * instruction that requires it to be aligned to the sub-register offset of
203 * the sources.
204 */
205 unsigned
required_dst_byte_offset(const intel_device_info * devinfo,const fs_inst * inst)206 required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst)
207 {
208 for (unsigned i = 0; i < inst->sources; i++) {
209 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
210 if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
211 reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
212 return 0;
213 }
214
215 return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
216 }
217
218 /*
219 * Return the closest legal execution type for an instruction on
220 * the specified platform.
221 */
222 brw_reg_type
required_exec_type(const intel_device_info * devinfo,const fs_inst * inst)223 required_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
224 {
225 const brw_reg_type t = get_exec_type(inst);
226 const bool has_64bit = brw_type_is_float(t) ?
227 devinfo->has_64bit_float : devinfo->has_64bit_int;
228
229 switch (inst->opcode) {
230 case SHADER_OPCODE_SHUFFLE:
231 /* IVB has an issue (which we found empirically) where it reads
232 * two address register components per channel for indirectly
233 * addressed 64-bit sources.
234 *
235 * From the Cherryview PRM Vol 7. "Register Region Restrictions":
236 *
237 * "When source or destination datatype is 64b or operation is
238 * integer DWord multiply, indirect addressing must not be
239 * used."
240 *
241 * Work around both of the above and handle platforms that
242 * don't support 64-bit types at all.
243 */
244 if ((!devinfo->has_64bit_int ||
245 intel_device_info_is_9lp(devinfo) ||
246 devinfo->ver >= 20) && brw_type_size_bytes(t) > 4)
247 return BRW_TYPE_UD;
248 else if (has_dst_aligned_region_restriction(devinfo, inst))
249 return brw_int_type(brw_type_size_bytes(t), false);
250 else
251 return t;
252
253 case SHADER_OPCODE_SEL_EXEC:
254 if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
255 brw_type_size_bytes(t) > 4)
256 return BRW_TYPE_UD;
257 else
258 return t;
259
260 case SHADER_OPCODE_QUAD_SWIZZLE:
261 if (has_dst_aligned_region_restriction(devinfo, inst))
262 return brw_int_type(brw_type_size_bytes(t), false);
263 else
264 return t;
265
266 case SHADER_OPCODE_CLUSTER_BROADCAST:
267 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
268 *
269 * "When source or destination datatype is 64b or operation is
270 * integer DWord multiply, indirect addressing must not be
271 * used."
272 *
273 * For MTL (verx10 == 125), float64 is supported, but int64 is not.
274 * Therefore we need to lower cluster broadcast using 32-bit int ops.
275 *
276 * For gfx12.5+ platforms that support int64, the register regions
277 * used by cluster broadcast aren't supported by the 64-bit pipeline.
278 *
279 * Work around the above and handle platforms that don't
280 * support 64-bit types at all.
281 */
282 if ((!has_64bit || devinfo->verx10 >= 125 ||
283 intel_device_info_is_9lp(devinfo) ||
284 devinfo->ver >= 20) && brw_type_size_bytes(t) > 4)
285 return BRW_TYPE_UD;
286 else
287 return brw_int_type(brw_type_size_bytes(t), false);
288
289 default:
290 return t;
291 }
292 }
293
294 /*
295 * Return whether the instruction has an unsupported channel bit layout
296 * specified for the i-th source region.
297 */
298 bool
has_invalid_src_region(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)299 has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
300 unsigned i)
301 {
302 /* Wa_22016140776:
303 *
304 * Scalar broadcast on HF math (packed or unpacked) must not be used.
305 * Compiler must use a mov instruction to expand the scalar value to
306 * a vector before using in a HF (packed or unpacked) math operation.
307 */
308 if (inst->is_math() && intel_needs_workaround(devinfo, 22016140776) &&
309 is_uniform(inst->src[i]) && inst->src[i].type == BRW_TYPE_HF) {
310 return true;
311 }
312
313 if (is_send(inst) || inst->is_math() || inst->is_control_source(i) ||
314 inst->opcode == BRW_OPCODE_DPAS) {
315 return false;
316 }
317
318 const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
319 const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
320
321 return (has_dst_aligned_region_restriction(devinfo, inst) &&
322 !is_uniform(inst->src[i]) &&
323 (byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
324 src_byte_offset != dst_byte_offset)) ||
325 (has_subdword_integer_region_restriction(devinfo, inst) &&
326 (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) ||
327 src_byte_offset != required_src_byte_offset(devinfo, inst, i)));
328 }
329
330 /*
331 * Return whether the instruction has an unsupported channel bit layout
332 * specified for the destination region.
333 */
334 bool
has_invalid_dst_region(const intel_device_info * devinfo,const fs_inst * inst)335 has_invalid_dst_region(const intel_device_info *devinfo,
336 const fs_inst *inst)
337 {
338 if (is_send(inst) || inst->is_math()) {
339 return false;
340 } else {
341 const brw_reg_type exec_type = get_exec_type(inst);
342 const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
343 const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
344 brw_type_size_bytes(inst->dst.type) < brw_type_size_bytes(exec_type);
345
346 return (has_dst_aligned_region_restriction(devinfo, inst) &&
347 (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
348 required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
349 (is_narrowing_conversion &&
350 required_dst_byte_stride(inst) != byte_stride(inst->dst));
351 }
352 }
353
354 /**
355 * Return a non-zero value if the execution type of the instruction is
356 * unsupported. The destination and sources matching the returned mask
357 * will be bit-cast to an integer type of appropriate size, lowering any
358 * source or destination modifiers into separate MOV instructions.
359 */
360 unsigned
has_invalid_exec_type(const intel_device_info * devinfo,const fs_inst * inst)361 has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
362 {
363 if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
364 switch (inst->opcode) {
365 case SHADER_OPCODE_SHUFFLE:
366 case SHADER_OPCODE_QUAD_SWIZZLE:
367 case SHADER_OPCODE_CLUSTER_BROADCAST:
368 case SHADER_OPCODE_BROADCAST:
369 case SHADER_OPCODE_MOV_INDIRECT:
370 return 0x1;
371
372 case SHADER_OPCODE_SEL_EXEC:
373 return 0x3;
374
375 default:
376 unreachable("Unknown invalid execution type source mask.");
377 }
378 } else {
379 return 0;
380 }
381 }
382
383 /**
384 * Return whether the instruction has an unsupported type conversion
385 * that must be handled by expanding the source operand.
386 */
387 bool
has_invalid_src_conversion(const intel_device_info * devinfo,const fs_inst * inst)388 has_invalid_src_conversion(const intel_device_info *devinfo,
389 const fs_inst *inst)
390 {
391 /* Scalar byte to float conversion is not allowed on DG2+ */
392 return devinfo->verx10 >= 125 &&
393 inst->opcode == BRW_OPCODE_MOV &&
394 brw_type_is_float(inst->dst.type) &&
395 brw_type_size_bits(inst->src[0].type) == 8 &&
396 is_uniform(inst->src[0]);
397 }
398
399 /*
400 * Return whether the instruction has unsupported source modifiers
401 * specified for the i-th source region.
402 */
403 bool
has_invalid_src_modifiers(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)404 has_invalid_src_modifiers(const intel_device_info *devinfo,
405 const fs_inst *inst, unsigned i)
406 {
407 return (!inst->can_do_source_mods(devinfo) &&
408 (inst->src[i].negate || inst->src[i].abs)) ||
409 ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
410 (inst->src[i].negate || inst->src[i].abs ||
411 inst->src[i].type != get_exec_type(inst))) ||
412 has_invalid_src_conversion(devinfo, inst);
413 }
414
415 /*
416 * Return whether the instruction has an unsupported type conversion
417 * specified for the destination.
418 */
419 bool
has_invalid_conversion(const intel_device_info * devinfo,const fs_inst * inst)420 has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
421 {
422 switch (inst->opcode) {
423 case BRW_OPCODE_MOV:
424 return false;
425 case BRW_OPCODE_SEL:
426 return inst->dst.type != get_exec_type(inst);
427 default:
428 /* FIXME: We assume the opcodes not explicitly mentioned before just
429 * work fine with arbitrary conversions, unless they need to be
430 * bit-cast.
431 */
432 return has_invalid_exec_type(devinfo, inst) &&
433 inst->dst.type != get_exec_type(inst);
434 }
435 }
436
437 /**
438 * Return whether the instruction has unsupported destination modifiers.
439 */
440 bool
has_invalid_dst_modifiers(const intel_device_info * devinfo,const fs_inst * inst)441 has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
442 {
443 return (has_invalid_exec_type(devinfo, inst) &&
444 (inst->saturate || inst->conditional_mod)) ||
445 has_invalid_conversion(devinfo, inst);
446 }
447
448 /**
449 * Return whether the instruction has non-standard semantics for the
450 * conditional mod which don't cause the flag register to be updated with
451 * the comparison result.
452 */
453 bool
has_inconsistent_cmod(const fs_inst * inst)454 has_inconsistent_cmod(const fs_inst *inst)
455 {
456 return inst->opcode == BRW_OPCODE_SEL ||
457 inst->opcode == BRW_OPCODE_CSEL ||
458 inst->opcode == BRW_OPCODE_IF ||
459 inst->opcode == BRW_OPCODE_WHILE;
460 }
461
462 bool
463 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
464 }
465
466 namespace brw {
467 /**
468 * Remove any modifiers from the \p i-th source region of the instruction,
469 * including negate, abs and any implicit type conversion to the execution
470 * type. Instead any source modifiers will be implemented as a separate
471 * MOV instruction prior to the original instruction.
472 */
473 bool
lower_src_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)474 lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
475 {
476 assert(inst->components_read(i) == 1);
477 assert(v->devinfo->has_integer_dword_mul ||
478 inst->opcode != BRW_OPCODE_MUL ||
479 brw_type_is_float(get_exec_type(inst)) ||
480 MIN2(brw_type_size_bytes(inst->src[0].type), brw_type_size_bytes(inst->src[1].type)) >= 4 ||
481 brw_type_size_bytes(inst->src[i].type) == get_exec_type_size(inst));
482
483 const fs_builder ibld(v, block, inst);
484 const brw_reg tmp = ibld.vgrf(get_exec_type(inst));
485
486 lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
487 inst->src[i] = tmp;
488
489 return true;
490 }
491 }
492
493 namespace {
494 /**
495 * Remove any modifiers from the destination region of the instruction,
496 * including saturate, conditional mod and any implicit type conversion
497 * from the execution type. Instead any destination modifiers will be
498 * implemented as a separate MOV instruction after the original
499 * instruction.
500 */
501 bool
lower_dst_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst)502 lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
503 {
504 const fs_builder ibld(v, block, inst);
505 const brw_reg_type type = get_exec_type(inst);
506 /* Not strictly necessary, but if possible use a temporary with the same
507 * channel alignment as the current destination in order to avoid
508 * violating the restrictions enforced later on by lower_src_region()
509 * and lower_dst_region(), which would introduce additional copy
510 * instructions into the program unnecessarily.
511 */
512 const unsigned stride =
513 brw_type_size_bytes(inst->dst.type) * inst->dst.stride <= brw_type_size_bytes(type) ? 1 :
514 brw_type_size_bytes(inst->dst.type) * inst->dst.stride / brw_type_size_bytes(type);
515 brw_reg tmp = ibld.vgrf(type, stride);
516 ibld.UNDEF(tmp);
517 tmp = horiz_stride(tmp, stride);
518
519 /* Emit a MOV taking care of all the destination modifiers. */
520 fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
521 mov->saturate = inst->saturate;
522 if (!has_inconsistent_cmod(inst))
523 mov->conditional_mod = inst->conditional_mod;
524 if (inst->opcode != BRW_OPCODE_SEL) {
525 mov->predicate = inst->predicate;
526 mov->predicate_inverse = inst->predicate_inverse;
527 }
528 mov->flag_subreg = inst->flag_subreg;
529 lower_instruction(v, block, mov);
530
531 /* Point the original instruction at the temporary, and clean up any
532 * destination modifiers.
533 */
534 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
535 inst->dst = tmp;
536 inst->size_written = inst->dst.component_size(inst->exec_size);
537 inst->saturate = false;
538 if (!has_inconsistent_cmod(inst))
539 inst->conditional_mod = BRW_CONDITIONAL_NONE;
540
541 assert(!inst->flags_written(v->devinfo) || !mov->predicate);
542 return true;
543 }
544
545 /**
546 * Remove any non-trivial shuffling of data from the \p i-th source region
547 * of the instruction. Instead implement the region as a series of integer
548 * copies into a temporary with the same channel layout as the destination.
549 */
550 bool
lower_src_region(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)551 lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
552 {
553 assert(inst->components_read(i) == 1);
554 const intel_device_info *devinfo = v->devinfo;
555 const fs_builder ibld(v, block, inst);
556 const unsigned stride = required_src_byte_stride(devinfo, inst, i) /
557 brw_type_size_bytes(inst->src[i].type);
558 assert(stride > 0);
559 /* Calculate the size of the temporary allocation manually instead of
560 * relying on the builder, since we may have to add some amount of
561 * padding mandated by the hardware for Xe2+ instructions with sub-dword
562 * integer regions.
563 */
564 const unsigned size =
565 DIV_ROUND_UP(required_src_byte_offset(v->devinfo, inst, i) +
566 inst->exec_size * stride *
567 brw_type_size_bytes(inst->src[i].type),
568 reg_unit(devinfo) * REG_SIZE) * reg_unit(devinfo);
569 brw_reg tmp = brw_vgrf(v->alloc.allocate(size), inst->src[i].type);
570 ibld.UNDEF(tmp);
571 tmp = byte_offset(horiz_stride(tmp, stride),
572 required_src_byte_offset(devinfo, inst, i));
573
574 /* Emit a series of 32-bit integer copies with any source modifiers
575 * cleaned up (because their semantics are dependent on the type).
576 */
577 const brw_reg_type raw_type = brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4),
578 false);
579 const unsigned n = brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type);
580 brw_reg raw_src = inst->src[i];
581 raw_src.negate = false;
582 raw_src.abs = false;
583
584 for (unsigned j = 0; j < n; j++) {
585 fs_inst *jnst = ibld.MOV(subscript(tmp, raw_type, j),
586 subscript(raw_src, raw_type, j));
587 if (has_subdword_integer_region_restriction(devinfo, jnst)) {
588 /* The copy isn't guaranteed to comply with all subdword integer
589 * regioning restrictions in some cases. Lower it recursively.
590 */
591 lower_instruction(v, block, jnst);
592 }
593 }
594
595 /* Point the original instruction at the temporary, making sure to keep
596 * any source modifiers in the instruction.
597 */
598 brw_reg lower_src = tmp;
599 lower_src.negate = inst->src[i].negate;
600 lower_src.abs = inst->src[i].abs;
601 inst->src[i] = lower_src;
602
603 return true;
604 }
605
606 /**
607 * Remove any non-trivial shuffling of data from the destination region of
608 * the instruction. Instead implement the region as a series of integer
609 * copies from a temporary with a channel layout compatible with the
610 * sources.
611 */
612 bool
lower_dst_region(fs_visitor * v,bblock_t * block,fs_inst * inst)613 lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
614 {
615 /* We cannot replace the result of an integer multiply which writes the
616 * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
617 * value whereas the MOV will act on only 32 or 33 bits of the
618 * accumulator.
619 */
620 assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
621 brw_type_is_float(inst->dst.type));
622
623 const fs_builder ibld(v, block, inst);
624 const unsigned stride = required_dst_byte_stride(inst) /
625 brw_type_size_bytes(inst->dst.type);
626 assert(stride > 0);
627 brw_reg tmp = ibld.vgrf(inst->dst.type, stride);
628 ibld.UNDEF(tmp);
629 tmp = horiz_stride(tmp, stride);
630
631 /* Emit a series of 32-bit integer copies from the temporary into the
632 * original destination.
633 */
634 const brw_reg_type raw_type = brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4),
635 false);
636 const unsigned n = brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type);
637
638 if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
639 /* Note that in general we cannot simply predicate the copies on the
640 * same flag register as the original instruction, since it may have
641 * been overwritten by the instruction itself. Instead initialize
642 * the temporary with the previous contents of the destination
643 * register.
644 */
645 for (unsigned j = 0; j < n; j++)
646 ibld.MOV(subscript(tmp, raw_type, j),
647 subscript(inst->dst, raw_type, j));
648 }
649
650 for (unsigned j = 0; j < n; j++)
651 ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
652 subscript(tmp, raw_type, j));
653
654 /* If the destination was an accumulator, after lowering it will be a
655 * GRF. Clear writes_accumulator for the instruction.
656 */
657 if (inst->dst.is_accumulator())
658 inst->writes_accumulator = false;
659
660 /* Point the original instruction at the temporary, making sure to keep
661 * any destination modifiers in the instruction.
662 */
663 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
664 inst->dst = tmp;
665 inst->size_written = inst->dst.component_size(inst->exec_size);
666
667 return true;
668 }
669
670 /**
671 * Change sources and destination of the instruction to an
672 * appropriate legal type, splitting the instruction into multiple
673 * ones of smaller execution type if necessary, to be used in cases
674 * where the execution type of an instruction is unsupported.
675 */
676 bool
lower_exec_type(fs_visitor * v,bblock_t * block,fs_inst * inst)677 lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
678 {
679 assert(inst->dst.type == get_exec_type(inst));
680 const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
681 const brw_reg_type raw_type = required_exec_type(v->devinfo, inst);
682 const unsigned n = get_exec_type_size(inst) / brw_type_size_bytes(raw_type);
683 const fs_builder ibld(v, block, inst);
684
685 brw_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
686 ibld.UNDEF(tmp);
687 tmp = horiz_stride(tmp, inst->dst.stride);
688
689 for (unsigned j = 0; j < n; j++) {
690 fs_inst sub_inst = *inst;
691
692 for (unsigned i = 0; i < inst->sources; i++) {
693 if (mask & (1u << i)) {
694 assert(inst->src[i].type == inst->dst.type);
695 sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
696 }
697 }
698
699 sub_inst.dst = subscript(tmp, raw_type, j);
700
701 assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
702 assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
703 ibld.emit(sub_inst);
704
705 fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
706 subscript(tmp, raw_type, j));
707 if (inst->opcode != BRW_OPCODE_SEL) {
708 mov->predicate = inst->predicate;
709 mov->predicate_inverse = inst->predicate_inverse;
710 }
711 lower_instruction(v, block, mov);
712 }
713
714 inst->remove(block);
715
716 return true;
717 }
718
719 /**
720 * Legalize the source and destination regioning controls of the specified
721 * instruction.
722 */
723 bool
lower_instruction(fs_visitor * v,bblock_t * block,fs_inst * inst)724 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
725 {
726 const intel_device_info *devinfo = v->devinfo;
727 bool progress = false;
728
729 if (has_invalid_dst_modifiers(devinfo, inst))
730 progress |= lower_dst_modifiers(v, block, inst);
731
732 if (has_invalid_dst_region(devinfo, inst))
733 progress |= lower_dst_region(v, block, inst);
734
735 for (unsigned i = 0; i < inst->sources; i++) {
736 if (has_invalid_src_modifiers(devinfo, inst, i))
737 progress |= lower_src_modifiers(v, block, inst, i);
738
739 if (has_invalid_src_region(devinfo, inst, i))
740 progress |= lower_src_region(v, block, inst, i);
741 }
742
743 if (has_invalid_exec_type(devinfo, inst))
744 progress |= lower_exec_type(v, block, inst);
745
746 return progress;
747 }
748 }
749
750 bool
brw_fs_lower_regioning(fs_visitor & s)751 brw_fs_lower_regioning(fs_visitor &s)
752 {
753 bool progress = false;
754
755 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg)
756 progress |= lower_instruction(&s, block, inst);
757
758 if (progress)
759 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
760
761 return progress;
762 }
763