1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "elk_fs.h"
25 #include "elk_cfg.h"
26 #include "elk_fs_builder.h"
27
28 using namespace elk;
29
30 namespace {
31 /* From the SKL PRM Vol 2a, "Move":
32 *
33 * "A mov with the same source and destination type, no source modifier,
34 * and no saturation is a raw move. A packed byte destination region (B
35 * or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36 * using raw move."
37 */
38 bool
is_byte_raw_mov(const elk_fs_inst * inst)39 is_byte_raw_mov(const elk_fs_inst *inst)
40 {
41 return type_sz(inst->dst.type) == 1 &&
42 inst->opcode == ELK_OPCODE_MOV &&
43 inst->src[0].type == inst->dst.type &&
44 !inst->saturate &&
45 !inst->src[0].negate &&
46 !inst->src[0].abs;
47 }
48
49 /*
50 * Return an acceptable byte stride for the destination of an instruction
51 * that requires it to have some particular alignment.
52 */
53 unsigned
required_dst_byte_stride(const elk_fs_inst * inst)54 required_dst_byte_stride(const elk_fs_inst *inst)
55 {
56 if (inst->dst.is_accumulator()) {
57 /* If the destination is an accumulator, insist that we leave the
58 * stride alone. We cannot "fix" accumulator destinations by writing
59 * to a temporary and emitting a MOV into the original destination.
60 * For multiply instructions (our one use of the accumulator), the
61 * MUL writes the full 66 bits of the accumulator whereas the MOV we
62 * would emit only writes 33 bits and leaves the top 33 bits
63 * undefined.
64 *
65 * It's safe to just require the original stride here because the
66 * lowering pass will detect the mismatch in has_invalid_src_region
67 * and fix the sources of the multiply instead of the destination.
68 */
69 return inst->dst.stride * type_sz(inst->dst.type);
70 } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71 !is_byte_raw_mov(inst)) {
72 return get_exec_type_size(inst);
73 } else {
74 /* Calculate the maximum byte stride and the minimum/maximum type
75 * size across all source and destination operands we are required to
76 * lower.
77 */
78 unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
79 unsigned min_size = type_sz(inst->dst.type);
80 unsigned max_size = type_sz(inst->dst.type);
81
82 for (unsigned i = 0; i < inst->sources; i++) {
83 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
84 const unsigned size = type_sz(inst->src[i].type);
85 max_stride = MAX2(max_stride, inst->src[i].stride * size);
86 min_size = MIN2(min_size, size);
87 max_size = MAX2(max_size, size);
88 }
89 }
90
91 /* All operands involved in lowering need to fit in the calculated
92 * stride.
93 */
94 assert(max_size <= 4 * min_size);
95
96 /* Attempt to use the largest byte stride among all present operands,
97 * but never exceed a stride of 4 since that would lead to illegal
98 * destination regions during lowering.
99 */
100 return MIN2(max_stride, 4 * min_size);
101 }
102 }
103
104 /*
105 * Return an acceptable byte sub-register offset for the destination of an
106 * instruction that requires it to be aligned to the sub-register offset of
107 * the sources.
108 */
109 unsigned
required_dst_byte_offset(const intel_device_info * devinfo,const elk_fs_inst * inst)110 required_dst_byte_offset(const intel_device_info *devinfo, const elk_fs_inst *inst)
111 {
112 for (unsigned i = 0; i < inst->sources; i++) {
113 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
114 if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
115 reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
116 return 0;
117 }
118
119 return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
120 }
121
122 /*
123 * Return the closest legal execution type for an instruction on
124 * the specified platform.
125 */
126 elk_reg_type
required_exec_type(const intel_device_info * devinfo,const elk_fs_inst * inst)127 required_exec_type(const intel_device_info *devinfo, const elk_fs_inst *inst)
128 {
129 const elk_reg_type t = get_exec_type(inst);
130 const bool has_64bit = elk_reg_type_is_floating_point(t) ?
131 devinfo->has_64bit_float : devinfo->has_64bit_int;
132
133 switch (inst->opcode) {
134 case ELK_SHADER_OPCODE_SHUFFLE:
135 /* IVB has an issue (which we found empirically) where it reads
136 * two address register components per channel for indirectly
137 * addressed 64-bit sources.
138 *
139 * From the Cherryview PRM Vol 7. "Register Region Restrictions":
140 *
141 * "When source or destination datatype is 64b or operation is
142 * integer DWord multiply, indirect addressing must not be
143 * used."
144 *
145 * Work around both of the above and handle platforms that
146 * don't support 64-bit types at all.
147 */
148 if ((!devinfo->has_64bit_int || devinfo->platform == INTEL_PLATFORM_CHV) &&
149 type_sz(t) > 4)
150 return ELK_REGISTER_TYPE_UD;
151 else if (has_dst_aligned_region_restriction(devinfo, inst))
152 return elk_int_type(type_sz(t), false);
153 else
154 return t;
155
156 case ELK_SHADER_OPCODE_SEL_EXEC:
157 if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
158 type_sz(t) > 4)
159 return ELK_REGISTER_TYPE_UD;
160 else
161 return t;
162
163 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
164 if (has_dst_aligned_region_restriction(devinfo, inst))
165 return elk_int_type(type_sz(t), false);
166 else
167 return t;
168
169 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
170 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
171 *
172 * "When source or destination datatype is 64b or operation is
173 * integer DWord multiply, indirect addressing must not be
174 * used."
175 *
176 * Work around the above and handle platforms that don't
177 * support 64-bit types at all.
178 */
179 if ((!has_64bit || devinfo->platform == INTEL_PLATFORM_CHV) &&
180 type_sz(t) > 4)
181 return ELK_REGISTER_TYPE_UD;
182 else
183 return elk_int_type(type_sz(t), false);
184
185 case ELK_SHADER_OPCODE_BROADCAST:
186 case ELK_SHADER_OPCODE_MOV_INDIRECT:
187 if ((devinfo->verx10 == 70 || devinfo->platform == INTEL_PLATFORM_CHV) &&
188 type_sz(inst->src[0].type) > 4)
189 return elk_int_type(type_sz(t), false);
190 else
191 return t;
192
193 default:
194 return t;
195 }
196 }
197
198 /*
199 * Return the stride between channels of the specified register in
200 * byte units, or ~0u if the region cannot be represented with a
201 * single one-dimensional stride.
202 */
203 unsigned
byte_stride(const elk_fs_reg & reg)204 byte_stride(const elk_fs_reg ®)
205 {
206 switch (reg.file) {
207 case BAD_FILE:
208 case UNIFORM:
209 case IMM:
210 case VGRF:
211 case MRF:
212 case ATTR:
213 return reg.stride * type_sz(reg.type);
214 case ARF:
215 case FIXED_GRF:
216 if (reg.is_null()) {
217 return 0;
218 } else {
219 const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
220 const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
221 const unsigned width = 1 << reg.width;
222
223 if (width == 1) {
224 return vstride * type_sz(reg.type);
225 } else if (hstride * width == vstride) {
226 return hstride * type_sz(reg.type);
227 } else {
228 return ~0u;
229 }
230 }
231 default:
232 unreachable("Invalid register file");
233 }
234 }
235
236 /*
237 * Return whether the instruction has an unsupported channel bit layout
238 * specified for the i-th source region.
239 */
240 bool
has_invalid_src_region(const intel_device_info * devinfo,const elk_fs_inst * inst,unsigned i)241 has_invalid_src_region(const intel_device_info *devinfo, const elk_fs_inst *inst,
242 unsigned i)
243 {
244 if (is_send(inst) || inst->is_math() || inst->is_control_source(i)) {
245 return false;
246 }
247
248 /* Empirical testing shows that Broadwell has a bug affecting half-float
249 * MAD instructions when any of its sources has a non-zero offset, such
250 * as:
251 *
252 * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
253 *
254 * We used to generate code like this for SIMD8 executions where we
255 * used to pack components Y and W of a vector at offset 16B of a SIMD
256 * register. The problem doesn't occur if the stride of the source is 0.
257 */
258 if (devinfo->ver == 8 &&
259 inst->opcode == ELK_OPCODE_MAD &&
260 inst->src[i].type == ELK_REGISTER_TYPE_HF &&
261 reg_offset(inst->src[i]) % REG_SIZE > 0 &&
262 inst->src[i].stride != 0) {
263 return true;
264 }
265
266 const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
267 const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
268
269 return has_dst_aligned_region_restriction(devinfo, inst) &&
270 !is_uniform(inst->src[i]) &&
271 (byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
272 src_byte_offset != dst_byte_offset);
273 }
274
275 /*
276 * Return whether the instruction has an unsupported channel bit layout
277 * specified for the destination region.
278 */
279 bool
has_invalid_dst_region(const intel_device_info * devinfo,const elk_fs_inst * inst)280 has_invalid_dst_region(const intel_device_info *devinfo,
281 const elk_fs_inst *inst)
282 {
283 if (is_send(inst) || inst->is_math()) {
284 return false;
285 } else {
286 const elk_reg_type exec_type = get_exec_type(inst);
287 const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
288 const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
289 type_sz(inst->dst.type) < type_sz(exec_type);
290
291 return (has_dst_aligned_region_restriction(devinfo, inst) &&
292 (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
293 required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
294 (is_narrowing_conversion &&
295 required_dst_byte_stride(inst) != byte_stride(inst->dst));
296 }
297 }
298
299 /**
300 * Return a non-zero value if the execution type of the instruction is
301 * unsupported. The destination and sources matching the returned mask
302 * will be bit-cast to an integer type of appropriate size, lowering any
303 * source or destination modifiers into separate MOV instructions.
304 */
305 unsigned
has_invalid_exec_type(const intel_device_info * devinfo,const elk_fs_inst * inst)306 has_invalid_exec_type(const intel_device_info *devinfo, const elk_fs_inst *inst)
307 {
308 if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
309 switch (inst->opcode) {
310 case ELK_SHADER_OPCODE_SHUFFLE:
311 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
312 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
313 case ELK_SHADER_OPCODE_BROADCAST:
314 case ELK_SHADER_OPCODE_MOV_INDIRECT:
315 return 0x1;
316
317 case ELK_SHADER_OPCODE_SEL_EXEC:
318 return 0x3;
319
320 default:
321 unreachable("Unknown invalid execution type source mask.");
322 }
323 } else {
324 return 0;
325 }
326 }
327
328 /*
329 * Return whether the instruction has unsupported source modifiers
330 * specified for the i-th source region.
331 */
332 bool
has_invalid_src_modifiers(const intel_device_info * devinfo,const elk_fs_inst * inst,unsigned i)333 has_invalid_src_modifiers(const intel_device_info *devinfo,
334 const elk_fs_inst *inst, unsigned i)
335 {
336 return (!inst->can_do_source_mods(devinfo) &&
337 (inst->src[i].negate || inst->src[i].abs)) ||
338 ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
339 (inst->src[i].negate || inst->src[i].abs ||
340 inst->src[i].type != get_exec_type(inst)));
341 }
342
343 /*
344 * Return whether the instruction has an unsupported type conversion
345 * specified for the destination.
346 */
347 bool
has_invalid_conversion(const intel_device_info * devinfo,const elk_fs_inst * inst)348 has_invalid_conversion(const intel_device_info *devinfo, const elk_fs_inst *inst)
349 {
350 switch (inst->opcode) {
351 case ELK_OPCODE_MOV:
352 return false;
353 case ELK_OPCODE_SEL:
354 return inst->dst.type != get_exec_type(inst);
355 default:
356 /* FIXME: We assume the opcodes not explicitly mentioned before just
357 * work fine with arbitrary conversions, unless they need to be
358 * bit-cast.
359 */
360 return has_invalid_exec_type(devinfo, inst) &&
361 inst->dst.type != get_exec_type(inst);
362 }
363 }
364
365 /**
366 * Return whether the instruction has unsupported destination modifiers.
367 */
368 bool
has_invalid_dst_modifiers(const intel_device_info * devinfo,const elk_fs_inst * inst)369 has_invalid_dst_modifiers(const intel_device_info *devinfo, const elk_fs_inst *inst)
370 {
371 return (has_invalid_exec_type(devinfo, inst) &&
372 (inst->saturate || inst->conditional_mod)) ||
373 has_invalid_conversion(devinfo, inst);
374 }
375
376 /**
377 * Return whether the instruction has non-standard semantics for the
378 * conditional mod which don't cause the flag register to be updated with
379 * the comparison result.
380 */
381 bool
has_inconsistent_cmod(const elk_fs_inst * inst)382 has_inconsistent_cmod(const elk_fs_inst *inst)
383 {
384 return inst->opcode == ELK_OPCODE_SEL ||
385 inst->opcode == ELK_OPCODE_CSEL ||
386 inst->opcode == ELK_OPCODE_IF ||
387 inst->opcode == ELK_OPCODE_WHILE;
388 }
389
390 bool
391 lower_instruction(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst);
392 }
393
394 namespace elk {
395 /**
396 * Remove any modifiers from the \p i-th source region of the instruction,
397 * including negate, abs and any implicit type conversion to the execution
398 * type. Instead any source modifiers will be implemented as a separate
399 * MOV instruction prior to the original instruction.
400 */
401 bool
lower_src_modifiers(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst,unsigned i)402 lower_src_modifiers(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst, unsigned i)
403 {
404 assert(inst->components_read(i) == 1);
405 assert(v->devinfo->has_integer_dword_mul ||
406 inst->opcode != ELK_OPCODE_MUL ||
407 elk_reg_type_is_floating_point(get_exec_type(inst)) ||
408 MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
409 type_sz(inst->src[i].type) == get_exec_type_size(inst));
410
411 const fs_builder ibld(v, block, inst);
412 const elk_fs_reg tmp = ibld.vgrf(get_exec_type(inst));
413
414 lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
415 inst->src[i] = tmp;
416
417 return true;
418 }
419 }
420
421 namespace {
422 /**
423 * Remove any modifiers from the destination region of the instruction,
424 * including saturate, conditional mod and any implicit type conversion
425 * from the execution type. Instead any destination modifiers will be
426 * implemented as a separate MOV instruction after the original
427 * instruction.
428 */
429 bool
lower_dst_modifiers(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)430 lower_dst_modifiers(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
431 {
432 const fs_builder ibld(v, block, inst);
433 const elk_reg_type type = get_exec_type(inst);
434 /* Not strictly necessary, but if possible use a temporary with the same
435 * channel alignment as the current destination in order to avoid
436 * violating the restrictions enforced later on by lower_src_region()
437 * and lower_dst_region(), which would introduce additional copy
438 * instructions into the program unnecessarily.
439 */
440 const unsigned stride =
441 type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
442 type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
443 elk_fs_reg tmp = ibld.vgrf(type, stride);
444 ibld.UNDEF(tmp);
445 tmp = horiz_stride(tmp, stride);
446
447 /* Emit a MOV taking care of all the destination modifiers. */
448 elk_fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
449 mov->saturate = inst->saturate;
450 if (!has_inconsistent_cmod(inst))
451 mov->conditional_mod = inst->conditional_mod;
452 if (inst->opcode != ELK_OPCODE_SEL) {
453 mov->predicate = inst->predicate;
454 mov->predicate_inverse = inst->predicate_inverse;
455 }
456 mov->flag_subreg = inst->flag_subreg;
457 lower_instruction(v, block, mov);
458
459 /* Point the original instruction at the temporary, and clean up any
460 * destination modifiers.
461 */
462 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
463 inst->dst = tmp;
464 inst->size_written = inst->dst.component_size(inst->exec_size);
465 inst->saturate = false;
466 if (!has_inconsistent_cmod(inst))
467 inst->conditional_mod = ELK_CONDITIONAL_NONE;
468
469 assert(!inst->flags_written(v->devinfo) || !mov->predicate);
470 return true;
471 }
472
473 /**
474 * Remove any non-trivial shuffling of data from the \p i-th source region
475 * of the instruction. Instead implement the region as a series of integer
476 * copies into a temporary with the same channel layout as the destination.
477 */
478 bool
lower_src_region(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst,unsigned i)479 lower_src_region(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst, unsigned i)
480 {
481 assert(inst->components_read(i) == 1);
482 const fs_builder ibld(v, block, inst);
483 const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
484 type_sz(inst->src[i].type);
485 assert(stride > 0);
486 elk_fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
487 ibld.UNDEF(tmp);
488 tmp = horiz_stride(tmp, stride);
489
490 /* Emit a series of 32-bit integer copies with any source modifiers
491 * cleaned up (because their semantics are dependent on the type).
492 */
493 const elk_reg_type raw_type = elk_int_type(MIN2(type_sz(tmp.type), 4),
494 false);
495 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
496 elk_fs_reg raw_src = inst->src[i];
497 raw_src.negate = false;
498 raw_src.abs = false;
499
500 for (unsigned j = 0; j < n; j++)
501 ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
502
503 /* Point the original instruction at the temporary, making sure to keep
504 * any source modifiers in the instruction.
505 */
506 elk_fs_reg lower_src = tmp;
507 lower_src.negate = inst->src[i].negate;
508 lower_src.abs = inst->src[i].abs;
509 inst->src[i] = lower_src;
510
511 return true;
512 }
513
514 /**
515 * Remove any non-trivial shuffling of data from the destination region of
516 * the instruction. Instead implement the region as a series of integer
517 * copies from a temporary with a channel layout compatible with the
518 * sources.
519 */
520 bool
lower_dst_region(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)521 lower_dst_region(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
522 {
523 /* We cannot replace the result of an integer multiply which writes the
524 * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
525 * value whereas the MOV will act on only 32 or 33 bits of the
526 * accumulator.
527 */
528 assert(inst->opcode != ELK_OPCODE_MUL || !inst->dst.is_accumulator() ||
529 elk_reg_type_is_floating_point(inst->dst.type));
530
531 const fs_builder ibld(v, block, inst);
532 const unsigned stride = required_dst_byte_stride(inst) /
533 type_sz(inst->dst.type);
534 assert(stride > 0);
535 elk_fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
536 ibld.UNDEF(tmp);
537 tmp = horiz_stride(tmp, stride);
538
539 /* Emit a series of 32-bit integer copies from the temporary into the
540 * original destination.
541 */
542 const elk_reg_type raw_type = elk_int_type(MIN2(type_sz(tmp.type), 4),
543 false);
544 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
545
546 if (inst->predicate && inst->opcode != ELK_OPCODE_SEL) {
547 /* Note that in general we cannot simply predicate the copies on the
548 * same flag register as the original instruction, since it may have
549 * been overwritten by the instruction itself. Instead initialize
550 * the temporary with the previous contents of the destination
551 * register.
552 */
553 for (unsigned j = 0; j < n; j++)
554 ibld.MOV(subscript(tmp, raw_type, j),
555 subscript(inst->dst, raw_type, j));
556 }
557
558 for (unsigned j = 0; j < n; j++)
559 ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
560 subscript(tmp, raw_type, j));
561
562 /* Point the original instruction at the temporary, making sure to keep
563 * any destination modifiers in the instruction.
564 */
565 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
566 inst->dst = tmp;
567 inst->size_written = inst->dst.component_size(inst->exec_size);
568
569 return true;
570 }
571
572 /**
573 * Change sources and destination of the instruction to an
574 * appropriate legal type, splitting the instruction into multiple
575 * ones of smaller execution type if necessary, to be used in cases
576 * where the execution type of an instruction is unsupported.
577 */
578 bool
lower_exec_type(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)579 lower_exec_type(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
580 {
581 assert(inst->dst.type == get_exec_type(inst));
582 const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
583 const elk_reg_type raw_type = required_exec_type(v->devinfo, inst);
584 const unsigned n = get_exec_type_size(inst) / type_sz(raw_type);
585 const fs_builder ibld(v, block, inst);
586
587 elk_fs_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
588 ibld.UNDEF(tmp);
589 tmp = horiz_stride(tmp, inst->dst.stride);
590
591 for (unsigned j = 0; j < n; j++) {
592 elk_fs_inst sub_inst = *inst;
593
594 for (unsigned i = 0; i < inst->sources; i++) {
595 if (mask & (1u << i)) {
596 assert(inst->src[i].type == inst->dst.type);
597 sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
598 }
599 }
600
601 sub_inst.dst = subscript(tmp, raw_type, j);
602
603 assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
604 assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
605 ibld.emit(sub_inst);
606
607 elk_fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
608 subscript(tmp, raw_type, j));
609 if (inst->opcode != ELK_OPCODE_SEL) {
610 mov->predicate = inst->predicate;
611 mov->predicate_inverse = inst->predicate_inverse;
612 }
613 lower_instruction(v, block, mov);
614 }
615
616 inst->remove(block);
617
618 return true;
619 }
620
621 /**
622 * Legalize the source and destination regioning controls of the specified
623 * instruction.
624 */
625 bool
lower_instruction(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)626 lower_instruction(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
627 {
628 const intel_device_info *devinfo = v->devinfo;
629 bool progress = false;
630
631 if (has_invalid_dst_modifiers(devinfo, inst))
632 progress |= lower_dst_modifiers(v, block, inst);
633
634 if (has_invalid_dst_region(devinfo, inst))
635 progress |= lower_dst_region(v, block, inst);
636
637 for (unsigned i = 0; i < inst->sources; i++) {
638 if (has_invalid_src_modifiers(devinfo, inst, i))
639 progress |= lower_src_modifiers(v, block, inst, i);
640
641 if (has_invalid_src_region(devinfo, inst, i))
642 progress |= lower_src_region(v, block, inst, i);
643 }
644
645 if (has_invalid_exec_type(devinfo, inst))
646 progress |= lower_exec_type(v, block, inst);
647
648 return progress;
649 }
650 }
651
652 bool
lower_regioning()653 elk_fs_visitor::lower_regioning()
654 {
655 bool progress = false;
656
657 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg)
658 progress |= lower_instruction(this, block, inst);
659
660 if (progress)
661 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
662
663 return progress;
664 }
665