1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "elk_vec4.h"
25 #include "elk_fs.h"
26 #include "elk_eu.h"
27 #include "elk_cfg.h"
28 #include "elk_nir.h"
29 #include "elk_vec4_builder.h"
30 #include "elk_vec4_vs.h"
31 #include "elk_dead_control_flow.h"
32 #include "elk_private.h"
33 #include "dev/intel_debug.h"
34 #include "util/u_math.h"
35
36 #define MAX_INSTRUCTION (1 << 30)
37
38 using namespace elk;
39
40 namespace elk {
41
42 void
init()43 src_reg::init()
44 {
45 memset((void*)this, 0, sizeof(*this));
46 this->file = BAD_FILE;
47 this->type = ELK_REGISTER_TYPE_UD;
48 }
49
src_reg(enum elk_reg_file file,int nr,const glsl_type * type)50 src_reg::src_reg(enum elk_reg_file file, int nr, const glsl_type *type)
51 {
52 init();
53
54 this->file = file;
55 this->nr = nr;
56 if (type && (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)))
57 this->swizzle = elk_swizzle_for_size(type->vector_elements);
58 else
59 this->swizzle = ELK_SWIZZLE_XYZW;
60 if (type)
61 this->type = elk_type_for_base_type(type);
62 }
63
64 /** Generic unset register constructor. */
src_reg()65 src_reg::src_reg()
66 {
67 init();
68 }
69
src_reg(struct::elk_reg reg)70 src_reg::src_reg(struct ::elk_reg reg) :
71 elk_backend_reg(reg)
72 {
73 this->offset = 0;
74 this->reladdr = NULL;
75 }
76
src_reg(const dst_reg & reg)77 src_reg::src_reg(const dst_reg ®) :
78 elk_backend_reg(reg)
79 {
80 this->reladdr = reg.reladdr;
81 this->swizzle = elk_swizzle_for_mask(reg.writemask);
82 }
83
84 void
init()85 dst_reg::init()
86 {
87 memset((void*)this, 0, sizeof(*this));
88 this->file = BAD_FILE;
89 this->type = ELK_REGISTER_TYPE_UD;
90 this->writemask = WRITEMASK_XYZW;
91 }
92
dst_reg()93 dst_reg::dst_reg()
94 {
95 init();
96 }
97
dst_reg(enum elk_reg_file file,int nr)98 dst_reg::dst_reg(enum elk_reg_file file, int nr)
99 {
100 init();
101
102 this->file = file;
103 this->nr = nr;
104 }
105
dst_reg(enum elk_reg_file file,int nr,const glsl_type * type,unsigned writemask)106 dst_reg::dst_reg(enum elk_reg_file file, int nr, const glsl_type *type,
107 unsigned writemask)
108 {
109 init();
110
111 this->file = file;
112 this->nr = nr;
113 this->type = elk_type_for_base_type(type);
114 this->writemask = writemask;
115 }
116
dst_reg(enum elk_reg_file file,int nr,elk_reg_type type,unsigned writemask)117 dst_reg::dst_reg(enum elk_reg_file file, int nr, elk_reg_type type,
118 unsigned writemask)
119 {
120 init();
121
122 this->file = file;
123 this->nr = nr;
124 this->type = type;
125 this->writemask = writemask;
126 }
127
dst_reg(struct::elk_reg reg)128 dst_reg::dst_reg(struct ::elk_reg reg) :
129 elk_backend_reg(reg)
130 {
131 this->offset = 0;
132 this->reladdr = NULL;
133 }
134
dst_reg(const src_reg & reg)135 dst_reg::dst_reg(const src_reg ®) :
136 elk_backend_reg(reg)
137 {
138 this->writemask = elk_mask_for_swizzle(reg.swizzle);
139 this->reladdr = reg.reladdr;
140 }
141
142 bool
equals(const dst_reg & r) const143 dst_reg::equals(const dst_reg &r) const
144 {
145 return (this->elk_backend_reg::equals(r) &&
146 (reladdr == r.reladdr ||
147 (reladdr && r.reladdr && reladdr->equals(*r.reladdr))));
148 }
149
150 bool
is_send_from_grf() const151 vec4_instruction::is_send_from_grf() const
152 {
153 switch (opcode) {
154 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
155 case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
156 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
157 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
158 case ELK_VEC4_OPCODE_URB_READ:
159 case ELK_VEC4_TCS_OPCODE_URB_WRITE:
160 case ELK_TCS_OPCODE_RELEASE_INPUT:
161 case ELK_SHADER_OPCODE_BARRIER:
162 return true;
163 default:
164 return false;
165 }
166 }
167
168 /**
169 * Returns true if this instruction's sources and destinations cannot
170 * safely be the same register.
171 *
172 * In most cases, a register can be written over safely by the same
173 * instruction that is its last use. For a single instruction, the
174 * sources are dereferenced before writing of the destination starts
175 * (naturally).
176 *
177 * However, there are a few cases where this can be problematic:
178 *
179 * - Virtual opcodes that translate to multiple instructions in the
180 * code generator: if src == dst and one instruction writes the
181 * destination before a later instruction reads the source, then
182 * src will have been clobbered.
183 *
184 * The register allocator uses this information to set up conflicts between
185 * GRF sources and the destination.
186 */
187 bool
has_source_and_destination_hazard() const188 vec4_instruction::has_source_and_destination_hazard() const
189 {
190 switch (opcode) {
191 case ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
192 case ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
193 case ELK_TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
194 return true;
195 default:
196 /* 8-wide compressed DF operations are executed as two 4-wide operations,
197 * so we have a src/dst hazard if the first half of the instruction
198 * overwrites the source of the second half. Prevent this by marking
199 * compressed instructions as having src/dst hazards, so the register
200 * allocator assigns safe register regions for dst and srcs.
201 */
202 return size_written > REG_SIZE;
203 }
204 }
205
206 unsigned
size_read(unsigned arg) const207 vec4_instruction::size_read(unsigned arg) const
208 {
209 switch (opcode) {
210 case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
211 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
212 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
213 case ELK_VEC4_TCS_OPCODE_URB_WRITE:
214 if (arg == 0)
215 return mlen * REG_SIZE;
216 break;
217 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
218 if (arg == 1)
219 return mlen * REG_SIZE;
220 break;
221 default:
222 break;
223 }
224
225 switch (src[arg].file) {
226 case BAD_FILE:
227 return 0;
228 case IMM:
229 case UNIFORM:
230 return 4 * type_sz(src[arg].type);
231 default:
232 /* XXX - Represent actual vertical stride. */
233 return exec_size * type_sz(src[arg].type);
234 }
235 }
236
237 bool
can_do_source_mods(const struct intel_device_info * devinfo)238 vec4_instruction::can_do_source_mods(const struct intel_device_info *devinfo)
239 {
240 if (devinfo->ver == 6 && is_math())
241 return false;
242
243 if (is_send_from_grf())
244 return false;
245
246 if (!elk_backend_instruction::can_do_source_mods())
247 return false;
248
249 return true;
250 }
251
252 bool
can_do_cmod()253 vec4_instruction::can_do_cmod()
254 {
255 if (!elk_backend_instruction::can_do_cmod())
256 return false;
257
258 /* The accumulator result appears to get used for the conditional modifier
259 * generation. When negating a UD value, there is a 33rd bit generated for
260 * the sign in the accumulator value, so now you can't check, for example,
261 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
262 */
263 for (unsigned i = 0; i < 3; i++) {
264 if (src[i].file != BAD_FILE &&
265 elk_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
266 return false;
267 }
268
269 return true;
270 }
271
272 bool
can_do_writemask(const struct intel_device_info * devinfo)273 vec4_instruction::can_do_writemask(const struct intel_device_info *devinfo)
274 {
275 switch (opcode) {
276 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
277 case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
278 case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
279 case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
280 case ELK_VEC4_OPCODE_TO_DOUBLE:
281 case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
282 case ELK_VEC4_OPCODE_PICK_HIGH_32BIT:
283 case ELK_VEC4_OPCODE_SET_LOW_32BIT:
284 case ELK_VEC4_OPCODE_SET_HIGH_32BIT:
285 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD:
286 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
287 case ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
288 case ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
289 case ELK_TES_OPCODE_CREATE_INPUT_READ_HEADER:
290 case ELK_TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
291 case ELK_VEC4_OPCODE_URB_READ:
292 case ELK_SHADER_OPCODE_MOV_INDIRECT:
293 case ELK_SHADER_OPCODE_TEX:
294 case ELK_FS_OPCODE_TXB:
295 case ELK_SHADER_OPCODE_TXD:
296 case ELK_SHADER_OPCODE_TXF:
297 case ELK_SHADER_OPCODE_TXF_LZ:
298 case ELK_SHADER_OPCODE_TXF_CMS:
299 case ELK_SHADER_OPCODE_TXF_CMS_W:
300 case ELK_SHADER_OPCODE_TXF_UMS:
301 case ELK_SHADER_OPCODE_TXF_MCS:
302 case ELK_SHADER_OPCODE_TXL:
303 case ELK_SHADER_OPCODE_TXL_LZ:
304 case ELK_SHADER_OPCODE_TXS:
305 case ELK_SHADER_OPCODE_LOD:
306 case ELK_SHADER_OPCODE_TG4:
307 case ELK_SHADER_OPCODE_TG4_OFFSET:
308 case ELK_SHADER_OPCODE_SAMPLEINFO:
309 return false;
310 default:
311 /* The MATH instruction on Gfx6 only executes in align1 mode, which does
312 * not support writemasking.
313 */
314 if (devinfo->ver == 6 && is_math())
315 return false;
316
317 return true;
318 }
319 }
320
321 bool
can_change_types() const322 vec4_instruction::can_change_types() const
323 {
324 return dst.type == src[0].type &&
325 !src[0].abs && !src[0].negate && !saturate &&
326 (opcode == ELK_OPCODE_MOV ||
327 (opcode == ELK_OPCODE_SEL &&
328 dst.type == src[1].type &&
329 predicate != ELK_PREDICATE_NONE &&
330 !src[1].abs && !src[1].negate));
331 }
332
333 /**
334 * Returns how many MRFs an opcode will write over.
335 *
336 * Note that this is not the 0 or 1 implied writes in an actual gen
337 * instruction -- the generate_* functions generate additional MOVs
338 * for setup.
339 */
340 unsigned
implied_mrf_writes() const341 vec4_instruction::implied_mrf_writes() const
342 {
343 if (mlen == 0 || is_send_from_grf())
344 return 0;
345
346 switch (opcode) {
347 case ELK_SHADER_OPCODE_RCP:
348 case ELK_SHADER_OPCODE_RSQ:
349 case ELK_SHADER_OPCODE_SQRT:
350 case ELK_SHADER_OPCODE_EXP2:
351 case ELK_SHADER_OPCODE_LOG2:
352 case ELK_SHADER_OPCODE_SIN:
353 case ELK_SHADER_OPCODE_COS:
354 return 1;
355 case ELK_SHADER_OPCODE_INT_QUOTIENT:
356 case ELK_SHADER_OPCODE_INT_REMAINDER:
357 case ELK_SHADER_OPCODE_POW:
358 case ELK_TCS_OPCODE_THREAD_END:
359 return 2;
360 case ELK_VEC4_VS_OPCODE_URB_WRITE:
361 return 1;
362 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD:
363 return 2;
364 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
365 return 2;
366 case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
367 return 3;
368 case ELK_VEC4_GS_OPCODE_URB_WRITE:
369 case ELK_VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
370 case ELK_GS_OPCODE_THREAD_END:
371 return 0;
372 case ELK_GS_OPCODE_FF_SYNC:
373 return 1;
374 case ELK_VEC4_TCS_OPCODE_URB_WRITE:
375 return 0;
376 case ELK_SHADER_OPCODE_TEX:
377 case ELK_SHADER_OPCODE_TXL:
378 case ELK_SHADER_OPCODE_TXD:
379 case ELK_SHADER_OPCODE_TXF:
380 case ELK_SHADER_OPCODE_TXF_CMS:
381 case ELK_SHADER_OPCODE_TXF_CMS_W:
382 case ELK_SHADER_OPCODE_TXF_MCS:
383 case ELK_SHADER_OPCODE_TXS:
384 case ELK_SHADER_OPCODE_TG4:
385 case ELK_SHADER_OPCODE_TG4_OFFSET:
386 case ELK_SHADER_OPCODE_SAMPLEINFO:
387 case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
388 return header_size;
389 default:
390 unreachable("not reached");
391 }
392 }
393
394 bool
equals(const src_reg & r) const395 src_reg::equals(const src_reg &r) const
396 {
397 return (this->elk_backend_reg::equals(r) &&
398 !reladdr && !r.reladdr);
399 }
400
401 bool
negative_equals(const src_reg & r) const402 src_reg::negative_equals(const src_reg &r) const
403 {
404 return this->elk_backend_reg::negative_equals(r) &&
405 !reladdr && !r.reladdr;
406 }
407
408 bool
opt_vector_float()409 vec4_visitor::opt_vector_float()
410 {
411 bool progress = false;
412
413 foreach_block(block, cfg) {
414 unsigned last_reg = ~0u, last_offset = ~0u;
415 enum elk_reg_file last_reg_file = BAD_FILE;
416
417 uint8_t imm[4] = { 0 };
418 int inst_count = 0;
419 vec4_instruction *imm_inst[4];
420 unsigned writemask = 0;
421 enum elk_reg_type dest_type = ELK_REGISTER_TYPE_F;
422
423 foreach_inst_in_block_safe(vec4_instruction, inst, block) {
424 int vf = -1;
425 enum elk_reg_type need_type = ELK_REGISTER_TYPE_LAST;
426
427 /* Look for unconditional MOVs from an immediate with a partial
428 * writemask. Skip type-conversion MOVs other than integer 0,
429 * where the type doesn't matter. See if the immediate can be
430 * represented as a VF.
431 */
432 if (inst->opcode == ELK_OPCODE_MOV &&
433 inst->src[0].file == IMM &&
434 inst->predicate == ELK_PREDICATE_NONE &&
435 inst->dst.writemask != WRITEMASK_XYZW &&
436 type_sz(inst->src[0].type) < 8 &&
437 (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
438
439 vf = elk_float_to_vf(inst->src[0].d);
440 need_type = ELK_REGISTER_TYPE_D;
441
442 if (vf == -1) {
443 vf = elk_float_to_vf(inst->src[0].f);
444 need_type = ELK_REGISTER_TYPE_F;
445 }
446 } else {
447 last_reg = ~0u;
448 }
449
450 /* If this wasn't a MOV, or the destination register doesn't match,
451 * or we have to switch destination types, then this breaks our
452 * sequence. Combine anything we've accumulated so far.
453 */
454 if (last_reg != inst->dst.nr ||
455 last_offset != inst->dst.offset ||
456 last_reg_file != inst->dst.file ||
457 (vf > 0 && dest_type != need_type)) {
458
459 if (inst_count > 1) {
460 unsigned vf;
461 memcpy(&vf, imm, sizeof(vf));
462 vec4_instruction *mov = MOV(imm_inst[0]->dst, elk_imm_vf(vf));
463 mov->dst.type = dest_type;
464 mov->dst.writemask = writemask;
465 inst->insert_before(block, mov);
466
467 for (int i = 0; i < inst_count; i++) {
468 imm_inst[i]->remove(block);
469 }
470
471 progress = true;
472 }
473
474 inst_count = 0;
475 last_reg = ~0u;;
476 writemask = 0;
477 dest_type = ELK_REGISTER_TYPE_F;
478
479 for (int i = 0; i < 4; i++) {
480 imm[i] = 0;
481 }
482 }
483
484 /* Record this instruction's value (if it was representable). */
485 if (vf != -1) {
486 if ((inst->dst.writemask & WRITEMASK_X) != 0)
487 imm[0] = vf;
488 if ((inst->dst.writemask & WRITEMASK_Y) != 0)
489 imm[1] = vf;
490 if ((inst->dst.writemask & WRITEMASK_Z) != 0)
491 imm[2] = vf;
492 if ((inst->dst.writemask & WRITEMASK_W) != 0)
493 imm[3] = vf;
494
495 writemask |= inst->dst.writemask;
496 imm_inst[inst_count++] = inst;
497
498 last_reg = inst->dst.nr;
499 last_offset = inst->dst.offset;
500 last_reg_file = inst->dst.file;
501 if (vf > 0)
502 dest_type = need_type;
503 }
504 }
505 }
506
507 if (progress)
508 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
509
510 return progress;
511 }
512
513 /* Replaces unused channels of a swizzle with channels that are used.
514 *
515 * For instance, this pass transforms
516 *
517 * mov vgrf4.yz, vgrf5.wxzy
518 *
519 * into
520 *
521 * mov vgrf4.yz, vgrf5.xxzx
522 *
523 * This eliminates false uses of some channels, letting dead code elimination
524 * remove the instructions that wrote them.
525 */
526 bool
opt_reduce_swizzle()527 vec4_visitor::opt_reduce_swizzle()
528 {
529 bool progress = false;
530
531 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
532 if (inst->dst.file == BAD_FILE ||
533 inst->dst.file == ARF ||
534 inst->dst.file == FIXED_GRF ||
535 inst->is_send_from_grf())
536 continue;
537
538 unsigned swizzle;
539
540 /* Determine which channels of the sources are read. */
541 switch (inst->opcode) {
542 case ELK_VEC4_OPCODE_PACK_BYTES:
543 case ELK_OPCODE_DP4:
544 case ELK_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
545 * but all four of src1.
546 */
547 swizzle = elk_swizzle_for_size(4);
548 break;
549 case ELK_OPCODE_DP3:
550 swizzle = elk_swizzle_for_size(3);
551 break;
552 case ELK_OPCODE_DP2:
553 swizzle = elk_swizzle_for_size(2);
554 break;
555
556 case ELK_VEC4_OPCODE_TO_DOUBLE:
557 case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
558 case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
559 case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
560 case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
561 case ELK_VEC4_OPCODE_PICK_HIGH_32BIT:
562 case ELK_VEC4_OPCODE_SET_LOW_32BIT:
563 case ELK_VEC4_OPCODE_SET_HIGH_32BIT:
564 swizzle = elk_swizzle_for_size(4);
565 break;
566
567 default:
568 swizzle = elk_swizzle_for_mask(inst->dst.writemask);
569 break;
570 }
571
572 /* Update sources' swizzles. */
573 for (int i = 0; i < 3; i++) {
574 if (inst->src[i].file != VGRF &&
575 inst->src[i].file != ATTR &&
576 inst->src[i].file != UNIFORM)
577 continue;
578
579 const unsigned new_swizzle =
580 elk_compose_swizzle(swizzle, inst->src[i].swizzle);
581 if (inst->src[i].swizzle != new_swizzle) {
582 inst->src[i].swizzle = new_swizzle;
583 progress = true;
584 }
585 }
586 }
587
588 if (progress)
589 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
590
591 return progress;
592 }
593
594 void
split_uniform_registers()595 vec4_visitor::split_uniform_registers()
596 {
597 /* Prior to this, uniforms have been in an array sized according to
598 * the number of vector uniforms present, sparsely filled (so an
599 * aggregate results in reg indices being skipped over). Now we're
600 * going to cut those aggregates up so each .nr index is one
601 * vector. The goal is to make elimination of unused uniform
602 * components easier later.
603 */
604 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
605 for (int i = 0 ; i < 3; i++) {
606 if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
607 continue;
608
609 assert(!inst->src[i].reladdr);
610
611 inst->src[i].nr += inst->src[i].offset / 16;
612 inst->src[i].offset %= 16;
613 }
614 }
615 }
616
617 /**
618 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
619 *
620 * While GLSL IR also performs this optimization, we end up with it in
621 * our instruction stream for a couple of reasons. One is that we
622 * sometimes generate silly instructions, for example in array access
623 * where we'll generate "ADD offset, index, base" even if base is 0.
624 * The other is that GLSL IR's constant propagation doesn't track the
625 * components of aggregates, so some VS patterns (initialize matrix to
626 * 0, accumulate in vertex blending factors) end up breaking down to
627 * instructions involving 0.
628 */
629 bool
opt_algebraic()630 vec4_visitor::opt_algebraic()
631 {
632 bool progress = false;
633
634 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
635 switch (inst->opcode) {
636 case ELK_OPCODE_MOV:
637 if (inst->src[0].file != IMM)
638 break;
639
640 if (inst->saturate) {
641 /* Full mixed-type saturates don't happen. However, we can end up
642 * with things like:
643 *
644 * mov.sat(8) g21<1>DF -1F
645 *
646 * Other mixed-size-but-same-base-type cases may also be possible.
647 */
648 if (inst->dst.type != inst->src[0].type &&
649 inst->dst.type != ELK_REGISTER_TYPE_DF &&
650 inst->src[0].type != ELK_REGISTER_TYPE_F)
651 assert(!"unimplemented: saturate mixed types");
652
653 if (elk_saturate_immediate(inst->src[0].type,
654 &inst->src[0].as_elk_reg())) {
655 inst->saturate = false;
656 progress = true;
657 }
658 }
659 break;
660
661 case ELK_OPCODE_OR:
662 if (inst->src[1].is_zero()) {
663 inst->opcode = ELK_OPCODE_MOV;
664 inst->src[1] = src_reg();
665 progress = true;
666 }
667 break;
668
669 case ELK_VEC4_OPCODE_UNPACK_UNIFORM:
670 if (inst->src[0].file != UNIFORM) {
671 inst->opcode = ELK_OPCODE_MOV;
672 progress = true;
673 }
674 break;
675
676 case ELK_OPCODE_ADD:
677 if (inst->src[1].is_zero()) {
678 inst->opcode = ELK_OPCODE_MOV;
679 inst->src[1] = src_reg();
680 progress = true;
681 }
682 break;
683
684 case ELK_OPCODE_MUL:
685 if (inst->src[1].file != IMM)
686 continue;
687
688 if (elk_reg_type_is_floating_point(inst->src[1].type))
689 break;
690
691 if (inst->src[1].is_zero()) {
692 inst->opcode = ELK_OPCODE_MOV;
693 switch (inst->src[0].type) {
694 case ELK_REGISTER_TYPE_F:
695 inst->src[0] = elk_imm_f(0.0f);
696 break;
697 case ELK_REGISTER_TYPE_D:
698 inst->src[0] = elk_imm_d(0);
699 break;
700 case ELK_REGISTER_TYPE_UD:
701 inst->src[0] = elk_imm_ud(0u);
702 break;
703 default:
704 unreachable("not reached");
705 }
706 inst->src[1] = src_reg();
707 progress = true;
708 } else if (inst->src[1].is_one()) {
709 inst->opcode = ELK_OPCODE_MOV;
710 inst->src[1] = src_reg();
711 progress = true;
712 } else if (inst->src[1].is_negative_one()) {
713 inst->opcode = ELK_OPCODE_MOV;
714 inst->src[0].negate = !inst->src[0].negate;
715 inst->src[1] = src_reg();
716 progress = true;
717 }
718 break;
719 case ELK_SHADER_OPCODE_BROADCAST:
720 if (is_uniform(inst->src[0]) ||
721 inst->src[1].is_zero()) {
722 inst->opcode = ELK_OPCODE_MOV;
723 inst->src[1] = src_reg();
724 inst->force_writemask_all = true;
725 progress = true;
726 }
727 break;
728
729 default:
730 break;
731 }
732 }
733
734 if (progress)
735 invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
736 DEPENDENCY_INSTRUCTION_DETAIL);
737
738 return progress;
739 }
740
741 /* Conditions for which we want to avoid setting the dependency control bits */
742 bool
is_dep_ctrl_unsafe(const vec4_instruction * inst)743 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
744 {
745 #define IS_DWORD(reg) \
746 (reg.type == ELK_REGISTER_TYPE_UD || \
747 reg.type == ELK_REGISTER_TYPE_D)
748
749 #define IS_64BIT(reg) (reg.file != BAD_FILE && type_sz(reg.type) == 8)
750
751 if (devinfo->ver >= 7) {
752 if (IS_64BIT(inst->dst) || IS_64BIT(inst->src[0]) ||
753 IS_64BIT(inst->src[1]) || IS_64BIT(inst->src[2]))
754 return true;
755 }
756
757 #undef IS_64BIT
758 #undef IS_DWORD
759
760 /*
761 * mlen:
762 * In the presence of send messages, totally interrupt dependency
763 * control. They're long enough that the chance of dependency
764 * control around them just doesn't matter.
765 *
766 * predicate:
767 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
768 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
769 * completes the scoreboard clear must have a non-zero execution mask. This
770 * means, if any kind of predication can change the execution mask or channel
771 * enable of the last instruction, the optimization must be avoided. This is
772 * to avoid instructions being shot down the pipeline when no writes are
773 * required.
774 *
775 * math:
776 * Dependency control does not work well over math instructions.
777 * NB: Discovered empirically
778 */
779 return (inst->mlen || inst->predicate || inst->is_math());
780 }
781
782 /**
783 * Sets the dependency control fields on instructions after register
784 * allocation and before the generator is run.
785 *
786 * When you have a sequence of instructions like:
787 *
788 * DP4 temp.x vertex uniform[0]
789 * DP4 temp.y vertex uniform[0]
790 * DP4 temp.z vertex uniform[0]
791 * DP4 temp.w vertex uniform[0]
792 *
793 * The hardware doesn't know that it can actually run the later instructions
794 * while the previous ones are in flight, producing stalls. However, we have
795 * manual fields we can set in the instructions that let it do so.
796 */
797 void
opt_set_dependency_control()798 vec4_visitor::opt_set_dependency_control()
799 {
800 vec4_instruction *last_grf_write[ELK_MAX_GRF];
801 uint8_t grf_channels_written[ELK_MAX_GRF];
802 vec4_instruction *last_mrf_write[ELK_MAX_GRF];
803 uint8_t mrf_channels_written[ELK_MAX_GRF];
804
805 assert(prog_data->total_grf ||
806 !"Must be called after register allocation");
807
808 foreach_block (block, cfg) {
809 memset(last_grf_write, 0, sizeof(last_grf_write));
810 memset(last_mrf_write, 0, sizeof(last_mrf_write));
811
812 foreach_inst_in_block (vec4_instruction, inst, block) {
813 /* If we read from a register that we were doing dependency control
814 * on, don't do dependency control across the read.
815 */
816 for (int i = 0; i < 3; i++) {
817 int reg = inst->src[i].nr + inst->src[i].offset / REG_SIZE;
818 if (inst->src[i].file == VGRF) {
819 last_grf_write[reg] = NULL;
820 } else if (inst->src[i].file == FIXED_GRF) {
821 memset(last_grf_write, 0, sizeof(last_grf_write));
822 break;
823 }
824 assert(inst->src[i].file != MRF);
825 }
826
827 if (is_dep_ctrl_unsafe(inst)) {
828 memset(last_grf_write, 0, sizeof(last_grf_write));
829 memset(last_mrf_write, 0, sizeof(last_mrf_write));
830 continue;
831 }
832
833 /* Now, see if we can do dependency control for this instruction
834 * against a previous one writing to its destination.
835 */
836 int reg = inst->dst.nr + inst->dst.offset / REG_SIZE;
837 if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
838 if (last_grf_write[reg] &&
839 last_grf_write[reg]->dst.offset == inst->dst.offset &&
840 !(inst->dst.writemask & grf_channels_written[reg])) {
841 last_grf_write[reg]->no_dd_clear = true;
842 inst->no_dd_check = true;
843 } else {
844 grf_channels_written[reg] = 0;
845 }
846
847 last_grf_write[reg] = inst;
848 grf_channels_written[reg] |= inst->dst.writemask;
849 } else if (inst->dst.file == MRF) {
850 if (last_mrf_write[reg] &&
851 last_mrf_write[reg]->dst.offset == inst->dst.offset &&
852 !(inst->dst.writemask & mrf_channels_written[reg])) {
853 last_mrf_write[reg]->no_dd_clear = true;
854 inst->no_dd_check = true;
855 } else {
856 mrf_channels_written[reg] = 0;
857 }
858
859 last_mrf_write[reg] = inst;
860 mrf_channels_written[reg] |= inst->dst.writemask;
861 }
862 }
863 }
864 }
865
866 bool
can_reswizzle(const struct intel_device_info * devinfo,int dst_writemask,int swizzle,int swizzle_mask)867 vec4_instruction::can_reswizzle(const struct intel_device_info *devinfo,
868 int dst_writemask,
869 int swizzle,
870 int swizzle_mask)
871 {
872 /* Gfx6 MATH instructions can not execute in align16 mode, so swizzles
873 * are not allowed.
874 */
875 if (devinfo->ver == 6 && is_math() && swizzle != ELK_SWIZZLE_XYZW)
876 return false;
877
878 /* If we write to the flag register changing the swizzle would change
879 * what channels are written to the flag register.
880 */
881 if (writes_flag(devinfo))
882 return false;
883
884 /* We can't swizzle implicit accumulator access. We'd have to
885 * reswizzle the producer of the accumulator value in addition
886 * to the consumer (i.e. both MUL and MACH). Just skip this.
887 */
888 if (reads_accumulator_implicitly())
889 return false;
890
891 if (!can_do_writemask(devinfo) && dst_writemask != WRITEMASK_XYZW)
892 return false;
893
894 /* If this instruction sets anything not referenced by swizzle, then we'd
895 * totally break it when we reswizzle.
896 */
897 if (dst.writemask & ~swizzle_mask)
898 return false;
899
900 if (mlen > 0)
901 return false;
902
903 for (int i = 0; i < 3; i++) {
904 if (src[i].is_accumulator())
905 return false;
906 }
907
908 return true;
909 }
910
911 /**
912 * For any channels in the swizzle's source that were populated by this
913 * instruction, rewrite the instruction to put the appropriate result directly
914 * in those channels.
915 *
916 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
917 */
918 void
reswizzle(int dst_writemask,int swizzle)919 vec4_instruction::reswizzle(int dst_writemask, int swizzle)
920 {
921 /* Destination write mask doesn't correspond to source swizzle for the dot
922 * product and pack_bytes instructions.
923 */
924 if (opcode != ELK_OPCODE_DP4 && opcode != ELK_OPCODE_DPH &&
925 opcode != ELK_OPCODE_DP3 && opcode != ELK_OPCODE_DP2 &&
926 opcode != ELK_VEC4_OPCODE_PACK_BYTES) {
927 for (int i = 0; i < 3; i++) {
928 if (src[i].file == BAD_FILE)
929 continue;
930
931 if (src[i].file == IMM) {
932 assert(src[i].type != ELK_REGISTER_TYPE_V &&
933 src[i].type != ELK_REGISTER_TYPE_UV);
934
935 /* Vector immediate types need to be reswizzled. */
936 if (src[i].type == ELK_REGISTER_TYPE_VF) {
937 const unsigned imm[] = {
938 (src[i].ud >> 0) & 0x0ff,
939 (src[i].ud >> 8) & 0x0ff,
940 (src[i].ud >> 16) & 0x0ff,
941 (src[i].ud >> 24) & 0x0ff,
942 };
943
944 src[i] = elk_imm_vf4(imm[ELK_GET_SWZ(swizzle, 0)],
945 imm[ELK_GET_SWZ(swizzle, 1)],
946 imm[ELK_GET_SWZ(swizzle, 2)],
947 imm[ELK_GET_SWZ(swizzle, 3)]);
948 }
949
950 continue;
951 }
952
953 src[i].swizzle = elk_compose_swizzle(swizzle, src[i].swizzle);
954 }
955 }
956
957 /* Apply the specified swizzle and writemask to the original mask of
958 * written components.
959 */
960 dst.writemask = dst_writemask &
961 elk_apply_swizzle_to_mask(swizzle, dst.writemask);
962 }
963
964 /*
965 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
966 * just written and then MOVed into another reg and making the original write
967 * of the GRF write directly to the final destination instead.
968 */
969 bool
opt_register_coalesce()970 vec4_visitor::opt_register_coalesce()
971 {
972 bool progress = false;
973 int next_ip = 0;
974 const vec4_live_variables &live = live_analysis.require();
975
976 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
977 int ip = next_ip;
978 next_ip++;
979
980 if (inst->opcode != ELK_OPCODE_MOV ||
981 (inst->dst.file != VGRF && inst->dst.file != MRF) ||
982 inst->predicate ||
983 inst->src[0].file != VGRF ||
984 inst->dst.type != inst->src[0].type ||
985 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
986 continue;
987
988 /* Remove no-op MOVs */
989 if (inst->dst.file == inst->src[0].file &&
990 inst->dst.nr == inst->src[0].nr &&
991 inst->dst.offset == inst->src[0].offset) {
992 bool is_nop_mov = true;
993
994 for (unsigned c = 0; c < 4; c++) {
995 if ((inst->dst.writemask & (1 << c)) == 0)
996 continue;
997
998 if (ELK_GET_SWZ(inst->src[0].swizzle, c) != c) {
999 is_nop_mov = false;
1000 break;
1001 }
1002 }
1003
1004 if (is_nop_mov) {
1005 inst->remove(block);
1006 progress = true;
1007 continue;
1008 }
1009 }
1010
1011 bool to_mrf = (inst->dst.file == MRF);
1012
1013 /* Can't coalesce this GRF if someone else was going to
1014 * read it later.
1015 */
1016 if (live.var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip)
1017 continue;
1018
1019 /* We need to check interference with the final destination between this
1020 * instruction and the earliest instruction involved in writing the GRF
1021 * we're eliminating. To do that, keep track of which of our source
1022 * channels we've seen initialized.
1023 */
1024 const unsigned chans_needed =
1025 elk_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1026 inst->dst.writemask);
1027 unsigned chans_remaining = chans_needed;
1028
1029 /* Now walk up the instruction stream trying to see if we can rewrite
1030 * everything writing to the temporary to write into the destination
1031 * instead.
1032 */
1033 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1034 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1035 inst) {
1036 _scan_inst = scan_inst;
1037
1038 if (regions_overlap(inst->src[0], inst->size_read(0),
1039 scan_inst->dst, scan_inst->size_written)) {
1040 /* Found something writing to the reg we want to coalesce away. */
1041 if (to_mrf) {
1042 /* SEND instructions can't have MRF as a destination. */
1043 if (scan_inst->mlen)
1044 break;
1045
1046 if (devinfo->ver == 6) {
1047 /* gfx6 math instructions must have the destination be
1048 * VGRF, so no compute-to-MRF for them.
1049 */
1050 if (scan_inst->is_math()) {
1051 break;
1052 }
1053 }
1054 }
1055
1056 /* ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2 generates a bunch of mov(1)
1057 * instructions, and this optimization pass is not capable of
1058 * handling that. Bail on these instructions and hope that some
1059 * later optimization pass can do the right thing after they are
1060 * expanded.
1061 */
1062 if (scan_inst->opcode == ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
1063 break;
1064
1065 /* This doesn't handle saturation on the instruction we
1066 * want to coalesce away if the register types do not match.
1067 * But if scan_inst is a non type-converting 'mov', we can fix
1068 * the types later.
1069 */
1070 if (inst->saturate &&
1071 inst->dst.type != scan_inst->dst.type &&
1072 !(scan_inst->opcode == ELK_OPCODE_MOV &&
1073 scan_inst->dst.type == scan_inst->src[0].type))
1074 break;
1075
1076 /* Only allow coalescing between registers of the same type size.
1077 * Otherwise we would need to make the pass aware of the fact that
1078 * channel sizes are different for single and double precision.
1079 */
1080 if (type_sz(inst->src[0].type) != type_sz(scan_inst->src[0].type))
1081 break;
1082
1083 /* Check that scan_inst writes the same amount of data as the
1084 * instruction, otherwise coalescing would lead to writing a
1085 * different (larger or smaller) region of the destination
1086 */
1087 if (scan_inst->size_written != inst->size_written)
1088 break;
1089
1090 /* If we can't handle the swizzle, bail. */
1091 if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
1092 inst->src[0].swizzle,
1093 chans_needed)) {
1094 break;
1095 }
1096
1097 /* This only handles coalescing writes of 8 channels (1 register
1098 * for single-precision and 2 registers for double-precision)
1099 * starting at the source offset of the copy instruction.
1100 */
1101 if (DIV_ROUND_UP(scan_inst->size_written,
1102 type_sz(scan_inst->dst.type)) > 8 ||
1103 scan_inst->dst.offset != inst->src[0].offset)
1104 break;
1105
1106 /* Mark which channels we found unconditional writes for. */
1107 if (!scan_inst->predicate)
1108 chans_remaining &= ~scan_inst->dst.writemask;
1109
1110 if (chans_remaining == 0)
1111 break;
1112 }
1113
1114 /* You can't read from an MRF, so if someone else reads our MRF's
1115 * source GRF that we wanted to rewrite, that stops us. If it's a
1116 * GRF we're trying to coalesce to, we don't actually handle
1117 * rewriting sources so bail in that case as well.
1118 */
1119 bool interfered = false;
1120 for (int i = 0; i < 3; i++) {
1121 if (regions_overlap(inst->src[0], inst->size_read(0),
1122 scan_inst->src[i], scan_inst->size_read(i)))
1123 interfered = true;
1124 }
1125 if (interfered)
1126 break;
1127
1128 /* If somebody else writes the same channels of our destination here,
1129 * we can't coalesce before that.
1130 */
1131 if (regions_overlap(inst->dst, inst->size_written,
1132 scan_inst->dst, scan_inst->size_written) &&
1133 (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
1134 break;
1135 }
1136
1137 /* Check for reads of the register we're trying to coalesce into. We
1138 * can't go rewriting instructions above that to put some other value
1139 * in the register instead.
1140 */
1141 if (to_mrf && scan_inst->mlen > 0) {
1142 unsigned start = scan_inst->base_mrf;
1143 unsigned end = scan_inst->base_mrf + scan_inst->mlen;
1144
1145 if (inst->dst.nr >= start && inst->dst.nr < end) {
1146 break;
1147 }
1148 } else {
1149 for (int i = 0; i < 3; i++) {
1150 if (regions_overlap(inst->dst, inst->size_written,
1151 scan_inst->src[i], scan_inst->size_read(i)))
1152 interfered = true;
1153 }
1154 if (interfered)
1155 break;
1156 }
1157 }
1158
1159 if (chans_remaining == 0) {
1160 /* If we've made it here, we have an MOV we want to coalesce out, and
1161 * a scan_inst pointing to the earliest instruction involved in
1162 * computing the value. Now go rewrite the instruction stream
1163 * between the two.
1164 */
1165 vec4_instruction *scan_inst = _scan_inst;
1166 while (scan_inst != inst) {
1167 if (scan_inst->dst.file == VGRF &&
1168 scan_inst->dst.nr == inst->src[0].nr &&
1169 scan_inst->dst.offset == inst->src[0].offset) {
1170 scan_inst->reswizzle(inst->dst.writemask,
1171 inst->src[0].swizzle);
1172 scan_inst->dst.file = inst->dst.file;
1173 scan_inst->dst.nr = inst->dst.nr;
1174 scan_inst->dst.offset = inst->dst.offset;
1175 if (inst->saturate &&
1176 inst->dst.type != scan_inst->dst.type) {
1177 /* If we have reached this point, scan_inst is a non
1178 * type-converting 'mov' and we can modify its register types
1179 * to match the ones in inst. Otherwise, we could have an
1180 * incorrect saturation result.
1181 */
1182 scan_inst->dst.type = inst->dst.type;
1183 scan_inst->src[0].type = inst->src[0].type;
1184 }
1185 scan_inst->saturate |= inst->saturate;
1186 }
1187 scan_inst = (vec4_instruction *)scan_inst->next;
1188 }
1189 inst->remove(block);
1190 progress = true;
1191 }
1192 }
1193
1194 if (progress)
1195 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1196
1197 return progress;
1198 }
1199
1200 /**
1201 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1202 * flow. We could probably do better here with some form of divergence
1203 * analysis.
1204 */
1205 bool
eliminate_find_live_channel()1206 vec4_visitor::eliminate_find_live_channel()
1207 {
1208 bool progress = false;
1209 unsigned depth = 0;
1210
1211 if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
1212 /* The optimization below assumes that channel zero is live on thread
1213 * dispatch, which may not be the case if the fixed function dispatches
1214 * threads sparsely.
1215 */
1216 return false;
1217 }
1218
1219 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1220 switch (inst->opcode) {
1221 case ELK_OPCODE_IF:
1222 case ELK_OPCODE_DO:
1223 depth++;
1224 break;
1225
1226 case ELK_OPCODE_ENDIF:
1227 case ELK_OPCODE_WHILE:
1228 depth--;
1229 break;
1230
1231 case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
1232 if (depth == 0) {
1233 inst->opcode = ELK_OPCODE_MOV;
1234 inst->src[0] = elk_imm_d(0);
1235 inst->force_writemask_all = true;
1236 progress = true;
1237 }
1238 break;
1239
1240 default:
1241 break;
1242 }
1243 }
1244
1245 if (progress)
1246 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
1247
1248 return progress;
1249 }
1250
1251 /**
1252 * Splits virtual GRFs requesting more than one contiguous physical register.
1253 *
1254 * We initially create large virtual GRFs for temporary structures, arrays,
1255 * and matrices, so that the visitor functions can add offsets to work their
1256 * way down to the actual member being accessed. But when it comes to
1257 * optimization, we'd like to treat each register as individual storage if
1258 * possible.
1259 *
1260 * So far, the only thing that might prevent splitting is a send message from
1261 * a GRF on IVB.
1262 */
1263 void
split_virtual_grfs()1264 vec4_visitor::split_virtual_grfs()
1265 {
1266 int num_vars = this->alloc.count;
1267 int new_virtual_grf[num_vars];
1268 bool split_grf[num_vars];
1269
1270 memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1271
1272 /* Try to split anything > 0 sized. */
1273 for (int i = 0; i < num_vars; i++) {
1274 split_grf[i] = this->alloc.sizes[i] != 1;
1275 }
1276
1277 /* Check that the instructions are compatible with the registers we're trying
1278 * to split.
1279 */
1280 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1281 if (inst->dst.file == VGRF && regs_written(inst) > 1)
1282 split_grf[inst->dst.nr] = false;
1283
1284 for (int i = 0; i < 3; i++) {
1285 if (inst->src[i].file == VGRF && regs_read(inst, i) > 1)
1286 split_grf[inst->src[i].nr] = false;
1287 }
1288 }
1289
1290 /* Allocate new space for split regs. Note that the virtual
1291 * numbers will be contiguous.
1292 */
1293 for (int i = 0; i < num_vars; i++) {
1294 if (!split_grf[i])
1295 continue;
1296
1297 new_virtual_grf[i] = alloc.allocate(1);
1298 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1299 unsigned reg = alloc.allocate(1);
1300 assert(reg == new_virtual_grf[i] + j - 1);
1301 (void) reg;
1302 }
1303 this->alloc.sizes[i] = 1;
1304 }
1305
1306 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1307 if (inst->dst.file == VGRF && split_grf[inst->dst.nr] &&
1308 inst->dst.offset / REG_SIZE != 0) {
1309 inst->dst.nr = (new_virtual_grf[inst->dst.nr] +
1310 inst->dst.offset / REG_SIZE - 1);
1311 inst->dst.offset %= REG_SIZE;
1312 }
1313 for (int i = 0; i < 3; i++) {
1314 if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] &&
1315 inst->src[i].offset / REG_SIZE != 0) {
1316 inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] +
1317 inst->src[i].offset / REG_SIZE - 1);
1318 inst->src[i].offset %= REG_SIZE;
1319 }
1320 }
1321 }
1322 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
1323 }
1324
1325 void
dump_instruction_to_file(const elk_backend_instruction * be_inst,FILE * file) const1326 vec4_visitor::dump_instruction_to_file(const elk_backend_instruction *be_inst, FILE *file) const
1327 {
1328 const vec4_instruction *inst = (const vec4_instruction *)be_inst;
1329
1330 if (inst->predicate) {
1331 fprintf(file, "(%cf%d.%d%s) ",
1332 inst->predicate_inverse ? '-' : '+',
1333 inst->flag_subreg / 2,
1334 inst->flag_subreg % 2,
1335 elk_pred_ctrl_align16[inst->predicate]);
1336 }
1337
1338 fprintf(file, "%s(%d)", elk_instruction_name(&compiler->isa, inst->opcode),
1339 inst->exec_size);
1340 if (inst->saturate)
1341 fprintf(file, ".sat");
1342 if (inst->conditional_mod) {
1343 fprintf(file, "%s", elk_conditional_modifier[inst->conditional_mod]);
1344 if (!inst->predicate &&
1345 (devinfo->ver < 5 || (inst->opcode != ELK_OPCODE_SEL &&
1346 inst->opcode != ELK_OPCODE_CSEL &&
1347 inst->opcode != ELK_OPCODE_IF &&
1348 inst->opcode != ELK_OPCODE_WHILE))) {
1349 fprintf(file, ".f%d.%d", inst->flag_subreg / 2, inst->flag_subreg % 2);
1350 }
1351 }
1352 fprintf(file, " ");
1353
1354 switch (inst->dst.file) {
1355 case VGRF:
1356 fprintf(file, "vgrf%d", inst->dst.nr);
1357 break;
1358 case FIXED_GRF:
1359 fprintf(file, "g%d", inst->dst.nr);
1360 break;
1361 case MRF:
1362 fprintf(file, "m%d", inst->dst.nr);
1363 break;
1364 case ARF:
1365 switch (inst->dst.nr) {
1366 case ELK_ARF_NULL:
1367 fprintf(file, "null");
1368 break;
1369 case ELK_ARF_ADDRESS:
1370 fprintf(file, "a0.%d", inst->dst.subnr);
1371 break;
1372 case ELK_ARF_ACCUMULATOR:
1373 fprintf(file, "acc%d", inst->dst.subnr);
1374 break;
1375 case ELK_ARF_FLAG:
1376 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1377 break;
1378 default:
1379 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1380 break;
1381 }
1382 break;
1383 case BAD_FILE:
1384 fprintf(file, "(null)");
1385 break;
1386 case IMM:
1387 case ATTR:
1388 case UNIFORM:
1389 unreachable("not reached");
1390 }
1391 if (inst->dst.offset ||
1392 (inst->dst.file == VGRF &&
1393 alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
1394 const unsigned reg_size = (inst->dst.file == UNIFORM ? 16 : REG_SIZE);
1395 fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
1396 inst->dst.offset % reg_size);
1397 }
1398 if (inst->dst.writemask != WRITEMASK_XYZW) {
1399 fprintf(file, ".");
1400 if (inst->dst.writemask & 1)
1401 fprintf(file, "x");
1402 if (inst->dst.writemask & 2)
1403 fprintf(file, "y");
1404 if (inst->dst.writemask & 4)
1405 fprintf(file, "z");
1406 if (inst->dst.writemask & 8)
1407 fprintf(file, "w");
1408 }
1409 fprintf(file, ":%s", elk_reg_type_to_letters(inst->dst.type));
1410
1411 if (inst->src[0].file != BAD_FILE)
1412 fprintf(file, ", ");
1413
1414 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1415 if (inst->src[i].negate)
1416 fprintf(file, "-");
1417 if (inst->src[i].abs)
1418 fprintf(file, "|");
1419 switch (inst->src[i].file) {
1420 case VGRF:
1421 fprintf(file, "vgrf%d", inst->src[i].nr);
1422 break;
1423 case FIXED_GRF:
1424 fprintf(file, "g%d.%d", inst->src[i].nr, inst->src[i].subnr);
1425 break;
1426 case ATTR:
1427 fprintf(file, "attr%d", inst->src[i].nr);
1428 break;
1429 case UNIFORM:
1430 fprintf(file, "u%d", inst->src[i].nr);
1431 break;
1432 case IMM:
1433 switch (inst->src[i].type) {
1434 case ELK_REGISTER_TYPE_F:
1435 fprintf(file, "%fF", inst->src[i].f);
1436 break;
1437 case ELK_REGISTER_TYPE_DF:
1438 fprintf(file, "%fDF", inst->src[i].df);
1439 break;
1440 case ELK_REGISTER_TYPE_D:
1441 fprintf(file, "%dD", inst->src[i].d);
1442 break;
1443 case ELK_REGISTER_TYPE_UD:
1444 fprintf(file, "%uU", inst->src[i].ud);
1445 break;
1446 case ELK_REGISTER_TYPE_VF:
1447 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1448 elk_vf_to_float((inst->src[i].ud >> 0) & 0xff),
1449 elk_vf_to_float((inst->src[i].ud >> 8) & 0xff),
1450 elk_vf_to_float((inst->src[i].ud >> 16) & 0xff),
1451 elk_vf_to_float((inst->src[i].ud >> 24) & 0xff));
1452 break;
1453 default:
1454 fprintf(file, "???");
1455 break;
1456 }
1457 break;
1458 case ARF:
1459 switch (inst->src[i].nr) {
1460 case ELK_ARF_NULL:
1461 fprintf(file, "null");
1462 break;
1463 case ELK_ARF_ADDRESS:
1464 fprintf(file, "a0.%d", inst->src[i].subnr);
1465 break;
1466 case ELK_ARF_ACCUMULATOR:
1467 fprintf(file, "acc%d", inst->src[i].subnr);
1468 break;
1469 case ELK_ARF_FLAG:
1470 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1471 break;
1472 default:
1473 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1474 break;
1475 }
1476 break;
1477 case BAD_FILE:
1478 fprintf(file, "(null)");
1479 break;
1480 case MRF:
1481 unreachable("not reached");
1482 }
1483
1484 if (inst->src[i].offset ||
1485 (inst->src[i].file == VGRF &&
1486 alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
1487 const unsigned reg_size = (inst->src[i].file == UNIFORM ? 16 : REG_SIZE);
1488 fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
1489 inst->src[i].offset % reg_size);
1490 }
1491
1492 if (inst->src[i].file != IMM) {
1493 static const char *chans[4] = {"x", "y", "z", "w"};
1494 fprintf(file, ".");
1495 for (int c = 0; c < 4; c++) {
1496 fprintf(file, "%s", chans[ELK_GET_SWZ(inst->src[i].swizzle, c)]);
1497 }
1498 }
1499
1500 if (inst->src[i].abs)
1501 fprintf(file, "|");
1502
1503 if (inst->src[i].file != IMM) {
1504 fprintf(file, ":%s", elk_reg_type_to_letters(inst->src[i].type));
1505 }
1506
1507 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1508 fprintf(file, ", ");
1509 }
1510
1511 if (inst->force_writemask_all)
1512 fprintf(file, " NoMask");
1513
1514 if (inst->exec_size != 8)
1515 fprintf(file, " group%d", inst->group);
1516
1517 fprintf(file, "\n");
1518 }
1519
1520
1521 int
setup_attributes(int payload_reg)1522 vec4_vs_visitor::setup_attributes(int payload_reg)
1523 {
1524 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1525 for (int i = 0; i < 3; i++) {
1526 if (inst->src[i].file == ATTR) {
1527 assert(inst->src[i].offset % REG_SIZE == 0);
1528 int grf = payload_reg + inst->src[i].nr +
1529 inst->src[i].offset / REG_SIZE;
1530
1531 struct elk_reg reg = elk_vec8_grf(grf, 0);
1532 reg.swizzle = inst->src[i].swizzle;
1533 reg.type = inst->src[i].type;
1534 reg.abs = inst->src[i].abs;
1535 reg.negate = inst->src[i].negate;
1536 inst->src[i] = reg;
1537 }
1538 }
1539 }
1540
1541 return payload_reg + vs_prog_data->nr_attribute_slots;
1542 }
1543
1544 void
setup_push_ranges()1545 vec4_visitor::setup_push_ranges()
1546 {
1547 /* Only allow 32 registers (256 uniform components) as push constants,
1548 * which is the limit on gfx6.
1549 *
1550 * If changing this value, note the limitation about total_regs in
1551 * elk_curbe.c.
1552 */
1553 const unsigned max_push_length = 32;
1554
1555 push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8);
1556 push_length = MIN2(push_length, max_push_length);
1557
1558 /* Shrink UBO push ranges so it all fits in max_push_length */
1559 for (unsigned i = 0; i < 4; i++) {
1560 struct elk_ubo_range *range = &prog_data->base.ubo_ranges[i];
1561
1562 if (push_length + range->length > max_push_length)
1563 range->length = max_push_length - push_length;
1564
1565 push_length += range->length;
1566 }
1567 assert(push_length <= max_push_length);
1568 }
1569
1570 int
setup_uniforms(int reg)1571 vec4_visitor::setup_uniforms(int reg)
1572 {
1573 /* It's possible that uniform compaction will shrink further than expected
1574 * so we re-compute the layout and set up our UBO push starts.
1575 */
1576 ASSERTED const unsigned old_push_length = push_length;
1577 push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8);
1578 for (unsigned i = 0; i < 4; i++) {
1579 ubo_push_start[i] = push_length;
1580 push_length += stage_prog_data->ubo_ranges[i].length;
1581 }
1582 assert(push_length == old_push_length);
1583
1584 /* The pre-gfx6 VS requires that some push constants get loaded no
1585 * matter what, or the GPU would hang.
1586 */
1587 if (devinfo->ver < 6 && push_length == 0) {
1588 elk_stage_prog_data_add_params(stage_prog_data, 4);
1589 for (unsigned int i = 0; i < 4; i++) {
1590 unsigned int slot = this->uniforms * 4 + i;
1591 stage_prog_data->param[slot] = ELK_PARAM_BUILTIN_ZERO;
1592 }
1593 push_length = 1;
1594 }
1595
1596 prog_data->base.dispatch_grf_start_reg = reg;
1597 prog_data->base.curb_read_length = push_length;
1598
1599 return reg + push_length;
1600 }
1601
1602 void
setup_payload(void)1603 vec4_vs_visitor::setup_payload(void)
1604 {
1605 int reg = 0;
1606
1607 /* The payload always contains important data in g0, which contains
1608 * the URB handles that are passed on to the URB write at the end
1609 * of the thread. So, we always start push constants at g1.
1610 */
1611 reg++;
1612
1613 reg = setup_uniforms(reg);
1614
1615 reg = setup_attributes(reg);
1616
1617 this->first_non_payload_grf = reg;
1618 }
1619
1620 bool
lower_minmax()1621 vec4_visitor::lower_minmax()
1622 {
1623 assert(devinfo->ver < 6);
1624
1625 bool progress = false;
1626
1627 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1628 const vec4_builder ibld(this, block, inst);
1629
1630 if (inst->opcode == ELK_OPCODE_SEL &&
1631 inst->predicate == ELK_PREDICATE_NONE) {
1632 /* If src1 is an immediate value that is not NaN, then it can't be
1633 * NaN. In that case, emit CMP because it is much better for cmod
1634 * propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't
1635 * support HF or DF, so it is not necessary to check for those.
1636 */
1637 if (inst->src[1].type != ELK_REGISTER_TYPE_F ||
1638 (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
1639 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
1640 inst->conditional_mod);
1641 } else {
1642 ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
1643 inst->conditional_mod);
1644 }
1645 inst->predicate = ELK_PREDICATE_NORMAL;
1646 inst->conditional_mod = ELK_CONDITIONAL_NONE;
1647
1648 progress = true;
1649 }
1650 }
1651
1652 if (progress)
1653 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1654
1655 return progress;
1656 }
1657
1658 src_reg
get_timestamp()1659 vec4_visitor::get_timestamp()
1660 {
1661 assert(devinfo->ver == 7);
1662
1663 src_reg ts = src_reg(elk_reg(ELK_ARCHITECTURE_REGISTER_FILE,
1664 ELK_ARF_TIMESTAMP,
1665 0,
1666 0,
1667 0,
1668 ELK_REGISTER_TYPE_UD,
1669 ELK_VERTICAL_STRIDE_0,
1670 ELK_WIDTH_4,
1671 ELK_HORIZONTAL_STRIDE_4,
1672 ELK_SWIZZLE_XYZW,
1673 WRITEMASK_XYZW));
1674
1675 dst_reg dst = dst_reg(this, glsl_uvec4_type());
1676
1677 vec4_instruction *mov = emit(MOV(dst, ts));
1678 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1679 * even if it's not enabled in the dispatch.
1680 */
1681 mov->force_writemask_all = true;
1682
1683 return src_reg(dst);
1684 }
1685
1686 static bool
is_align1_df(vec4_instruction * inst)1687 is_align1_df(vec4_instruction *inst)
1688 {
1689 switch (inst->opcode) {
1690 case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
1691 case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
1692 case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
1693 case ELK_VEC4_OPCODE_TO_DOUBLE:
1694 case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
1695 case ELK_VEC4_OPCODE_PICK_HIGH_32BIT:
1696 case ELK_VEC4_OPCODE_SET_LOW_32BIT:
1697 case ELK_VEC4_OPCODE_SET_HIGH_32BIT:
1698 return true;
1699 default:
1700 return false;
1701 }
1702 }
1703
1704 /**
1705 * Three source instruction must have a GRF/MRF destination register.
1706 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
1707 */
1708 void
fixup_3src_null_dest()1709 vec4_visitor::fixup_3src_null_dest()
1710 {
1711 bool progress = false;
1712
1713 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
1714 if (inst->elk_is_3src(compiler) && inst->dst.is_null()) {
1715 const unsigned size_written = type_sz(inst->dst.type);
1716 const unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
1717
1718 inst->dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
1719 inst->dst.type);
1720 progress = true;
1721 }
1722 }
1723
1724 if (progress)
1725 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
1726 DEPENDENCY_VARIABLES);
1727 }
1728
1729 void
convert_to_hw_regs()1730 vec4_visitor::convert_to_hw_regs()
1731 {
1732 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1733 for (int i = 0; i < 3; i++) {
1734 class src_reg &src = inst->src[i];
1735 struct elk_reg reg;
1736 switch (src.file) {
1737 case VGRF: {
1738 reg = byte_offset(elk_vecn_grf(4, src.nr, 0), src.offset);
1739 reg.type = src.type;
1740 reg.abs = src.abs;
1741 reg.negate = src.negate;
1742 break;
1743 }
1744
1745 case UNIFORM: {
1746 if (src.nr >= UBO_START) {
1747 reg = byte_offset(elk_vec4_grf(
1748 prog_data->base.dispatch_grf_start_reg +
1749 ubo_push_start[src.nr - UBO_START] +
1750 src.offset / 32, 0),
1751 src.offset % 32);
1752 } else {
1753 reg = byte_offset(elk_vec4_grf(
1754 prog_data->base.dispatch_grf_start_reg +
1755 src.nr / 2, src.nr % 2 * 4),
1756 src.offset);
1757 }
1758 reg = stride(reg, 0, 4, 1);
1759 reg.type = src.type;
1760 reg.abs = src.abs;
1761 reg.negate = src.negate;
1762
1763 /* This should have been moved to pull constants. */
1764 assert(!src.reladdr);
1765 break;
1766 }
1767
1768 case FIXED_GRF:
1769 if (type_sz(src.type) == 8) {
1770 reg = src.as_elk_reg();
1771 break;
1772 }
1773 FALLTHROUGH;
1774 case ARF:
1775 case IMM:
1776 continue;
1777
1778 case BAD_FILE:
1779 /* Probably unused. */
1780 reg = elk_null_reg();
1781 reg = retype(reg, src.type);
1782 break;
1783
1784 case MRF:
1785 case ATTR:
1786 unreachable("not reached");
1787 }
1788
1789 apply_logical_swizzle(®, inst, i);
1790 src = reg;
1791
1792 /* From IVB PRM, vol4, part3, "General Restrictions on Regioning
1793 * Parameters":
1794 *
1795 * "If ExecSize = Width and HorzStride ≠ 0, VertStride must be set
1796 * to Width * HorzStride."
1797 *
1798 * We can break this rule with DF sources on DF align1
1799 * instructions, because the exec_size would be 4 and width is 4.
1800 * As we know we are not accessing to next GRF, it is safe to
1801 * set vstride to the formula given by the rule itself.
1802 */
1803 if (is_align1_df(inst) && (cvt(inst->exec_size) - 1) == src.width)
1804 src.vstride = src.width + src.hstride;
1805 }
1806
1807 if (inst->elk_is_3src(compiler)) {
1808 /* 3-src instructions with scalar sources support arbitrary subnr,
1809 * but don't actually use swizzles. Convert swizzle into subnr.
1810 * Skip this for double-precision instructions: RepCtrl=1 is not
1811 * allowed for them and needs special handling.
1812 */
1813 for (int i = 0; i < 3; i++) {
1814 if (inst->src[i].vstride == ELK_VERTICAL_STRIDE_0 &&
1815 type_sz(inst->src[i].type) < 8) {
1816 assert(elk_is_single_value_swizzle(inst->src[i].swizzle));
1817 inst->src[i].subnr += 4 * ELK_GET_SWZ(inst->src[i].swizzle, 0);
1818 }
1819 }
1820 }
1821
1822 dst_reg &dst = inst->dst;
1823 struct elk_reg reg;
1824
1825 switch (inst->dst.file) {
1826 case VGRF:
1827 reg = byte_offset(elk_vec8_grf(dst.nr, 0), dst.offset);
1828 reg.type = dst.type;
1829 reg.writemask = dst.writemask;
1830 break;
1831
1832 case MRF:
1833 reg = byte_offset(elk_message_reg(dst.nr), dst.offset);
1834 assert((reg.nr & ~ELK_MRF_COMPR4) < ELK_MAX_MRF(devinfo->ver));
1835 reg.type = dst.type;
1836 reg.writemask = dst.writemask;
1837 break;
1838
1839 case ARF:
1840 case FIXED_GRF:
1841 reg = dst.as_elk_reg();
1842 break;
1843
1844 case BAD_FILE:
1845 reg = elk_null_reg();
1846 reg = retype(reg, dst.type);
1847 break;
1848
1849 case IMM:
1850 case ATTR:
1851 case UNIFORM:
1852 unreachable("not reached");
1853 }
1854
1855 dst = reg;
1856 }
1857 }
1858
1859 static bool
stage_uses_interleaved_attributes(unsigned stage,enum intel_shader_dispatch_mode dispatch_mode)1860 stage_uses_interleaved_attributes(unsigned stage,
1861 enum intel_shader_dispatch_mode dispatch_mode)
1862 {
1863 switch (stage) {
1864 case MESA_SHADER_TESS_EVAL:
1865 return true;
1866 case MESA_SHADER_GEOMETRY:
1867 return dispatch_mode != INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
1868 default:
1869 return false;
1870 }
1871 }
1872
1873 /**
1874 * Get the closest native SIMD width supported by the hardware for instruction
1875 * \p inst. The instruction will be left untouched by
1876 * vec4_visitor::lower_simd_width() if the returned value matches the
1877 * instruction's original execution size.
1878 */
1879 static unsigned
get_lowered_simd_width(const struct intel_device_info * devinfo,enum intel_shader_dispatch_mode dispatch_mode,unsigned stage,const vec4_instruction * inst)1880 get_lowered_simd_width(const struct intel_device_info *devinfo,
1881 enum intel_shader_dispatch_mode dispatch_mode,
1882 unsigned stage, const vec4_instruction *inst)
1883 {
1884 /* Do not split some instructions that require special handling */
1885 switch (inst->opcode) {
1886 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
1887 case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1888 return inst->exec_size;
1889 default:
1890 break;
1891 }
1892
1893 unsigned lowered_width = MIN2(16, inst->exec_size);
1894
1895 /* We need to split some cases of double-precision instructions that write
1896 * 2 registers. We only need to care about this in gfx7 because that is the
1897 * only hardware that implements fp64 in Align16.
1898 */
1899 if (devinfo->ver == 7 && inst->size_written > REG_SIZE) {
1900 /* Align16 8-wide double-precision SEL does not work well. Verified
1901 * empirically.
1902 */
1903 if (inst->opcode == ELK_OPCODE_SEL && type_sz(inst->dst.type) == 8)
1904 lowered_width = MIN2(lowered_width, 4);
1905
1906 /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct
1907 * Register Addressing:
1908 *
1909 * "When destination spans two registers, the source MUST span two
1910 * registers."
1911 */
1912 for (unsigned i = 0; i < 3; i++) {
1913 if (inst->src[i].file == BAD_FILE)
1914 continue;
1915 if (inst->size_read(i) <= REG_SIZE)
1916 lowered_width = MIN2(lowered_width, 4);
1917
1918 /* Interleaved attribute setups use a vertical stride of 0, which
1919 * makes them hit the associated instruction decompression bug in gfx7.
1920 * Split them to prevent this.
1921 */
1922 if (inst->src[i].file == ATTR &&
1923 stage_uses_interleaved_attributes(stage, dispatch_mode))
1924 lowered_width = MIN2(lowered_width, 4);
1925 }
1926 }
1927
1928 /* IvyBridge can manage a maximum of 4 DFs per SIMD4x2 instruction, since
1929 * it doesn't support compression in Align16 mode, no matter if it has
1930 * force_writemask_all enabled or disabled (the latter is affected by the
1931 * compressed instruction bug in gfx7, which is another reason to enforce
1932 * this limit).
1933 */
1934 if (devinfo->verx10 == 70 &&
1935 (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8))
1936 lowered_width = MIN2(lowered_width, 4);
1937
1938 return lowered_width;
1939 }
1940
1941 static bool
dst_src_regions_overlap(vec4_instruction * inst)1942 dst_src_regions_overlap(vec4_instruction *inst)
1943 {
1944 if (inst->size_written == 0)
1945 return false;
1946
1947 unsigned dst_start = inst->dst.offset;
1948 unsigned dst_end = dst_start + inst->size_written - 1;
1949 for (int i = 0; i < 3; i++) {
1950 if (inst->src[i].file == BAD_FILE)
1951 continue;
1952
1953 if (inst->dst.file != inst->src[i].file ||
1954 inst->dst.nr != inst->src[i].nr)
1955 continue;
1956
1957 unsigned src_start = inst->src[i].offset;
1958 unsigned src_end = src_start + inst->size_read(i) - 1;
1959
1960 if ((dst_start >= src_start && dst_start <= src_end) ||
1961 (dst_end >= src_start && dst_end <= src_end) ||
1962 (dst_start <= src_start && dst_end >= src_end)) {
1963 return true;
1964 }
1965 }
1966
1967 return false;
1968 }
1969
1970 bool
lower_simd_width()1971 vec4_visitor::lower_simd_width()
1972 {
1973 bool progress = false;
1974
1975 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1976 const unsigned lowered_width =
1977 get_lowered_simd_width(devinfo, prog_data->dispatch_mode, stage, inst);
1978 assert(lowered_width <= inst->exec_size);
1979 if (lowered_width == inst->exec_size)
1980 continue;
1981
1982 /* We need to deal with source / destination overlaps when splitting.
1983 * The hardware supports reading from and writing to the same register
1984 * in the same instruction, but we need to be careful that each split
1985 * instruction we produce does not corrupt the source of the next.
1986 *
1987 * The easiest way to handle this is to make the split instructions write
1988 * to temporaries if there is an src/dst overlap and then move from the
1989 * temporaries to the original destination. We also need to consider
1990 * instructions that do partial writes via align1 opcodes, in which case
1991 * we need to make sure that the we initialize the temporary with the
1992 * value of the instruction's dst.
1993 */
1994 bool needs_temp = dst_src_regions_overlap(inst);
1995 for (unsigned n = 0; n < inst->exec_size / lowered_width; n++) {
1996 unsigned channel_offset = lowered_width * n;
1997
1998 unsigned size_written = lowered_width * type_sz(inst->dst.type);
1999
2000 /* Create the split instruction from the original so that we copy all
2001 * relevant instruction fields, then set the width and calculate the
2002 * new dst/src regions.
2003 */
2004 vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst);
2005 linst->exec_size = lowered_width;
2006 linst->group = channel_offset;
2007 linst->size_written = size_written;
2008
2009 /* Compute split dst region */
2010 dst_reg dst;
2011 if (needs_temp) {
2012 unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
2013 dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
2014 inst->dst.type);
2015 if (inst->is_align1_partial_write()) {
2016 vec4_instruction *copy = MOV(dst, src_reg(inst->dst));
2017 copy->exec_size = lowered_width;
2018 copy->group = channel_offset;
2019 copy->size_written = size_written;
2020 inst->insert_before(block, copy);
2021 }
2022 } else {
2023 dst = horiz_offset(inst->dst, channel_offset);
2024 }
2025 linst->dst = dst;
2026
2027 /* Compute split source regions */
2028 for (int i = 0; i < 3; i++) {
2029 if (linst->src[i].file == BAD_FILE)
2030 continue;
2031
2032 bool is_interleaved_attr =
2033 linst->src[i].file == ATTR &&
2034 stage_uses_interleaved_attributes(stage,
2035 prog_data->dispatch_mode);
2036
2037 if (!is_uniform(linst->src[i]) && !is_interleaved_attr)
2038 linst->src[i] = horiz_offset(linst->src[i], channel_offset);
2039 }
2040
2041 inst->insert_before(block, linst);
2042
2043 /* If we used a temporary to store the result of the split
2044 * instruction, copy the result to the original destination
2045 */
2046 if (needs_temp) {
2047 vec4_instruction *mov =
2048 MOV(offset(inst->dst, lowered_width, n), src_reg(dst));
2049 mov->exec_size = lowered_width;
2050 mov->group = channel_offset;
2051 mov->size_written = size_written;
2052 mov->predicate = inst->predicate;
2053 inst->insert_before(block, mov);
2054 }
2055 }
2056
2057 inst->remove(block);
2058 progress = true;
2059 }
2060
2061 if (progress)
2062 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2063
2064 return progress;
2065 }
2066
2067 static elk_predicate
scalarize_predicate(elk_predicate predicate,unsigned writemask)2068 scalarize_predicate(elk_predicate predicate, unsigned writemask)
2069 {
2070 if (predicate != ELK_PREDICATE_NORMAL)
2071 return predicate;
2072
2073 switch (writemask) {
2074 case WRITEMASK_X:
2075 return ELK_PREDICATE_ALIGN16_REPLICATE_X;
2076 case WRITEMASK_Y:
2077 return ELK_PREDICATE_ALIGN16_REPLICATE_Y;
2078 case WRITEMASK_Z:
2079 return ELK_PREDICATE_ALIGN16_REPLICATE_Z;
2080 case WRITEMASK_W:
2081 return ELK_PREDICATE_ALIGN16_REPLICATE_W;
2082 default:
2083 unreachable("invalid writemask");
2084 }
2085 }
2086
2087 /* Gfx7 has a hardware decompression bug that we can exploit to represent
2088 * handful of additional swizzles natively.
2089 */
2090 static bool
is_gfx7_supported_64bit_swizzle(vec4_instruction * inst,unsigned arg)2091 is_gfx7_supported_64bit_swizzle(vec4_instruction *inst, unsigned arg)
2092 {
2093 switch (inst->src[arg].swizzle) {
2094 case ELK_SWIZZLE_XXXX:
2095 case ELK_SWIZZLE_YYYY:
2096 case ELK_SWIZZLE_ZZZZ:
2097 case ELK_SWIZZLE_WWWW:
2098 case ELK_SWIZZLE_XYXY:
2099 case ELK_SWIZZLE_YXYX:
2100 case ELK_SWIZZLE_ZWZW:
2101 case ELK_SWIZZLE_WZWZ:
2102 return true;
2103 default:
2104 return false;
2105 }
2106 }
2107
2108 /* 64-bit sources use regions with a width of 2. These 2 elements in each row
2109 * can be addressed using 32-bit swizzles (which is what the hardware supports)
2110 * but it also means that the swizzle we apply on the first two components of a
2111 * dvec4 is coupled with the swizzle we use for the last 2. In other words,
2112 * only some specific swizzle combinations can be natively supported.
2113 *
2114 * FIXME: we can go an step further and implement even more swizzle
2115 * variations using only partial scalarization.
2116 *
2117 * For more details see:
2118 * https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82
2119 */
2120 bool
is_supported_64bit_region(vec4_instruction * inst,unsigned arg)2121 vec4_visitor::is_supported_64bit_region(vec4_instruction *inst, unsigned arg)
2122 {
2123 const src_reg &src = inst->src[arg];
2124 assert(type_sz(src.type) == 8);
2125
2126 /* Uniform regions have a vstride=0. Because we use 2-wide rows with
2127 * 64-bit regions it means that we cannot access components Z/W, so
2128 * return false for any such case. Interleaved attributes will also be
2129 * mapped to GRF registers with a vstride of 0, so apply the same
2130 * treatment.
2131 */
2132 if ((is_uniform(src) ||
2133 (stage_uses_interleaved_attributes(stage, prog_data->dispatch_mode) &&
2134 src.file == ATTR)) &&
2135 (elk_mask_for_swizzle(src.swizzle) & 12))
2136 return false;
2137
2138 switch (src.swizzle) {
2139 case ELK_SWIZZLE_XYZW:
2140 case ELK_SWIZZLE_XXZZ:
2141 case ELK_SWIZZLE_YYWW:
2142 case ELK_SWIZZLE_YXWZ:
2143 return true;
2144 default:
2145 return devinfo->ver == 7 && is_gfx7_supported_64bit_swizzle(inst, arg);
2146 }
2147 }
2148
2149 bool
scalarize_df()2150 vec4_visitor::scalarize_df()
2151 {
2152 bool progress = false;
2153
2154 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
2155 /* Skip DF instructions that operate in Align1 mode */
2156 if (is_align1_df(inst))
2157 continue;
2158
2159 /* Check if this is a double-precision instruction */
2160 bool is_double = type_sz(inst->dst.type) == 8;
2161 for (int arg = 0; !is_double && arg < 3; arg++) {
2162 is_double = inst->src[arg].file != BAD_FILE &&
2163 type_sz(inst->src[arg].type) == 8;
2164 }
2165
2166 if (!is_double)
2167 continue;
2168
2169 /* Skip the lowering for specific regioning scenarios that we can
2170 * support natively.
2171 */
2172 bool skip_lowering = true;
2173
2174 /* XY and ZW writemasks operate in 32-bit, which means that they don't
2175 * have a native 64-bit representation and they should always be split.
2176 */
2177 if (inst->dst.writemask == WRITEMASK_XY ||
2178 inst->dst.writemask == WRITEMASK_ZW) {
2179 skip_lowering = false;
2180 } else {
2181 for (unsigned i = 0; i < 3; i++) {
2182 if (inst->src[i].file == BAD_FILE || type_sz(inst->src[i].type) < 8)
2183 continue;
2184 skip_lowering = skip_lowering && is_supported_64bit_region(inst, i);
2185 }
2186 }
2187
2188 if (skip_lowering)
2189 continue;
2190
2191 /* Generate scalar instructions for each enabled channel */
2192 for (unsigned chan = 0; chan < 4; chan++) {
2193 unsigned chan_mask = 1 << chan;
2194 if (!(inst->dst.writemask & chan_mask))
2195 continue;
2196
2197 vec4_instruction *scalar_inst = new(mem_ctx) vec4_instruction(*inst);
2198
2199 for (unsigned i = 0; i < 3; i++) {
2200 unsigned swz = ELK_GET_SWZ(inst->src[i].swizzle, chan);
2201 scalar_inst->src[i].swizzle = ELK_SWIZZLE4(swz, swz, swz, swz);
2202 }
2203
2204 scalar_inst->dst.writemask = chan_mask;
2205
2206 if (inst->predicate != ELK_PREDICATE_NONE) {
2207 scalar_inst->predicate =
2208 scalarize_predicate(inst->predicate, chan_mask);
2209 }
2210
2211 inst->insert_before(block, scalar_inst);
2212 }
2213
2214 inst->remove(block);
2215 progress = true;
2216 }
2217
2218 if (progress)
2219 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2220
2221 return progress;
2222 }
2223
2224 bool
lower_64bit_mad_to_mul_add()2225 vec4_visitor::lower_64bit_mad_to_mul_add()
2226 {
2227 bool progress = false;
2228
2229 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
2230 if (inst->opcode != ELK_OPCODE_MAD)
2231 continue;
2232
2233 if (type_sz(inst->dst.type) != 8)
2234 continue;
2235
2236 dst_reg mul_dst = dst_reg(this, glsl_dvec4_type());
2237
2238 /* Use the copy constructor so we copy all relevant instruction fields
2239 * from the original mad into the add and mul instructions
2240 */
2241 vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst);
2242 mul->opcode = ELK_OPCODE_MUL;
2243 mul->dst = mul_dst;
2244 mul->src[0] = inst->src[1];
2245 mul->src[1] = inst->src[2];
2246 mul->src[2].file = BAD_FILE;
2247
2248 vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst);
2249 add->opcode = ELK_OPCODE_ADD;
2250 add->src[0] = src_reg(mul_dst);
2251 add->src[1] = inst->src[0];
2252 add->src[2].file = BAD_FILE;
2253
2254 inst->insert_before(block, mul);
2255 inst->insert_before(block, add);
2256 inst->remove(block);
2257
2258 progress = true;
2259 }
2260
2261 if (progress)
2262 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2263
2264 return progress;
2265 }
2266
2267 /* The align16 hardware can only do 32-bit swizzle channels, so we need to
2268 * translate the logical 64-bit swizzle channels that we use in the Vec4 IR
2269 * to 32-bit swizzle channels in hardware registers.
2270 *
2271 * @inst and @arg identify the original vec4 IR source operand we need to
2272 * translate the swizzle for and @hw_reg is the hardware register where we
2273 * will write the hardware swizzle to use.
2274 *
2275 * This pass assumes that Align16/DF instructions have been fully scalarized
2276 * previously so there is just one 64-bit swizzle channel to deal with for any
2277 * given Vec4 IR source.
2278 */
2279 void
apply_logical_swizzle(struct elk_reg * hw_reg,vec4_instruction * inst,int arg)2280 vec4_visitor::apply_logical_swizzle(struct elk_reg *hw_reg,
2281 vec4_instruction *inst, int arg)
2282 {
2283 src_reg reg = inst->src[arg];
2284
2285 if (reg.file == BAD_FILE || reg.file == ELK_IMMEDIATE_VALUE)
2286 return;
2287
2288 /* If this is not a 64-bit operand or this is a scalar instruction we don't
2289 * need to do anything about the swizzles.
2290 */
2291 if(type_sz(reg.type) < 8 || is_align1_df(inst)) {
2292 hw_reg->swizzle = reg.swizzle;
2293 return;
2294 }
2295
2296 /* Take the 64-bit logical swizzle channel and translate it to 32-bit */
2297 assert(elk_is_single_value_swizzle(reg.swizzle) ||
2298 is_supported_64bit_region(inst, arg));
2299
2300 /* Apply the region <2, 2, 1> for GRF or <0, 2, 1> for uniforms, as align16
2301 * HW can only do 32-bit swizzle channels.
2302 */
2303 hw_reg->width = ELK_WIDTH_2;
2304
2305 if (is_supported_64bit_region(inst, arg) &&
2306 !is_gfx7_supported_64bit_swizzle(inst, arg)) {
2307 /* Supported 64-bit swizzles are those such that their first two
2308 * components, when expanded to 32-bit swizzles, match the semantics
2309 * of the original 64-bit swizzle with 2-wide row regioning.
2310 */
2311 unsigned swizzle0 = ELK_GET_SWZ(reg.swizzle, 0);
2312 unsigned swizzle1 = ELK_GET_SWZ(reg.swizzle, 1);
2313 hw_reg->swizzle = ELK_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
2314 swizzle1 * 2, swizzle1 * 2 + 1);
2315 } else {
2316 /* If we got here then we have one of the following:
2317 *
2318 * 1. An unsupported swizzle, which should be single-value thanks to the
2319 * scalarization pass.
2320 *
2321 * 2. A gfx7 supported swizzle. These can be single-value or double-value
2322 * swizzles. If the latter, they are never cross-dvec2 channels. For
2323 * these we always need to activate the gfx7 vstride=0 exploit.
2324 */
2325 unsigned swizzle0 = ELK_GET_SWZ(reg.swizzle, 0);
2326 unsigned swizzle1 = ELK_GET_SWZ(reg.swizzle, 1);
2327 assert((swizzle0 < 2) == (swizzle1 < 2));
2328
2329 /* To gain access to Z/W components we need to select the second half
2330 * of the register and then use a X/Y swizzle to select Z/W respectively.
2331 */
2332 if (swizzle0 >= 2) {
2333 *hw_reg = suboffset(*hw_reg, 2);
2334 swizzle0 -= 2;
2335 swizzle1 -= 2;
2336 }
2337
2338 /* All gfx7-specific supported swizzles require the vstride=0 exploit */
2339 if (devinfo->ver == 7 && is_gfx7_supported_64bit_swizzle(inst, arg))
2340 hw_reg->vstride = ELK_VERTICAL_STRIDE_0;
2341
2342 /* Any 64-bit source with an offset at 16B is intended to address the
2343 * second half of a register and needs a vertical stride of 0 so we:
2344 *
2345 * 1. Don't violate register region restrictions.
2346 * 2. Activate the gfx7 instruction decompression bug exploit when
2347 * execsize > 4
2348 */
2349 if (hw_reg->subnr % REG_SIZE == 16) {
2350 assert(devinfo->ver == 7);
2351 hw_reg->vstride = ELK_VERTICAL_STRIDE_0;
2352 }
2353
2354 hw_reg->swizzle = ELK_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
2355 swizzle1 * 2, swizzle1 * 2 + 1);
2356 }
2357 }
2358
2359 void
invalidate_analysis(elk::analysis_dependency_class c)2360 vec4_visitor::invalidate_analysis(elk::analysis_dependency_class c)
2361 {
2362 elk_backend_shader::invalidate_analysis(c);
2363 live_analysis.invalidate(c);
2364 }
2365
2366 bool
run()2367 vec4_visitor::run()
2368 {
2369 setup_push_ranges();
2370
2371 if (prog_data->base.zero_push_reg) {
2372 /* push_reg_mask_param is in uint32 params and UNIFORM is in vec4s */
2373 const unsigned mask_param = stage_prog_data->push_reg_mask_param;
2374 src_reg mask = src_reg(dst_reg(UNIFORM, mask_param / 4));
2375 assert(mask_param % 2 == 0); /* Should be 64-bit-aligned */
2376 mask.swizzle = ELK_SWIZZLE4((mask_param + 0) % 4,
2377 (mask_param + 1) % 4,
2378 (mask_param + 0) % 4,
2379 (mask_param + 1) % 4);
2380
2381 emit(ELK_VEC4_OPCODE_ZERO_OOB_PUSH_REGS,
2382 dst_reg(VGRF, alloc.allocate(3)), mask);
2383 }
2384
2385 emit_prolog();
2386
2387 emit_nir_code();
2388 if (failed)
2389 return false;
2390 base_ir = NULL;
2391
2392 emit_thread_end();
2393
2394 calculate_cfg();
2395 cfg->validate(_mesa_shader_stage_to_abbrev(stage));
2396
2397 /* Before any optimization, push array accesses out to scratch
2398 * space where we need them to be. This pass may allocate new
2399 * virtual GRFs, so we want to do it early. It also makes sure
2400 * that we have reladdr computations available for CSE, since we'll
2401 * often do repeated subexpressions for those.
2402 */
2403 move_grf_array_access_to_scratch();
2404 split_uniform_registers();
2405
2406 split_virtual_grfs();
2407
2408 #define OPT(pass, args...) ({ \
2409 pass_num++; \
2410 bool this_progress = pass(args); \
2411 \
2412 if (INTEL_DEBUG(DEBUG_OPTIMIZER) && this_progress) { \
2413 char filename[64]; \
2414 snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \
2415 _mesa_shader_stage_to_abbrev(stage), \
2416 nir->info.name, iteration, pass_num); \
2417 \
2418 elk_backend_shader::dump_instructions(filename); \
2419 } \
2420 \
2421 cfg->validate(_mesa_shader_stage_to_abbrev(stage)); \
2422 progress = progress || this_progress; \
2423 this_progress; \
2424 })
2425
2426
2427 if (INTEL_DEBUG(DEBUG_OPTIMIZER)) {
2428 char filename[64];
2429 snprintf(filename, 64, "%s-%s-00-00-start",
2430 _mesa_shader_stage_to_abbrev(stage), nir->info.name);
2431
2432 elk_backend_shader::dump_instructions(filename);
2433 }
2434
2435 bool progress;
2436 int iteration = 0;
2437 int pass_num = 0;
2438 do {
2439 progress = false;
2440 pass_num = 0;
2441 iteration++;
2442
2443 OPT(elk_opt_predicated_break, this);
2444 OPT(opt_reduce_swizzle);
2445 OPT(dead_code_eliminate);
2446 OPT(elk_dead_control_flow_eliminate, this);
2447 OPT(opt_copy_propagation);
2448 OPT(opt_cmod_propagation);
2449 OPT(opt_cse);
2450 OPT(opt_algebraic);
2451 OPT(opt_register_coalesce);
2452 OPT(eliminate_find_live_channel);
2453 } while (progress);
2454
2455 pass_num = 0;
2456
2457 if (OPT(opt_vector_float)) {
2458 OPT(opt_cse);
2459 OPT(opt_copy_propagation, false);
2460 OPT(opt_copy_propagation, true);
2461 OPT(dead_code_eliminate);
2462 }
2463
2464 if (devinfo->ver <= 5 && OPT(lower_minmax)) {
2465 OPT(opt_cmod_propagation);
2466 OPT(opt_cse);
2467 OPT(opt_copy_propagation);
2468 OPT(dead_code_eliminate);
2469 }
2470
2471 if (OPT(lower_simd_width)) {
2472 OPT(opt_copy_propagation);
2473 OPT(dead_code_eliminate);
2474 }
2475
2476 if (failed)
2477 return false;
2478
2479 OPT(lower_64bit_mad_to_mul_add);
2480
2481 /* Run this before payload setup because tessellation shaders
2482 * rely on it to prevent cross dvec2 regioning on DF attributes
2483 * that are setup so that XY are on the second half of register and
2484 * ZW are in the first half of the next.
2485 */
2486 OPT(scalarize_df);
2487
2488 setup_payload();
2489
2490 if (INTEL_DEBUG(DEBUG_SPILL_VEC4)) {
2491 /* Debug of register spilling: Go spill everything. */
2492 const int grf_count = alloc.count;
2493 float spill_costs[alloc.count];
2494 bool no_spill[alloc.count];
2495 evaluate_spill_costs(spill_costs, no_spill);
2496 for (int i = 0; i < grf_count; i++) {
2497 if (no_spill[i])
2498 continue;
2499 spill_reg(i);
2500 }
2501
2502 /* We want to run this after spilling because 64-bit (un)spills need to
2503 * emit code to shuffle 64-bit data for the 32-bit scratch read/write
2504 * messages that can produce unsupported 64-bit swizzle regions.
2505 */
2506 OPT(scalarize_df);
2507 }
2508
2509 fixup_3src_null_dest();
2510
2511 bool allocated_without_spills = reg_allocate();
2512
2513 if (!allocated_without_spills) {
2514 elk_shader_perf_log(compiler, log_data,
2515 "%s shader triggered register spilling. "
2516 "Try reducing the number of live vec4 values "
2517 "to improve performance.\n",
2518 _mesa_shader_stage_to_string(stage));
2519
2520 while (!reg_allocate()) {
2521 if (failed)
2522 return false;
2523 }
2524
2525 /* We want to run this after spilling because 64-bit (un)spills need to
2526 * emit code to shuffle 64-bit data for the 32-bit scratch read/write
2527 * messages that can produce unsupported 64-bit swizzle regions.
2528 */
2529 OPT(scalarize_df);
2530 }
2531
2532 opt_schedule_instructions();
2533
2534 opt_set_dependency_control();
2535
2536 convert_to_hw_regs();
2537
2538 if (last_scratch > 0) {
2539 prog_data->base.total_scratch =
2540 elk_get_scratch_size(last_scratch * REG_SIZE);
2541 }
2542
2543 return !failed;
2544 }
2545
2546 } /* namespace elk */
2547
2548 extern "C" {
2549
2550 const unsigned *
elk_compile_vs(const struct elk_compiler * compiler,struct elk_compile_vs_params * params)2551 elk_compile_vs(const struct elk_compiler *compiler,
2552 struct elk_compile_vs_params *params)
2553 {
2554 struct nir_shader *nir = params->base.nir;
2555 const struct elk_vs_prog_key *key = params->key;
2556 struct elk_vs_prog_data *prog_data = params->prog_data;
2557 const bool debug_enabled =
2558 elk_should_print_shader(nir, params->base.debug_flag ?
2559 params->base.debug_flag : DEBUG_VS);
2560
2561 prog_data->base.base.stage = MESA_SHADER_VERTEX;
2562 prog_data->base.base.total_scratch = 0;
2563
2564 const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
2565 elk_nir_apply_key(nir, compiler, &key->base, 8);
2566
2567 const unsigned *assembly = NULL;
2568
2569 prog_data->inputs_read = nir->info.inputs_read;
2570 prog_data->double_inputs_read = nir->info.vs.double_inputs;
2571
2572 elk_nir_lower_vs_inputs(nir, params->edgeflag_is_last, key->gl_attrib_wa_flags);
2573 elk_nir_lower_vue_outputs(nir);
2574 elk_postprocess_nir(nir, compiler, debug_enabled,
2575 key->base.robust_flags);
2576
2577 prog_data->base.clip_distance_mask =
2578 ((1 << nir->info.clip_distance_array_size) - 1);
2579 prog_data->base.cull_distance_mask =
2580 ((1 << nir->info.cull_distance_array_size) - 1) <<
2581 nir->info.clip_distance_array_size;
2582
2583 unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
2584
2585 /* gl_VertexID and gl_InstanceID are system values, but arrive via an
2586 * incoming vertex attribute. So, add an extra slot.
2587 */
2588 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
2589 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
2590 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
2591 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID)) {
2592 nr_attribute_slots++;
2593 }
2594
2595 /* gl_DrawID and IsIndexedDraw share its very own vec4 */
2596 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID) ||
2597 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW)) {
2598 nr_attribute_slots++;
2599 }
2600
2601 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW))
2602 prog_data->uses_is_indexed_draw = true;
2603
2604 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX))
2605 prog_data->uses_firstvertex = true;
2606
2607 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE))
2608 prog_data->uses_baseinstance = true;
2609
2610 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE))
2611 prog_data->uses_vertexid = true;
2612
2613 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID))
2614 prog_data->uses_instanceid = true;
2615
2616 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
2617 prog_data->uses_drawid = true;
2618
2619 /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
2620 * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
2621 * vec4 mode, the hardware appears to wedge unless we read something.
2622 */
2623 if (is_scalar)
2624 prog_data->base.urb_read_length =
2625 DIV_ROUND_UP(nr_attribute_slots, 2);
2626 else
2627 prog_data->base.urb_read_length =
2628 DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
2629
2630 prog_data->nr_attribute_slots = nr_attribute_slots;
2631
2632 /* Since vertex shaders reuse the same VUE entry for inputs and outputs
2633 * (overwriting the original contents), we need to make sure the size is
2634 * the larger of the two.
2635 */
2636 const unsigned vue_entries =
2637 MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots);
2638
2639 if (compiler->devinfo->ver == 6) {
2640 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
2641 } else {
2642 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
2643 }
2644
2645 if (unlikely(debug_enabled)) {
2646 fprintf(stderr, "VS Output ");
2647 elk_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX);
2648 }
2649
2650 if (is_scalar) {
2651 const unsigned dispatch_width = 8;
2652 prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
2653
2654 elk_fs_visitor v(compiler, ¶ms->base, &key->base,
2655 &prog_data->base.base, nir, dispatch_width,
2656 params->base.stats != NULL, debug_enabled);
2657 if (!v.run_vs()) {
2658 params->base.error_str =
2659 ralloc_strdup(params->base.mem_ctx, v.fail_msg);
2660 return NULL;
2661 }
2662
2663 assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
2664 prog_data->base.base.dispatch_grf_start_reg =
2665 v.payload().num_regs / reg_unit(compiler->devinfo);
2666
2667 elk_fs_generator g(compiler, ¶ms->base,
2668 &prog_data->base.base, v.runtime_check_aads_emit,
2669 MESA_SHADER_VERTEX);
2670 if (unlikely(debug_enabled)) {
2671 const char *debug_name =
2672 ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
2673 nir->info.label ? nir->info.label :
2674 "unnamed",
2675 nir->info.name);
2676
2677 g.enable_debug(debug_name);
2678 }
2679 g.generate_code(v.cfg, dispatch_width, v.shader_stats,
2680 v.performance_analysis.require(), params->base.stats);
2681 g.add_const_data(nir->constant_data, nir->constant_data_size);
2682 assembly = g.get_assembly();
2683 }
2684
2685 if (!assembly) {
2686 prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
2687
2688 vec4_vs_visitor v(compiler, ¶ms->base, key, prog_data,
2689 nir, debug_enabled);
2690 if (!v.run()) {
2691 params->base.error_str =
2692 ralloc_strdup(params->base.mem_ctx, v.fail_msg);
2693 return NULL;
2694 }
2695
2696 assembly = elk_vec4_generate_assembly(compiler, ¶ms->base,
2697 nir, &prog_data->base,
2698 v.cfg,
2699 v.performance_analysis.require(),
2700 debug_enabled);
2701 }
2702
2703 return assembly;
2704 }
2705
2706 } /* extern "C" */
2707